1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.deri.any23.extractor.xpath;
19
20 import org.deri.any23.extractor.ExtractionContext;
21 import org.deri.any23.extractor.ExtractionException;
22 import org.deri.any23.extractor.ExtractionParameters;
23 import org.deri.any23.extractor.ExtractionResult;
24 import org.deri.any23.extractor.Extractor;
25 import org.deri.any23.extractor.ExtractorDescription;
26 import org.deri.any23.extractor.ExtractorFactory;
27 import org.deri.any23.extractor.SimpleExtractorFactory;
28 import org.openrdf.model.URI;
29 import org.w3c.dom.Document;
30
31 import java.io.IOException;
32 import java.util.ArrayList;
33 import java.util.Arrays;
34 import java.util.List;
35
36
37
38
39
40
41
42
43 public class XPathExtractor implements Extractor.TagSoupDOMExtractor {
44
45 public final static String NAME = "html-xpath";
46
47 public final static ExtractorFactory<XPathExtractor> factory =
48 SimpleExtractorFactory.create(
49 NAME,
50 null,
51 Arrays.asList("text/html;q=0.02", "application/xhtml+xml;q=0.02"),
52 null,
53 XPathExtractor.class
54 );
55
56 private final List<XPathExtractionRule> xPathExtractionRules = new ArrayList<XPathExtractionRule>();
57
58 public XPathExtractor(List<XPathExtractionRule> rules) {
59 xPathExtractionRules.addAll(rules);
60 }
61
62 public void add(XPathExtractionRule rule) {
63 xPathExtractionRules.add(rule);
64 }
65
66 public void remove(XPathExtractionRule rule) {
67 xPathExtractionRules.remove(rule);
68 }
69
70 public boolean contains(XPathExtractionRule rule) {
71 return xPathExtractionRules.contains(rule);
72 }
73
74 public void run(
75 ExtractionParameters extractionParameters,
76 ExtractionContext extractionContext,
77 Document in,
78 ExtractionResult out
79 )
80 throws IOException, ExtractionException {
81 final URI documentURI = extractionContext.getDocumentURI();
82 for(XPathExtractionRule rule : xPathExtractionRules) {
83 if(rule.acceptURI(documentURI)) {
84 rule.process(in, out);
85 }
86 }
87 }
88
89 public ExtractorDescription getDescription() {
90 return factory;
91 }
92
93 }