1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.deri.any23.extractor.html;
18
19 import org.deri.any23.extractor.ExtractionContext;
20 import org.deri.any23.extractor.ExtractionException;
21 import org.deri.any23.extractor.ExtractionParameters;
22 import org.deri.any23.extractor.ExtractionResult;
23 import org.deri.any23.extractor.Extractor.TagSoupDOMExtractor;
24 import org.deri.any23.extractor.ExtractorDescription;
25 import org.deri.any23.extractor.ExtractorFactory;
26 import org.deri.any23.extractor.SimpleExtractorFactory;
27 import org.deri.any23.rdf.PopularPrefixes;
28 import org.deri.any23.vocab.XHTML;
29 import org.openrdf.model.URI;
30 import org.openrdf.model.ValueFactory;
31 import org.openrdf.model.impl.ValueFactoryImpl;
32 import org.w3c.dom.Document;
33 import org.w3c.dom.Node;
34
35 import java.io.IOException;
36 import java.util.Arrays;
37 import java.util.List;
38
39
40
41
42
43 public class HeadLinkExtractor implements TagSoupDOMExtractor {
44
45 public void run(
46 ExtractionParameters extractionParameters,
47 ExtractionContext extractionContext,
48 Document in,
49 ExtractionResult out
50 ) throws IOException, ExtractionException {
51 HTMLDocument html = new HTMLDocument(in);
52 ValueFactory vf = ValueFactoryImpl.getInstance();
53
54 final List<Node> headLinkNodes = DomUtils.findAll(
55 in,
56 "/HTML/HEAD/LINK[(" +
57 "@type='application/rdf+xml' or " +
58 "@type='text/rdf' or " +
59 "@type='application/x-turtle' or " +
60 "@type='application/turtle' or " +
61 "@type='text/turtle' or " +
62 "@type='text/rdf+n3'" +
63 ") and @href and @rel]"
64 );
65 for (Node node : headLinkNodes) {
66 final URI href = html.resolveURI(DomUtils.find(node, "@href"));
67 final String rel = DomUtils.find(node, "@rel");
68 out.writeTriple(
69 extractionContext.getDocumentURI(),
70 vf.createURI(XHTML.NS + rel),
71 href
72 );
73 final String title = DomUtils.find(node, "@title");
74 if (title != null && !"".equals(title)) {
75 out.writeTriple(
76 href,
77 factory.getPrefixes().expand("dcterms:title"),
78 vf.createLiteral(title)
79 );
80 }
81 final String type = DomUtils.find(node, "@type");
82 if (type != null && !"".equals(type)) {
83 out.writeTriple(
84 href,
85 factory.getPrefixes().expand("dcterms:format"),
86 vf.createLiteral(type)
87 );
88 }
89 }
90 }
91
92 public ExtractorDescription getDescription() {
93 return factory;
94 }
95
96 public final static ExtractorFactory<HeadLinkExtractor> factory =
97 SimpleExtractorFactory.create(
98 "html-head-links",
99 PopularPrefixes.createSubset("xhtml", "dcterms"),
100 Arrays.asList("text/html;q=0.05", "application/xhtml+xml;q=0.05"),
101 null,
102 HeadLinkExtractor.class);
103 }