1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.deri.any23.extractor.html;
19
20 import org.deri.any23.extractor.ErrorReporter;
21 import org.deri.any23.extractor.ExtractionContext;
22 import org.deri.any23.extractor.ExtractionException;
23 import org.deri.any23.extractor.ExtractionParameters;
24 import org.deri.any23.extractor.ExtractionResult;
25 import org.deri.any23.extractor.Extractor;
26 import org.deri.any23.extractor.ExtractorDescription;
27 import org.deri.any23.extractor.ExtractorFactory;
28 import org.deri.any23.extractor.SimpleExtractorFactory;
29 import org.deri.any23.extractor.rdf.RDFParserFactory;
30 import org.deri.any23.rdf.PopularPrefixes;
31 import org.openrdf.model.URI;
32 import org.openrdf.rio.RDFParseException;
33 import org.openrdf.rio.turtle.TurtleParser;
34 import org.w3c.dom.Document;
35 import org.w3c.dom.Node;
36
37 import java.io.IOException;
38 import java.io.StringReader;
39 import java.util.Arrays;
40 import java.util.List;
41
42
43
44
45
46
47
48
49
50 public class TurtleHTMLExtractor implements Extractor.TagSoupDOMExtractor {
51
52 public final static String NAME = "html-script-turtle";
53
54 public final static ExtractorFactory<TurtleHTMLExtractor> factory =
55 SimpleExtractorFactory.create(
56 NAME,
57 PopularPrefixes.get(),
58 Arrays.asList("text/html;q=0.02", "application/xhtml+xml;q=0.02"),
59 null,
60 TurtleHTMLExtractor.class
61 );
62
63 private TurtleParser turtleParser;
64
65 public void run(
66 ExtractionParameters extractionParameters,
67 ExtractionContext extractionContext,
68 Document in,
69 ExtractionResult out
70 ) throws IOException, ExtractionException {
71 List<Node> scriptNodes;
72 HTMLDocument htmlDocument = new HTMLDocument(in);
73 final URI documentURI = extractionContext.getDocumentURI();
74
75 scriptNodes = htmlDocument.findAll(".//SCRIPT[contains(@type,'text/turtle')]");
76 processScriptNodes(documentURI, extractionContext, out, scriptNodes);
77
78 scriptNodes = htmlDocument.findAll(".//SCRIPT[contains(@type,'text/n3')]");
79 processScriptNodes(documentURI, extractionContext, out, scriptNodes);
80
81 scriptNodes = htmlDocument.findAll(".//SCRIPT[contains(@type,'text/plain')]");
82 processScriptNodes(documentURI, extractionContext,out, scriptNodes);
83 }
84
85 public ExtractorDescription getDescription() {
86 return factory;
87 }
88
89
90
91
92
93
94
95
96 private void processScriptNodes(URI documentURI, ExtractionContext ec, ExtractionResult er, List<Node> ns) {
97 if(ns.size() > 0 && turtleParser == null) {
98 turtleParser = RDFParserFactory.getInstance().getTurtleParserInstance(true, false, ec, er);
99 }
100 for(Node n : ns) {
101 processScriptNode(turtleParser, documentURI, n, er);
102 }
103 }
104
105
106
107
108
109
110
111
112
113 private void processScriptNode(TurtleParser turtleParser, URI documentURI, Node n, ExtractionResult er) {
114 final Node idAttribute = n.getAttributes().getNamedItem("id");
115 final String graphName =
116 documentURI.stringValue() +
117 ( idAttribute == null ? "" : "#" + idAttribute.getTextContent() );
118 try {
119 turtleParser.parse( new StringReader(n.getTextContent()), graphName );
120 } catch (RDFParseException rdfpe) {
121 er.notifyError(
122 ErrorReporter.ErrorLevel.ERROR,
123 String.format(
124 "An error occurred while parsing turtle content within script node: %s",
125 Arrays.toString( DomUtils.getXPathListForNode(n) )
126 ),
127 rdfpe.getLineNumber(), rdfpe.getColumnNumber()
128 );
129 } catch (Exception e) {
130 er.notifyError(ErrorReporter.ErrorLevel.ERROR, "An error occurred while processing RDF data.", -1 , -1);
131 }
132 }
133
134 }