View Javadoc

1   /**
2    * Copyright 2008-2010 Digital Enterprise Research Institute (DERI)
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *          http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   *
16   */
17  
18  package org.deri.any23.extractor.html;
19  
20  import org.deri.any23.extractor.ErrorReporter;
21  import org.deri.any23.extractor.ExtractionContext;
22  import org.deri.any23.extractor.ExtractionException;
23  import org.deri.any23.extractor.ExtractionParameters;
24  import org.deri.any23.extractor.ExtractionResult;
25  import org.deri.any23.extractor.Extractor;
26  import org.deri.any23.extractor.ExtractorDescription;
27  import org.deri.any23.extractor.ExtractorFactory;
28  import org.deri.any23.extractor.SimpleExtractorFactory;
29  import org.deri.any23.extractor.rdf.RDFParserFactory;
30  import org.deri.any23.rdf.PopularPrefixes;
31  import org.openrdf.model.URI;
32  import org.openrdf.rio.RDFParseException;
33  import org.openrdf.rio.turtle.TurtleParser;
34  import org.w3c.dom.Document;
35  import org.w3c.dom.Node;
36  
37  import java.io.IOException;
38  import java.io.StringReader;
39  import java.util.Arrays;
40  import java.util.List;
41  
42  /**
43   * Extractor for <i>Turtle/N3</i> format embedded within <i>HTML</i>
44   * <i>script</i> tags.
45   *
46   * See specification draft <a href="http://esw.w3.org/N3inHTML">here</a>. 
47   *
48   * @author Michele Mostarda (mostarda@fbk.eu)
49   */
50  public class TurtleHTMLExtractor implements Extractor.TagSoupDOMExtractor {
51  
52      public final static String NAME = "html-script-turtle";
53  
54      public final static ExtractorFactory<TurtleHTMLExtractor> factory =
55              SimpleExtractorFactory.create(
56                      NAME,
57                      PopularPrefixes.get(),
58                      Arrays.asList("text/html;q=0.02", "application/xhtml+xml;q=0.02"),
59                      null,
60                      TurtleHTMLExtractor.class
61              );
62  
63      private TurtleParser turtleParser;
64  
65      public void run(
66              ExtractionParameters extractionParameters,
67              ExtractionContext extractionContext,
68              Document in,
69              ExtractionResult out
70      ) throws IOException, ExtractionException {
71          List<Node> scriptNodes;
72          HTMLDocument htmlDocument = new HTMLDocument(in);
73          final URI documentURI = extractionContext.getDocumentURI();
74  
75          scriptNodes = htmlDocument.findAll(".//SCRIPT[contains(@type,'text/turtle')]");
76          processScriptNodes(documentURI, extractionContext, out, scriptNodes);
77  
78          scriptNodes = htmlDocument.findAll(".//SCRIPT[contains(@type,'text/n3')]");
79          processScriptNodes(documentURI, extractionContext, out, scriptNodes);
80  
81          scriptNodes = htmlDocument.findAll(".//SCRIPT[contains(@type,'text/plain')]");
82          processScriptNodes(documentURI, extractionContext,out, scriptNodes);
83      }
84  
85      public ExtractorDescription getDescription() {
86          return factory;
87      }
88  
89      /**
90       * Processes a list of <i>html script</i> nodes retrieving the N3 / Turtle content.
91       *
92       * @param documentURI the URI of the original HTML document.
93       * @param er the extraction result used to store triples.
94       * @param ns the list of script nodes.
95       */
96      private void processScriptNodes(URI documentURI, ExtractionContext ec, ExtractionResult er, List<Node> ns) {
97          if(ns.size() > 0 && turtleParser == null) {
98              turtleParser = RDFParserFactory.getInstance().getTurtleParserInstance(true, false, ec, er);
99          }
100         for(Node n : ns) {
101             processScriptNode(turtleParser, documentURI, n, er);
102         }
103     }
104 
105     /**
106      * Processes a single <i>html script</i> node.
107      *
108      * @param turtleParser the parser used to digest node content.
109      * @param documentURI the URI of the original HTML document.
110      * @param n the script node.
111      * @param er the extraction result used to store triples.
112      */
113     private void processScriptNode(TurtleParser turtleParser, URI documentURI, Node n, ExtractionResult er) {
114         final Node idAttribute = n.getAttributes().getNamedItem("id");
115         final String graphName =
116                 documentURI.stringValue() +
117                 ( idAttribute == null ? "" : "#" +   idAttribute.getTextContent() ); 
118         try {
119             turtleParser.parse( new StringReader(n.getTextContent()), graphName );
120         } catch (RDFParseException rdfpe) {
121             er.notifyError(
122                     ErrorReporter.ErrorLevel.ERROR,
123                     String.format(
124                             "An error occurred while parsing turtle content within script node: %s",
125                             Arrays.toString( DomUtils.getXPathListForNode(n) )
126                     ),
127                     rdfpe.getLineNumber(), rdfpe.getColumnNumber()
128             );
129         } catch (Exception e) {
130             er.notifyError(ErrorReporter.ErrorLevel.ERROR, "An error occurred while processing RDF data.", -1 , -1);
131         }
132     }
133 
134 }