View Javadoc

1   /*
2    * Copyright 2008-2010 Digital Enterprise Research Institute (DERI)
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *          http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package org.deri.any23.extractor.rdfa;
18  
19  import org.deri.any23.configuration.DefaultConfiguration;
20  import org.deri.any23.extractor.ExtractionContext;
21  import org.deri.any23.extractor.ExtractionException;
22  import org.deri.any23.extractor.ExtractionParameters;
23  import org.deri.any23.extractor.ExtractionResult;
24  import org.deri.any23.extractor.Extractor.TagSoupDOMExtractor;
25  import org.deri.any23.extractor.ExtractorDescription;
26  import org.deri.any23.extractor.ExtractorFactory;
27  import org.deri.any23.extractor.SimpleExtractorFactory;
28  import org.deri.any23.extractor.rdf.RDFParserFactory;
29  import org.openrdf.rio.RDFHandlerException;
30  import org.openrdf.rio.RDFParseException;
31  import org.openrdf.rio.RDFParser;
32  import org.w3c.dom.Document;
33  
34  import java.io.IOException;
35  import java.io.InputStream;
36  import java.io.StringReader;
37  import java.io.StringWriter;
38  import java.util.Arrays;
39  
40  /**
41   * Extractor for RDFa in HTML, based on Fabien Gadon's XSLT transform, found
42   * <a href="http://ns.inria.fr/grddl/rdfa/">here</a>. It works by first
43   * parsing the HTML using a tagsoup parser, then applies the XSLT to the
44   * DOM tree, then parses the resulting RDF/XML.
45   *
46   * @author Gabriele Renzi
47   * @author Richard Cyganiak (richard@cyganiak.de)
48   */
49  public class RDFaExtractor implements TagSoupDOMExtractor {
50  
51      public final static String NAME = "html-rdfa";
52  
53      public final static String xsltFilename =
54              DefaultConfiguration.singleton().getPropertyOrFail("any23.rdfa.extractor.xslt");
55  
56      private static XSLTStylesheet xslt = null;
57  
58      public final static ExtractorFactory<RDFaExtractor> factory =
59          SimpleExtractorFactory.create(
60                  NAME,
61                  null,
62                  Arrays.asList("text/html;q=0.3", "application/xhtml+xml;q=0.3"),
63                  null,
64                  RDFaExtractor.class
65          );
66  
67      /**
68       * Returns a {@link org.deri.any23.extractor.rdfa.XSLTStylesheet} able to distill RDFa from
69       * HTML pages.
70       *
71       * @return returns a not <code>null</code> XSLT instance.
72       */
73      public static synchronized XSLTStylesheet getXSLT() {
74          // Lazily initialized static instance, so we don't parse
75          // the XSLT unless really necessary, and only once
76          if (xslt == null) {
77              InputStream in = RDFaExtractor.class.getResourceAsStream(xsltFilename);
78              if (in == null) {
79                  throw new RuntimeException("Couldn't load '" + xsltFilename +
80                          "', maybe the file is not bundled in the jar?");
81              }
82              xslt = new XSLTStylesheet(in);
83          }
84          return xslt;
85      }
86  
87      private boolean verifyDataType;
88  
89      private boolean stopAtFirstError;
90  
91      /**
92       * Constructor, allows to specify the validation and error handling policies.
93       *
94       * @param verifyDataType if <code>true</code> the data types will be verified,
95       *         if <code>false</code> will be ignored.
96       * @param stopAtFirstError if <code>true</code> the parser will stop at first parsing error,
97       *        if <code>false</code> will ignore non blocking errors.
98       */
99      public RDFaExtractor(boolean verifyDataType, boolean stopAtFirstError) {
100         this.verifyDataType   = verifyDataType;
101         this.stopAtFirstError = stopAtFirstError;
102     }
103 
104     /**
105      * Default constructor, with no verification of data types and not stop at first error.
106      */    
107     public RDFaExtractor() {
108         this(false, false);
109     }
110 
111     public boolean isVerifyDataType() {
112         return verifyDataType;
113     }
114 
115     public void setVerifyDataType(boolean verifyDataType) {
116         this.verifyDataType = verifyDataType;
117     }
118 
119     public boolean isStopAtFirstError() {
120         return stopAtFirstError;
121     }
122 
123     public void setStopAtFirstError(boolean stopAtFirstError) {
124         this.stopAtFirstError = stopAtFirstError;
125     }
126 
127     public void run(
128             ExtractionParameters extractionParameters,
129             ExtractionContext extractionContext,
130             Document in,
131             ExtractionResult out
132     ) throws IOException, ExtractionException {
133 
134         StringWriter buffer = new StringWriter();
135         try {
136             getXSLT().applyTo(in, buffer);
137         } catch (XSLTStylesheetException xslte) {
138             throw new ExtractionException("An error occurred during the XSLT application.", xslte);
139         }
140 
141         try {
142             RDFParser parser
143                     = RDFParserFactory.getInstance().getRDFXMLParser(
144                         verifyDataType, stopAtFirstError, extractionContext, out
145                     );
146             parser.parse(
147                     new StringReader(buffer.getBuffer().toString()),
148                     extractionContext.getDocumentURI().stringValue()
149             );
150         } catch (RDFHandlerException ex) {
151             throw new IllegalStateException(
152                     "Should not happen, RDFHandlerAdapter does not throw RDFHandlerException", ex
153             );
154         } catch (RDFParseException ex) {
155             throw new ExtractionException(
156                     "Invalid RDF/XML produced by RDFa transform.", ex, out
157             );
158         }
159     }
160 
161     private String getDocType(Document in) {
162         return in.getDoctype().getPublicId();
163     }
164 
165     /**
166      * @return the {@link org.deri.any23.extractor.ExtractorDescription} of this extractor
167      */
168     public ExtractorDescription getDescription() {
169         return factory;
170     }
171 
172 }