1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.deri.any23.extractor.rdfa;
18
19 import org.deri.any23.configuration.DefaultConfiguration;
20 import org.deri.any23.extractor.ExtractionContext;
21 import org.deri.any23.extractor.ExtractionException;
22 import org.deri.any23.extractor.ExtractionParameters;
23 import org.deri.any23.extractor.ExtractionResult;
24 import org.deri.any23.extractor.Extractor.TagSoupDOMExtractor;
25 import org.deri.any23.extractor.ExtractorDescription;
26 import org.deri.any23.extractor.ExtractorFactory;
27 import org.deri.any23.extractor.SimpleExtractorFactory;
28 import org.deri.any23.extractor.rdf.RDFParserFactory;
29 import org.openrdf.rio.RDFHandlerException;
30 import org.openrdf.rio.RDFParseException;
31 import org.openrdf.rio.RDFParser;
32 import org.w3c.dom.Document;
33
34 import java.io.IOException;
35 import java.io.InputStream;
36 import java.io.StringReader;
37 import java.io.StringWriter;
38 import java.util.Arrays;
39
40
41
42
43
44
45
46
47
48
49 public class RDFaExtractor implements TagSoupDOMExtractor {
50
51 public final static String NAME = "html-rdfa";
52
53 public final static String xsltFilename =
54 DefaultConfiguration.singleton().getPropertyOrFail("any23.rdfa.extractor.xslt");
55
56 private static XSLTStylesheet xslt = null;
57
58 public final static ExtractorFactory<RDFaExtractor> factory =
59 SimpleExtractorFactory.create(
60 NAME,
61 null,
62 Arrays.asList("text/html;q=0.3", "application/xhtml+xml;q=0.3"),
63 null,
64 RDFaExtractor.class
65 );
66
67
68
69
70
71
72
73 public static synchronized XSLTStylesheet getXSLT() {
74
75
76 if (xslt == null) {
77 InputStream in = RDFaExtractor.class.getResourceAsStream(xsltFilename);
78 if (in == null) {
79 throw new RuntimeException("Couldn't load '" + xsltFilename +
80 "', maybe the file is not bundled in the jar?");
81 }
82 xslt = new XSLTStylesheet(in);
83 }
84 return xslt;
85 }
86
87 private boolean verifyDataType;
88
89 private boolean stopAtFirstError;
90
91
92
93
94
95
96
97
98
99 public RDFaExtractor(boolean verifyDataType, boolean stopAtFirstError) {
100 this.verifyDataType = verifyDataType;
101 this.stopAtFirstError = stopAtFirstError;
102 }
103
104
105
106
107 public RDFaExtractor() {
108 this(false, false);
109 }
110
111 public boolean isVerifyDataType() {
112 return verifyDataType;
113 }
114
115 public void setVerifyDataType(boolean verifyDataType) {
116 this.verifyDataType = verifyDataType;
117 }
118
119 public boolean isStopAtFirstError() {
120 return stopAtFirstError;
121 }
122
123 public void setStopAtFirstError(boolean stopAtFirstError) {
124 this.stopAtFirstError = stopAtFirstError;
125 }
126
127 public void run(
128 ExtractionParameters extractionParameters,
129 ExtractionContext extractionContext,
130 Document in,
131 ExtractionResult out
132 ) throws IOException, ExtractionException {
133
134 StringWriter buffer = new StringWriter();
135 try {
136 getXSLT().applyTo(in, buffer);
137 } catch (XSLTStylesheetException xslte) {
138 throw new ExtractionException("An error occurred during the XSLT application.", xslte);
139 }
140
141 try {
142 RDFParser parser
143 = RDFParserFactory.getInstance().getRDFXMLParser(
144 verifyDataType, stopAtFirstError, extractionContext, out
145 );
146 parser.parse(
147 new StringReader(buffer.getBuffer().toString()),
148 extractionContext.getDocumentURI().stringValue()
149 );
150 } catch (RDFHandlerException ex) {
151 throw new IllegalStateException(
152 "Should not happen, RDFHandlerAdapter does not throw RDFHandlerException", ex
153 );
154 } catch (RDFParseException ex) {
155 throw new ExtractionException(
156 "Invalid RDF/XML produced by RDFa transform.", ex, out
157 );
158 }
159 }
160
161 private String getDocType(Document in) {
162 return in.getDoctype().getPublicId();
163 }
164
165
166
167
168 public ExtractorDescription getDescription() {
169 return factory;
170 }
171
172 }