1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.deri.any23.extractor.microdata;
18
19 import org.deri.any23.extractor.ErrorReporter;
20 import org.deri.any23.extractor.ExtractionContext;
21 import org.deri.any23.extractor.ExtractionException;
22 import org.deri.any23.extractor.ExtractionParameters;
23 import org.deri.any23.extractor.ExtractionResult;
24 import org.deri.any23.extractor.Extractor;
25 import org.deri.any23.extractor.ExtractorDescription;
26 import org.deri.any23.extractor.ExtractorFactory;
27 import org.deri.any23.extractor.SimpleExtractorFactory;
28 import org.deri.any23.extractor.html.DomUtils;
29 import org.deri.any23.rdf.PopularPrefixes;
30 import org.deri.any23.rdf.RDFUtils;
31 import org.deri.any23.vocab.DCTERMS;
32 import org.deri.any23.vocab.XHTML;
33 import org.openrdf.model.Literal;
34 import org.openrdf.model.Resource;
35 import org.openrdf.model.URI;
36 import org.openrdf.model.Value;
37 import org.openrdf.model.vocabulary.RDF;
38 import org.openrdf.model.vocabulary.XMLSchema;
39 import org.w3c.dom.Document;
40 import org.w3c.dom.Node;
41 import org.w3c.dom.NodeList;
42
43 import java.io.IOException;
44 import java.net.MalformedURLException;
45 import java.net.URL;
46 import java.util.Arrays;
47 import java.util.Date;
48 import java.util.HashMap;
49 import java.util.HashSet;
50 import java.util.List;
51 import java.util.Map;
52 import java.util.Set;
53
54
55
56
57
58
59
60
61 public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
62
63 private static final URI MICRODATA_ITEM
64 = RDFUtils.uri("http://www.w3.org/1999/xhtml/microdata#item");
65
66 public final static ExtractorFactory<MicrodataExtractor> factory =
67 SimpleExtractorFactory.create(
68 "html-microdata",
69 PopularPrefixes.createSubset("rdf", "doac", "foaf"),
70 Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
71 null,
72 MicrodataExtractor.class
73 );
74
75 private String documentLanguage;
76
77 private boolean isStrict;
78
79 private String defaultNamespace;
80
81 public ExtractorDescription getDescription() {
82 return factory;
83 }
84
85
86
87
88
89
90
91
92 public void run(
93 ExtractionParameters extractionParameters,
94 ExtractionContext extractionContext,
95 Document in,
96 ExtractionResult out
97 ) throws IOException, ExtractionException {
98
99 final MicrodataParserReport parserReport = MicrodataParser.getMicrodata(in);
100 if(parserReport.getErrors().length > 0) {
101 notifyError(parserReport.getErrors(), out);
102 }
103 final ItemScope[] itemScopes = parserReport.getDetectedItemScopes();
104 if (itemScopes.length == 0) {
105 return;
106 }
107
108 isStrict = extractionParameters.getFlag("any23.microdata.strict");
109 if (!isStrict) {
110 defaultNamespace = extractionParameters.getProperty("any23.microdata.ns.default");
111 }
112
113 documentLanguage = getDocumentLanguage(in);
114
115
116
117
118 final URI documentURI = extractionContext.getDocumentURI();
119 final Map<ItemScope, Resource> mappings = new HashMap<ItemScope, Resource>();
120 for (ItemScope itemScope : itemScopes) {
121 Resource subject = processType(itemScope, documentURI, out, mappings);
122 out.writeTriple(
123 documentURI,
124 MICRODATA_ITEM,
125 subject
126 );
127 }
128
129
130
131
132 processTitle(in, documentURI, out);
133
134
135
136 processHREFElements(in, documentURI, out);
137
138
139
140 processMetaElements(in, documentURI, out);
141
142
143
144
145 processCiteElements(in, documentURI, out);
146 }
147
148
149
150
151
152
153
154 private String getDocumentLanguage(Document in) {
155 String lang = DomUtils.find(in, "string(/HTML/@lang)");
156 if (lang.equals("")) {
157 return null;
158 }
159 return lang;
160 }
161
162
163
164
165
166
167
168
169 private String getLanguage(Node node) {
170 Node nodeLang = node.getAttributes().getNamedItem("lang");
171 if (nodeLang == null) {
172
173 return documentLanguage;
174 }
175 return nodeLang.getTextContent();
176 }
177
178
179
180
181
182
183
184
185
186 private void processTitle(Document in, URI documentURI, ExtractionResult out) {
187 NodeList titles = in.getElementsByTagName("title");
188
189 if (titles.getLength() == 1) {
190 Node title = titles.item(0);
191 String titleValue = title.getTextContent();
192 Literal object;
193 String lang = getLanguage(title);
194 if (lang == null) {
195
196 object = RDFUtils.literal(titleValue);
197 } else {
198 object = RDFUtils.literal(titleValue, lang);
199 }
200 out.writeTriple(
201 documentURI,
202 DCTERMS.getInstance().title,
203 object
204 );
205 }
206 }
207
208
209
210
211
212
213
214
215
216 private void processHREFElements(Document in, URI documentURI, ExtractionResult out) {
217 NodeList anchors = in.getElementsByTagName("a");
218 for (int i = 0; i < anchors.getLength(); i++) {
219 processHREFElement(anchors.item(i), documentURI, out);
220 }
221 NodeList areas = in.getElementsByTagName("area");
222 for (int i = 0; i < areas.getLength(); i++) {
223 processHREFElement(areas.item(i), documentURI, out);
224 }
225 NodeList links = in.getElementsByTagName("link");
226 for (int i = 0; i < links.getLength(); i++) {
227 processHREFElement(links.item(i), documentURI, out);
228 }
229 }
230
231
232
233
234
235
236
237
238
239 private void processHREFElement(Node item, URI documentURI, ExtractionResult out) {
240 Node rel = item.getAttributes().getNamedItem("rel");
241 if (rel == null) {
242 return;
243 }
244 Node href = item.getAttributes().getNamedItem("href");
245 if (href == null) {
246 return;
247 }
248 URL absoluteURL;
249 if (!isAbsoluteURL(href.getTextContent())) {
250 try {
251 absoluteURL = toAbsoluteURL(
252 documentURI.toString(),
253 href.getTextContent(),
254 '/'
255 );
256 } catch (MalformedURLException e) {
257
258 return;
259 }
260 } else {
261 try {
262 absoluteURL = new URL(href.getTextContent());
263 } catch (MalformedURLException e) {
264
265 return;
266 }
267 }
268 String[] relTokens = rel.getTextContent().split(" ");
269 Set<String> tokensWithNoDuplicates = new HashSet<String>();
270 for (String relToken : relTokens) {
271 if (relToken.contains(":")) {
272
273 continue;
274 }
275 if (relToken.equals("alternate") || relToken.equals("stylesheet")) {
276 tokensWithNoDuplicates.add("ALTERNATE-STYLESHEET");
277 continue;
278 }
279 tokensWithNoDuplicates.add(relToken.toLowerCase());
280 }
281 for (String token : tokensWithNoDuplicates) {
282 URI predicate;
283 if (isAbsoluteURL(token)) {
284 predicate = RDFUtils.uri(token);
285 } else {
286 predicate = RDFUtils.uri(XHTML.NS + token);
287 }
288 out.writeTriple(
289 documentURI,
290 predicate,
291 RDFUtils.uri(absoluteURL.toString())
292 );
293 }
294 }
295
296
297
298
299
300
301
302
303
304 private void processMetaElements(Document in, URI documentURI, ExtractionResult out) {
305 NodeList metas = in.getElementsByTagName("meta");
306 for (int i = 0; i < metas.getLength(); i++) {
307 Node meta = metas.item(i);
308 String name = DomUtils.readAttribute(meta, "name" , null);
309 String content = DomUtils.readAttribute(meta, "content", null);
310 if (name != null && content != null) {
311 if (isAbsoluteURL(name)) {
312 processMetaElement(
313 RDFUtils.uri(name),
314 content,
315 getLanguage(meta),
316 documentURI,
317 out
318 );
319 } else {
320 processMetaElement(
321 name,
322 content,
323 getLanguage(meta),
324 documentURI,
325 out
326 );
327 }
328 }
329 }
330 }
331
332
333
334
335
336
337
338
339
340
341
342 private void processMetaElement(
343 URI uri,
344 String content,
345 String language,
346 URI documentURI,
347 ExtractionResult out
348 ) {
349 if (content.contains(":")) {
350
351 return;
352 }
353 Literal subject;
354 if (language == null) {
355
356 subject = RDFUtils.literal(content);
357 } else {
358 subject = RDFUtils.literal(content, language);
359 }
360 out.writeTriple(
361 documentURI,
362 uri,
363 subject
364 );
365 }
366
367
368
369
370
371
372
373
374
375
376
377 private void processMetaElement(
378 String name,
379 String content,
380 String language,
381 URI documentURI,
382 ExtractionResult out) {
383 Literal subject;
384 if (language == null) {
385
386 subject = RDFUtils.literal(content);
387 } else {
388 subject = RDFUtils.literal(content, language);
389 }
390 out.writeTriple(
391 documentURI,
392 RDFUtils.uri(XHTML.NS + name.toLowerCase()),
393 subject
394 );
395 }
396
397
398
399
400
401
402
403
404
405 private void processCiteElements(Document in, URI documentURI, ExtractionResult out) {
406 NodeList blockQuotes = in.getElementsByTagName("blockquote");
407 for (int i = 0; i < blockQuotes.getLength(); i++) {
408 processCiteElement(blockQuotes.item(i), documentURI, out);
409 }
410 NodeList quotes = in.getElementsByTagName("q");
411 for (int i = 0; i < quotes.getLength(); i++) {
412 processCiteElement(quotes.item(i), documentURI, out);
413 }
414 }
415
416 private void processCiteElement(Node item, URI documentURI, ExtractionResult out) {
417 if (item.getAttributes().getNamedItem("cite") != null) {
418 out.writeTriple(
419 documentURI,
420 DCTERMS.getInstance().source,
421 RDFUtils.uri(item.getAttributes().getNamedItem("cite").getTextContent())
422 );
423 }
424 }
425
426
427
428
429
430
431
432
433
434
435
436
437
438 private Resource processType(
439 ItemScope itemScope,
440 URI documentURI, ExtractionResult out,
441 Map<ItemScope, Resource> mappings
442 ) throws ExtractionException {
443 Resource subject;
444 if (mappings.containsKey(itemScope)) {
445 subject = mappings.get(itemScope);
446 } else if (isAbsoluteURL(itemScope.getItemId())) {
447 subject = RDFUtils.uri(itemScope.getItemId());
448 } else {
449 subject = RDFUtils.getBNode(Integer.toString(itemScope.hashCode()));
450 }
451 mappings.put(itemScope, subject);
452
453
454 String itemScopeType = "";
455 if (itemScope.getType() != null) {
456 String itemType;
457 itemType = itemScope.getType().toString();
458 out.writeTriple(subject, RDF.TYPE, RDFUtils.uri(itemType));
459 itemScopeType = itemScope.getType().toString();
460 }
461 for (String propName : itemScope.getProperties().keySet()) {
462 List<ItemProp> itemProps = itemScope.getProperties().get(propName);
463 for (ItemProp itemProp : itemProps) {
464 try {
465 processProperty(
466 subject,
467 propName,
468 itemProp,
469 itemScopeType,
470 documentURI,
471 mappings,
472 out
473 );
474 } catch (MalformedURLException e) {
475 throw new ExtractionException(
476 "Error while processing on subject '" + subject +
477 "' the itemProp: '" + itemProp + "' "
478 );
479 }
480 }
481 }
482 return subject;
483 }
484
485 private void processProperty(
486 Resource subject,
487 String propName,
488 ItemProp itemProp,
489 String itemScopeType,
490 URI documentURI,
491 Map<ItemScope, Resource> mappings,
492 ExtractionResult out
493 ) throws MalformedURLException, ExtractionException {
494 URI predicate;
495 if (!isAbsoluteURL(propName) && itemScopeType.equals("") && isStrict) {
496 return;
497 } else if (!isAbsoluteURL(propName) && itemScopeType.equals("") && !isStrict) {
498 predicate = RDFUtils.uri(
499 toAbsoluteURL(
500 defaultNamespace,
501 propName,
502 '/'
503 ).toString()
504 );
505 } else {
506 predicate = RDFUtils.uri(
507 toAbsoluteURL(
508 itemScopeType,
509 propName,
510 '/'
511 ).toString());
512 }
513 Value value;
514 Object propValue = itemProp.getValue().getContent();
515 ItemPropValue.Type propType = itemProp.getValue().getType();
516 if (propType.equals(ItemPropValue.Type.Nested)) {
517 value = processType((ItemScope) propValue, documentURI, out, mappings);
518 } else if (propType.equals(ItemPropValue.Type.Plain)) {
519 value = RDFUtils.literal((String) propValue, documentLanguage);
520 } else if (propType.equals(ItemPropValue.Type.Link)) {
521 value = RDFUtils.uri(
522 toAbsoluteURL(
523 documentURI.toString(),
524 (String) propValue,
525 '/'
526 ).toString()
527 );
528 } else if (propType.equals(ItemPropValue.Type.Date)) {
529 value = RDFUtils.literal(ItemPropValue.formatDateTime((Date) propValue), XMLSchema.DATE);
530 } else {
531 throw new RuntimeException("Invalid Type '" +
532 propType + "' for ItemPropValue with name: '" + propName + "'");
533 }
534 out.writeTriple(subject, predicate, value);
535 }
536
537 private boolean isAbsoluteURL(String urlString) {
538 boolean result = false;
539 try {
540 URL url = new URL(urlString);
541 String protocol = url.getProtocol();
542 if (protocol != null && protocol.trim().length() > 0)
543 result = true;
544 } catch (MalformedURLException e) {
545 return false;
546 }
547 return result;
548 }
549
550 private URL toAbsoluteURL(String ns, String part, char trailing)
551 throws MalformedURLException {
552 if (isAbsoluteURL(part)) {
553 return new URL(part);
554 }
555 char lastChar = ns.charAt(ns.length() - 1);
556 if (lastChar == '#' || lastChar == '/')
557 return new URL(ns + part);
558 return new URL(ns + trailing + part);
559 }
560
561 private void notifyError(MicrodataParserException[] errors, ExtractionResult out) {
562 for(MicrodataParserException mpe : errors) {
563 out.notifyError(
564 ErrorReporter.ErrorLevel.ERROR,
565 mpe.toJSON(),
566 mpe.getErrorLocationBeginRow() ,
567 mpe.getErrorLocationBeginCol()
568 );
569 }
570 }
571
572 }