View Javadoc

1   /*
2    * Copyright 2008-2010 Digital Enterprise Research Institute (DERI)
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *          http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package org.deri.any23.extractor.microdata;
18  
19  import org.deri.any23.extractor.ErrorReporter;
20  import org.deri.any23.extractor.ExtractionContext;
21  import org.deri.any23.extractor.ExtractionException;
22  import org.deri.any23.extractor.ExtractionParameters;
23  import org.deri.any23.extractor.ExtractionResult;
24  import org.deri.any23.extractor.Extractor;
25  import org.deri.any23.extractor.ExtractorDescription;
26  import org.deri.any23.extractor.ExtractorFactory;
27  import org.deri.any23.extractor.SimpleExtractorFactory;
28  import org.deri.any23.extractor.html.DomUtils;
29  import org.deri.any23.rdf.PopularPrefixes;
30  import org.deri.any23.rdf.RDFUtils;
31  import org.deri.any23.vocab.DCTERMS;
32  import org.deri.any23.vocab.XHTML;
33  import org.openrdf.model.Literal;
34  import org.openrdf.model.Resource;
35  import org.openrdf.model.URI;
36  import org.openrdf.model.Value;
37  import org.openrdf.model.vocabulary.RDF;
38  import org.openrdf.model.vocabulary.XMLSchema;
39  import org.w3c.dom.Document;
40  import org.w3c.dom.Node;
41  import org.w3c.dom.NodeList;
42  
43  import java.io.IOException;
44  import java.net.MalformedURLException;
45  import java.net.URL;
46  import java.util.Arrays;
47  import java.util.Date;
48  import java.util.HashMap;
49  import java.util.HashSet;
50  import java.util.List;
51  import java.util.Map;
52  import java.util.Set;
53  
54  /**
55   * Default implementation of <a href="http://www.w3.org/TR/microdata/">Microdata</a> extractor,
56   * based on {@link TagSoupDOMExtractor}.
57   *
58   * @author Michele Mostarda (mostarda@fbk.eu)
59   * @author Davide Palmisano ( dpalmisano@gmail.com )
60   */
61  public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
62  
63      private static final URI MICRODATA_ITEM
64              = RDFUtils.uri("http://www.w3.org/1999/xhtml/microdata#item");
65  
66      public final static ExtractorFactory<MicrodataExtractor> factory =
67              SimpleExtractorFactory.create(
68                      "html-microdata",
69                      PopularPrefixes.createSubset("rdf", "doac", "foaf"),
70                      Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
71                      null,
72                      MicrodataExtractor.class
73              );
74  
75      private String documentLanguage;
76  
77      private boolean isStrict;
78  
79      private String defaultNamespace;
80  
81      public ExtractorDescription getDescription() {
82          return factory;
83      }
84  
85      /**
86       * This extraction performs the
87       * <a href="http://www.w3.org/TR/microdata/#rdf">Microdata to RDF conversion algorithm</a>.
88       * A slight modification of the specification algorithm has been introduced
89       * to avoid performing actions 5.2.1, 5.2.2, 5.2.3, 5.2.4 if step 5.2.6 doesn't detect any
90       * Microdata.
91       */
92      public void run(
93              ExtractionParameters extractionParameters,
94              ExtractionContext extractionContext,
95              Document in,
96              ExtractionResult out
97      ) throws IOException, ExtractionException {
98  
99          final MicrodataParserReport parserReport = MicrodataParser.getMicrodata(in);
100         if(parserReport.getErrors().length > 0) {
101             notifyError(parserReport.getErrors(), out);
102         }
103         final ItemScope[] itemScopes = parserReport.getDetectedItemScopes();
104         if (itemScopes.length == 0) {
105             return;
106         }
107 
108         isStrict = extractionParameters.getFlag("any23.microdata.strict");
109         if (!isStrict) {
110             defaultNamespace = extractionParameters.getProperty("any23.microdata.ns.default");
111         }
112 
113         documentLanguage = getDocumentLanguage(in);
114 
115         /**
116          * 5.2.6
117          */
118         final URI documentURI = extractionContext.getDocumentURI();
119         final Map<ItemScope, Resource> mappings = new HashMap<ItemScope, Resource>();
120         for (ItemScope itemScope : itemScopes) {
121             Resource subject = processType(itemScope, documentURI, out, mappings);
122             out.writeTriple(
123                     documentURI,
124                     MICRODATA_ITEM,
125                     subject
126             );
127         }
128 
129         /**
130          * 5.2.1
131          */
132         processTitle(in, documentURI, out);
133         /**
134          * 5.2.2
135          */
136         processHREFElements(in, documentURI, out);
137         /**
138          * 5.2.3
139          */
140         processMetaElements(in, documentURI, out);
141 
142         /**
143          * 5.2.4
144          */
145         processCiteElements(in, documentURI, out);
146     }
147 
148     /**
149      * Returns the {@link Document} language if declared, <code>null</code> otherwise.
150      *
151      * @param in a instance of {@link Document}.
152      * @return the language declared, could be <code>null</code>.
153      */
154     private String getDocumentLanguage(Document in) {
155         String lang = DomUtils.find(in, "string(/HTML/@lang)");
156         if (lang.equals("")) {
157             return null;
158         }
159         return lang;
160     }
161 
162     /**
163      * Returns the {@link Node} language if declared, or the {@link Document} one
164      * if not defined.
165      *
166      * @param node a {@link Node} instance.
167      * @return the {@link Node} language or the {@link Document} one. Could be <code>null</code>
168      */
169     private String getLanguage(Node node) {
170         Node nodeLang = node.getAttributes().getNamedItem("lang");
171         if (nodeLang == null) {
172             // if the element does not specify a lang, use the document one
173             return documentLanguage;
174         }
175         return nodeLang.getTextContent();
176     }
177 
178     /**
179      * Implements step 5.2.1 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
180      * extraction algorithm.
181      *
182      * @param in          {@link Document} to be processed.
183      * @param documentURI Document current {@link URI}.
184      * @param out         a valid not <code>null</code> {@link ExtractionResult}
185      */
186     private void processTitle(Document in, URI documentURI, ExtractionResult out) {
187         NodeList titles = in.getElementsByTagName("title");
188         // just one title is allowed.
189         if (titles.getLength() == 1) {
190             Node title = titles.item(0);
191             String titleValue = title.getTextContent();
192             Literal object;
193             String lang = getLanguage(title);
194             if (lang == null) {
195                 // unable to decide the language, leave it unknown
196                 object = RDFUtils.literal(titleValue);
197             } else {
198                 object = RDFUtils.literal(titleValue, lang);
199             }
200             out.writeTriple(
201                     documentURI,
202                     DCTERMS.getInstance().title,
203                     object
204             );
205         }
206     }
207 
208     /**
209      * Implements step 5.2.2 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
210      * extraction algorithm.
211      *
212      * @param in          {@link Document} to be processed.
213      * @param documentURI Document current {@link URI}.
214      * @param out         a valid not <code>null</code> {@link ExtractionResult}
215      */
216     private void processHREFElements(Document in, URI documentURI, ExtractionResult out) {
217         NodeList anchors = in.getElementsByTagName("a");
218         for (int i = 0; i < anchors.getLength(); i++) {
219             processHREFElement(anchors.item(i), documentURI, out);
220         }
221         NodeList areas = in.getElementsByTagName("area");
222         for (int i = 0; i < areas.getLength(); i++) {
223             processHREFElement(areas.item(i), documentURI, out);
224         }
225         NodeList links = in.getElementsByTagName("link");
226         for (int i = 0; i < links.getLength(); i++) {
227             processHREFElement(links.item(i), documentURI, out);
228         }
229     }
230 
231     /**
232      * Implements sub-step for 5.2.3 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
233      * extraction algorithm.
234      *
235      * @param item        {@link Node} to be processed.
236      * @param documentURI Document current {@link URI}.
237      * @param out         a valid not <code>null</code> {@link ExtractionResult}
238      */
239     private void processHREFElement(Node item, URI documentURI, ExtractionResult out) {
240         Node rel = item.getAttributes().getNamedItem("rel");
241         if (rel == null) {
242             return;
243         }
244         Node href = item.getAttributes().getNamedItem("href");
245         if (href == null) {
246             return;
247         }
248         URL absoluteURL;
249         if (!isAbsoluteURL(href.getTextContent())) {
250             try {
251                 absoluteURL = toAbsoluteURL(
252                         documentURI.toString(),
253                         href.getTextContent(),
254                         '/'
255                 );
256             } catch (MalformedURLException e) {
257                 // okay, it's not an absolute URL, return
258                 return;
259             }
260         } else {
261             try {
262                 absoluteURL = new URL(href.getTextContent());
263             } catch (MalformedURLException e) {
264                 // cannot happen
265                 return;
266             }
267         }
268         String[] relTokens = rel.getTextContent().split(" ");
269         Set<String> tokensWithNoDuplicates = new HashSet<String>();
270         for (String relToken : relTokens) {
271             if (relToken.contains(":")) {
272                 // if contain semi-colon, skip
273                 continue;
274             }
275             if (relToken.equals("alternate") || relToken.equals("stylesheet")) {
276                 tokensWithNoDuplicates.add("ALTERNATE-STYLESHEET");
277                 continue;
278             }
279             tokensWithNoDuplicates.add(relToken.toLowerCase());
280         }
281         for (String token : tokensWithNoDuplicates) {
282             URI predicate;
283             if (isAbsoluteURL(token)) {
284                 predicate = RDFUtils.uri(token);
285             } else {
286                 predicate = RDFUtils.uri(XHTML.NS + token);
287             }
288             out.writeTriple(
289                     documentURI,
290                     predicate,
291                     RDFUtils.uri(absoluteURL.toString())
292             );
293         }
294     }
295 
296     /**
297      * Implements step 5.2.3 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
298      * extraction algorithm.
299      *
300      * @param in          {@link Document} to be processed.
301      * @param documentURI Document current {@link URI}.
302      * @param out         a valid not <code>null</code> {@link ExtractionResult}
303      */
304     private void processMetaElements(Document in, URI documentURI, ExtractionResult out) {
305         NodeList metas = in.getElementsByTagName("meta");
306         for (int i = 0; i < metas.getLength(); i++) {
307             Node meta = metas.item(i);
308             String name    = DomUtils.readAttribute(meta, "name"   , null);
309             String content = DomUtils.readAttribute(meta, "content", null);
310             if (name != null && content != null) {
311                 if (isAbsoluteURL(name)) {
312                     processMetaElement(
313                             RDFUtils.uri(name),
314                             content,
315                             getLanguage(meta),
316                             documentURI,
317                             out
318                     );
319                 } else {
320                     processMetaElement(
321                             name,
322                             content,
323                             getLanguage(meta),
324                             documentURI,
325                             out
326                     );
327                 }
328             }
329         }
330     }
331 
332     /**
333      * Implements sub step for 5.2.3 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
334      * extraction algorithm.
335      *
336      * @param uri
337      * @param content
338      * @param language
339      * @param documentURI
340      * @param out
341      */
342     private void processMetaElement(
343             URI uri,
344             String content,
345             String language,
346             URI documentURI,
347             ExtractionResult out
348     ) {
349         if (content.contains(":")) {
350             // if it contains U+003A COLON, exit
351             return;
352         }
353         Literal subject;
354         if (language == null) {
355             // ok, we don't know the language
356             subject = RDFUtils.literal(content);
357         } else {
358             subject = RDFUtils.literal(content, language);
359         }
360         out.writeTriple(
361                 documentURI,
362                 uri,
363                 subject
364         );
365     }
366 
367     /**
368      * Implements sub step for 5.2.3 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
369      * extraction algorithm.
370      *
371      * @param name
372      * @param content
373      * @param language
374      * @param documentURI
375      * @param out
376      */
377     private void processMetaElement(
378             String name,
379             String content,
380             String language,
381             URI documentURI,
382             ExtractionResult out) {
383         Literal subject;
384         if (language == null) {
385             // ok, we don't know the language
386             subject = RDFUtils.literal(content);
387         } else {
388             subject = RDFUtils.literal(content, language);
389         }
390         out.writeTriple(
391                 documentURI,
392                 RDFUtils.uri(XHTML.NS + name.toLowerCase()),
393                 subject
394         );
395     }
396 
397     /**
398      * Implements sub step for 5.2.4 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
399      * extraction algorithm.
400      *
401      * @param in
402      * @param documentURI
403      * @param out
404      */
405     private void processCiteElements(Document in, URI documentURI, ExtractionResult out) {
406         NodeList blockQuotes = in.getElementsByTagName("blockquote");
407         for (int i = 0; i < blockQuotes.getLength(); i++) {
408             processCiteElement(blockQuotes.item(i), documentURI, out);
409         }
410         NodeList quotes = in.getElementsByTagName("q");
411         for (int i = 0; i < quotes.getLength(); i++) {
412             processCiteElement(quotes.item(i), documentURI, out);
413         }
414     }
415 
416     private void processCiteElement(Node item, URI documentURI, ExtractionResult out) {
417         if (item.getAttributes().getNamedItem("cite") != null) {
418             out.writeTriple(
419                     documentURI,
420                     DCTERMS.getInstance().source,
421                     RDFUtils.uri(item.getAttributes().getNamedItem("cite").getTextContent())
422             );
423         }
424     }
425 
426     /**
427      * Recursive method implementing 5.2.6.1 "generate the triple for the item" of
428      * <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
429      * extraction algorithm.
430      *
431      * @param itemScope
432      * @param documentURI
433      * @param out
434      * @param mappings
435      * @return
436      * @throws ExtractionException
437      */
438     private Resource processType(
439             ItemScope itemScope,
440             URI documentURI, ExtractionResult out,
441             Map<ItemScope, Resource> mappings
442     ) throws ExtractionException {
443         Resource subject;
444         if (mappings.containsKey(itemScope)) {
445             subject = mappings.get(itemScope);
446         } else if (isAbsoluteURL(itemScope.getItemId())) {
447             subject = RDFUtils.uri(itemScope.getItemId());
448         } else {
449             subject = RDFUtils.getBNode(Integer.toString(itemScope.hashCode()));
450         }
451         mappings.put(itemScope, subject);
452 
453         // ItemScope.type could be null, but surely it's a valid URL
454         String itemScopeType = "";
455         if (itemScope.getType() != null) {
456             String itemType;
457             itemType = itemScope.getType().toString();
458             out.writeTriple(subject, RDF.TYPE, RDFUtils.uri(itemType));
459             itemScopeType = itemScope.getType().toString();
460         }
461         for (String propName : itemScope.getProperties().keySet()) {
462             List<ItemProp> itemProps = itemScope.getProperties().get(propName);
463             for (ItemProp itemProp : itemProps) {
464                 try {
465                     processProperty(
466                             subject,
467                             propName,
468                             itemProp,
469                             itemScopeType,
470                             documentURI,
471                             mappings,
472                             out
473                     );
474                 } catch (MalformedURLException e) {
475                     throw new ExtractionException(
476                             "Error while processing on subject '" + subject +
477                                     "' the itemProp: '" + itemProp + "' "
478                     );
479                 }
480             }
481         }
482         return subject;
483     }
484 
485     private void processProperty(
486             Resource subject,
487             String propName,
488             ItemProp itemProp,
489             String itemScopeType,
490             URI documentURI,
491             Map<ItemScope, Resource> mappings,
492             ExtractionResult out
493     ) throws MalformedURLException, ExtractionException {
494         URI predicate;
495         if (!isAbsoluteURL(propName) && itemScopeType.equals("") && isStrict) {
496             return;
497         } else if (!isAbsoluteURL(propName) && itemScopeType.equals("") && !isStrict) {
498             predicate = RDFUtils.uri(
499                     toAbsoluteURL(
500                             defaultNamespace,
501                             propName,
502                             '/'
503                     ).toString()
504             );
505         } else {
506             predicate = RDFUtils.uri(
507                     toAbsoluteURL(
508                             itemScopeType,
509                             propName,
510                             '/'
511                     ).toString());
512         }
513         Value value;
514         Object propValue = itemProp.getValue().getContent();
515         ItemPropValue.Type propType = itemProp.getValue().getType();
516         if (propType.equals(ItemPropValue.Type.Nested)) {
517             value = processType((ItemScope) propValue, documentURI, out, mappings);
518         } else if (propType.equals(ItemPropValue.Type.Plain)) {
519             value = RDFUtils.literal((String) propValue, documentLanguage);
520         } else if (propType.equals(ItemPropValue.Type.Link)) {
521             value = RDFUtils.uri(
522                     toAbsoluteURL(
523                             documentURI.toString(),
524                             (String) propValue,
525                             '/'
526                     ).toString()
527             );
528         } else if (propType.equals(ItemPropValue.Type.Date)) {
529             value = RDFUtils.literal(ItemPropValue.formatDateTime((Date) propValue), XMLSchema.DATE);
530         } else {
531             throw new RuntimeException("Invalid Type '" +
532                     propType + "' for ItemPropValue with name: '" + propName + "'");
533         }
534         out.writeTriple(subject, predicate, value);
535     }
536 
537     private boolean isAbsoluteURL(String urlString) {
538         boolean result = false;
539         try {
540             URL url = new URL(urlString);
541             String protocol = url.getProtocol();
542             if (protocol != null && protocol.trim().length() > 0)
543                 result = true;
544         } catch (MalformedURLException e) {
545             return false;
546         }
547         return result;
548     }
549 
550     private URL toAbsoluteURL(String ns, String part, char trailing)
551             throws MalformedURLException {
552         if (isAbsoluteURL(part)) {
553             return new URL(part);
554         }
555         char lastChar = ns.charAt(ns.length() - 1);
556         if (lastChar == '#' || lastChar == '/')
557             return new URL(ns + part);
558         return new URL(ns + trailing + part);
559     }
560 
561     private void notifyError(MicrodataParserException[] errors, ExtractionResult out) {
562         for(MicrodataParserException mpe : errors) {
563             out.notifyError(
564                     ErrorReporter.ErrorLevel.ERROR,
565                     mpe.toJSON(),
566                     mpe.getErrorLocationBeginRow() ,
567                     mpe.getErrorLocationBeginCol()
568             );
569         }
570     }
571 
572 }