View Javadoc

1   /*
2    * Copyright 2008-2010 Digital Enterprise Research Institute (DERI)
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *          http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package org.deri.any23.extractor.html;
18  
19  import org.deri.any23.extractor.ExtractionException;
20  import org.deri.any23.extractor.ExtractionResult;
21  import org.deri.any23.extractor.ExtractorDescription;
22  import org.deri.any23.extractor.ExtractorFactory;
23  import org.deri.any23.extractor.SimpleExtractorFactory;
24  import org.deri.any23.extractor.TagSoupExtractionResult;
25  import org.deri.any23.rdf.PopularPrefixes;
26  import org.deri.any23.vocab.DCTERMS;
27  import org.deri.any23.vocab.REVIEW;
28  import org.deri.any23.vocab.VCARD;
29  import org.openrdf.model.BNode;
30  import org.openrdf.model.Resource;
31  import org.openrdf.model.vocabulary.RDF;
32  import org.w3c.dom.Node;
33  
34  import java.util.Arrays;
35  import java.util.List;
36  
37  import static org.deri.any23.extractor.html.HTMLDocument.TextField;
38  
39  /**
40   * Extractor for the <a href="http://microformats.org/wiki/hreview">hReview</a>
41   * microformat.
42   *
43   * @author Gabriele Renzi
44   */
45  public class HReviewExtractor extends EntityBasedMicroformatExtractor {
46  
47      private static final REVIEW  vREVIEW  = REVIEW.getInstance();
48      private static final VCARD   vVCARD   = VCARD.getInstance();
49      private static final DCTERMS vDCTERMS = DCTERMS.getInstance();
50  
51      public final static ExtractorFactory<HReviewExtractor> factory =
52              SimpleExtractorFactory.create(
53                      "html-mf-hreview",
54                      PopularPrefixes.createSubset("rdf", "vcard", "rev"),
55                      Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
56                      null,
57                      HReviewExtractor.class
58              );
59  
60      public ExtractorDescription getDescription() {
61          return factory;
62      }
63  
64      protected String getBaseClassName() {
65          return "hreview";
66      }
67  
68      @Override
69      protected void resetExtractor() {
70          // Empty.
71      }
72  
73      protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
74          BNode rev = getBlankNodeFor(node);
75          out.writeTriple(rev, RDF.TYPE, vREVIEW.Review);
76          final HTMLDocument fragment = new HTMLDocument(node);
77          addRating(fragment, rev);
78          addSummary(fragment, rev);
79          addTime(fragment, rev);
80          addType(fragment, rev);
81          addDescription(fragment, rev);
82          addItem(fragment, rev);
83          addReviewer(fragment, rev);
84  
85          final TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
86          tser.addResourceRoot(
87                  DomUtils.getXPathListForNode(node),
88                  rev,
89                  this.getClass()
90          );
91  
92          return true;
93      }
94  
95      private void addType(HTMLDocument doc, Resource rev) {
96          TextField value = doc.getSingularTextField("type");
97          conditionallyAddStringProperty(
98                  value.source(),
99                  rev, vREVIEW.type, value.value()
100         );
101     }
102 
103     private void addReviewer(HTMLDocument doc, Resource rev) {
104         List<Node> nodes = doc.findAllByClassName("reviewer");
105         if (nodes.size() > 0) {
106             Node node0 = nodes.get(0);
107             addBNodeProperty(
108                     node0,
109                     rev, vREVIEW.reviewer, getBlankNodeFor(node0)
110             );
111         }
112     }
113 
114     private void addItem(HTMLDocument root, BNode rev) throws ExtractionException {
115         List<Node> nodes = root.findAllByClassName("item");
116         for (Node node : nodes) {
117             Resource item = findDummy(new HTMLDocument(node));
118             addBNodeProperty(
119                     node,
120                     item, vREVIEW.hasReview, rev
121             );
122         }
123     }
124 
125     private Resource findDummy(HTMLDocument item) throws ExtractionException {
126         Resource blank = getBlankNodeFor(item.getDocument());
127         TextField val = item.getSingularTextField("fn");
128         conditionallyAddStringProperty(
129                 val.source(),
130                 blank, vVCARD.fn, val.value()
131         );
132         final TextField url = item.getSingularUrlField("url");
133         conditionallyAddResourceProperty(blank, vVCARD.url, getHTMLDocument().resolveURI(url.value()));
134         TextField pics[] = item.getPluralUrlField("photo");
135         for (TextField pic : pics) {
136             addURIProperty(blank, vVCARD.photo, getHTMLDocument().resolveURI(pic.value()));
137         }
138         return blank;
139     }
140 
141     private void addRating(HTMLDocument doc, Resource rev) {
142         HTMLDocument.TextField value = doc.getSingularTextField("rating");
143         conditionallyAddStringProperty(
144                 value.source(), rev, vREVIEW.rating, value.value()
145         );
146     }
147 
148     private void addSummary(HTMLDocument doc, Resource rev) {
149         TextField value = doc.getSingularTextField("summary");
150         conditionallyAddStringProperty(
151                 value.source(),
152                 rev, vREVIEW.title, value.value()
153         );
154     }
155 
156     private void addTime(HTMLDocument doc, Resource rev) {
157         TextField value = doc.getSingularTextField("dtreviewed");
158         conditionallyAddStringProperty(
159                 value.source(),
160                 rev, vDCTERMS.date, value.value()
161         );
162     }
163 
164     private void addDescription(HTMLDocument doc, Resource rev) {
165         TextField value = doc.getSingularTextField("description");
166         conditionallyAddStringProperty(
167                 value.source(),
168                 rev, vREVIEW.text, value.value()
169         );
170     }
171 
172 }