1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.deri.any23.extractor.html;
18
19 import org.deri.any23.extractor.ExtractionException;
20 import org.deri.any23.extractor.ExtractionResult;
21 import org.deri.any23.extractor.ExtractorDescription;
22 import org.deri.any23.extractor.ExtractorFactory;
23 import org.deri.any23.extractor.SimpleExtractorFactory;
24 import org.deri.any23.extractor.TagSoupExtractionResult;
25 import org.deri.any23.rdf.PopularPrefixes;
26 import org.deri.any23.vocab.DCTERMS;
27 import org.deri.any23.vocab.REVIEW;
28 import org.deri.any23.vocab.VCARD;
29 import org.openrdf.model.BNode;
30 import org.openrdf.model.Resource;
31 import org.openrdf.model.vocabulary.RDF;
32 import org.w3c.dom.Node;
33
34 import java.util.Arrays;
35 import java.util.List;
36
37 import static org.deri.any23.extractor.html.HTMLDocument.TextField;
38
39
40
41
42
43
44
45 public class HReviewExtractor extends EntityBasedMicroformatExtractor {
46
47 private static final REVIEW vREVIEW = REVIEW.getInstance();
48 private static final VCARD vVCARD = VCARD.getInstance();
49 private static final DCTERMS vDCTERMS = DCTERMS.getInstance();
50
51 public final static ExtractorFactory<HReviewExtractor> factory =
52 SimpleExtractorFactory.create(
53 "html-mf-hreview",
54 PopularPrefixes.createSubset("rdf", "vcard", "rev"),
55 Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
56 null,
57 HReviewExtractor.class
58 );
59
60 public ExtractorDescription getDescription() {
61 return factory;
62 }
63
64 protected String getBaseClassName() {
65 return "hreview";
66 }
67
68 @Override
69 protected void resetExtractor() {
70
71 }
72
73 protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
74 BNode rev = getBlankNodeFor(node);
75 out.writeTriple(rev, RDF.TYPE, vREVIEW.Review);
76 final HTMLDocument fragment = new HTMLDocument(node);
77 addRating(fragment, rev);
78 addSummary(fragment, rev);
79 addTime(fragment, rev);
80 addType(fragment, rev);
81 addDescription(fragment, rev);
82 addItem(fragment, rev);
83 addReviewer(fragment, rev);
84
85 final TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
86 tser.addResourceRoot(
87 DomUtils.getXPathListForNode(node),
88 rev,
89 this.getClass()
90 );
91
92 return true;
93 }
94
95 private void addType(HTMLDocument doc, Resource rev) {
96 TextField value = doc.getSingularTextField("type");
97 conditionallyAddStringProperty(
98 value.source(),
99 rev, vREVIEW.type, value.value()
100 );
101 }
102
103 private void addReviewer(HTMLDocument doc, Resource rev) {
104 List<Node> nodes = doc.findAllByClassName("reviewer");
105 if (nodes.size() > 0) {
106 Node node0 = nodes.get(0);
107 addBNodeProperty(
108 node0,
109 rev, vREVIEW.reviewer, getBlankNodeFor(node0)
110 );
111 }
112 }
113
114 private void addItem(HTMLDocument root, BNode rev) throws ExtractionException {
115 List<Node> nodes = root.findAllByClassName("item");
116 for (Node node : nodes) {
117 Resource item = findDummy(new HTMLDocument(node));
118 addBNodeProperty(
119 node,
120 item, vREVIEW.hasReview, rev
121 );
122 }
123 }
124
125 private Resource findDummy(HTMLDocument item) throws ExtractionException {
126 Resource blank = getBlankNodeFor(item.getDocument());
127 TextField val = item.getSingularTextField("fn");
128 conditionallyAddStringProperty(
129 val.source(),
130 blank, vVCARD.fn, val.value()
131 );
132 final TextField url = item.getSingularUrlField("url");
133 conditionallyAddResourceProperty(blank, vVCARD.url, getHTMLDocument().resolveURI(url.value()));
134 TextField pics[] = item.getPluralUrlField("photo");
135 for (TextField pic : pics) {
136 addURIProperty(blank, vVCARD.photo, getHTMLDocument().resolveURI(pic.value()));
137 }
138 return blank;
139 }
140
141 private void addRating(HTMLDocument doc, Resource rev) {
142 HTMLDocument.TextField value = doc.getSingularTextField("rating");
143 conditionallyAddStringProperty(
144 value.source(), rev, vREVIEW.rating, value.value()
145 );
146 }
147
148 private void addSummary(HTMLDocument doc, Resource rev) {
149 TextField value = doc.getSingularTextField("summary");
150 conditionallyAddStringProperty(
151 value.source(),
152 rev, vREVIEW.title, value.value()
153 );
154 }
155
156 private void addTime(HTMLDocument doc, Resource rev) {
157 TextField value = doc.getSingularTextField("dtreviewed");
158 conditionallyAddStringProperty(
159 value.source(),
160 rev, vDCTERMS.date, value.value()
161 );
162 }
163
164 private void addDescription(HTMLDocument doc, Resource rev) {
165 TextField value = doc.getSingularTextField("description");
166 conditionallyAddStringProperty(
167 value.source(),
168 rev, vREVIEW.text, value.value()
169 );
170 }
171
172 }