1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.deri.any23.extractor.html;
18
19 import org.deri.any23.extractor.ExtractionException;
20 import org.deri.any23.extractor.ExtractionResult;
21 import org.deri.any23.extractor.ExtractorDescription;
22 import org.deri.any23.extractor.ExtractorFactory;
23 import org.deri.any23.extractor.SimpleExtractorFactory;
24 import org.deri.any23.extractor.TagSoupExtractionResult;
25 import org.deri.any23.rdf.PopularPrefixes;
26 import org.deri.any23.vocab.FOAF;
27 import org.deri.any23.vocab.HLISTING;
28 import org.openrdf.model.BNode;
29 import org.openrdf.model.Resource;
30 import org.openrdf.model.URI;
31 import org.openrdf.model.vocabulary.RDF;
32 import org.w3c.dom.Node;
33
34 import java.util.ArrayList;
35 import java.util.Arrays;
36 import java.util.HashSet;
37 import java.util.List;
38 import java.util.Set;
39
40 import static org.deri.any23.extractor.html.HTMLDocument.TextField;
41
42
43
44
45
46
47
48
49 public class HListingExtractor extends EntityBasedMicroformatExtractor {
50
51 private static final HLISTING hLISTING = HLISTING.getInstance();
52 private static final FOAF foaf = FOAF.getInstance();
53
54 private static final Set<String> ActionClasses = new HashSet<String>() {
55 {
56 add("sell" );
57 add("rent" );
58 add("trade" );
59 add("meet" );
60 add("announce");
61 add("offer" );
62 add("wanted" );
63 add("event" );
64 add("service" );
65 }
66 };
67
68 private static final List<String> validClassesForAddress = Arrays.asList(
69 "post-office-box",
70 "extended-address",
71 "street-address",
72 "locality",
73 "region",
74 "postal-code",
75 "country-name"
76 );
77
78 private HTMLDocument fragment;
79
80 public final static ExtractorFactory<HListingExtractor> factory =
81 SimpleExtractorFactory.create(
82 "html-mf-hlisting",
83 PopularPrefixes.createSubset("rdf", "hlisting"),
84 Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
85 null,
86 HListingExtractor.class
87 );
88
89 public ExtractorDescription getDescription() {
90 return factory;
91 }
92
93 protected String getBaseClassName() {
94 return "hlisting";
95 }
96
97 @Override
98 protected void resetExtractor() {
99
100 }
101
102 @Override
103 protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
104 this.fragment = new HTMLDocument(node);
105 BNode listing = getBlankNodeFor(node);
106 out.writeTriple(listing, RDF.TYPE, hLISTING.Listing);
107
108 for (String action : findActions(fragment)) {
109 out.writeTriple(listing, hLISTING.action, hLISTING.getResource(action));
110 }
111 out.writeTriple(listing, hLISTING.lister, addLister() );
112 addItem(listing);
113 addDateTimes(listing);
114 addPrice(listing);
115 addDescription(listing);
116 addSummary(listing);
117 addPermalink(listing);
118
119 final TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
120 tser.addResourceRoot(
121 DomUtils.getXPathListForNode(node),
122 listing,
123 this.getClass()
124 );
125
126 return true;
127 }
128
129 private void addItem(Resource listing) throws ExtractionException {
130 Node node = fragment.findMicroformattedObjectNode("*", "item");
131 if (null == node) return;
132 BNode blankItem = valueFactory.createBNode();
133 addBNodeProperty(
134 node,
135 listing, hLISTING.item, blankItem
136 );
137 addURIProperty(blankItem, RDF.TYPE, hLISTING.Item);
138
139 HTMLDocument item = new HTMLDocument(node);
140
141 addItemName(item, blankItem);
142 addItemUrl(item, blankItem);
143
144 addItemPhoto(fragment, blankItem);
145 addItemAddresses(fragment, blankItem);
146 }
147
148 private void addItemAddresses(HTMLDocument doc, Resource blankItem) {
149 final String extractorName = getDescription().getExtractorName();
150 for (Node node : doc.findAll(".//*[contains(@class,'adr')]//*[@class]")) {
151 String[] klasses = node.getAttributes().getNamedItem("class").getNodeValue().split("\\s+");
152 for (String klass : klasses)
153 if (validClassesForAddress.contains(klass)) {
154 String value = node.getNodeValue();
155
156 if (!(null == value || "".equals(value))) {
157 URI property = hLISTING.getPropertyCamelized(klass);
158 conditionallyAddLiteralProperty(
159 node,
160 blankItem, property, valueFactory.createLiteral(value)
161 );
162 }
163 }
164 }
165 }
166
167 private void addPermalink(Resource listing) {
168 String link = fragment.find(".//A[contains(@rel,'self') and contains(@rel,'bookmark')]/@href");
169 conditionallyAddStringProperty(
170 fragment.getDocument(),
171 listing, hLISTING.permalink, link
172 );
173 }
174
175 private void addPrice(Resource listing) {
176 TextField price = fragment.getSingularTextField("price");
177 conditionallyAddStringProperty(
178 price.source(),
179 listing, hLISTING.price, price.value()
180 );
181 }
182
183 private void addDescription(Resource listing) {
184 TextField description = fragment.getSingularTextField("description");
185 conditionallyAddStringProperty(
186 description.source(),
187 listing, hLISTING.description, description.value()
188 );
189 }
190
191 private void addSummary(Resource listing) {
192 TextField summary = fragment.getSingularTextField("summary");
193 conditionallyAddStringProperty(
194 summary.source(),
195 listing, hLISTING.summary, summary.value()
196 );
197 }
198
199 private void addDateTimes(Resource listing) {
200 TextField listed = fragment.getSingularTextField("dtlisted");
201 conditionallyAddStringProperty(
202 listed.source(),
203 listing, hLISTING.dtlisted, listed.value()
204 );
205 HTMLDocument.TextField expired = fragment.getSingularTextField("dtexpired");
206 conditionallyAddStringProperty(
207 expired.source(),
208 listing, hLISTING.dtexpired, expired.value()
209 );
210 }
211
212 private Resource addLister() throws ExtractionException {
213 Resource blankLister = valueFactory.createBNode();
214 addURIProperty(blankLister, RDF.TYPE, hLISTING.Lister);
215 Node node = fragment.findMicroformattedObjectNode("*", "lister");
216 if (null == node)
217 return blankLister;
218 HTMLDocument listerNode = new HTMLDocument(node);
219 addListerFn(listerNode, blankLister);
220 addListerOrg(listerNode, blankLister);
221 addListerEmail(listerNode, blankLister);
222 addListerUrl(listerNode, blankLister);
223 addListerTel(listerNode, blankLister);
224 addListerLogo(listerNode, blankLister);
225 return blankLister;
226 }
227
228 private void addListerTel(HTMLDocument doc, Resource blankLister) {
229 HTMLDocument.TextField tel = doc.getSingularTextField("tel");
230 conditionallyAddStringProperty(
231 tel.source(),
232 blankLister, hLISTING.tel, tel.value()
233 );
234 }
235
236 private void addListerUrl(HTMLDocument doc, Resource blankLister) throws ExtractionException {
237 TextField url = doc.getSingularUrlField("url");
238 conditionallyAddResourceProperty(blankLister, hLISTING.listerUrl, getHTMLDocument().resolveURI(url.value()));
239 }
240
241 private void addListerEmail(HTMLDocument doc, Resource blankLister) {
242 TextField email = doc.getSingularUrlField("email");
243 conditionallyAddResourceProperty(blankLister, foaf.mbox, fixLink(email.value(), "mailto"));
244 }
245
246 private void addListerFn(HTMLDocument doc, Resource blankLister) {
247 TextField fn = doc.getSingularTextField("fn");
248 conditionallyAddStringProperty(
249 fn.source(),
250 blankLister, hLISTING.listerName, fn.value()
251 );
252 }
253
254 private void addListerLogo(HTMLDocument doc, Resource blankLister) throws ExtractionException {
255 TextField logo = doc.getSingularUrlField("logo");
256 conditionallyAddResourceProperty(blankLister, hLISTING.listerLogo, getHTMLDocument().resolveURI(logo.value()));
257 }
258
259 private void addListerOrg(HTMLDocument doc, Resource blankLister) {
260 TextField org = doc.getSingularTextField("org");
261 conditionallyAddStringProperty(
262 org.source(),
263 blankLister, hLISTING.listerOrg, org.value()
264 );
265 }
266
267 private void addItemName(HTMLDocument item, Resource blankItem) {
268 HTMLDocument.TextField fn = item.getSingularTextField("fn");
269 conditionallyAddStringProperty(
270 fn.source(),
271 blankItem, hLISTING.itemName, fn.value()
272 );
273 }
274
275 private void addItemUrl(HTMLDocument item, Resource blankItem) throws ExtractionException {
276 TextField url = item.getSingularUrlField("url");
277 conditionallyAddResourceProperty(blankItem, hLISTING.itemUrl, getHTMLDocument().resolveURI(url.value()));
278 }
279
280 private void addItemPhoto(HTMLDocument doc, Resource blankLister) throws ExtractionException {
281
282 String url = doc.findMicroformattedValue("*", "item", "A", "photo", "@href");
283 conditionallyAddResourceProperty(blankLister, hLISTING.itemPhoto, getHTMLDocument().resolveURI(url));
284 url = doc.findMicroformattedValue("*", "item", "IMG", "photo", "@src");
285 conditionallyAddResourceProperty(blankLister, hLISTING.itemPhoto, getHTMLDocument().resolveURI(url));
286
287 url = doc.findMicroformattedValue("*", "photo", "IMG", "", "@src");
288 conditionallyAddResourceProperty(blankLister, hLISTING.itemPhoto, getHTMLDocument().resolveURI(url));
289 }
290
291 private List<String> findActions(HTMLDocument doc) {
292 List<String> actions = new ArrayList<String>(0);
293
294 String[] classes = doc.readAttribute("class").split("\\s+");
295 for (String klass : classes) {
296 if (ActionClasses.contains(klass))
297 actions.add(klass);
298 }
299
300 for (Node action : doc.findAll("./*[@class]/@class")) {
301 for (String substring : action.getNodeValue().split("\\s+")) {
302 if (ActionClasses.contains(substring))
303 actions.add(substring);
304 }
305 }
306 return actions;
307 }
308
309 }