View Javadoc

1   /*
2    * Copyright 2008-2010 Digital Enterprise Research Institute (DERI)
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *          http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package org.deri.any23.extractor.html;
18  
19  import org.deri.any23.extractor.ExtractionException;
20  import org.deri.any23.extractor.ExtractionResult;
21  import org.deri.any23.extractor.ExtractorDescription;
22  import org.deri.any23.extractor.ExtractorFactory;
23  import org.deri.any23.extractor.SimpleExtractorFactory;
24  import org.deri.any23.extractor.TagSoupExtractionResult;
25  import org.deri.any23.rdf.PopularPrefixes;
26  import org.deri.any23.vocab.FOAF;
27  import org.deri.any23.vocab.HLISTING;
28  import org.openrdf.model.BNode;
29  import org.openrdf.model.Resource;
30  import org.openrdf.model.URI;
31  import org.openrdf.model.vocabulary.RDF;
32  import org.w3c.dom.Node;
33  
34  import java.util.ArrayList;
35  import java.util.Arrays;
36  import java.util.HashSet;
37  import java.util.List;
38  import java.util.Set;
39  
40  import static org.deri.any23.extractor.html.HTMLDocument.TextField;
41  
42  
43  /**
44   * Extractor for the <a href="http://microformats.org/wiki/hlisting">hListing</a>
45   * microformat.
46   *
47   * @author Gabriele Renzi
48   */
49  public class HListingExtractor extends EntityBasedMicroformatExtractor {
50  
51      private static final HLISTING hLISTING = HLISTING.getInstance();
52      private static final FOAF     foaf     = FOAF.getInstance();
53  
54      private static final Set<String> ActionClasses = new HashSet<String>() {
55          {
56              add("sell"    );
57              add("rent"    );
58              add("trade"   );
59              add("meet"    );
60              add("announce");
61              add("offer"   );
62              add("wanted"  );
63              add("event"   );
64              add("service" );
65          }
66      };
67  
68      private static final List<String> validClassesForAddress = Arrays.asList(
69              "post-office-box",
70              "extended-address",
71              "street-address",
72              "locality",
73              "region",
74              "postal-code",
75              "country-name"
76      );
77  
78      private HTMLDocument fragment;
79  
80      public final static ExtractorFactory<HListingExtractor> factory =
81              SimpleExtractorFactory.create(
82                      "html-mf-hlisting",
83                      PopularPrefixes.createSubset("rdf", "hlisting"),
84                      Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
85                      null,
86                      HListingExtractor.class
87              );
88  
89      public ExtractorDescription getDescription() {
90          return factory;
91      }
92  
93      protected String getBaseClassName() {
94          return "hlisting";
95      }
96  
97      @Override
98      protected void resetExtractor() {
99          // Empty.
100     }
101 
102     @Override
103     protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
104         this.fragment = new HTMLDocument(node);
105         BNode listing = getBlankNodeFor(node);
106         out.writeTriple(listing, RDF.TYPE, hLISTING.Listing);
107 
108         for (String action : findActions(fragment)) {
109             out.writeTriple(listing, hLISTING.action, hLISTING.getResource(action));
110         }
111         out.writeTriple(listing, hLISTING.lister, addLister() );
112         addItem(listing);
113         addDateTimes(listing);
114         addPrice(listing);
115         addDescription(listing);
116         addSummary(listing);
117         addPermalink(listing);
118 
119         final TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
120         tser.addResourceRoot(
121                 DomUtils.getXPathListForNode(node),
122                 listing,
123                 this.getClass()
124         );
125 
126         return true;
127     }
128 
129     private void addItem(Resource listing) throws ExtractionException {
130         Node node = fragment.findMicroformattedObjectNode("*", "item");
131         if (null == node) return;
132         BNode blankItem = valueFactory.createBNode();
133         addBNodeProperty(
134                 node,
135                 listing, hLISTING.item, blankItem
136         );
137         addURIProperty(blankItem, RDF.TYPE, hLISTING.Item);
138 
139         HTMLDocument item = new HTMLDocument(node);
140 
141         addItemName(item, blankItem);
142         addItemUrl(item, blankItem);
143         // the format is specified with photo into item, but kelkoo has it into the top level
144         addItemPhoto(fragment, blankItem);
145         addItemAddresses(fragment, blankItem);
146     }
147 
148     private void addItemAddresses(HTMLDocument doc, Resource blankItem) {
149         final String extractorName = getDescription().getExtractorName();
150         for (Node node : doc.findAll(".//*[contains(@class,'adr')]//*[@class]")) {
151             String[] klasses = node.getAttributes().getNamedItem("class").getNodeValue().split("\\s+");
152             for (String klass : klasses)
153                 if (validClassesForAddress.contains(klass)) {
154                     String value = node.getNodeValue();
155                     // do not use conditionallyAdd, it won't work cause of evaluation rules
156                     if (!(null == value || "".equals(value))) {
157                         URI property = hLISTING.getPropertyCamelized(klass);
158                         conditionallyAddLiteralProperty(
159                                 node,
160                                 blankItem, property, valueFactory.createLiteral(value)
161                         );
162                     }
163                 }
164         }
165     }
166 
167     private void addPermalink(Resource listing) {
168         String link = fragment.find(".//A[contains(@rel,'self') and contains(@rel,'bookmark')]/@href");
169         conditionallyAddStringProperty(
170                 fragment.getDocument(),
171                 listing, hLISTING.permalink, link
172         );
173     }
174 
175     private void addPrice(Resource listing) {
176         TextField price = fragment.getSingularTextField("price");
177         conditionallyAddStringProperty(
178                 price.source(),
179                 listing, hLISTING.price, price.value()
180         );
181     }
182 
183     private void addDescription(Resource listing) {
184         TextField description = fragment.getSingularTextField("description");
185         conditionallyAddStringProperty(
186                 description.source(),
187                 listing, hLISTING.description, description.value()
188         );
189     }
190 
191     private void addSummary(Resource listing) {
192         TextField summary = fragment.getSingularTextField("summary");
193         conditionallyAddStringProperty(
194                 summary.source(),
195                 listing, hLISTING.summary, summary.value()
196         );
197     }
198 
199     private void addDateTimes(Resource listing) {
200         TextField listed = fragment.getSingularTextField("dtlisted");
201         conditionallyAddStringProperty(
202                 listed.source(),
203                 listing, hLISTING.dtlisted, listed.value()
204         );
205         HTMLDocument.TextField expired = fragment.getSingularTextField("dtexpired");
206         conditionallyAddStringProperty(
207                 expired.source(),
208                 listing, hLISTING.dtexpired, expired.value()
209         );
210     }
211 
212     private Resource addLister() throws ExtractionException {
213         Resource blankLister = valueFactory.createBNode();
214         addURIProperty(blankLister, RDF.TYPE, hLISTING.Lister);
215         Node node = fragment.findMicroformattedObjectNode("*", "lister");
216         if (null == node)
217             return blankLister;
218         HTMLDocument listerNode = new HTMLDocument(node);
219         addListerFn(listerNode, blankLister);
220         addListerOrg(listerNode, blankLister);
221         addListerEmail(listerNode, blankLister);
222         addListerUrl(listerNode, blankLister);
223         addListerTel(listerNode, blankLister);
224         addListerLogo(listerNode, blankLister);
225         return blankLister;
226     }
227 
228     private void addListerTel(HTMLDocument doc, Resource blankLister) {
229         HTMLDocument.TextField tel = doc.getSingularTextField("tel");
230         conditionallyAddStringProperty(
231                 tel.source(),
232                 blankLister, hLISTING.tel, tel.value()
233         );
234     }
235 
236     private void addListerUrl(HTMLDocument doc, Resource blankLister) throws ExtractionException {
237         TextField url = doc.getSingularUrlField("url");
238         conditionallyAddResourceProperty(blankLister, hLISTING.listerUrl, getHTMLDocument().resolveURI(url.value()));
239     }
240 
241     private void addListerEmail(HTMLDocument doc, Resource blankLister) {
242         TextField email = doc.getSingularUrlField("email");
243         conditionallyAddResourceProperty(blankLister, foaf.mbox, fixLink(email.value(), "mailto"));
244     }
245 
246     private void addListerFn(HTMLDocument doc, Resource blankLister) {
247         TextField fn = doc.getSingularTextField("fn");
248         conditionallyAddStringProperty(
249                 fn.source(),
250                 blankLister, hLISTING.listerName, fn.value()
251         );
252     }
253 
254     private void addListerLogo(HTMLDocument doc, Resource blankLister) throws ExtractionException {
255         TextField logo = doc.getSingularUrlField("logo");
256         conditionallyAddResourceProperty(blankLister, hLISTING.listerLogo, getHTMLDocument().resolveURI(logo.value()));
257     }
258 
259     private void addListerOrg(HTMLDocument doc, Resource blankLister) {
260         TextField org = doc.getSingularTextField("org");
261         conditionallyAddStringProperty(
262                 org.source(),
263                 blankLister, hLISTING.listerOrg, org.value()
264         );
265     }
266 
267     private void addItemName(HTMLDocument item, Resource blankItem) {
268         HTMLDocument.TextField fn = item.getSingularTextField("fn");
269         conditionallyAddStringProperty(
270                 fn.source(),
271                 blankItem, hLISTING.itemName, fn.value()
272         );
273     }
274 
275     private void addItemUrl(HTMLDocument item, Resource blankItem) throws ExtractionException {
276         TextField url = item.getSingularUrlField("url");
277         conditionallyAddResourceProperty(blankItem, hLISTING.itemUrl, getHTMLDocument().resolveURI(url.value()));
278     }
279 
280     private void addItemPhoto(HTMLDocument doc, Resource blankLister) throws ExtractionException {
281         // as per spec
282         String url = doc.findMicroformattedValue("*", "item", "A", "photo", "@href");
283         conditionallyAddResourceProperty(blankLister, hLISTING.itemPhoto, getHTMLDocument().resolveURI(url));
284         url = doc.findMicroformattedValue("*", "item", "IMG", "photo", "@src");
285         conditionallyAddResourceProperty(blankLister, hLISTING.itemPhoto, getHTMLDocument().resolveURI(url));
286         // as per kelkoo. Remember that contains(foo,'') is true in xpath
287         url = doc.findMicroformattedValue("*", "photo", "IMG", "", "@src");
288         conditionallyAddResourceProperty(blankLister, hLISTING.itemPhoto, getHTMLDocument().resolveURI(url));
289     }
290 
291     private List<String> findActions(HTMLDocument doc) {
292         List<String> actions = new ArrayList<String>(0);
293         // first check if values are inlined
294         String[] classes = doc.readAttribute("class").split("\\s+");
295         for (String klass : classes) {
296             if (ActionClasses.contains(klass))
297                 actions.add(klass);
298         }
299 
300         for (Node action : doc.findAll("./*[@class]/@class")) {
301             for (String substring : action.getNodeValue().split("\\s+")) {
302                 if (ActionClasses.contains(substring))
303                     actions.add(substring);
304             }
305         }
306         return actions;
307     }
308 
309 }