View Javadoc

1   /*
2    * Copyright 2008-2010 Digital Enterprise Research Institute (DERI)
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *          http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package org.deri.any23.extractor.html;
18  
19  import org.apache.commons.lang.StringUtils;
20  import org.deri.any23.extractor.ExtractionException;
21  import org.deri.any23.extractor.ExtractionResult;
22  import org.deri.any23.extractor.ExtractorDescription;
23  import org.deri.any23.extractor.ExtractorFactory;
24  import org.deri.any23.extractor.SimpleExtractorFactory;
25  import org.deri.any23.extractor.TagSoupExtractionResult;
26  import org.deri.any23.extractor.html.annotations.Includes;
27  import org.deri.any23.rdf.PopularPrefixes;
28  import org.deri.any23.vocab.VCARD;
29  import org.openrdf.model.BNode;
30  import org.openrdf.model.Resource;
31  import org.openrdf.model.URI;
32  import org.openrdf.model.vocabulary.RDF;
33  import org.w3c.dom.NamedNodeMap;
34  import org.w3c.dom.Node;
35  
36  import java.util.ArrayList;
37  import java.util.Arrays;
38  import java.util.Collection;
39  import java.util.List;
40  
41  import static org.deri.any23.extractor.html.HTMLDocument.TextField;
42  
43  
44  /**
45   * Extractor for the <a href="http://microformats.org/wiki/hcard">hCard</a>
46   * microformat.
47   *
48   * @author Gabriele Renzi
49   */
50  @Includes( extractors = AdrExtractor.class )
51  public class HCardExtractor extends EntityBasedMicroformatExtractor {
52  
53      private static final VCARD vCARD = VCARD.getInstance();
54  
55      private HCardName name = new HCardName();
56      
57      private HTMLDocument fragment;
58  
59      public final static ExtractorFactory<HCardExtractor> factory =
60              SimpleExtractorFactory.create(
61                      "html-mf-hcard",
62                      PopularPrefixes.createSubset("rdf", "vcard"),
63                      Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
64                      null,
65                      HCardExtractor.class
66              );
67  
68      public ExtractorDescription getDescription() {
69          return factory;
70      }
71  
72      @Override
73      protected String getBaseClassName() {
74          return "vcard";
75      }
76  
77      @Override
78      protected void resetExtractor() {
79          name.reset(); // Cleanup of the HCardName content.
80      }
81  
82      private void fixIncludes(HTMLDocument document, Node node) {
83          NamedNodeMap attributes = node.getAttributes();
84          // header case test 32
85          if ("TD".equals(node.getNodeName()) && (null != attributes.getNamedItem("headers"))) {
86              String id = attributes.getNamedItem("headers").getNodeValue();
87              Node header = document.findNodeById(id);
88              if (null != header) {
89                  node.appendChild(header.cloneNode(true));
90                  attributes.removeNamedItem("headers");
91              }
92          }
93          // include pattern, test 31
94  
95          for (Node current : document.findAll("//*[@class]")) {
96              if (!DomUtils.hasClassName(current, "include")) continue;
97              // we have to remove the field soon to avoid infinite loops
98              // no null check, we know it's there or we won't be in the loop
99              current.getAttributes().removeNamedItem("class");
100             ArrayList<TextField> res = new ArrayList<TextField>();
101             HTMLDocument.readUrlField(res, current);
102             TextField id = res.get(0);
103             if (null == id)
104                 continue;
105             id = new TextField( StringUtils.substringAfter(id.value(), "#"), id.source() );
106             Node included = document.findNodeById(id.value());
107             if (null == included)
108                 continue;
109             current.appendChild(included.cloneNode(true));
110         }
111     }
112 
113     @Override
114     protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
115         this.fragment = new HTMLDocument(node);
116         fixIncludes(getHTMLDocument(), node);
117         final BNode card = getBlankNodeFor(node);
118         boolean foundSomething = false;
119 
120         readFn();
121         readNames();
122         readOrganization();
123         foundSomething |= addFn(card);
124         foundSomething |= addNames(card);
125         foundSomething |= addOrganizationName(card);
126         foundSomething |= addStringProperty("sort-string", card, vCARD.sort_string);
127         foundSomething |= addUrl(card);
128         foundSomething |= addEmail(card);
129         foundSomething |= addPhoto(card);
130         foundSomething |= addLogo(card);
131         foundSomething |= addUid(card);
132         foundSomething |= addClass(card);
133         foundSomething |= addStringProperty("bday", card, vCARD.bday);
134         foundSomething |= addStringProperty("rev", card, vCARD.rev);
135         foundSomething |= addStringProperty("tz", card, vCARD.tz);
136         foundSomething |= addCategory(card);
137         foundSomething |= addStringProperty("card", card, vCARD.class_);
138         foundSomething |= addSubMicroformat("adr", card, vCARD.adr);
139         foundSomething |= addTelephones(card);
140         foundSomething |= addStringProperty("title", card, vCARD.title);
141         foundSomething |= addStringProperty("role", card, vCARD.role);
142         foundSomething |= addStringMultiProperty("note", card, vCARD.note);
143         foundSomething |= addSubMicroformat("geo", card, vCARD.geo);
144 
145         if (!foundSomething) return false;
146         out.writeTriple(card, RDF.TYPE, vCARD.VCard);
147 
148         final TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
149         tser.addResourceRoot( DomUtils.getXPathListForNode(node), card, this.getClass() );
150 
151         return true;
152     }
153 
154     private boolean addTelephones(Resource card) {
155         boolean found = false;
156         for (Node node : fragment.findAll(".//*[contains(@class,'tel')]")) {
157             HTMLDocument telFragment = new HTMLDocument(node);
158             TextField[] values = telFragment.getPluralUrlField("value");
159             if (values.length == 0) {
160                 //no sub values
161                 String[] typeAndValue = telFragment.getSingularUrlField("tel").value().split(":");
162                 //modem:goo fax:foo tel:bar
163                 if (typeAndValue.length > 1) {
164                     found |= addTel(card, "tel", typeAndValue[1]);
165                 } else {
166                     found |= addTel(card, "tel", typeAndValue[0]);
167                 }
168             } else {
169                 final String[] valuesStr = new String[values.length];
170                 for(int i = 0; i < values.length; i++) {
171                     valuesStr[i] = values[i].value();
172                 }
173                 HTMLDocument.TextField[] types = telFragment.getPluralTextField("type");
174                 if (types.length == 0) {
175                     found |= addTel(card, "tel", StringUtils.join(valuesStr));
176                 }
177                 for (HTMLDocument.TextField type : types) {
178                     found |= addTel(card, type.value(), StringUtils.join(valuesStr));
179                 }
180             }
181         }
182         return found;
183     }
184 
185     private boolean addTel(Resource card, String type, String value) {
186         URI tel = super.fixLink(value, "tel");
187         URI composed = vCARD.getProperty(type + "Tel", null);
188         if (composed == null) {
189             URI simple = vCARD.getProperty(type, null);
190             if (simple == null) {
191                 return conditionallyAddResourceProperty(card, vCARD.tel, tel);
192             }
193             return conditionallyAddResourceProperty(card, simple, tel);
194         }
195         return conditionallyAddResourceProperty(card, composed, tel);
196     }
197 
198     private boolean addSubMicroformat(String className, Resource resource, URI property) {
199         List<Node> nodes = fragment.findAllByClassName(className);
200         if (nodes.isEmpty()) return false;
201         for (Node node : nodes) {
202             addBNodeProperty(
203                     node,
204                     resource, property, getBlankNodeFor(node)
205             );
206         }
207         return true;
208     }
209 
210     private boolean addStringProperty(String className, Resource resource, URI property) {
211         final HTMLDocument.TextField textField = fragment.getSingularTextField(className);
212         return conditionallyAddStringProperty(
213                 textField.source(),
214                 resource, property, textField.value()
215         );
216     }
217 
218     /**
219      * Adds a property that can be associated to multiple values.
220      *
221      * @param className
222      * @param resource
223      * @param property
224      * @return <code>true</code> if the multi property has been added, <code>false</code> otherwise.
225      */
226     private boolean addStringMultiProperty(String className, Resource resource, URI property) {
227         HTMLDocument.TextField[] fields = fragment.getPluralTextField(className);
228         boolean found = false;
229         final String extractorName = getDescription().getExtractorName();
230         for(HTMLDocument.TextField field : fields) {
231             found |= conditionallyAddStringProperty(
232                     field.source(),
233                     resource, property, field.value()
234             );
235         }
236         return found;
237     }
238 
239     private boolean addCategory(Resource card) {
240         HTMLDocument.TextField[] categories = fragment.getPluralTextField("category");
241         boolean found = false;
242         for (HTMLDocument.TextField category : categories) {
243             found |= conditionallyAddStringProperty(
244                     category.source(),
245                     card, vCARD.category, category.value()
246             );
247         }
248         return found;
249     }
250 
251     private boolean addUid(Resource card) {
252         TextField uid = fragment.getSingularUrlField("uid");
253         return conditionallyAddStringProperty(
254                 fragment.getDocument(),
255                 card, vCARD.uid, uid.value()
256         );
257     }
258 
259     private boolean addClass(Resource card) {
260         TextField class_ = fragment.getSingularUrlField("class");
261         return conditionallyAddStringProperty(
262                 fragment.getDocument(),
263                 card, vCARD.class_, class_.value()
264         );
265     }
266 
267     private boolean addLogo(Resource card) throws ExtractionException {
268         TextField[] links = fragment.getPluralUrlField("logo");
269         boolean found = false;
270         for (TextField link : links) {
271             found |= conditionallyAddResourceProperty(
272                     card, vCARD.logo, getHTMLDocument().resolveURI(link.value())
273             );
274         }
275         return found;
276     }
277 
278     private boolean addPhoto(Resource card) throws ExtractionException {
279         TextField[] links = fragment.getPluralUrlField("photo");
280         boolean found = false;
281         for (TextField link : links) {
282             found |= conditionallyAddResourceProperty(
283                     card, vCARD.photo, getHTMLDocument().resolveURI(link.value())
284             );
285         }
286         return found;
287     }
288 
289     private boolean addEmail(Resource card) {
290         String email = dropSubject(fragment.getSingularUrlField("email").value());
291         return conditionallyAddResourceProperty(
292                 card,
293                 vCARD.email,
294                 fixLink(email, "mailto")
295         );
296     }
297 
298     private String dropSubject(String mail) {
299         if (mail == null) return null;
300         return mail.split("\\?")[0];
301     }
302 
303     private void readNames() {
304         for (String field : HCardName.FIELDS) {
305             HTMLDocument.TextField[] values = fragment.getPluralTextField(field);
306             for (HTMLDocument.TextField text : values) {
307                 if ("".equals(text.value())) continue;
308                 name.setField(field, text);
309             }
310         }
311     }
312 
313     private void addFieldTriple(Node n, BNode bn, String fieldName, String fieldValue) {
314         conditionallyAddLiteralProperty(
315                 n, bn, vCARD.getProperty(fieldName), valueFactory.createLiteral(fieldValue)
316         );
317     }
318 
319     private boolean addNames(Resource card) {
320         BNode n = valueFactory.createBNode();
321         addBNodeProperty(
322                 this.fragment.getDocument(),
323                 card, vCARD.n, n
324         );
325         addURIProperty(n, RDF.TYPE, vCARD.Name);
326 
327         for (String fieldName : HCardName.FIELDS) {
328             if (!name.containsField(fieldName)) {
329                 continue;
330             }
331             if (name.isMultiField(fieldName)) {
332                 Collection<HTMLDocument.TextField> values = name.getFields(fieldName);
333                 for(TextField value : values) {
334                     addFieldTriple(
335                             value.source(),
336                             n, fieldName, value.value()
337                     );
338                 }
339             } else {
340                 TextField value =  name.getField(fieldName);
341                 if(value == null) { continue; }
342                 addFieldTriple(
343                         value.source(),
344                         n, fieldName, value.value()
345                 );
346             }
347         }
348         return true;
349     }
350 
351     private void readFn() {
352         name.setFullName(fragment.getSingularTextField("fn"));
353     }
354 
355     private boolean addFn(Resource card) {
356         final TextField fullNameTextField = name.getFullName();
357         if(fullNameTextField == null) {
358             return false;
359         }
360         return conditionallyAddStringProperty(
361                 fullNameTextField.source(),
362                 card, vCARD.fn, fullNameTextField.value()
363         );
364     }
365 
366     private void readOrganization() {
367         Node node = fragment.findMicroformattedObjectNode("*", "org");
368         if (node == null) return;
369         HTMLDocument doc = new HTMLDocument(node);
370         String nodeText = doc.getText();
371         if(nodeText != null) {
372             name.setOrganization( new HTMLDocument.TextField(nodeText, node) );
373         }
374         nodeText = doc.getSingularTextField("organization-name").value();
375         if(nodeText == null || "".equals(nodeText) ) {
376             nodeText = HTMLDocument.readTextField(node).value();
377         }
378         name.setOrganization( new TextField(nodeText, node) );
379 
380         name.setOrganizationUnit(doc.getSingularTextField("organization-unit"));
381     }
382 
383     private boolean addOrganizationName(Resource card) {
384         if (name.getOrganization() == null) return false;
385         BNode org = valueFactory.createBNode();
386         final String extractorName =  getDescription().getExtractorName();
387         addBNodeProperty(
388                 this.fragment.getDocument(),
389                 card, vCARD.org, org
390         );
391         addURIProperty(org, RDF.TYPE, vCARD.Organization);
392         final TextField organizationTextField = name.getOrganization();
393         conditionallyAddLiteralProperty(
394                 organizationTextField.source(),
395                 org, vCARD.organization_name, valueFactory.createLiteral( organizationTextField.value() )
396         );
397         final TextField organizationUnitTextField = name.getOrganizationUnit();
398         if(organizationUnitTextField != null) {
399             conditionallyAddStringProperty(
400                     organizationUnitTextField.source(),
401                     org, vCARD.organization_unit, organizationUnitTextField.value()
402             );
403         }
404         return true;
405     }
406 
407     private boolean addUrl(Resource card) throws ExtractionException {
408         TextField[] links = fragment.getPluralUrlField("url");
409         boolean found = false;
410         for (TextField link : links) {
411             found |= conditionallyAddResourceProperty(card, vCARD.url, getHTMLDocument().resolveURI(link.value()));
412         }
413         return found;
414     }
415 
416 }