1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.deri.any23.extractor.html;
18
19 import org.apache.commons.lang.StringUtils;
20 import org.deri.any23.extractor.ExtractionException;
21 import org.deri.any23.extractor.ExtractionResult;
22 import org.deri.any23.extractor.ExtractorDescription;
23 import org.deri.any23.extractor.ExtractorFactory;
24 import org.deri.any23.extractor.SimpleExtractorFactory;
25 import org.deri.any23.extractor.TagSoupExtractionResult;
26 import org.deri.any23.extractor.html.annotations.Includes;
27 import org.deri.any23.rdf.PopularPrefixes;
28 import org.deri.any23.vocab.VCARD;
29 import org.openrdf.model.BNode;
30 import org.openrdf.model.Resource;
31 import org.openrdf.model.URI;
32 import org.openrdf.model.vocabulary.RDF;
33 import org.w3c.dom.NamedNodeMap;
34 import org.w3c.dom.Node;
35
36 import java.util.ArrayList;
37 import java.util.Arrays;
38 import java.util.Collection;
39 import java.util.List;
40
41 import static org.deri.any23.extractor.html.HTMLDocument.TextField;
42
43
44
45
46
47
48
49
50 @Includes( extractors = AdrExtractor.class )
51 public class HCardExtractor extends EntityBasedMicroformatExtractor {
52
53 private static final VCARD vCARD = VCARD.getInstance();
54
55 private HCardName name = new HCardName();
56
57 private HTMLDocument fragment;
58
59 public final static ExtractorFactory<HCardExtractor> factory =
60 SimpleExtractorFactory.create(
61 "html-mf-hcard",
62 PopularPrefixes.createSubset("rdf", "vcard"),
63 Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
64 null,
65 HCardExtractor.class
66 );
67
68 public ExtractorDescription getDescription() {
69 return factory;
70 }
71
72 @Override
73 protected String getBaseClassName() {
74 return "vcard";
75 }
76
77 @Override
78 protected void resetExtractor() {
79 name.reset();
80 }
81
82 private void fixIncludes(HTMLDocument document, Node node) {
83 NamedNodeMap attributes = node.getAttributes();
84
85 if ("TD".equals(node.getNodeName()) && (null != attributes.getNamedItem("headers"))) {
86 String id = attributes.getNamedItem("headers").getNodeValue();
87 Node header = document.findNodeById(id);
88 if (null != header) {
89 node.appendChild(header.cloneNode(true));
90 attributes.removeNamedItem("headers");
91 }
92 }
93
94
95 for (Node current : document.findAll("//*[@class]")) {
96 if (!DomUtils.hasClassName(current, "include")) continue;
97
98
99 current.getAttributes().removeNamedItem("class");
100 ArrayList<TextField> res = new ArrayList<TextField>();
101 HTMLDocument.readUrlField(res, current);
102 TextField id = res.get(0);
103 if (null == id)
104 continue;
105 id = new TextField( StringUtils.substringAfter(id.value(), "#"), id.source() );
106 Node included = document.findNodeById(id.value());
107 if (null == included)
108 continue;
109 current.appendChild(included.cloneNode(true));
110 }
111 }
112
113 @Override
114 protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
115 this.fragment = new HTMLDocument(node);
116 fixIncludes(getHTMLDocument(), node);
117 final BNode card = getBlankNodeFor(node);
118 boolean foundSomething = false;
119
120 readFn();
121 readNames();
122 readOrganization();
123 foundSomething |= addFn(card);
124 foundSomething |= addNames(card);
125 foundSomething |= addOrganizationName(card);
126 foundSomething |= addStringProperty("sort-string", card, vCARD.sort_string);
127 foundSomething |= addUrl(card);
128 foundSomething |= addEmail(card);
129 foundSomething |= addPhoto(card);
130 foundSomething |= addLogo(card);
131 foundSomething |= addUid(card);
132 foundSomething |= addClass(card);
133 foundSomething |= addStringProperty("bday", card, vCARD.bday);
134 foundSomething |= addStringProperty("rev", card, vCARD.rev);
135 foundSomething |= addStringProperty("tz", card, vCARD.tz);
136 foundSomething |= addCategory(card);
137 foundSomething |= addStringProperty("card", card, vCARD.class_);
138 foundSomething |= addSubMicroformat("adr", card, vCARD.adr);
139 foundSomething |= addTelephones(card);
140 foundSomething |= addStringProperty("title", card, vCARD.title);
141 foundSomething |= addStringProperty("role", card, vCARD.role);
142 foundSomething |= addStringMultiProperty("note", card, vCARD.note);
143 foundSomething |= addSubMicroformat("geo", card, vCARD.geo);
144
145 if (!foundSomething) return false;
146 out.writeTriple(card, RDF.TYPE, vCARD.VCard);
147
148 final TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
149 tser.addResourceRoot( DomUtils.getXPathListForNode(node), card, this.getClass() );
150
151 return true;
152 }
153
154 private boolean addTelephones(Resource card) {
155 boolean found = false;
156 for (Node node : fragment.findAll(".//*[contains(@class,'tel')]")) {
157 HTMLDocument telFragment = new HTMLDocument(node);
158 TextField[] values = telFragment.getPluralUrlField("value");
159 if (values.length == 0) {
160
161 String[] typeAndValue = telFragment.getSingularUrlField("tel").value().split(":");
162
163 if (typeAndValue.length > 1) {
164 found |= addTel(card, "tel", typeAndValue[1]);
165 } else {
166 found |= addTel(card, "tel", typeAndValue[0]);
167 }
168 } else {
169 final String[] valuesStr = new String[values.length];
170 for(int i = 0; i < values.length; i++) {
171 valuesStr[i] = values[i].value();
172 }
173 HTMLDocument.TextField[] types = telFragment.getPluralTextField("type");
174 if (types.length == 0) {
175 found |= addTel(card, "tel", StringUtils.join(valuesStr));
176 }
177 for (HTMLDocument.TextField type : types) {
178 found |= addTel(card, type.value(), StringUtils.join(valuesStr));
179 }
180 }
181 }
182 return found;
183 }
184
185 private boolean addTel(Resource card, String type, String value) {
186 URI tel = super.fixLink(value, "tel");
187 URI composed = vCARD.getProperty(type + "Tel", null);
188 if (composed == null) {
189 URI simple = vCARD.getProperty(type, null);
190 if (simple == null) {
191 return conditionallyAddResourceProperty(card, vCARD.tel, tel);
192 }
193 return conditionallyAddResourceProperty(card, simple, tel);
194 }
195 return conditionallyAddResourceProperty(card, composed, tel);
196 }
197
198 private boolean addSubMicroformat(String className, Resource resource, URI property) {
199 List<Node> nodes = fragment.findAllByClassName(className);
200 if (nodes.isEmpty()) return false;
201 for (Node node : nodes) {
202 addBNodeProperty(
203 node,
204 resource, property, getBlankNodeFor(node)
205 );
206 }
207 return true;
208 }
209
210 private boolean addStringProperty(String className, Resource resource, URI property) {
211 final HTMLDocument.TextField textField = fragment.getSingularTextField(className);
212 return conditionallyAddStringProperty(
213 textField.source(),
214 resource, property, textField.value()
215 );
216 }
217
218
219
220
221
222
223
224
225
226 private boolean addStringMultiProperty(String className, Resource resource, URI property) {
227 HTMLDocument.TextField[] fields = fragment.getPluralTextField(className);
228 boolean found = false;
229 final String extractorName = getDescription().getExtractorName();
230 for(HTMLDocument.TextField field : fields) {
231 found |= conditionallyAddStringProperty(
232 field.source(),
233 resource, property, field.value()
234 );
235 }
236 return found;
237 }
238
239 private boolean addCategory(Resource card) {
240 HTMLDocument.TextField[] categories = fragment.getPluralTextField("category");
241 boolean found = false;
242 for (HTMLDocument.TextField category : categories) {
243 found |= conditionallyAddStringProperty(
244 category.source(),
245 card, vCARD.category, category.value()
246 );
247 }
248 return found;
249 }
250
251 private boolean addUid(Resource card) {
252 TextField uid = fragment.getSingularUrlField("uid");
253 return conditionallyAddStringProperty(
254 fragment.getDocument(),
255 card, vCARD.uid, uid.value()
256 );
257 }
258
259 private boolean addClass(Resource card) {
260 TextField class_ = fragment.getSingularUrlField("class");
261 return conditionallyAddStringProperty(
262 fragment.getDocument(),
263 card, vCARD.class_, class_.value()
264 );
265 }
266
267 private boolean addLogo(Resource card) throws ExtractionException {
268 TextField[] links = fragment.getPluralUrlField("logo");
269 boolean found = false;
270 for (TextField link : links) {
271 found |= conditionallyAddResourceProperty(
272 card, vCARD.logo, getHTMLDocument().resolveURI(link.value())
273 );
274 }
275 return found;
276 }
277
278 private boolean addPhoto(Resource card) throws ExtractionException {
279 TextField[] links = fragment.getPluralUrlField("photo");
280 boolean found = false;
281 for (TextField link : links) {
282 found |= conditionallyAddResourceProperty(
283 card, vCARD.photo, getHTMLDocument().resolveURI(link.value())
284 );
285 }
286 return found;
287 }
288
289 private boolean addEmail(Resource card) {
290 String email = dropSubject(fragment.getSingularUrlField("email").value());
291 return conditionallyAddResourceProperty(
292 card,
293 vCARD.email,
294 fixLink(email, "mailto")
295 );
296 }
297
298 private String dropSubject(String mail) {
299 if (mail == null) return null;
300 return mail.split("\\?")[0];
301 }
302
303 private void readNames() {
304 for (String field : HCardName.FIELDS) {
305 HTMLDocument.TextField[] values = fragment.getPluralTextField(field);
306 for (HTMLDocument.TextField text : values) {
307 if ("".equals(text.value())) continue;
308 name.setField(field, text);
309 }
310 }
311 }
312
313 private void addFieldTriple(Node n, BNode bn, String fieldName, String fieldValue) {
314 conditionallyAddLiteralProperty(
315 n, bn, vCARD.getProperty(fieldName), valueFactory.createLiteral(fieldValue)
316 );
317 }
318
319 private boolean addNames(Resource card) {
320 BNode n = valueFactory.createBNode();
321 addBNodeProperty(
322 this.fragment.getDocument(),
323 card, vCARD.n, n
324 );
325 addURIProperty(n, RDF.TYPE, vCARD.Name);
326
327 for (String fieldName : HCardName.FIELDS) {
328 if (!name.containsField(fieldName)) {
329 continue;
330 }
331 if (name.isMultiField(fieldName)) {
332 Collection<HTMLDocument.TextField> values = name.getFields(fieldName);
333 for(TextField value : values) {
334 addFieldTriple(
335 value.source(),
336 n, fieldName, value.value()
337 );
338 }
339 } else {
340 TextField value = name.getField(fieldName);
341 if(value == null) { continue; }
342 addFieldTriple(
343 value.source(),
344 n, fieldName, value.value()
345 );
346 }
347 }
348 return true;
349 }
350
351 private void readFn() {
352 name.setFullName(fragment.getSingularTextField("fn"));
353 }
354
355 private boolean addFn(Resource card) {
356 final TextField fullNameTextField = name.getFullName();
357 if(fullNameTextField == null) {
358 return false;
359 }
360 return conditionallyAddStringProperty(
361 fullNameTextField.source(),
362 card, vCARD.fn, fullNameTextField.value()
363 );
364 }
365
366 private void readOrganization() {
367 Node node = fragment.findMicroformattedObjectNode("*", "org");
368 if (node == null) return;
369 HTMLDocument doc = new HTMLDocument(node);
370 String nodeText = doc.getText();
371 if(nodeText != null) {
372 name.setOrganization( new HTMLDocument.TextField(nodeText, node) );
373 }
374 nodeText = doc.getSingularTextField("organization-name").value();
375 if(nodeText == null || "".equals(nodeText) ) {
376 nodeText = HTMLDocument.readTextField(node).value();
377 }
378 name.setOrganization( new TextField(nodeText, node) );
379
380 name.setOrganizationUnit(doc.getSingularTextField("organization-unit"));
381 }
382
383 private boolean addOrganizationName(Resource card) {
384 if (name.getOrganization() == null) return false;
385 BNode org = valueFactory.createBNode();
386 final String extractorName = getDescription().getExtractorName();
387 addBNodeProperty(
388 this.fragment.getDocument(),
389 card, vCARD.org, org
390 );
391 addURIProperty(org, RDF.TYPE, vCARD.Organization);
392 final TextField organizationTextField = name.getOrganization();
393 conditionallyAddLiteralProperty(
394 organizationTextField.source(),
395 org, vCARD.organization_name, valueFactory.createLiteral( organizationTextField.value() )
396 );
397 final TextField organizationUnitTextField = name.getOrganizationUnit();
398 if(organizationUnitTextField != null) {
399 conditionallyAddStringProperty(
400 organizationUnitTextField.source(),
401 org, vCARD.organization_unit, organizationUnitTextField.value()
402 );
403 }
404 return true;
405 }
406
407 private boolean addUrl(Resource card) throws ExtractionException {
408 TextField[] links = fragment.getPluralUrlField("url");
409 boolean found = false;
410 for (TextField link : links) {
411 found |= conditionallyAddResourceProperty(card, vCARD.url, getHTMLDocument().resolveURI(link.value()));
412 }
413 return found;
414 }
415
416 }