1 package org.deri.any23.extractor.html;
2
3 import org.deri.any23.extractor.ExtractionException;
4 import org.deri.any23.extractor.ExtractionResult;
5 import org.deri.any23.extractor.ExtractorDescription;
6 import org.deri.any23.extractor.ExtractorFactory;
7 import org.deri.any23.extractor.SimpleExtractorFactory;
8 import org.deri.any23.extractor.TagSoupExtractionResult;
9 import org.deri.any23.rdf.PopularPrefixes;
10 import org.deri.any23.vocab.WO;
11 import org.openrdf.model.BNode;
12 import org.openrdf.model.Resource;
13 import org.openrdf.model.URI;
14 import org.openrdf.model.vocabulary.RDF;
15 import org.w3c.dom.Node;
16
17 import java.util.Arrays;
18
19
20
21
22
23
24
25
26
27 public class SpeciesExtractor extends EntityBasedMicroformatExtractor {
28
29 private static final WO vWO = WO.getInstance();
30
31 private static final String[] classes = {
32 "kingdom",
33 "division",
34 "phylum",
35 "order",
36 "family",
37 "genus",
38 "species",
39 "class",
40 };
41
42 public final static ExtractorFactory<SpeciesExtractor> factory =
43 SimpleExtractorFactory.create(
44 "html-mf-species",
45 PopularPrefixes.createSubset("rdf", "wo"),
46 Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
47 null,
48 SpeciesExtractor.class
49 );
50
51
52
53
54
55
56 @Override
57 public ExtractorDescription getDescription() {
58 return factory;
59 }
60
61
62
63
64
65
66 @Override
67 protected String getBaseClassName() {
68 return "biota";
69 }
70
71
72
73
74 @Override
75 protected void resetExtractor() {
76
77 }
78
79
80
81
82
83
84
85
86
87
88 @Override
89 protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
90 BNode biota = getBlankNodeFor(node);
91 conditionallyAddResourceProperty(biota, RDF.TYPE, vWO.species);
92
93 final HTMLDocument fragment = new HTMLDocument(node);
94 addNames(fragment, biota);
95 addClasses(fragment, biota);
96
97 final TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
98 tser.addResourceRoot(
99 DomUtils.getXPathListForNode(node),
100 biota,
101 this.getClass()
102 );
103
104 return true;
105 }
106
107 private void addNames(HTMLDocument doc, Resource biota) throws ExtractionException {
108 HTMLDocument.TextField binomial = doc.getSingularTextField("binomial");
109 conditionallyAddStringProperty(
110 binomial.source(), biota, vWO.scientificName, binomial.value()
111 );
112 HTMLDocument.TextField vernacular = doc.getSingularTextField("vernacular");
113 conditionallyAddStringProperty(
114 vernacular.source(), biota, vWO.speciesName, vernacular.value()
115 );
116 }
117
118 private void addClassesName(HTMLDocument doc, Resource biota) throws ExtractionException {
119 for (String clazz : classes) {
120 HTMLDocument.TextField classTextField = doc.getSingularTextField(clazz);
121 conditionallyAddStringProperty(
122 classTextField.source(), biota, resolvePropertyName(clazz), classTextField.value());
123 }
124 }
125
126 private void addClasses(HTMLDocument doc, Resource biota) throws ExtractionException {
127 for(String clazz : classes) {
128 HTMLDocument.TextField classTextField = doc.getSingularUrlField(clazz);
129 if(classTextField.source() != null) {
130 BNode classBNode = getBlankNodeFor(classTextField.source());
131 addBNodeProperty(biota, vWO.getProperty(clazz), classBNode);
132 conditionallyAddResourceProperty(classBNode, RDF.TYPE, resolveClassName(clazz));
133 HTMLDocument fragment = new HTMLDocument(classTextField.source());
134 addClassesName(fragment, classBNode);
135 }
136 }
137 }
138
139 private URI resolvePropertyName(String clazz) {
140 return vWO.getProperty(
141 String.format(
142 "%sName",
143 clazz
144 )
145 );
146 }
147
148 private URI resolveClassName(String clazz) {
149 String upperCaseClass = clazz.substring(0, 1);
150 return vWO.getResource(
151 String.format("%s%s",
152 upperCaseClass.toUpperCase(),
153 clazz.substring(1)
154 )
155 );
156 }
157 }