View Javadoc

1   package org.deri.any23.extractor.html;
2   
3   import org.deri.any23.extractor.ExtractionException;
4   import org.deri.any23.extractor.ExtractionResult;
5   import org.deri.any23.extractor.ExtractorDescription;
6   import org.deri.any23.extractor.ExtractorFactory;
7   import org.deri.any23.extractor.SimpleExtractorFactory;
8   import org.deri.any23.extractor.TagSoupExtractionResult;
9   import org.deri.any23.rdf.PopularPrefixes;
10  import org.deri.any23.vocab.WO;
11  import org.openrdf.model.BNode;
12  import org.openrdf.model.Resource;
13  import org.openrdf.model.URI;
14  import org.openrdf.model.vocabulary.RDF;
15  import org.w3c.dom.Node;
16  
17  import java.util.Arrays;
18  
19  /**
20   * Extractor able to extract the <a href="http://microformats.org/wiki/species">Species Microformat</a>.
21   * The data are represented using the
22   * <a href="http://www.bbc.co.uk/ontologies/wildlife/2010-02-22.shtml">BBC Wildlife Ontology</a>.
23   *
24   * @see org.deri.any23.vocab.WO
25   * @author Davide Palmisano (dpalmisano@gmail.com)
26   */
27  public class SpeciesExtractor extends EntityBasedMicroformatExtractor {
28  
29      private static final WO vWO = WO.getInstance();
30  
31      private static final String[] classes = {
32              "kingdom",
33              "division",
34              "phylum",
35              "order",
36              "family",
37              "genus",
38              "species",
39              "class",
40      };
41  
42      public final static ExtractorFactory<SpeciesExtractor> factory =
43              SimpleExtractorFactory.create(
44                      "html-mf-species",
45                      PopularPrefixes.createSubset("rdf", "wo"),
46                      Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
47                      null,
48                      SpeciesExtractor.class
49              );
50  
51      /**
52       * Returns the description of this extractor.
53       *
54       * @return a human readable description.
55       */
56      @Override
57      public ExtractorDescription getDescription() {
58          return factory;
59      }
60  
61      /**
62       * Returns the base class name for the extractor.
63       *
64       * @return a string containing the base of the extractor.
65       */
66      @Override
67      protected String getBaseClassName() {
68          return "biota";
69      }
70  
71      /**
72       * Resets the internal status of the extractor to prepare it to a new extraction section.
73       */
74      @Override
75      protected void resetExtractor() {
76          // empty
77      }
78  
79      /**
80       * Extracts an entity from a <i>DOM</i> node.
81       *
82       * @param node the DOM node.
83       * @param out  the extraction result collector.
84       * @return <code>true</code> if the extraction has produces something, <code>false</code> otherwise.
85       * @throws org.deri.any23.extractor.ExtractionException
86       *
87       */
88      @Override
89      protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
90          BNode biota = getBlankNodeFor(node);
91          conditionallyAddResourceProperty(biota, RDF.TYPE, vWO.species);
92  
93          final HTMLDocument fragment = new HTMLDocument(node);
94          addNames(fragment, biota);
95          addClasses(fragment, biota);
96  
97          final TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
98          tser.addResourceRoot(
99                  DomUtils.getXPathListForNode(node),
100                 biota,
101                 this.getClass()
102         );
103 
104         return true;
105     }
106 
107     private void addNames(HTMLDocument doc, Resource biota) throws ExtractionException {
108         HTMLDocument.TextField binomial = doc.getSingularTextField("binomial");
109         conditionallyAddStringProperty(
110                 binomial.source(), biota, vWO.scientificName, binomial.value()
111         );
112         HTMLDocument.TextField vernacular = doc.getSingularTextField("vernacular");
113         conditionallyAddStringProperty(
114                 vernacular.source(), biota, vWO.speciesName, vernacular.value()
115         );
116     }
117 
118     private void addClassesName(HTMLDocument doc, Resource biota) throws ExtractionException {
119         for (String clazz : classes) {
120             HTMLDocument.TextField classTextField = doc.getSingularTextField(clazz);
121             conditionallyAddStringProperty(
122                     classTextField.source(), biota, resolvePropertyName(clazz), classTextField.value());
123         }
124     }
125 
126     private void addClasses(HTMLDocument doc, Resource biota) throws ExtractionException {
127         for(String clazz : classes) {
128             HTMLDocument.TextField classTextField = doc.getSingularUrlField(clazz);
129             if(classTextField.source() != null) {
130                 BNode classBNode = getBlankNodeFor(classTextField.source());
131                 addBNodeProperty(biota, vWO.getProperty(clazz), classBNode);
132                 conditionallyAddResourceProperty(classBNode, RDF.TYPE, resolveClassName(clazz));
133                 HTMLDocument fragment = new HTMLDocument(classTextField.source());
134                 addClassesName(fragment, classBNode);
135             }
136         }
137     }
138 
139     private URI resolvePropertyName(String clazz) {
140         return vWO.getProperty(
141                 String.format(
142                         "%sName",
143                         clazz
144                 )
145         );
146     }
147 
148     private URI resolveClassName(String clazz) {
149         String upperCaseClass = clazz.substring(0, 1);
150         return vWO.getResource(
151                 String.format("%s%s",
152                         upperCaseClass.toUpperCase(),
153                         clazz.substring(1)
154                 )
155         );
156     }
157 }