View Javadoc

1   /*
2    * Copyright 2008-2010 Digital Enterprise Research Institute (DERI)
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *          http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package org.deri.any23.extractor.html;
18  
19  import org.deri.any23.extractor.ExtractionResult;
20  import org.deri.any23.extractor.ExtractorDescription;
21  import org.deri.any23.extractor.ExtractorFactory;
22  import org.deri.any23.extractor.SimpleExtractorFactory;
23  import org.deri.any23.extractor.TagSoupExtractionResult;
24  import org.deri.any23.rdf.PopularPrefixes;
25  import org.deri.any23.vocab.VCARD;
26  import org.openrdf.model.BNode;
27  import org.openrdf.model.vocabulary.RDF;
28  import org.w3c.dom.Node;
29  
30  import java.util.Arrays;
31  
32  import static org.deri.any23.extractor.html.HTMLDocument.TextField;
33  
34  
35  /**
36   * Extractor for the <a href="http://microformats.org/wiki/geo">Geo</a>
37   * microformat.
38   *
39   * @author Gabriele Renzi
40   */
41  public class GeoExtractor extends EntityBasedMicroformatExtractor {
42  
43      private static final VCARD vVCARD = VCARD.getInstance();
44  
45      public static final ExtractorFactory<GeoExtractor> factory =
46              SimpleExtractorFactory.create(
47                  "html-mf-geo",
48                  PopularPrefixes.createSubset("rdf", "vcard"),
49                  Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
50                  null,
51                  GeoExtractor.class
52              );
53  
54      public ExtractorDescription getDescription() {
55          return factory;
56      }
57  
58      protected String getBaseClassName() {
59          return "geo";
60      }
61  
62      @Override
63      protected void resetExtractor() {
64          // Empty.
65      }
66  
67      protected boolean extractEntity(Node node, ExtractionResult out) {
68          if (null == node) return false;
69          //try lat & lon
70          final HTMLDocument document = new HTMLDocument(node);
71          HTMLDocument.TextField latNode = document.getSingularTextField("latitude" );
72          TextField lonNode = document.getSingularTextField("longitude");
73          String lat = latNode.value();
74          String lon = lonNode.value();
75          if ("".equals(lat) || "".equals(lon)) {
76              String[] both = document.getSingularUrlField("geo").value().split(";");
77              if (both.length != 2) return false;
78              lat = both[0];
79              lon = both[1];
80          }
81          BNode geo = getBlankNodeFor(node);
82          out.writeTriple(geo, RDF.TYPE, vVCARD.Location);
83          final String extractorName = getDescription().getExtractorName();
84          conditionallyAddStringProperty(
85                  latNode.source(),
86                  geo, vVCARD.latitude , lat
87          );
88          conditionallyAddStringProperty(
89                  lonNode.source(),
90                  geo, vVCARD.longitude, lon
91          );
92  
93          final TagSoupExtractionResult tser = (TagSoupExtractionResult) getCurrentExtractionResult();
94          tser.addResourceRoot( document.getPathToLocalRoot(), geo, this.getClass() );
95  
96          return true;
97      }
98      
99  }