View Javadoc

1   /*
2    * Copyright 2008-2010 Digital Enterprise Research Institute (DERI)
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *          http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package org.deri.any23.extractor.html;
18  
19  import org.deri.any23.extractor.ExtractionException;
20  import org.deri.any23.extractor.ExtractorDescription;
21  import org.deri.any23.extractor.ExtractorFactory;
22  import org.deri.any23.extractor.SimpleExtractorFactory;
23  import org.deri.any23.extractor.TagSoupExtractionResult;
24  import org.deri.any23.rdf.PopularPrefixes;
25  import org.deri.any23.rdf.RDFUtils;
26  import org.deri.any23.vocab.ICAL;
27  import org.openrdf.model.BNode;
28  import org.openrdf.model.Resource;
29  import org.openrdf.model.URI;
30  import org.openrdf.model.vocabulary.RDF;
31  import org.w3c.dom.Node;
32  
33  import javax.xml.datatype.DatatypeConfigurationException;
34  import java.text.ParseException;
35  import java.util.Arrays;
36  import java.util.List;
37  
38  import static org.deri.any23.extractor.html.HTMLDocument.TextField;
39  
40  
41  /**
42   * Extractor for the <a href="http://microformats.org/wiki/hcalendar">hCalendar</a>
43   * microformat.
44   *
45   * @author Gabriele Renzi
46   */
47  public class HCalendarExtractor extends MicroformatExtractor {
48  
49      private static final ICAL vICAL = ICAL.getInstance();
50  
51      public final static ExtractorFactory<HCalendarExtractor> factory =
52              SimpleExtractorFactory.create(
53                      "html-mf-hcalendar",
54                      PopularPrefixes.createSubset("rdf", "ical"),
55                      Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
56                      null,
57                      HCalendarExtractor.class);
58  
59      private static final String[] Components = {"Vevent", "Vtodo", "Vjournal", "Vfreebusy"};
60  
61      private static final String DATE_FORMAT = "yyyyMMdd'T'HHmm'Z'";
62  
63      private String[] textSingularProps = {
64              "summary",
65              "class",
66              "transp",
67              "description",
68              "status",
69              "location"};
70  
71      private String[] textDateProps = {
72              "dtstart",
73              "dtstamp",
74              "dtend",
75      };
76  
77      public ExtractorDescription getDescription() {
78          return factory;
79      }
80  
81      @Override
82      protected boolean extract() throws ExtractionException {
83          final HTMLDocument document = getHTMLDocument();
84          List<Node> calendars = document.findAllByClassName("vcalendar");
85          if (calendars.size() == 0)
86              // vcal allows to avoid top name, in which case whole document is
87              // the calendar, let's try
88              if (document.findAllByClassName("vevent").size() > 0)
89                  calendars.add(document.getDocument());
90  
91          boolean foundAny = false;
92          for (Node node : calendars)
93              foundAny |= extractCalendar(node);
94  
95          return foundAny;
96      }
97  
98      private boolean extractCalendar(Node node) throws ExtractionException {
99          URI cal = getDocumentURI();
100         addURIProperty(cal, RDF.TYPE, vICAL.Vcalendar);
101         return addComponents(node, cal);
102     }
103 
104     private boolean addComponents(Node node, Resource cal) throws ExtractionException {
105         boolean foundAny = false;
106         for (String component : Components) {
107             List<Node> events = DomUtils.findAllByClassName(node, component);
108             if (events.size() == 0)
109                 continue;
110             for (Node evtNode : events)
111                 foundAny |= extractComponent(evtNode, cal, component);
112         }
113         return foundAny;
114     }
115 
116     private boolean extractComponent(Node node, Resource cal, String component) throws ExtractionException {
117         HTMLDocument compoNode = new HTMLDocument(node);
118         BNode evt = valueFactory.createBNode();
119         addURIProperty(evt, RDF.TYPE, vICAL.getResource(component));
120         addTextProps(compoNode, evt);
121         addUrl(compoNode, evt);
122         addRRule(compoNode, evt);
123         addOrganizer(compoNode, evt);
124         addUid(compoNode, evt);
125         addBNodeProperty(cal, vICAL.component, evt);
126 
127         final TagSoupExtractionResult tser = (TagSoupExtractionResult) getCurrentExtractionResult();
128         tser.addResourceRoot( compoNode.getPathToLocalRoot(), evt, this.getClass() );
129 
130         return true;
131     }
132 
133     private void addUid(HTMLDocument compoNode, Resource evt) {
134         TextField url = compoNode.getSingularUrlField("uid");
135         conditionallyAddStringProperty(
136                 compoNode.getDocument(),
137                 evt, vICAL.uid, url.value()
138         );
139     }
140 
141     private void addUrl(HTMLDocument compoNode, Resource evt) throws ExtractionException {
142         TextField url = compoNode.getSingularUrlField("url");
143         if ("".equals(url.value())) return;
144         addURIProperty(evt, vICAL.url, getHTMLDocument().resolveURI(url.value()));
145     }
146 
147     private void addRRule(HTMLDocument compoNode, Resource evt) {
148         for (Node rule : compoNode.findAllByClassName("rrule")) {
149             BNode rrule = valueFactory.createBNode();
150             addURIProperty(rrule, RDF.TYPE, vICAL.DomainOf_rrule);
151             TextField freq = new HTMLDocument(rule).getSingularTextField("freq");
152             conditionallyAddStringProperty(
153                     freq.source(),
154                     rrule, vICAL.freq, freq.value()
155             );
156             addBNodeProperty(
157                     rule,
158                     evt, vICAL.rrule, rrule
159             );
160         }
161     }
162 
163     private void addOrganizer(HTMLDocument compoNode, Resource evt) {
164         for (Node organizer : compoNode.findAllByClassName("organizer")) {
165             //untyped
166             BNode blank = valueFactory.createBNode();
167             TextField mail = new HTMLDocument(organizer).getSingularUrlField("organizer");
168             conditionallyAddStringProperty(
169                     compoNode.getDocument(),
170                     blank, vICAL.calAddress, mail.value()
171             );
172             addBNodeProperty(
173                     organizer,
174                     evt, vICAL.organizer, blank
175             );
176         }
177     }
178 
179     private void addTextProps(HTMLDocument node, Resource evt) {
180         for (String date : textSingularProps) {
181             HTMLDocument.TextField val = node.getSingularTextField(date);
182             conditionallyAddStringProperty(
183                     val.source(),
184                     evt, vICAL.getProperty(date), val.value()
185             );
186         }
187 
188         for (String date : textDateProps) {
189             HTMLDocument.TextField val = node.getSingularTextField(date);
190             try {
191                 conditionallyAddStringProperty(
192                         val.source(),
193                         evt,
194                         vICAL.getProperty(date),
195                         RDFUtils.getXSDDate(
196                                 val.value(),
197                                 DATE_FORMAT
198                         )
199                 );
200             } catch (ParseException e) {
201                 // Unparsable date format just leave it as it is.
202                 conditionallyAddStringProperty( val.source(), evt, vICAL.getProperty(date), val.value());
203             } catch (DatatypeConfigurationException e) {
204                 // Unparsable date format just leave it as it is
205                 conditionallyAddStringProperty(val.source(), evt, vICAL.getProperty(date), val.value());
206             }
207         }
208 
209         HTMLDocument.TextField[] values = node.getPluralTextField("category");
210         for (TextField val : values) {
211             conditionallyAddStringProperty(val.source(), evt, vICAL.categories, val.value());
212         }
213     }
214 
215 }