1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.deri.any23.extractor.html;
18
19 import org.deri.any23.extractor.ExtractionException;
20 import org.deri.any23.extractor.ExtractorDescription;
21 import org.deri.any23.extractor.ExtractorFactory;
22 import org.deri.any23.extractor.SimpleExtractorFactory;
23 import org.deri.any23.extractor.TagSoupExtractionResult;
24 import org.deri.any23.rdf.PopularPrefixes;
25 import org.deri.any23.rdf.RDFUtils;
26 import org.deri.any23.vocab.ICAL;
27 import org.openrdf.model.BNode;
28 import org.openrdf.model.Resource;
29 import org.openrdf.model.URI;
30 import org.openrdf.model.vocabulary.RDF;
31 import org.w3c.dom.Node;
32
33 import javax.xml.datatype.DatatypeConfigurationException;
34 import java.text.ParseException;
35 import java.util.Arrays;
36 import java.util.List;
37
38 import static org.deri.any23.extractor.html.HTMLDocument.TextField;
39
40
41
42
43
44
45
46
47 public class HCalendarExtractor extends MicroformatExtractor {
48
49 private static final ICAL vICAL = ICAL.getInstance();
50
51 public final static ExtractorFactory<HCalendarExtractor> factory =
52 SimpleExtractorFactory.create(
53 "html-mf-hcalendar",
54 PopularPrefixes.createSubset("rdf", "ical"),
55 Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
56 null,
57 HCalendarExtractor.class);
58
59 private static final String[] Components = {"Vevent", "Vtodo", "Vjournal", "Vfreebusy"};
60
61 private static final String DATE_FORMAT = "yyyyMMdd'T'HHmm'Z'";
62
63 private String[] textSingularProps = {
64 "summary",
65 "class",
66 "transp",
67 "description",
68 "status",
69 "location"};
70
71 private String[] textDateProps = {
72 "dtstart",
73 "dtstamp",
74 "dtend",
75 };
76
77 public ExtractorDescription getDescription() {
78 return factory;
79 }
80
81 @Override
82 protected boolean extract() throws ExtractionException {
83 final HTMLDocument document = getHTMLDocument();
84 List<Node> calendars = document.findAllByClassName("vcalendar");
85 if (calendars.size() == 0)
86
87
88 if (document.findAllByClassName("vevent").size() > 0)
89 calendars.add(document.getDocument());
90
91 boolean foundAny = false;
92 for (Node node : calendars)
93 foundAny |= extractCalendar(node);
94
95 return foundAny;
96 }
97
98 private boolean extractCalendar(Node node) throws ExtractionException {
99 URI cal = getDocumentURI();
100 addURIProperty(cal, RDF.TYPE, vICAL.Vcalendar);
101 return addComponents(node, cal);
102 }
103
104 private boolean addComponents(Node node, Resource cal) throws ExtractionException {
105 boolean foundAny = false;
106 for (String component : Components) {
107 List<Node> events = DomUtils.findAllByClassName(node, component);
108 if (events.size() == 0)
109 continue;
110 for (Node evtNode : events)
111 foundAny |= extractComponent(evtNode, cal, component);
112 }
113 return foundAny;
114 }
115
116 private boolean extractComponent(Node node, Resource cal, String component) throws ExtractionException {
117 HTMLDocument compoNode = new HTMLDocument(node);
118 BNode evt = valueFactory.createBNode();
119 addURIProperty(evt, RDF.TYPE, vICAL.getResource(component));
120 addTextProps(compoNode, evt);
121 addUrl(compoNode, evt);
122 addRRule(compoNode, evt);
123 addOrganizer(compoNode, evt);
124 addUid(compoNode, evt);
125 addBNodeProperty(cal, vICAL.component, evt);
126
127 final TagSoupExtractionResult tser = (TagSoupExtractionResult) getCurrentExtractionResult();
128 tser.addResourceRoot( compoNode.getPathToLocalRoot(), evt, this.getClass() );
129
130 return true;
131 }
132
133 private void addUid(HTMLDocument compoNode, Resource evt) {
134 TextField url = compoNode.getSingularUrlField("uid");
135 conditionallyAddStringProperty(
136 compoNode.getDocument(),
137 evt, vICAL.uid, url.value()
138 );
139 }
140
141 private void addUrl(HTMLDocument compoNode, Resource evt) throws ExtractionException {
142 TextField url = compoNode.getSingularUrlField("url");
143 if ("".equals(url.value())) return;
144 addURIProperty(evt, vICAL.url, getHTMLDocument().resolveURI(url.value()));
145 }
146
147 private void addRRule(HTMLDocument compoNode, Resource evt) {
148 for (Node rule : compoNode.findAllByClassName("rrule")) {
149 BNode rrule = valueFactory.createBNode();
150 addURIProperty(rrule, RDF.TYPE, vICAL.DomainOf_rrule);
151 TextField freq = new HTMLDocument(rule).getSingularTextField("freq");
152 conditionallyAddStringProperty(
153 freq.source(),
154 rrule, vICAL.freq, freq.value()
155 );
156 addBNodeProperty(
157 rule,
158 evt, vICAL.rrule, rrule
159 );
160 }
161 }
162
163 private void addOrganizer(HTMLDocument compoNode, Resource evt) {
164 for (Node organizer : compoNode.findAllByClassName("organizer")) {
165
166 BNode blank = valueFactory.createBNode();
167 TextField mail = new HTMLDocument(organizer).getSingularUrlField("organizer");
168 conditionallyAddStringProperty(
169 compoNode.getDocument(),
170 blank, vICAL.calAddress, mail.value()
171 );
172 addBNodeProperty(
173 organizer,
174 evt, vICAL.organizer, blank
175 );
176 }
177 }
178
179 private void addTextProps(HTMLDocument node, Resource evt) {
180 for (String date : textSingularProps) {
181 HTMLDocument.TextField val = node.getSingularTextField(date);
182 conditionallyAddStringProperty(
183 val.source(),
184 evt, vICAL.getProperty(date), val.value()
185 );
186 }
187
188 for (String date : textDateProps) {
189 HTMLDocument.TextField val = node.getSingularTextField(date);
190 try {
191 conditionallyAddStringProperty(
192 val.source(),
193 evt,
194 vICAL.getProperty(date),
195 RDFUtils.getXSDDate(
196 val.value(),
197 DATE_FORMAT
198 )
199 );
200 } catch (ParseException e) {
201
202 conditionallyAddStringProperty( val.source(), evt, vICAL.getProperty(date), val.value());
203 } catch (DatatypeConfigurationException e) {
204
205 conditionallyAddStringProperty(val.source(), evt, vICAL.getProperty(date), val.value());
206 }
207 }
208
209 HTMLDocument.TextField[] values = node.getPluralTextField("category");
210 for (TextField val : values) {
211 conditionallyAddStringProperty(val.source(), evt, vICAL.categories, val.value());
212 }
213 }
214
215 }