View Javadoc

1   /*
2    * Copyright 2008-2010 Digital Enterprise Research Institute (DERI)
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *          http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package org.deri.any23.extractor.html;
18  
19  import org.deri.any23.extractor.ExtractionContext;
20  import org.deri.any23.extractor.ExtractionException;
21  import org.deri.any23.extractor.ExtractionParameters;
22  import org.deri.any23.extractor.ExtractionResult;
23  import org.deri.any23.extractor.Extractor.TagSoupDOMExtractor;
24  import org.deri.any23.extractor.ExtractorDescription;
25  import org.deri.any23.extractor.ExtractorFactory;
26  import org.deri.any23.extractor.SimpleExtractorFactory;
27  import org.deri.any23.rdf.PopularPrefixes;
28  import org.deri.any23.vocab.XHTML;
29  import org.openrdf.model.URI;
30  import org.openrdf.model.ValueFactory;
31  import org.openrdf.model.impl.ValueFactoryImpl;
32  import org.w3c.dom.Document;
33  import org.w3c.dom.Node;
34  
35  import java.io.IOException;
36  import java.util.Arrays;
37  import java.util.List;
38  
39  /**
40   * This {@link org.deri.any23.extractor.Extractor.TagSoupDOMExtractor} implementation
41   * retrieves the <code>LINK</code>s declared within the <code>HTML/HEAD</code> page header.
42   */
43  public class HeadLinkExtractor implements TagSoupDOMExtractor {
44  
45      public void run(
46              ExtractionParameters extractionParameters,
47              ExtractionContext extractionContext,
48              Document in,
49              ExtractionResult out
50      ) throws IOException, ExtractionException {
51          HTMLDocument html = new HTMLDocument(in);
52          ValueFactory vf = ValueFactoryImpl.getInstance();
53  
54          final List<Node> headLinkNodes = DomUtils.findAll(
55                  in,
56                  "/HTML/HEAD/LINK[(" +
57                          "@type='application/rdf+xml' or " +
58                          "@type='text/rdf' or " +
59                          "@type='application/x-turtle' or " +
60                          "@type='application/turtle' or " +
61                          "@type='text/turtle' or " +
62                          "@type='text/rdf+n3'" +
63                          ") and @href and @rel]"
64          );
65          for (Node node : headLinkNodes) {
66              final URI href = html.resolveURI(DomUtils.find(node, "@href"));
67              final String rel = DomUtils.find(node, "@rel");
68              out.writeTriple(
69                      extractionContext.getDocumentURI(),
70                      vf.createURI(XHTML.NS + rel),
71                      href
72              );
73              final String title = DomUtils.find(node, "@title");
74              if (title != null && !"".equals(title)) {
75                  out.writeTriple(
76                          href,
77                          factory.getPrefixes().expand("dcterms:title"),
78                          vf.createLiteral(title)
79                  );
80              }
81              final String type = DomUtils.find(node, "@type");
82              if (type != null && !"".equals(type)) {
83                  out.writeTriple(
84                          href,
85                          factory.getPrefixes().expand("dcterms:format"),
86                          vf.createLiteral(type)
87                  );
88              }
89          }
90      }
91  
92      public ExtractorDescription getDescription() {
93          return factory;
94      }
95  
96      public final static ExtractorFactory<HeadLinkExtractor> factory =
97              SimpleExtractorFactory.create(
98                      "html-head-links",
99                      PopularPrefixes.createSubset("xhtml", "dcterms"),
100                     Arrays.asList("text/html;q=0.05", "application/xhtml+xml;q=0.05"),
101                     null,
102                     HeadLinkExtractor.class);
103 }