View Javadoc

1   /**
2    * Copyright 2008-2010 Digital Enterprise Research Institute (DERI)
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *          http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   *
16   */
17  
18  package org.deri.any23.extractor.html;
19  
20  import org.deri.any23.extractor.ExtractionContext;
21  import org.deri.any23.extractor.ExtractionException;
22  import org.deri.any23.extractor.ExtractionParameters;
23  import org.deri.any23.extractor.ExtractionResult;
24  import org.deri.any23.extractor.Extractor.TagSoupDOMExtractor;
25  import org.deri.any23.extractor.ExtractorDescription;
26  import org.deri.any23.extractor.ExtractorFactory;
27  import org.deri.any23.extractor.SimpleExtractorFactory;
28  import org.deri.any23.rdf.Any23ValueFactoryWrapper;
29  import org.deri.any23.rdf.PopularPrefixes;
30  import org.deri.any23.vocab.DCTERMS;
31  import org.openrdf.model.impl.ValueFactoryImpl;
32  import org.w3c.dom.Document;
33  
34  import java.io.IOException;
35  import java.util.Arrays;
36  
37  /**
38   * Extracts the value of the <title> element of an
39   * HTML or XHTML page.
40   *
41   * @author Richard Cyganiak (richard@cyganiak.de)
42   */
43  public class TitleExtractor implements TagSoupDOMExtractor {
44  
45      public static final String NAME = "html-head-title";
46  
47      private static final DCTERMS vDCTERMS = DCTERMS.getInstance();
48  
49      public final static ExtractorFactory<TitleExtractor> factory =
50              SimpleExtractorFactory.create(
51                      NAME,
52                      PopularPrefixes.createSubset("dcterms"),
53                      Arrays.asList("text/html;q=0.02", "application/xhtml+xml;q=0.02"),
54                      "example-title.html",
55                      TitleExtractor.class
56              );
57  
58      public void run(
59              ExtractionParameters extractionParameters,
60              ExtractionContext extractionContext,
61              Document in,
62              ExtractionResult out
63      ) throws IOException, ExtractionException {
64          final Any23ValueFactoryWrapper valueFactory = new Any23ValueFactoryWrapper(
65              ValueFactoryImpl.getInstance(), out, extractionContext.getDefaultLanguage()
66          );
67          
68          try {
69              String title = DomUtils.find(in, "/HTML/HEAD/TITLE/text()").trim();
70              if (title != null && (title.length() != 0)) {
71                  out.writeTriple(extractionContext.getDocumentURI(), vDCTERMS.title, valueFactory.createLiteral(title));
72              }
73          } finally {
74              valueFactory.setErrorReporter(null);
75          }
76      }
77  
78      public ExtractorDescription getDescription() {
79          return factory;
80      }
81      
82  }