View Javadoc

1   /**
2    * Copyright 2008-2010 Digital Enterprise Research Institute (DERI)
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *          http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   *
16   */
17  
18  package org.deri.any23.extractor.xpath;
19  
20  import org.deri.any23.extractor.ExtractionContext;
21  import org.deri.any23.extractor.ExtractionException;
22  import org.deri.any23.extractor.ExtractionParameters;
23  import org.deri.any23.extractor.ExtractionResult;
24  import org.deri.any23.extractor.Extractor;
25  import org.deri.any23.extractor.ExtractorDescription;
26  import org.deri.any23.extractor.ExtractorFactory;
27  import org.deri.any23.extractor.SimpleExtractorFactory;
28  import org.openrdf.model.URI;
29  import org.w3c.dom.Document;
30  
31  import java.io.IOException;
32  import java.util.ArrayList;
33  import java.util.Arrays;
34  import java.util.List;
35  
36  /**
37   * Implementation of an {@link org.deri.any23.extractor.Extractor.TagSoupDOMExtractor} able to
38   * apply {@link XPathExtractionRule}s and generate <i>quads</i>.
39   *
40   * @see XPathExtractionRule
41   * @author Michele Mostarda (mostarda@fbk.eu)
42   */
43  public class XPathExtractor implements Extractor.TagSoupDOMExtractor {
44  
45      public final static String NAME = "html-xpath";
46  
47      public final static ExtractorFactory<XPathExtractor> factory =
48              SimpleExtractorFactory.create(
49                      NAME,
50                      null,
51                      Arrays.asList("text/html;q=0.02", "application/xhtml+xml;q=0.02"),
52                      null,
53                      XPathExtractor.class
54              );
55  
56      private final List<XPathExtractionRule> xPathExtractionRules = new ArrayList<XPathExtractionRule>();
57  
58      public XPathExtractor(List<XPathExtractionRule> rules) {
59          xPathExtractionRules.addAll(rules);
60      }
61  
62      public void add(XPathExtractionRule rule) {
63          xPathExtractionRules.add(rule);
64      }
65  
66      public void remove(XPathExtractionRule rule) {
67          xPathExtractionRules.remove(rule);
68      }
69  
70      public boolean contains(XPathExtractionRule rule) {
71          return xPathExtractionRules.contains(rule);
72      }
73  
74      public void run(
75              ExtractionParameters extractionParameters,
76              ExtractionContext extractionContext,
77              Document in,
78              ExtractionResult out
79      )
80      throws IOException, ExtractionException {
81          final URI documentURI = extractionContext.getDocumentURI();
82          for(XPathExtractionRule rule : xPathExtractionRules) {
83              if(rule.acceptURI(documentURI)) {
84                  rule.process(in, out);
85              }
86          }
87      }
88  
89      public ExtractorDescription getDescription() {
90          return factory;
91      }
92  
93  }