View Javadoc

1   /*
2    * Copyright 2008-2010 Digital Enterprise Research Institute (DERI)
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package org.deri.any23.extractor.csv;
18  
19  import org.apache.commons.csv.CSVParser;
20  import org.deri.any23.extractor.ExtractionContext;
21  import org.deri.any23.extractor.ExtractionException;
22  import org.deri.any23.extractor.ExtractionParameters;
23  import org.deri.any23.extractor.ExtractionResult;
24  import org.deri.any23.extractor.Extractor;
25  import org.deri.any23.extractor.ExtractorDescription;
26  import org.deri.any23.extractor.ExtractorFactory;
27  import org.deri.any23.extractor.SimpleExtractorFactory;
28  import org.deri.any23.rdf.RDFUtils;
29  import org.deri.any23.vocab.CSV;
30  import org.openrdf.model.URI;
31  import org.openrdf.model.Value;
32  import org.openrdf.model.impl.LiteralImpl;
33  import org.openrdf.model.impl.URIImpl;
34  import org.openrdf.model.vocabulary.RDF;
35  import org.openrdf.model.vocabulary.RDFS;
36  import org.openrdf.model.vocabulary.XMLSchema;
37  
38  import java.io.IOException;
39  import java.io.InputStream;
40  import java.util.Arrays;
41  
42  /**
43   * This extractor produces <i>RDF</i> from a <i>CSV file</i> .
44   * It automatically detects fields <i>delimiter</i>. If not able uses
45   * the one provided in the <i>Any23</i> configuration.
46   *
47   * @see {@link CSVReaderBuilder}
48   * @author Davide Palmisano ( dpalmisano@gmail.com )
49   */
50  public class CSVExtractor implements Extractor.ContentExtractor {
51  
52      private CSVParser csvParser;
53  
54      private URI[] headerURIs;
55  
56      private CSV csv = CSV.getInstance();
57  
58      public final static ExtractorFactory<CSVExtractor> factory =
59              SimpleExtractorFactory.create(
60                      "csv",
61                      null,
62                      Arrays.asList(
63                              "text/csv;q=0.1"
64                      ),
65                      null,
66                      CSVExtractor.class
67              );
68  
69      /**
70       * {@inheritDoc}
71       */
72      public void setStopAtFirstError(boolean f) {
73      }
74  
75      /**
76       * {@inheritDoc}
77       */
78      public void run(
79              ExtractionParameters extractionParameters,
80              ExtractionContext extractionContext,
81              InputStream in
82              , ExtractionResult out
83      ) throws IOException, ExtractionException {
84          final URI documentURI = extractionContext.getDocumentURI();
85  
86          // build the parser
87          csvParser = CSVReaderBuilder.build(in);
88  
89          // get the header and generate the URIs for column names
90          String[] header = csvParser.getLine();
91          headerURIs = processHeader(header, documentURI);
92  
93          // write triples to describe properties
94          writeHeaderPropertiesMetadata(header, out);
95  
96          String[] nextLine;
97          int index = 0;
98          while ((nextLine = csvParser.getLine()) != null) {
99              URI rowSubject = RDFUtils.uri(
100                     documentURI.toString(),
101                     "row/" + index
102             );
103             // add a row type
104             out.writeTriple(rowSubject, RDF.TYPE, csv.rowType);
105             // for each row produce its statements
106             produceRowStatements(rowSubject, nextLine, out);
107             // link the row to the document
108             out.writeTriple(documentURI, csv.row, rowSubject);
109             // the progressive row number
110             out.writeTriple(
111                     rowSubject,
112                     csv.rowPosition,
113                     new LiteralImpl(String.valueOf(index))
114             );
115             index++;
116         }
117         // add some CSV metadata such as the number of rows and columns
118         addTableMetadataStatements(
119                 documentURI,
120                 out,
121                 index,
122                 headerURIs.length
123         );
124     }
125 
126     /**
127      * @param number
128      * @return
129      */
130     private boolean isNumber(String number) {
131         try {
132             Double.valueOf(number);
133             return true;
134         } catch (NumberFormatException e) {
135             return false;
136         }
137     }
138 
139     /**
140      * It writes <i>RDF</i> statements representing properties of the header.
141      *
142      * @param header
143      * @param out
144      */
145     private void writeHeaderPropertiesMetadata(String[] header, ExtractionResult out) {
146         int index = 0;
147         for (URI singleHeader : headerURIs) {
148             if (index > headerURIs.length) {
149                 break;
150             }
151             if (!RDFUtils.isAbsoluteURI(header[index])) {
152                 out.writeTriple(
153                         singleHeader,
154                         RDFS.LABEL,
155                         new LiteralImpl(header[index])
156                 );
157             }
158             out.writeTriple(
159                     singleHeader,
160                     csv.columnPosition,
161                     new LiteralImpl(String.valueOf(index), XMLSchema.INTEGER)
162             );
163             index++;
164         }
165     }
166 
167     /**
168      * It process the first row of the file, returning a list of {@link URI}s representing
169      * the properties for each column. If a value of the header is an absolute <i>URI</i>
170      * then it leave it as is. Otherwise the {@link CSV} vocabulary is used.
171      *
172      * @param header
173      * @return an array of {@link URI}s identifying the column names.
174      */
175     private URI[] processHeader(String[] header, URI documentURI) {
176         URI[] result = new URI[header.length];
177         int index = 0;
178         for (String h : header) {
179             String candidate = h.trim();
180             if (RDFUtils.isAbsoluteURI(candidate)) {
181                 result[index] = new URIImpl(candidate);
182             } else {
183                 result[index] = normalize(candidate, documentURI);
184             }
185             index++;
186         }
187         return result;
188     }
189 
190     private URI normalize(String toBeNormalized, URI documentURI) {
191         String candidate = toBeNormalized;
192         candidate = candidate.trim().toLowerCase().replace("?", "").replace("&", "");
193         String[] tokens = candidate.split(" ");
194         candidate = tokens[0];
195         for (int i = 1; i < tokens.length; i++) {
196             String firstChar = ("" + tokens[i].charAt(0)).toUpperCase();
197             candidate += firstChar + tokens[i].substring(1);
198         }
199         return new URIImpl(documentURI.toString() + candidate);
200     }
201 
202     /**
203      * It writes on the provided {@link ExtractionResult}, the </>RDF statements</>
204      * representing the row <i>cell</i>. If a  row <i>cell</i> is an absolute <i>URI</i>
205      * then an object property is written, literal otherwise.
206      *
207      * @param rowSubject
208      * @param values
209      * @param out
210      */
211     private void produceRowStatements(
212             URI rowSubject,
213             String[] values,
214             ExtractionResult out
215     ) {
216         int index = 0;
217         for (String cell : values) {
218             if (index >= headerURIs.length) {
219                 // there are some row cells that don't have an associated column name
220                 break;
221             }
222             if (cell.equals("")) {
223                 continue;
224             }
225             URI predicate = headerURIs[index];
226             Value object = getObjectFromCell(cell);
227             out.writeTriple(rowSubject, predicate, object);
228             index++;
229         }
230     }
231 
232     private Value getObjectFromCell(String cell) {
233         Value object;
234         cell = cell.trim();
235         if (RDFUtils.isAbsoluteURI(cell)) {
236             object = new URIImpl(cell);
237         } else {
238             URI datatype = XMLSchema.STRING;
239             if (isNumber(cell)) {
240                 datatype = XMLSchema.INTEGER;
241             }
242             object = new LiteralImpl(cell, datatype);
243         }
244         return object;
245     }
246 
247     /**
248      * It writes on the provided {@link ExtractionResult} some <i>RDF Statements</i>
249      * on generic properties of the <i>CSV</i> file, such as number of rows and columns.
250      *
251      * @param documentURI
252      * @param out
253      * @param numberOfRows
254      * @param numberOfColumns
255      */
256     private void addTableMetadataStatements(
257             URI documentURI,
258             ExtractionResult out,
259             int numberOfRows,
260             int numberOfColumns) {
261         out.writeTriple(
262                 documentURI,
263                 csv.numberOfRows,
264                 new LiteralImpl(String.valueOf(numberOfRows), XMLSchema.INTEGER)
265         );
266         out.writeTriple(
267                 documentURI,
268                 csv.numberOfColumns,
269                 new LiteralImpl(String.valueOf(numberOfColumns), XMLSchema.INTEGER)
270         );
271     }
272 
273     /**
274      * {@inheritDoc}
275      */
276     public ExtractorDescription getDescription() {
277         return factory;
278     }
279 }