1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.deri.any23.extractor.csv;
18
19 import org.apache.commons.csv.CSVParser;
20 import org.deri.any23.extractor.ExtractionContext;
21 import org.deri.any23.extractor.ExtractionException;
22 import org.deri.any23.extractor.ExtractionParameters;
23 import org.deri.any23.extractor.ExtractionResult;
24 import org.deri.any23.extractor.Extractor;
25 import org.deri.any23.extractor.ExtractorDescription;
26 import org.deri.any23.extractor.ExtractorFactory;
27 import org.deri.any23.extractor.SimpleExtractorFactory;
28 import org.deri.any23.rdf.RDFUtils;
29 import org.deri.any23.vocab.CSV;
30 import org.openrdf.model.URI;
31 import org.openrdf.model.Value;
32 import org.openrdf.model.impl.LiteralImpl;
33 import org.openrdf.model.impl.URIImpl;
34 import org.openrdf.model.vocabulary.RDF;
35 import org.openrdf.model.vocabulary.RDFS;
36 import org.openrdf.model.vocabulary.XMLSchema;
37
38 import java.io.IOException;
39 import java.io.InputStream;
40 import java.util.Arrays;
41
42
43
44
45
46
47
48
49
50 public class CSVExtractor implements Extractor.ContentExtractor {
51
52 private CSVParser csvParser;
53
54 private URI[] headerURIs;
55
56 private CSV csv = CSV.getInstance();
57
58 public final static ExtractorFactory<CSVExtractor> factory =
59 SimpleExtractorFactory.create(
60 "csv",
61 null,
62 Arrays.asList(
63 "text/csv;q=0.1"
64 ),
65 null,
66 CSVExtractor.class
67 );
68
69
70
71
72 public void setStopAtFirstError(boolean f) {
73 }
74
75
76
77
78 public void run(
79 ExtractionParameters extractionParameters,
80 ExtractionContext extractionContext,
81 InputStream in
82 , ExtractionResult out
83 ) throws IOException, ExtractionException {
84 final URI documentURI = extractionContext.getDocumentURI();
85
86
87 csvParser = CSVReaderBuilder.build(in);
88
89
90 String[] header = csvParser.getLine();
91 headerURIs = processHeader(header, documentURI);
92
93
94 writeHeaderPropertiesMetadata(header, out);
95
96 String[] nextLine;
97 int index = 0;
98 while ((nextLine = csvParser.getLine()) != null) {
99 URI rowSubject = RDFUtils.uri(
100 documentURI.toString(),
101 "row/" + index
102 );
103
104 out.writeTriple(rowSubject, RDF.TYPE, csv.rowType);
105
106 produceRowStatements(rowSubject, nextLine, out);
107
108 out.writeTriple(documentURI, csv.row, rowSubject);
109
110 out.writeTriple(
111 rowSubject,
112 csv.rowPosition,
113 new LiteralImpl(String.valueOf(index))
114 );
115 index++;
116 }
117
118 addTableMetadataStatements(
119 documentURI,
120 out,
121 index,
122 headerURIs.length
123 );
124 }
125
126
127
128
129
130 private boolean isNumber(String number) {
131 try {
132 Double.valueOf(number);
133 return true;
134 } catch (NumberFormatException e) {
135 return false;
136 }
137 }
138
139
140
141
142
143
144
145 private void writeHeaderPropertiesMetadata(String[] header, ExtractionResult out) {
146 int index = 0;
147 for (URI singleHeader : headerURIs) {
148 if (index > headerURIs.length) {
149 break;
150 }
151 if (!RDFUtils.isAbsoluteURI(header[index])) {
152 out.writeTriple(
153 singleHeader,
154 RDFS.LABEL,
155 new LiteralImpl(header[index])
156 );
157 }
158 out.writeTriple(
159 singleHeader,
160 csv.columnPosition,
161 new LiteralImpl(String.valueOf(index), XMLSchema.INTEGER)
162 );
163 index++;
164 }
165 }
166
167
168
169
170
171
172
173
174
175 private URI[] processHeader(String[] header, URI documentURI) {
176 URI[] result = new URI[header.length];
177 int index = 0;
178 for (String h : header) {
179 String candidate = h.trim();
180 if (RDFUtils.isAbsoluteURI(candidate)) {
181 result[index] = new URIImpl(candidate);
182 } else {
183 result[index] = normalize(candidate, documentURI);
184 }
185 index++;
186 }
187 return result;
188 }
189
190 private URI normalize(String toBeNormalized, URI documentURI) {
191 String candidate = toBeNormalized;
192 candidate = candidate.trim().toLowerCase().replace("?", "").replace("&", "");
193 String[] tokens = candidate.split(" ");
194 candidate = tokens[0];
195 for (int i = 1; i < tokens.length; i++) {
196 String firstChar = ("" + tokens[i].charAt(0)).toUpperCase();
197 candidate += firstChar + tokens[i].substring(1);
198 }
199 return new URIImpl(documentURI.toString() + candidate);
200 }
201
202
203
204
205
206
207
208
209
210
211 private void produceRowStatements(
212 URI rowSubject,
213 String[] values,
214 ExtractionResult out
215 ) {
216 int index = 0;
217 for (String cell : values) {
218 if (index >= headerURIs.length) {
219
220 break;
221 }
222 if (cell.equals("")) {
223 continue;
224 }
225 URI predicate = headerURIs[index];
226 Value object = getObjectFromCell(cell);
227 out.writeTriple(rowSubject, predicate, object);
228 index++;
229 }
230 }
231
232 private Value getObjectFromCell(String cell) {
233 Value object;
234 cell = cell.trim();
235 if (RDFUtils.isAbsoluteURI(cell)) {
236 object = new URIImpl(cell);
237 } else {
238 URI datatype = XMLSchema.STRING;
239 if (isNumber(cell)) {
240 datatype = XMLSchema.INTEGER;
241 }
242 object = new LiteralImpl(cell, datatype);
243 }
244 return object;
245 }
246
247
248
249
250
251
252
253
254
255
256 private void addTableMetadataStatements(
257 URI documentURI,
258 ExtractionResult out,
259 int numberOfRows,
260 int numberOfColumns) {
261 out.writeTriple(
262 documentURI,
263 csv.numberOfRows,
264 new LiteralImpl(String.valueOf(numberOfRows), XMLSchema.INTEGER)
265 );
266 out.writeTriple(
267 documentURI,
268 csv.numberOfColumns,
269 new LiteralImpl(String.valueOf(numberOfColumns), XMLSchema.INTEGER)
270 );
271 }
272
273
274
275
276 public ExtractorDescription getDescription() {
277 return factory;
278 }
279 }