1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.deri.any23.servlet;
18
19 import org.deri.any23.configuration.DefaultConfiguration;
20 import org.deri.any23.extractor.ExtractionParameters;
21 import org.deri.any23.http.HTTPClient;
22 import org.deri.any23.servlet.conneg.Any23Negotiator;
23 import org.deri.any23.servlet.conneg.MediaRangeSpec;
24 import org.deri.any23.source.ByteArrayDocumentSource;
25 import org.deri.any23.source.DocumentSource;
26 import org.deri.any23.source.HTTPDocumentSource;
27 import org.deri.any23.source.StringDocumentSource;
28
29 import javax.servlet.ServletException;
30 import javax.servlet.http.HttpServlet;
31 import javax.servlet.http.HttpServletRequest;
32 import javax.servlet.http.HttpServletResponse;
33 import java.io.IOException;
34 import java.net.URI;
35 import java.net.URISyntaxException;
36 import java.util.regex.Pattern;
37
38 import static org.deri.any23.extractor.ExtractionParameters.ValidationMode;
39
40
41
42
43
44
45
46
47 public class Servlet extends HttpServlet {
48
49 public static final String DEFAULT_BASE_URI = "http://any23.org/tmp/";
50
51 private static final long serialVersionUID = 8207685628715421336L;
52
53
54 private final static Pattern schemeRegex =
55 Pattern.compile("^[a-zA-Z][a-zA-Z0-9.+-]*:");
56
57 @Override
58 protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws IOException, ServletException {
59 final WebResponder responder = new WebResponder(this, resp);
60 final String format = getFormatFromRequestOrNegotiation(req);
61 final boolean report = isReport(req);
62 if (format == null) {
63 responder.sendError(406, "Client accept header does not include a supported output format", report);
64 return;
65 }
66 final String uri = getInputURIFromRequest(req);
67 if (uri == null) {
68 responder.sendError(404, "Missing URI in GET request. Try /format/http://example.com/myfile", report);
69 return;
70 }
71 final ExtractionParameters eps = getExtractionParameters(req);
72 responder.runExtraction(createHTTPDocumentSource(responder, uri, report), eps, format, report);
73 }
74
75 @Override
76 protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws IOException {
77 final WebResponder responder = new WebResponder(this, resp);
78 final boolean report = isReport(req);
79 if (req.getContentType() == null) {
80 responder.sendError(400, "Invalid POST request, no Content-Type for the message body specified", report);
81 return;
82 }
83 final String uri = getInputURIFromRequest(req);
84 final String format = getFormatFromRequestOrNegotiation(req);
85 if (format == null) {
86 responder.sendError(406, "Client accept header does not include a supported output format", report);
87 return;
88 }
89 final ExtractionParameters eps = getExtractionParameters(req);
90 if ("application/x-www-form-urlencoded".equals(getContentTypeHeader(req))) {
91 if (uri != null) {
92 log("Attempting conversion to '" + format + "' from URI <" + uri + ">");
93 responder.runExtraction(createHTTPDocumentSource(responder, uri, report), eps, format, report);
94 return;
95 }
96 if (req.getParameter("body") == null) {
97 responder.sendError(400, "Invalid POST request, parameter 'uri' or 'body' required", report);
98 return;
99 }
100 String type = null;
101 if (req.getParameter("type") != null && !"".equals(req.getParameter("type"))) {
102 type = req.getParameter("type");
103 }
104 log("Attempting conversion to '" + format + "' from body parameter");
105 responder.runExtraction(
106 new StringDocumentSource(req.getParameter("body"), Servlet.DEFAULT_BASE_URI, type),
107 eps,
108 format,
109 report
110 );
111 return;
112 }
113 log("Attempting conversion to '" + format + "' from POST body");
114 responder.runExtraction(
115 new ByteArrayDocumentSource(
116 req.getInputStream(),
117 Servlet.DEFAULT_BASE_URI,
118 getContentTypeHeader(req)
119 ),
120 eps,
121 format,
122 report
123 );
124 }
125
126 private String getFormatFromRequestOrNegotiation(HttpServletRequest request) {
127 String fromRequest = getFormatFromRequest(request);
128 if (fromRequest != null && !"".equals(fromRequest) && !"best".equals(fromRequest)) {
129 return fromRequest;
130 }
131 MediaRangeSpec result = Any23Negotiator.getNegotiator().getBestMatch(request.getHeader("Accept"));
132 if (result == null) {
133 return null;
134 }
135 if ("text/turtle".equals(result.getMediaType())) {
136 return "turtle";
137 }
138 if ("text/rdf+n3".equals(result.getMediaType())) {
139 return "n3";
140 }
141 if ("text/rdf+nq".equals(result.getMediaType())) {
142 return "nq";
143 }
144 if ("application/rdf+xml".equals(result.getMediaType())) {
145 return "rdf";
146 }
147 if ("text/plain".equals(result.getMediaType())) {
148 return "nt";
149 }
150 return "turtle";
151 }
152
153 private String getFormatFromRequest(HttpServletRequest request) {
154 if (request.getPathInfo() == null) return "best";
155 String[] args = request.getPathInfo().split("/", 3);
156 if (args.length < 2 || "".equals(args[1])) {
157 if (request.getParameter("format") == null) {
158 return "best";
159 } else {
160 return request.getParameter("format");
161 }
162 }
163 return args[1];
164 }
165
166 private String getInputURIFromRequest(HttpServletRequest request) {
167 if (request.getPathInfo() == null) return null;
168 String[] args = request.getPathInfo().split("/", 3);
169 if (args.length < 3) {
170 if (request.getParameter("uri") != null) {
171 return request.getParameter("uri").trim();
172 }
173 if (request.getParameter("url") != null) {
174 return request.getParameter("url").trim();
175 }
176 return null;
177 }
178 String uri = args[2];
179 if (request.getQueryString() != null) {
180 uri = uri + "?" + request.getQueryString();
181 }
182 if (!hasScheme(uri)) {
183 uri = "http://" + uri;
184 } else if (hasOnlySingleSlashAfterScheme(uri)) {
185
186
187
188
189 uri = uri.replaceFirst(":/", "://");
190 }
191 return uri.trim();
192 }
193
194
195 private boolean hasScheme(String uri) {
196 return schemeRegex.matcher(uri).find();
197 }
198
199 private final static Pattern schemeAndSingleSlashRegex =
200 Pattern.compile("^[a-zA-Z][a-zA-Z0-9.+-]*:/[^/]");
201
202 private boolean hasOnlySingleSlashAfterScheme(String uri) {
203 return schemeAndSingleSlashRegex.matcher(uri).find();
204 }
205
206 private String getContentTypeHeader(HttpServletRequest req) {
207 if (req.getHeader("Content-Type") == null) return null;
208 if ("".equals(req.getHeader("Content-Type"))) return null;
209 String contentType = req.getHeader("Content-Type");
210
211 int index = contentType.indexOf(";");
212 if (index == -1) return contentType;
213 return contentType.substring(0, index);
214 }
215
216 private DocumentSource createHTTPDocumentSource(WebResponder responder, String uri, boolean report)
217 throws IOException {
218 try {
219 if (!isValidURI(uri)) {
220 throw new URISyntaxException(uri, "@@@");
221 }
222 return createHTTPDocumentSource(responder.getRunner().getHTTPClient(), uri);
223 } catch (URISyntaxException ex) {
224 responder.sendError(400, "Invalid input URI " + uri, report);
225 return null;
226 }
227 }
228
229 protected DocumentSource createHTTPDocumentSource(HTTPClient httpClient, String uri)
230 throws IOException, URISyntaxException {
231 return new HTTPDocumentSource(httpClient, uri);
232 }
233
234 private boolean isValidURI(String s) {
235 try {
236 URI uri = new URI(s);
237 if (!"http".equals(uri.getScheme()) && !"https".equals(uri.getScheme())) {
238 return false;
239 }
240 } catch (URISyntaxException e) {
241 return false;
242 }
243 return true;
244 }
245
246
247 private ExtractionParameters getExtractionParameters(HttpServletRequest request) {
248 final ValidationMode mode =
249 request.getParameter("fix") != null
250 ?
251 ValidationMode.ValidateAndFix
252 :
253 ValidationMode.None;
254 return new ExtractionParameters(DefaultConfiguration.singleton(), mode);
255 }
256
257 private boolean isReport(HttpServletRequest request) {
258 return request.getParameter("report") != null;
259 }
260
261 }