View Javadoc

1   /*
2    * Copyright 2008-2010 Digital Enterprise Research Institute (DERI)
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package org.deri.any23.servlet;
18  
19  import org.deri.any23.configuration.DefaultConfiguration;
20  import org.deri.any23.extractor.ExtractionParameters;
21  import org.deri.any23.http.HTTPClient;
22  import org.deri.any23.servlet.conneg.Any23Negotiator;
23  import org.deri.any23.servlet.conneg.MediaRangeSpec;
24  import org.deri.any23.source.ByteArrayDocumentSource;
25  import org.deri.any23.source.DocumentSource;
26  import org.deri.any23.source.HTTPDocumentSource;
27  import org.deri.any23.source.StringDocumentSource;
28  
29  import javax.servlet.ServletException;
30  import javax.servlet.http.HttpServlet;
31  import javax.servlet.http.HttpServletRequest;
32  import javax.servlet.http.HttpServletResponse;
33  import java.io.IOException;
34  import java.net.URI;
35  import java.net.URISyntaxException;
36  import java.util.regex.Pattern;
37  
38  import static org.deri.any23.extractor.ExtractionParameters.ValidationMode;
39  
40  /**
41   * A <i>Servlet</i> that fetches a client-specified <i>URI</i>,
42   * RDFizes the content, and returns it in a format chosen by the client.
43   *
44   * @author Gabriele Renzi
45   * @author Richard Cyganiak (richard@cyganiak.de)
46   */
47  public class Servlet extends HttpServlet {
48  
49      public static final String DEFAULT_BASE_URI = "http://any23.org/tmp/";
50  
51      private static final long serialVersionUID = 8207685628715421336L;
52  
53      // RFC 3986: scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
54      private final static Pattern schemeRegex =
55              Pattern.compile("^[a-zA-Z][a-zA-Z0-9.+-]*:");
56  
57      @Override
58      protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws IOException, ServletException {
59          final WebResponder responder = new WebResponder(this, resp);
60          final String format = getFormatFromRequestOrNegotiation(req);
61          final boolean report = isReport(req);
62          if (format == null) {
63              responder.sendError(406, "Client accept header does not include a supported output format", report);
64              return;
65          }
66          final String uri = getInputURIFromRequest(req);
67          if (uri == null) {
68              responder.sendError(404, "Missing URI in GET request. Try /format/http://example.com/myfile", report);
69              return;
70          }
71          final ExtractionParameters eps = getExtractionParameters(req);
72          responder.runExtraction(createHTTPDocumentSource(responder, uri, report), eps, format, report);
73      }
74  
75      @Override
76      protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws IOException {
77          final WebResponder responder = new WebResponder(this, resp);
78          final boolean report = isReport(req);
79          if (req.getContentType() == null) {
80              responder.sendError(400, "Invalid POST request, no Content-Type for the message body specified", report);
81              return;
82          }
83          final String uri = getInputURIFromRequest(req);
84          final String format = getFormatFromRequestOrNegotiation(req);
85          if (format == null) {
86              responder.sendError(406, "Client accept header does not include a supported output format", report);
87              return;
88          }
89          final ExtractionParameters eps = getExtractionParameters(req);
90          if ("application/x-www-form-urlencoded".equals(getContentTypeHeader(req))) {
91              if (uri != null) {
92                  log("Attempting conversion to '" + format + "' from URI <" + uri + ">");
93                  responder.runExtraction(createHTTPDocumentSource(responder, uri, report), eps, format, report);
94                  return;
95              }
96              if (req.getParameter("body") == null) {
97                  responder.sendError(400, "Invalid POST request, parameter 'uri' or 'body' required", report);
98                  return;
99              }
100             String type = null;
101             if (req.getParameter("type") != null && !"".equals(req.getParameter("type"))) {
102                 type = req.getParameter("type");
103             }
104             log("Attempting conversion to '" + format + "' from body parameter");
105             responder.runExtraction(
106                     new StringDocumentSource(req.getParameter("body"), Servlet.DEFAULT_BASE_URI, type),
107                     eps,
108                     format,
109                     report
110             );
111             return;
112         }
113         log("Attempting conversion to '" + format + "' from POST body");
114         responder.runExtraction(
115                 new ByteArrayDocumentSource(
116                         req.getInputStream(),
117                         Servlet.DEFAULT_BASE_URI,
118                         getContentTypeHeader(req)
119                 ),
120                 eps,
121                 format,
122                 report
123         );
124     }
125 
126     private String getFormatFromRequestOrNegotiation(HttpServletRequest request) {
127         String fromRequest = getFormatFromRequest(request);
128         if (fromRequest != null && !"".equals(fromRequest) && !"best".equals(fromRequest)) {
129             return fromRequest;
130         }
131         MediaRangeSpec result = Any23Negotiator.getNegotiator().getBestMatch(request.getHeader("Accept"));
132         if (result == null) {
133             return null;
134         }
135         if ("text/turtle".equals(result.getMediaType())) {
136             return "turtle";
137         }
138         if ("text/rdf+n3".equals(result.getMediaType())) {
139             return "n3";
140         }
141         if ("text/rdf+nq".equals(result.getMediaType())) {
142             return "nq";
143         }
144         if ("application/rdf+xml".equals(result.getMediaType())) {
145             return "rdf";
146         }
147         if ("text/plain".equals(result.getMediaType())) {
148             return "nt";
149         }
150         return "turtle";    // shouldn't happen
151     }
152 
153     private String getFormatFromRequest(HttpServletRequest request) {
154         if (request.getPathInfo() == null) return "best";
155         String[] args = request.getPathInfo().split("/", 3);
156         if (args.length < 2 || "".equals(args[1])) {
157             if (request.getParameter("format") == null) {
158                 return "best";
159             } else {
160                 return request.getParameter("format");
161             }
162         }
163         return args[1];
164     }
165 
166     private String getInputURIFromRequest(HttpServletRequest request) {
167         if (request.getPathInfo() == null) return null;
168         String[] args = request.getPathInfo().split("/", 3);
169         if (args.length < 3) {
170             if (request.getParameter("uri") != null) {
171                 return request.getParameter("uri").trim();
172             }
173             if (request.getParameter("url") != null) {
174                 return request.getParameter("url").trim();
175             }
176             return null;
177         }
178         String uri = args[2];
179         if (request.getQueryString() != null) {
180             uri = uri + "?" + request.getQueryString();
181         }
182         if (!hasScheme(uri)) {
183             uri = "http://" + uri;
184         } else if (hasOnlySingleSlashAfterScheme(uri)) {
185             // This is to work around an issue where Tomcat 6.0.18 is
186             // too smart for us. Tomcat normalizes double-slashes in
187             // the path, and thus turns "http://" into "http:/" if it
188             // occurs in the path. So we restore the double slash.
189             uri = uri.replaceFirst(":/", "://");
190         }
191         return uri.trim();
192     }
193 
194 
195     private boolean hasScheme(String uri) {
196         return schemeRegex.matcher(uri).find();
197     }
198 
199     private final static Pattern schemeAndSingleSlashRegex =
200             Pattern.compile("^[a-zA-Z][a-zA-Z0-9.+-]*:/[^/]");
201 
202     private boolean hasOnlySingleSlashAfterScheme(String uri) {
203         return schemeAndSingleSlashRegex.matcher(uri).find();
204     }
205 
206     private String getContentTypeHeader(HttpServletRequest req) {
207         if (req.getHeader("Content-Type") == null) return null;
208         if ("".equals(req.getHeader("Content-Type"))) return null;
209         String contentType = req.getHeader("Content-Type");
210         // strip off parameters such as ";charset=UTF-8"
211         int index = contentType.indexOf(";");
212         if (index == -1) return contentType;
213         return contentType.substring(0, index);
214     }
215 
216     private DocumentSource createHTTPDocumentSource(WebResponder responder, String uri, boolean report)
217     throws IOException {
218         try {
219             if (!isValidURI(uri)) {
220                 throw new URISyntaxException(uri, "@@@");
221             }
222             return createHTTPDocumentSource(responder.getRunner().getHTTPClient(), uri);
223         } catch (URISyntaxException ex) {
224             responder.sendError(400, "Invalid input URI " + uri, report);
225             return null;
226         }
227     }
228 
229     protected DocumentSource createHTTPDocumentSource(HTTPClient httpClient, String uri)
230             throws IOException, URISyntaxException {
231         return new HTTPDocumentSource(httpClient, uri);
232     }
233 
234     private boolean isValidURI(String s) {
235         try {
236             URI uri = new URI(s);
237             if (!"http".equals(uri.getScheme()) && !"https".equals(uri.getScheme())) {
238                 return false;
239             }
240         } catch (URISyntaxException e) {
241             return false;
242         }
243         return true;
244     }
245 
246     // TODO: add possibility to specify validation={none|validate|validate+fix}
247     private ExtractionParameters getExtractionParameters(HttpServletRequest request) {
248         final ValidationMode mode =
249                 request.getParameter("fix") != null
250                         ?
251                 ValidationMode.ValidateAndFix
252                         :
253                 ValidationMode.None;
254         return new ExtractionParameters(DefaultConfiguration.singleton(), mode);
255     }
256 
257     private boolean isReport(HttpServletRequest request) {
258         return request.getParameter("report") != null;
259     }
260     
261 }