11/*
2- * Copyright 2013, 2022 Deutsche Nationalbibliothek et al
2+ * Copyright 2013, 2023 Deutsche Nationalbibliothek et al
33 *
44 * Licensed under the Apache License, Version 2.0 the "License";
55 * you may not use this file except in compliance with the License.
3232import java .io .SequenceInputStream ;
3333import java .net .HttpURLConnection ;
3434import java .net .URL ;
35+ import java .net .URLDecoder ;
3536import java .util .Arrays ;
3637import java .util .HashMap ;
3738import java .util .Map ;
3839import java .util .regex .Pattern ;
40+ import java .util .zip .GZIPInputStream ;
3941
4042/**
4143 * Opens an {@link HttpURLConnection} and passes a reader to the receiver.
4244 *
4345 * @author Christoph Böhme
4446 * @author Jan Schnasse
4547 * @author Jens Wille
48+ * @author Pascal Christoph (dr0i)
4649 */
47- @ Description ("Opens an HTTP resource. Supports setting HTTP header fields `Accept`, `Accept-Charset` and `Content-Type`, as well as generic headers (separated by `\\ n`). Defaults: request `method` = `GET`, request `url` = `@-` (input data), request `body` = `@-` (input data) if request method supports body and input data not already used, `Accept` header = `*/*`, `Accept-Charset` header (`encoding `) = `UTF-8`, `errorPrefix ` = `ERROR: `." )
50+ @ Description ("Opens an HTTP resource. Supports setting HTTP header fields `Accept`, `Accept-Charset`, `Accept-Encoding`, `Content-Encoding` and `Content-Type`, as well as generic headers (separated by `\\ n`). Defaults: request `method` = `GET`, request `url` = `@-` (input data), request `body` = `@-` (input data) if request method supports body and input data not already used, `Accept` header (`accept`) = `*/*`, `Accept-Charset` header (`acceptcharset `) = `UTF-8`, `errorprefix ` = `ERROR: `." )
4851@ In (String .class )
4952@ Out (Reader .class )
5053@ FluxCommand ("open-http" )
5154public final class HttpOpener extends DefaultObjectPipe <String , ObjectReceiver <Reader >> {
5255
53- public static final String ACCEPT_DEFAULT = "*/*" ;
5456 public static final String ACCEPT_HEADER = "accept" ;
57+ public static final String ACCEPT_CHARSET_HEADER = "accept-charset" ;
58+ public static final String ACCEPT_ENCODING_HEADER = "accept-encoding" ;
59+ public static final String CONTENT_ENCODING_HEADER = "content-encoding" ;
5560 public static final String CONTENT_TYPE_HEADER = "content-type" ;
61+
62+ public static final String ACCEPT_DEFAULT = "*/*" ;
63+ public static final String CHARSET_DEFAULT = "UTF-8" ;
5664 public static final String DEFAULT_PREFIX = "ERROR: " ;
57- public static final String ENCODING_DEFAULT = "UTF-8 " ;
58- public static final String ENCODING_HEADER = "accept-charset " ;
65+ public static final String HEADER_FIELD_SEPARATOR = "\n " ;
66+ public static final String HEADER_VALUE_SEPARATOR = ": " ;
5967 public static final String INPUT_DESIGNATOR = "@-" ;
68+ public static final String MIME_PARAMETER_CHARSET = "charset" ;
69+ public static final String MIME_PARAMETER_SEPARATOR = ";" ;
70+ public static final String MIME_PARAMETER_VALUE_SEPARATOR = "=" ;
6071
6172 public static final String DEFAULT_METHOD_NAME = "GET" ;
6273 public static final Method DEFAULT_METHOD = Method .valueOf (DEFAULT_METHOD_NAME );
6374
64- public static final String HEADER_FIELD_SEPARATOR = "\n " ;
65- public static final String HEADER_VALUE_SEPARATOR = ":" ;
66-
6775 private static final Pattern HEADER_FIELD_SEPARATOR_PATTERN = Pattern .compile (HEADER_FIELD_SEPARATOR );
6876 private static final Pattern HEADER_VALUE_SEPARATOR_PATTERN = Pattern .compile (HEADER_VALUE_SEPARATOR );
77+ private static final Pattern MIME_PARAMETER_SEPARATOR_PATTERN = Pattern .compile (MIME_PARAMETER_SEPARATOR );
78+
79+ private static final int ALLOWED_REDIRECTIONS = 3 ;
80+ private static final int CONNECTION_TIMEOUT = 11000 ;
6981
7082 private final Map <String , String > headers = new HashMap <>();
7183
@@ -118,7 +130,7 @@ public boolean getResponseHasBody() {
118130 */
119131 public HttpOpener () {
120132 setAccept (ACCEPT_DEFAULT );
121- setEncoding ( ENCODING_DEFAULT );
133+ setAcceptCharset ( CHARSET_DEFAULT );
122134 setErrorPrefix (DEFAULT_PREFIX );
123135 setMethod (DEFAULT_METHOD );
124136 setUrl (INPUT_DESIGNATOR );
@@ -137,43 +149,59 @@ public void setAccept(final String accept) {
137149 }
138150
139151 /**
140- * Sets the HTTP request body. The default value for the request body is
141- * {@value INPUT_DESIGNATOR} <i>if the {@link #setMethod(Method) request
142- * method} accepts a request body</i>, which means it will use the {@link
143- * #process(String) input data} data as request body <i>if the input has
144- * not already been used</i>; otherwise, no request body will be set by
145- * default.
152+ * Sets the HTTP {@value CONTENT_TYPE_HEADER} header value. This is a
153+ * MIME type such as {@code text/plain} or {@code application/json}.
146154 *
147- * <p>If a request body has been set, but the request method does not
148- * accept a body, the method <i>may</i> be changed to {@code POST}.
155+ * @param contentType MIME type to use for the HTTP content-type header
156+ */
157+ public void setContentType (final String contentType ) {
158+ setHeader (CONTENT_TYPE_HEADER , contentType );
159+ }
160+
161+ /**
162+ * Sets the HTTP {@value ACCEPT_CHARSET_HEADER} header value. This is the
163+ * preferred charset for the HTTP response.
164+ * The default charset is {@value CHARSET_DEFAULT}.
149165 *
150- * @param body the request body
166+ * @param charset name of the charset used for the accept-charset HTTP header
151167 */
152- public void setBody (final String body ) {
153- this . body = body ;
168+ public void setAcceptCharset (final String charset ) {
169+ setHeader ( ACCEPT_CHARSET_HEADER , charset ) ;
154170 }
155171
156172 /**
157- * Sets the HTTP {@value CONTENT_TYPE_HEADER} header value. This is a
158- * MIME type such as {@code text/plain} or {@code application/json}.
173+ * @deprecated Use {@link #setAcceptCharset} instead.
174+ * @param charset name of the charset used for the accept-charset HTTP header
175+ */
176+ @ Deprecated
177+ public void setEncoding (final String charset ) {
178+ setAcceptCharset (charset );
179+ }
180+
181+ /**
182+ * Sets the HTTP {@value ACCEPT_ENCODING_HEADER} header value. This is the
183+ * preferred content encoding for the HTTP response. It accepts HTTP compression.
184+ * Allowed values are i.a. "gzip" and "Brotli".
185+ * The default for the content encoding is null, which means "no compression".
159186 *
160- * @param contentType MIME type to use for the HTTP content-type header
187+ * @param acceptEncoding name of content encoding used for the accept-encoding HTTP
188+ * header
161189 */
162- public void setContentType (final String contentType ) {
163- setHeader (CONTENT_TYPE_HEADER , contentType );
190+ public void setAcceptEncoding (final String acceptEncoding ) {
191+ setHeader (ACCEPT_ENCODING_HEADER , acceptEncoding );
164192 }
165193
166194 /**
167- * Sets the HTTP {@value ENCODING_HEADER } header value. This is the
168- * preferred encoding for the HTTP response. Additionally, the encoding
169- * is used for reading the HTTP response if it does not specify a content
170- * encoding. The default for the encoding is {@value ENCODING_DEFAULT} .
195+ * Sets the HTTP {@value CONTENT_ENCODING_HEADER } header value. This is the
196+ * content encoding for the HTTP request. It enables HTTP compression.
197+ * Allowed values are "gzip".
198+ * The default for the content encoding is null, which means "no compression" .
171199 *
172- * @param encoding name of the encoding used for the accept-charset HTTP
200+ * @param contentEncoding name of content encoding used for the content-encoding HTTP
173201 * header
174202 */
175- public void setEncoding (final String encoding ) {
176- setHeader (ENCODING_HEADER , encoding );
203+ public void setContentEncoding (final String contentEncoding ) {
204+ setHeader (CONTENT_ENCODING_HEADER , contentEncoding );
177205 }
178206
179207 /**
@@ -239,28 +267,40 @@ public void setUrl(final String url) {
239267 this .url = url ;
240268 }
241269
270+ /**
271+ * Sets the HTTP request body. The default value for the request body is
272+ * {@value INPUT_DESIGNATOR} <i>if the {@link #setMethod(Method) request
273+ * method} accepts a request body</i>, which means it will use the {@link
274+ * #process(String) input data} data as request body <i>if the input has
275+ * not already been used</i>; otherwise, no request body will be set by
276+ * default.
277+ *
278+ * <p>If a request body has been set, but the request method does not
279+ * accept a body, the method <i>may</i> be changed to {@code POST}.
280+ *
281+ * @param body the request body
282+ */
283+ public void setBody (final String body ) {
284+ this .body = body ;
285+ }
286+
242287 @ Override
243288 public void process (final String input ) {
244289 try {
245290 final String requestUrl = getInput (input , url );
246291 final String requestBody = getInput (input ,
247- body == null && method .getRequestHasBody () ? INPUT_DESIGNATOR : body );
248-
249- final HttpURLConnection connection =
250- (HttpURLConnection ) new URL (requestUrl ).openConnection ();
292+ body == null && method .getRequestHasBody () ? INPUT_DESIGNATOR : body );
251293
252- connection .setRequestMethod (method .name ());
253- headers .forEach (connection ::addRequestProperty );
254-
255- if (requestBody != null ) {
256- connection .setDoOutput (true );
257- connection .getOutputStream ().write (requestBody .getBytes ());
258- }
294+ final URL urlToOpen = new URL (requestUrl );
295+ final HttpURLConnection connection = requestBody != null ?
296+ doOutput (urlToOpen , requestBody ) : doRedirects (urlToOpen );
259297
260298 final InputStream inputStream = getInputStream (connection );
261- final String contentEncoding = getEncoding (connection . getContentEncoding () );
299+ final String charset = getContentCharset (connection );
262300
263- getReceiver ().process (new InputStreamReader (inputStream , contentEncoding ));
301+ getReceiver ().process (new InputStreamReader (
302+ "gzip" .equalsIgnoreCase (connection .getContentEncoding ()) ?
303+ new GZIPInputStream (inputStream ) : inputStream , charset ));
264304 }
265305 catch (final IOException e ) {
266306 throw new MetafactureException (e );
@@ -287,6 +327,46 @@ else if (inputUsed) {
287327 return result ;
288328 }
289329
330+ private HttpURLConnection doOutput (final URL urlToOpen , final String requestBody ) throws IOException {
331+ final HttpURLConnection connection = openConnection (urlToOpen );
332+
333+ connection .setDoOutput (true );
334+ connection .getOutputStream ().write (requestBody .getBytes ());
335+
336+ return connection ;
337+ }
338+
339+ private HttpURLConnection doRedirects (final URL startingUrl ) throws IOException {
340+ URL urlToFollow = startingUrl ;
341+
342+ for (int i = 0 ; i < ALLOWED_REDIRECTIONS ; ++i ) {
343+ final HttpURLConnection connection = openConnection (urlToFollow );
344+ connection .setInstanceFollowRedirects (false ); // Make the logic below easier to detect redirections
345+
346+ switch (connection .getResponseCode ()) {
347+ case HttpURLConnection .HTTP_MOVED_PERM :
348+ case HttpURLConnection .HTTP_MOVED_TEMP :
349+ final String location = URLDecoder .decode (connection .getHeaderField ("Location" ), "UTF-8" );
350+ urlToFollow = new URL (urlToFollow , location ); // Deal with relative URLs
351+ break ;
352+ default :
353+ return connection ;
354+ }
355+ }
356+
357+ throw new IOException ("Too many redirects" );
358+ }
359+
360+ private HttpURLConnection openConnection (final URL urlToOpen ) throws IOException {
361+ final HttpURLConnection connection = (HttpURLConnection ) urlToOpen .openConnection ();
362+
363+ connection .setRequestMethod (method .name ());
364+ connection .setConnectTimeout (CONNECTION_TIMEOUT );
365+ headers .forEach (connection ::setRequestProperty );
366+
367+ return connection ;
368+ }
369+
290370 private InputStream getInputStream (final HttpURLConnection connection ) throws IOException {
291371 try {
292372 return connection .getInputStream ();
@@ -312,8 +392,23 @@ private InputStream getErrorStream(final InputStream errorStream) {
312392 }
313393 }
314394
315- private String getEncoding (final String contentEncoding ) {
316- return contentEncoding != null ? contentEncoding : headers .get (ENCODING_HEADER );
395+ private String getContentCharset (final HttpURLConnection connection ) {
396+ final String contentType = connection .getContentType ();
397+
398+ if (contentType != null ) {
399+ final String [] parts = MIME_PARAMETER_SEPARATOR_PATTERN .split (contentType );
400+
401+ for (int i = 1 ; i < parts .length ; ++i ) {
402+ final String parameter = parts [i ].trim ();
403+ final int index = parameter .indexOf (MIME_PARAMETER_VALUE_SEPARATOR );
404+
405+ if (index != -1 && MIME_PARAMETER_CHARSET .equalsIgnoreCase (parameter .substring (0 , index ))) {
406+ return parameter .substring (index + 1 );
407+ }
408+ }
409+ }
410+
411+ return CHARSET_DEFAULT ;
317412 }
318413
319414}
0 commit comments