Skip to content

Commit

Permalink
Opens gzip compressed content (#511)
Browse files Browse the repository at this point in the history
- follows redirects
- fixes misconception of "Content-Encoding"
  • Loading branch information
dr0i committed Dec 18, 2023
1 parent 52e4141 commit 6dd2889
Show file tree
Hide file tree
Showing 2 changed files with 146 additions and 62 deletions.
147 changes: 114 additions & 33 deletions metafacture-io/src/main/java/org/metafacture/io/HttpOpener.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2013, 2022 Deutsche Nationalbibliothek et al
* Copyright 2013, 2023 Deutsche Nationalbibliothek et al
*
* Licensed under the Apache License, Version 2.0 the "License";
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -32,19 +32,22 @@
import java.io.SequenceInputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLDecoder;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;

/**
* Opens an {@link HttpURLConnection} and passes a reader to the receiver.
*
* @author Christoph Böhme
* @author Jan Schnasse
* @author Jens Wille
* @author Pascal Christoph (dr0i)
*/
@Description("Opens an HTTP resource. Supports setting HTTP header fields `Accept`, `Accept-Charset` and `Content-Type`, as well as generic headers (separated by `\\n`). Defaults: request `method` = `GET`, request `url` = `@-` (input data), request `body` = `@-` (input data) if request method supports body and input data not already used, `Accept` header = `*/*`, `Accept-Charset` header (`encoding`) = `UTF-8`, `errorPrefix` = `ERROR: `.")
@Description("Opens an HTTP resource. Supports setting HTTP header fields `Accept`, `Accept-Charset`, `Accept-Encoding` and `Content-Type`, as well as generic headers (separated by `\\n`). Defaults: request `method` = `GET`, request `url` = `@-` (input data), request `body` = `@-` (input data) if request method supports body and input data not already used, `Accept` header = `*/*`, `Accept-Charset` header (`encoding`) = `UTF-8`, `errorPrefix` = `ERROR: `.")
@In(String.class)
@Out(Reader.class)
@FluxCommand("open-http")
Expand All @@ -53,22 +56,21 @@ public final class HttpOpener extends DefaultObjectPipe<String, ObjectReceiver<R
public static final String ACCEPT_DEFAULT = "*/*";
public static final String ACCEPT_HEADER = "accept";
public static final String CONTENT_TYPE_HEADER = "content-type";
public static final String ACCEPT_ENCODING_HEADER = "accept-encoding";
public static final String ENCODING_HEADER = "content-encoding";
public static final String DEFAULT_PREFIX = "ERROR: ";
public static final String ENCODING_DEFAULT = "UTF-8";
public static final String ENCODING_HEADER = "accept-charset";
public static final String CHARSET_DEFAULT = "UTF-8";
public static final String ACCEPT_CHARSET_HEADER = "accept-charset";
public static final String INPUT_DESIGNATOR = "@-";

public static final String DEFAULT_METHOD_NAME = "GET";
public static final Method DEFAULT_METHOD = Method.valueOf(DEFAULT_METHOD_NAME);

public static final String HEADER_FIELD_SEPARATOR = "\n";
public static final String HEADER_VALUE_SEPARATOR = ":";

private static final Pattern HEADER_FIELD_SEPARATOR_PATTERN = Pattern.compile(HEADER_FIELD_SEPARATOR);
private static final Pattern HEADER_VALUE_SEPARATOR_PATTERN = Pattern.compile(HEADER_VALUE_SEPARATOR);

private static final int ALLOWED_REDIRECTIONS = 3;
private static final int CONNECTION_TIMEOUT = 11000;
private final Map<String, String> headers = new HashMap<>();

private Method method;
private String body;
private String errorPrefix;
Expand Down Expand Up @@ -118,7 +120,7 @@ public boolean getResponseHasBody() {
*/
public HttpOpener() {
setAccept(ACCEPT_DEFAULT);
setEncoding(ENCODING_DEFAULT);
setAcceptCharset(CHARSET_DEFAULT);
setErrorPrefix(DEFAULT_PREFIX);
setMethod(DEFAULT_METHOD);
setUrl(INPUT_DESIGNATOR);
Expand Down Expand Up @@ -163,17 +165,50 @@ public void setContentType(final String contentType) {
setHeader(CONTENT_TYPE_HEADER, contentType);
}

/**
* Sets the HTTP {@value ACCEPT_CHARSET_HEADER} header value. This is the
* preferred charset for the HTTP response.
* The default charset is {@value CHARSET_DEFAULT}.
*
* @param charset name of the charset used for the accept-charset HTTP header
*/
public void setAcceptCharset(final String charset) {
setHeader(ACCEPT_CHARSET_HEADER, charset);
}

/**
* @deprecated Use {@link #setAcceptCharset} instead.
* @param charset name of the charset used for the accept-charset HTTP header
*/
@Deprecated
public void setEncoding(final String charset) {
setAcceptCharset(charset);
}

/**
* Sets the HTTP {@value ACCEPT_ENCODING_HEADER} header value. This is the
* preferred content encoding for the HTTP response. It accepts HTTP compression.
* Allowed values are i.a. "gzip" and "Brotli".
* The default for the content encoding is null, which means "no compression".
*
* @param contentEncoding name of content encoding used for the accept-encoding HTTP
* header
*/
public void setAcceptContentEncoding(final String contentEncoding) {
setHeader(ACCEPT_ENCODING_HEADER, contentEncoding);
}

/**
* Sets the HTTP {@value ENCODING_HEADER} header value. This is the
* preferred encoding for the HTTP response. Additionally, the encoding
* is used for reading the HTTP response if it does not specify a content
* encoding. The default for the encoding is {@value ENCODING_DEFAULT}.
* content encoding for the HTTP GET. It enables HTTP compression.
* Allowed values are "gzip".
* The default for the content encoding is null, which means "no compression".
*
* @param encoding name of the encoding used for the accept-charset HTTP
* @param contentEncoding name of content encoding used for the content-encoding HTTP
* header
*/
public void setEncoding(final String encoding) {
setHeader(ENCODING_HEADER, encoding);
public void setContentEncoding(final String contentEncoding) {
setHeader(ENCODING_HEADER, contentEncoding);
}

/**
Expand Down Expand Up @@ -244,23 +279,15 @@ public void process(final String input) {
try {
final String requestUrl = getInput(input, url);
final String requestBody = getInput(input,
body == null && method.getRequestHasBody() ? INPUT_DESIGNATOR : body);

final HttpURLConnection connection =
(HttpURLConnection) new URL(requestUrl).openConnection();

connection.setRequestMethod(method.name());
headers.forEach(connection::addRequestProperty);

body == null && method.getRequestHasBody() ? INPUT_DESIGNATOR : body);
Reader reader = null;
if (requestBody != null) {
connection.setDoOutput(true);
connection.getOutputStream().write(requestBody.getBytes());
reader = doPostOrPut(requestBody, new URL(requestUrl));
}

final InputStream inputStream = getInputStream(connection);
final String contentEncoding = getEncoding(connection.getContentEncoding());

getReceiver().process(new InputStreamReader(inputStream, contentEncoding));
else {
reader = doGet(requestUrl);
}
getReceiver().process(reader);
}
catch (final IOException e) {
throw new MetafactureException(e);
Expand All @@ -270,6 +297,32 @@ public void process(final String input) {
}
}

private Reader doPostOrPut(final String requestBody, final URL urlToOpen) throws IOException {
final HttpURLConnection connection = (HttpURLConnection) urlToOpen.openConnection();
connection.setDoOutput(true);
connection.setRequestMethod(method.name());
headers.forEach(connection::setRequestProperty);
connection.getOutputStream().write(requestBody.getBytes());
final InputStream inputStream = getInputStream(connection);
return new InputStreamReader(inputStream, headers.get(ACCEPT_CHARSET_HEADER));
}

private Reader doGet(final String requestUrl) throws IOException {
final Reader reader;
final HttpURLConnection connection;
connection = followRedirects(new URL(requestUrl));
final InputStream inputStream = getInputStream(connection);

if ("gzip".equalsIgnoreCase(connection.getContentEncoding())) {
final GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream);
reader = new InputStreamReader(gzipInputStream);
}
else {
reader = new InputStreamReader(inputStream, headers.get(ACCEPT_CHARSET_HEADER));
}
return reader;
}

private String getInput(final String input, final String value) {
final String result;

Expand Down Expand Up @@ -312,8 +365,36 @@ private InputStream getErrorStream(final InputStream errorStream) {
}
}

private String getEncoding(final String contentEncoding) {
return contentEncoding != null ? contentEncoding : headers.get(ENCODING_HEADER);
private HttpURLConnection followRedirects(final URL startingUrl) throws IOException {
int times = 0;
HttpURLConnection conn;
URL urlToFollow = startingUrl;
while (true) {
times = times + 1;

if (times > ALLOWED_REDIRECTIONS) {
throw new IOException("Stuck in redirect loop");
}

conn = (HttpURLConnection) urlToFollow.openConnection();
headers.forEach(conn::setRequestProperty);
conn.setRequestMethod(method.name());
conn.setConnectTimeout(CONNECTION_TIMEOUT);
conn.setInstanceFollowRedirects(false); // Make the logic below easier to detect redirections

switch (conn.getResponseCode()) {
case HttpURLConnection.HTTP_MOVED_PERM:
case HttpURLConnection.HTTP_MOVED_TEMP:
String location = conn.getHeaderField("Location");
location = URLDecoder.decode(location, "UTF-8");
urlToFollow = new URL(urlToFollow, location); // Deal with relative URLs
continue;
default:
break;
}
break;
}
return conn;
}

}
61 changes: 32 additions & 29 deletions metafacture-io/src/test/java/org/metafacture/io/HttpOpenerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,12 @@

package org.metafacture.io;

import org.metafacture.commons.ResourceUtil;
import org.metafacture.framework.ObjectReceiver;

import com.github.tomakehurst.wiremock.client.MappingBuilder;
import com.github.tomakehurst.wiremock.client.ResponseDefinitionBuilder;
import com.github.tomakehurst.wiremock.client.WireMock;
import com.github.tomakehurst.wiremock.core.WireMockConfiguration;
import com.github.tomakehurst.wiremock.http.HttpHeader;
import com.github.tomakehurst.wiremock.http.HttpHeaders;
import com.github.tomakehurst.wiremock.http.RequestMethod;
import com.github.tomakehurst.wiremock.junit.WireMockRule;
import com.github.tomakehurst.wiremock.matching.RequestPatternBuilder;
Expand All @@ -32,20 +31,22 @@
import org.junit.ComparisonFailure;
import org.junit.Rule;
import org.junit.Test;
import org.metafacture.commons.ResourceUtil;
import org.metafacture.framework.ObjectReceiver;
import org.mockito.ArgumentCaptor;
import org.mockito.Captor;
import org.mockito.Mock;
import org.mockito.Mockito;
import org.mockito.junit.MockitoJUnit;
import org.mockito.junit.MockitoRule;

import static org.mockito.Mockito.times;

import java.io.IOException;
import java.io.Reader;
import java.io.*;
import java.util.Arrays;
import java.util.function.BiConsumer;
import java.util.function.Consumer;
import java.util.zip.GZIPOutputStream;

import static org.mockito.Mockito.times;

/**
* Tests for class {@link HttpOpener}.
Expand All @@ -62,6 +63,18 @@ public final class HttpOpenerTest {

private static final String REQUEST_BODY = "request body";
private static final String RESPONSE_BODY = "response bödy"; // UTF-8
private static byte[] GZIPPED_RESPONSE_BODY;
static {
try {
ByteArrayOutputStream out = new ByteArrayOutputStream();
GZIPOutputStream gzip = new GZIPOutputStream(out);
gzip.write(RESPONSE_BODY.getBytes("UTF-8"));
gzip.close();
GZIPPED_RESPONSE_BODY = out.toByteArray();
}catch (Exception e){
e.printStackTrace();
}
}

@Rule
public MockitoRule mockitoRule = MockitoJUnit.rule();
Expand Down Expand Up @@ -226,40 +239,23 @@ public void shouldPerformPostRequestWithContentTypeParameter() throws IOExceptio
}

@Test
public void shouldPerformPostRequestWithEncodingParameter() throws IOException {
final String encoding = "ISO-8859-1";
public void shouldPerformPostRequestWithCharsetParameter() throws IOException {
final String charset = "ISO-8859-1";
final String header = "Accept-Charset";
final StringValuePattern value = WireMock.equalTo(encoding);
final StringValuePattern value = WireMock.equalTo(charset);

try {
shouldPerformRequest(REQUEST_BODY, HttpOpener.Method.POST, (o, u) -> {
o.setMethod(HttpOpener.Method.POST);
o.setUrl(u);
o.setEncoding(encoding);
o.setAcceptCharset(charset);
}, s -> s.withHeader(header, value), q -> q.withHeader(header, value), null);
}
catch (final ComparisonFailure e) {
Assert.assertEquals("expected:<response b[ö]dy> but was:<response b[ö]dy>", e.getMessage());
}
}

@Test
public void shouldPerformPostRequestWithEncodingParameterAndContentEncodingResponseHeader() throws IOException {
final String encoding = "ISO-8859-1";
final String header = "Accept-Charset";
final StringValuePattern value = WireMock.equalTo(encoding);

shouldPerformRequest(REQUEST_BODY, HttpOpener.Method.POST, (o, u) -> {
o.setMethod(HttpOpener.Method.POST);
o.setUrl(u);
o.setEncoding(encoding);
},
s -> s.withHeader(header, value),
q -> q.withHeader(header, value),
r -> r.withHeader("Content-Encoding", "UTF-8")
);
}

@Test
public void shouldPerformGetRequestWithErrorResponse() throws IOException {
shouldPerformRequest(TEST_URL, HttpOpener.Method.GET, (o, u) -> {},
Expand All @@ -278,6 +274,14 @@ public void shouldPerformGetRequestWithErrorResponseAndWithoutErrorPrefixParamet
null, null, WireMock.badRequest().withBody(RESPONSE_BODY), RESPONSE_BODY);
}

@Test
public void shouldPerformGetRequestWithGzipedContentEncoding() throws IOException {
shouldPerformRequest(TEST_URL, HttpOpener.Method.GET, (o, u) -> o.setAcceptContentEncoding("gzip"),
null, null,
WireMock.ok().withBody(GZIPPED_RESPONSE_BODY).withHeaders(new HttpHeaders(new HttpHeader(HttpOpener.ENCODING_HEADER,"gzip"))),
RESPONSE_BODY);
}

private void shouldPerformRequest(final String input, final HttpOpener.Method method, final BiConsumer<HttpOpener, String> consumer, final String... headers) throws IOException {
shouldPerformRequest(input, method, consumer,
s -> Arrays.stream(headers).forEach(h -> s.withHeader(h, TEST_VALUE)),
Expand All @@ -289,7 +293,6 @@ private void shouldPerformRequest(final String input, final HttpOpener.Method me
if (responseConsumer != null) {
responseConsumer.accept(response);
}

shouldPerformRequest(input, method,
consumer, stubConsumer, requestConsumer,
response, method.getResponseHasBody() ? RESPONSE_BODY : "");
Expand Down

0 comments on commit 6dd2889

Please sign in to comment.