diff --git a/xoai-common/src/main/java/io/gdcc/xoai/xml/CopyElement.java b/xoai-common/src/main/java/io/gdcc/xoai/xml/CopyElement.java index acb6e552..65d556f5 100644 --- a/xoai-common/src/main/java/io/gdcc/xoai/xml/CopyElement.java +++ b/xoai-common/src/main/java/io/gdcc/xoai/xml/CopyElement.java @@ -9,6 +9,7 @@ package io.gdcc.xoai.xml; import io.gdcc.xoai.xmlio.exceptions.XmlWriteException; +import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; @@ -82,11 +83,31 @@ protected void writeXml(XmlWriter writer) throws IOException { * and the application must take care of it. */ - try (xmlInputStream; ) { - // fill the buffer once with the first 1024 chars and read as string - byte[] buffer = xmlInputStream.readNBytes(1024); - String firstChars = new String(buffer, StandardCharsets.UTF_8); + try (xmlInputStream; + // create a buffered stream to allow for rewind + BufferedInputStream bufferedXmlInputStream = + new BufferedInputStream(xmlInputStream)) { + final int n = 1024; + // mark the start of the stream, leave 1 byte headroom for reading ahead + bufferedXmlInputStream.mark(n + 1); + // read the first bytes + byte[] bytes = bufferedXmlInputStream.readNBytes(n); + + // broken UTF-8 multibyte char only possible when >= 1024 chars available + if (bytes.length == n) { + // analyze how far we can read without risk of broken char + final int maxN = maxBytesWithCompleteUTF8Chars(bytes); + // if we detected a (potentially broken) mbchar at the end, re-read. + if (maxN < n) { + // reset the stream to begin + bufferedXmlInputStream.reset(); + // replace bytes by reading again, but only so far as maxN + bytes = bufferedXmlInputStream.readNBytes(maxN); + } + } + + String firstChars = new String(bytes, StandardCharsets.UTF_8); // match the start with the compiled regex and replace with nothing when matching. firstChars = xmlDeclaration.reset(firstChars).replaceFirst(""); @@ -94,7 +115,29 @@ protected void writeXml(XmlWriter writer) throws IOException { writer.getOutputStream().write(firstChars.getBytes(StandardCharsets.UTF_8)); // now send the rest of the stream - xmlInputStream.transferTo(writer.getOutputStream()); + bufferedXmlInputStream.transferTo(writer.getOutputStream()); } } + + public static int maxBytesWithCompleteUTF8Chars(final byte[] buffer) { + final int n = buffer.length; + + if (Byte.toUnsignedInt(buffer[n - 1]) + > 127) { // 0b0xxxxxxx are all 1-byte chars, so only seek for 0b1xxxxxxx + for (int i = n - 1; + i > n - 5; + i--) { // go max four bytes back (max UTF-8 mbchar length) + if (Byte.toUnsignedInt(buffer[i]) + > 191) { // 0b110xxxxx / 0b1110xxxx / 0b11110xxx are the UTF-8 multibyte + // char start bytes + // -> this is a multibyte start char => use "i" as perfect new length + // (array index starts at 0, so is already -1 for read length) + return i; + } + // -> this is a byte somewhere inside the mbchar, go -1 byte and try again + } + } + + return n; + } } diff --git a/xoai-common/src/test/java/io/gdcc/xoai/xml/CopyElementTest.java b/xoai-common/src/test/java/io/gdcc/xoai/xml/CopyElementTest.java index 679384fb..96b85da0 100644 --- a/xoai-common/src/test/java/io/gdcc/xoai/xml/CopyElementTest.java +++ b/xoai-common/src/test/java/io/gdcc/xoai/xml/CopyElementTest.java @@ -1,5 +1,6 @@ package io.gdcc.xoai.xml; +import static org.hamcrest.CoreMatchers.equalTo; import static org.hamcrest.MatcherAssert.assertThat; import static org.junit.jupiter.api.Assertions.*; import static org.xmlunit.matchers.CompareMatcher.isSimilarTo; @@ -15,11 +16,19 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; +import java.io.UnsupportedEncodingException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; +import java.util.Arrays; +import java.util.Map; +import java.util.stream.Stream; import javax.xml.stream.XMLStreamException; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; +import org.xmlunit.matchers.EvaluateXPathMatcher; public class CopyElementTest { @@ -199,4 +208,104 @@ void writeLargeStream() throws IOException, XMLStreamException, XmlWriteExceptio // then assertThat("Large stream XML wrapped in ", result, hasXPath("//metadata")); } + + /** + * This test checks that we do not bodge multibyte UTF-8 chars from the input stream. See issue 188 at gdcc/xoai. + */ + @Test + void ensureIssue188Fixed() throws IOException, XMLStreamException { + // given + final Path sourceFile = + Path.of("src", "test", "resources", "188-copy-element-multibyte.xml"); + final String sourceXml = Files.readString(sourceFile, StandardCharsets.UTF_8); + final ByteArrayOutputStream resultStream = new ByteArrayOutputStream(); + + final String description = + "This dataset contains information of experiments carried out upland rice in two" + + " regions of Nicaragua (Caribbean and Pacific Region), as well as a" + + " compilation of soils data from different regions in Nicaragua collected" + + " from 2015 to 2019. The experiments were designed to explore the effects of" + + " micronutrients in the yield of upland rice. The experiments were carried" + + " out on farmer’s field during the 2018 production cycle, the dataset" + + " contains yield and aerial biomass of the experiments."; + + assertThat( + "Description is intact in source", + sourceXml, + EvaluateXPathMatcher.hasXPath("//dc:description/text()", equalTo(description)) + .withNamespaceContext(Map.of("dc", "http://purl.org/dc/elements/1.1/"))); + + // when + try (resultStream; + InputStream stream = + new ByteArrayInputStream(sourceXml.getBytes(StandardCharsets.UTF_8)); + XmlWriter writer = new XmlWriter(resultStream)) { + writer.writeStartDocument(); + writer.writeStartElement("metadata"); + writer.write(new CopyElement(stream)); + writer.writeEndElement(); + writer.writeEndDocument(); + } + String result = resultStream.toString(); + + // then + assertThat("Large stream XML wrapped in ", result, hasXPath("//metadata")); + assertThat( + "Has a element", + result, + hasXPath("//dc:description") + .withNamespaceContext(Map.of("dc", "http://purl.org/dc/elements/1.1/"))); + assertThat( + "Description is still intact", + result, + EvaluateXPathMatcher.hasXPath("//dc:description/text()", equalTo(description)) + .withNamespaceContext(Map.of("dc", "http://purl.org/dc/elements/1.1/"))); + } + + static Stream multicharString() { + return Stream.of( + Arguments.of(4, "test", 0), // 1-byte ending + // 2-byte ending (complete, but we can't know that!) + Arguments.of(4, "testß", 0), + // 2-byte ending on 1st byte + Arguments.of(4, "testß", 1), + // 3-byte ending on 2nd byte + Arguments.of(4, "test’", 1), + // 3-byte ending on 3rd byte (complete, but we can't know that!) + Arguments.of(4, "test♥", 0), + // 3-byte ending on 2nd byte + Arguments.of(4, "test♥", 1), + // 3-byte ending on 1st byte + Arguments.of(4, "test♥", 2), + // 4-byte ending on 3rd byte + Arguments.of(4, "test\uD83D\uDD2B", 1)); + } + + @MethodSource("multicharString") + @ParameterizedTest + void ensureBufferAnalysisCorrectness(int expectedLength, String text, int bytesToCutOf) + throws UnsupportedEncodingException { + // given + // System.out.println(convertToBinary(text)); + byte[] bytes = text.getBytes(StandardCharsets.UTF_8); + // System.out.println(Arrays.toString(bytes)); + + // when + byte[] sut = Arrays.copyOf(bytes, bytes.length - bytesToCutOf); + int length = CopyElement.maxBytesWithCompleteUTF8Chars(sut); + + // then + assertEquals(expectedLength, length); + } + + static String convertToBinary(String input) { + byte[] bytes = input.getBytes(StandardCharsets.UTF_8); + StringBuilder sb = new StringBuilder(); + for (byte b : bytes) { + sb.append(String.format("%8s", Integer.toBinaryString(b & 0xFF)).replace(" ", "0")); + sb.append(" "); + } + return sb.toString(); + } } diff --git a/xoai-common/src/test/resources/188-copy-element-multibyte.xml b/xoai-common/src/test/resources/188-copy-element-multibyte.xml new file mode 100644 index 00000000..a1395101 --- /dev/null +++ b/xoai-common/src/test/resources/188-copy-element-multibyte.xml @@ -0,0 +1,7 @@ +Impact of micronutrients on upland rice yield, Nicaraguahttps://doi.org/10.7910/DVN/IBDWAVSiles, PabloTellez, OrlandoPeng, Yuan-ChingZeledón, YasserHarvard DataverseThis dataset contains information of experiments carried out upland rice in two regions of Nicaragua (Caribbean and Pacific Region), as well as a compilation of soils data from different regions in Nicaragua collected from 2015 to 2019. The experiments were designed to explore the effects of micronutrients in the yield of upland rice. The experiments were carried out on farmer’s field during the 2018 production cycle, the dataset contains yield and aerial biomass of the experiments.Agricultural SciencesEarth and Environmental SciencesUpland riceMicronutrientsCrop yieldZincBoronLatin America and the CaribbeanMultifunctional LandscapesSpanish, Castilian2019-02Garcia, CarolinaExperimental DataEnvironmental Data + +