From b078b4680830a02a7bc71d1313e744001657ee02 Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Fri, 15 Sep 2023 17:42:15 -0400 Subject: [PATCH 1/4] A quick and dirty fix for the "split utf8 character" issue in CopyElement. #188 --- .../java/io/gdcc/xoai/xml/CopyElement.java | 29 ++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/xoai-common/src/main/java/io/gdcc/xoai/xml/CopyElement.java b/xoai-common/src/main/java/io/gdcc/xoai/xml/CopyElement.java index acb6e552..f4d8b1b0 100644 --- a/xoai-common/src/main/java/io/gdcc/xoai/xml/CopyElement.java +++ b/xoai-common/src/main/java/io/gdcc/xoai/xml/CopyElement.java @@ -85,7 +85,29 @@ protected void writeXml(XmlWriter writer) throws IOException { try (xmlInputStream; ) { // fill the buffer once with the first 1024 chars and read as string byte[] buffer = xmlInputStream.readNBytes(1024); - String firstChars = new String(buffer, StandardCharsets.UTF_8); + + // We will take some extra precautions to make sure we are not + // splitting any multi-byte UTF-8 characters in the process! + String firstChars; + int length = -1; + + if (buffer.length < 1024) { + // the entire metadata fragment is shorter than 1024 bytes; + // should be safe to just convert to String: + firstChars = new String(buffer, StandardCharsets.UTF_8); + } else { + length = 1021; + // try converting the entire buffer into a String, *except* for + // the last 3 bytes: + firstChars = new String(buffer, 0, length, StandardCharsets.UTF_8); + while (firstChars.charAt(firstChars.length()-1) == '\uFFFD' && length <= 1024 ) { + // keep reading additional bytes, one at a time, for as long as + // the last character in the resulting String is a '�', aka + // Unicode "no such character" character, '\uFFFD', or in UTF-8 + length++; + firstChars = new String(buffer, 0, length, StandardCharsets.UTF_8); + } + } // match the start with the compiled regex and replace with nothing when matching. firstChars = xmlDeclaration.reset(firstChars).replaceFirst(""); @@ -93,6 +115,11 @@ protected void writeXml(XmlWriter writer) throws IOException { // write the chars to the output stream writer.getOutputStream().write(firstChars.getBytes(StandardCharsets.UTF_8)); + // if we have any leftover unused bytes in the buffer, write those too: + if (length > -1 && length < 1024) { + writer.getOutputStream().write(buffer, length, 1024 - length); + } + // now send the rest of the stream xmlInputStream.transferTo(writer.getOutputStream()); } From f9132518e050620cf09f9d4a44ae1fe8db0ab389 Mon Sep 17 00:00:00 2001 From: Oliver Bertuch Date: Fri, 29 Sep 2023 19:29:47 +0200 Subject: [PATCH 2/4] style(common): apply spotless changes to quickfix commit --- .../java/io/gdcc/xoai/xml/CopyElement.java | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/xoai-common/src/main/java/io/gdcc/xoai/xml/CopyElement.java b/xoai-common/src/main/java/io/gdcc/xoai/xml/CopyElement.java index f4d8b1b0..23414ea3 100644 --- a/xoai-common/src/main/java/io/gdcc/xoai/xml/CopyElement.java +++ b/xoai-common/src/main/java/io/gdcc/xoai/xml/CopyElement.java @@ -85,24 +85,24 @@ protected void writeXml(XmlWriter writer) throws IOException { try (xmlInputStream; ) { // fill the buffer once with the first 1024 chars and read as string byte[] buffer = xmlInputStream.readNBytes(1024); - + // We will take some extra precautions to make sure we are not // splitting any multi-byte UTF-8 characters in the process! - String firstChars; - int length = -1; - + String firstChars; + int length = -1; + if (buffer.length < 1024) { - // the entire metadata fragment is shorter than 1024 bytes; - // should be safe to just convert to String: + // the entire metadata fragment is shorter than 1024 bytes; + // should be safe to just convert to String: firstChars = new String(buffer, StandardCharsets.UTF_8); } else { - length = 1021; - // try converting the entire buffer into a String, *except* for + length = 1021; + // try converting the entire buffer into a String, *except* for // the last 3 bytes: firstChars = new String(buffer, 0, length, StandardCharsets.UTF_8); - while (firstChars.charAt(firstChars.length()-1) == '\uFFFD' && length <= 1024 ) { - // keep reading additional bytes, one at a time, for as long as - // the last character in the resulting String is a '�', aka + while (firstChars.charAt(firstChars.length() - 1) == '\uFFFD' && length <= 1024) { + // keep reading additional bytes, one at a time, for as long as + // the last character in the resulting String is a '�', aka // Unicode "no such character" character, '\uFFFD', or in UTF-8 length++; firstChars = new String(buffer, 0, length, StandardCharsets.UTF_8); @@ -115,11 +115,11 @@ protected void writeXml(XmlWriter writer) throws IOException { // write the chars to the output stream writer.getOutputStream().write(firstChars.getBytes(StandardCharsets.UTF_8)); - // if we have any leftover unused bytes in the buffer, write those too: + // if we have any leftover unused bytes in the buffer, write those too: if (length > -1 && length < 1024) { writer.getOutputStream().write(buffer, length, 1024 - length); } - + // now send the rest of the stream xmlInputStream.transferTo(writer.getOutputStream()); } From 5057b23f382b74881ba695dc889bf21d7dfcfbd5 Mon Sep 17 00:00:00 2001 From: Oliver Bertuch Date: Fri, 29 Sep 2023 19:30:08 +0200 Subject: [PATCH 3/4] test(common): add reproducer test for issue #188 This reproducer test will ensure the output is not bodged by broken UTF-8 multibyte chars again. --- .../io/gdcc/xoai/xml/CopyElementTest.java | 57 +++++++++++++++++++ .../resources/188-copy-element-multibyte.xml | 7 +++ 2 files changed, 64 insertions(+) create mode 100644 xoai-common/src/test/resources/188-copy-element-multibyte.xml diff --git a/xoai-common/src/test/java/io/gdcc/xoai/xml/CopyElementTest.java b/xoai-common/src/test/java/io/gdcc/xoai/xml/CopyElementTest.java index 679384fb..57912af5 100644 --- a/xoai-common/src/test/java/io/gdcc/xoai/xml/CopyElementTest.java +++ b/xoai-common/src/test/java/io/gdcc/xoai/xml/CopyElementTest.java @@ -1,5 +1,6 @@ package io.gdcc.xoai.xml; +import static org.hamcrest.CoreMatchers.equalTo; import static org.hamcrest.MatcherAssert.assertThat; import static org.junit.jupiter.api.Assertions.*; import static org.xmlunit.matchers.CompareMatcher.isSimilarTo; @@ -18,8 +19,10 @@ import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; +import java.util.Map; import javax.xml.stream.XMLStreamException; import org.junit.jupiter.api.Test; +import org.xmlunit.matchers.EvaluateXPathMatcher; public class CopyElementTest { @@ -199,4 +202,58 @@ void writeLargeStream() throws IOException, XMLStreamException, XmlWriteExceptio // then assertThat("Large stream XML wrapped in ", result, hasXPath("//metadata")); } + + /** + * This test checks that we do not bodge multibyte UTF-8 chars from the input stream. See issue 188 at gdcc/xoai. + */ + @Test + void ensureIssue188Fixed() throws IOException, XMLStreamException { + // given + final Path sourceFile = + Path.of("src", "test", "resources", "188-copy-element-multibyte.xml"); + final String sourceXml = Files.readString(sourceFile, StandardCharsets.UTF_8); + final ByteArrayOutputStream resultStream = new ByteArrayOutputStream(); + + final String description = + "This dataset contains information of experiments carried out upland rice in two" + + " regions of Nicaragua (Caribbean and Pacific Region), as well as a" + + " compilation of soils data from different regions in Nicaragua collected" + + " from 2015 to 2019. The experiments were designed to explore the effects of" + + " micronutrients in the yield of upland rice. The experiments were carried" + + " out on farmer’s field during the 2018 production cycle, the dataset" + + " contains yield and aerial biomass of the experiments."; + + assertThat( + "Description is intact in source", + sourceXml, + EvaluateXPathMatcher.hasXPath("//dc:description/text()", equalTo(description)) + .withNamespaceContext(Map.of("dc", "http://purl.org/dc/elements/1.1/"))); + + // when + try (resultStream; + InputStream stream = + new ByteArrayInputStream(sourceXml.getBytes(StandardCharsets.UTF_8)); + XmlWriter writer = new XmlWriter(resultStream)) { + writer.writeStartDocument(); + writer.writeStartElement("metadata"); + writer.write(new CopyElement(stream)); + writer.writeEndElement(); + writer.writeEndDocument(); + } + String result = resultStream.toString(); + + // then + assertThat("Large stream XML wrapped in ", result, hasXPath("//metadata")); + assertThat( + "Has a element", + result, + hasXPath("//dc:description") + .withNamespaceContext(Map.of("dc", "http://purl.org/dc/elements/1.1/"))); + assertThat( + "Description is still intact", + result, + EvaluateXPathMatcher.hasXPath("//dc:description/text()", equalTo(description)) + .withNamespaceContext(Map.of("dc", "http://purl.org/dc/elements/1.1/"))); + } } diff --git a/xoai-common/src/test/resources/188-copy-element-multibyte.xml b/xoai-common/src/test/resources/188-copy-element-multibyte.xml new file mode 100644 index 00000000..a1395101 --- /dev/null +++ b/xoai-common/src/test/resources/188-copy-element-multibyte.xml @@ -0,0 +1,7 @@ +Impact of micronutrients on upland rice yield, Nicaraguahttps://doi.org/10.7910/DVN/IBDWAVSiles, PabloTellez, OrlandoPeng, Yuan-ChingZeledón, YasserHarvard DataverseThis dataset contains information of experiments carried out upland rice in two regions of Nicaragua (Caribbean and Pacific Region), as well as a compilation of soils data from different regions in Nicaragua collected from 2015 to 2019. The experiments were designed to explore the effects of micronutrients in the yield of upland rice. The experiments were carried out on farmer’s field during the 2018 production cycle, the dataset contains yield and aerial biomass of the experiments.Agricultural SciencesEarth and Environmental SciencesUpland riceMicronutrientsCrop yieldZincBoronLatin America and the CaribbeanMultifunctional LandscapesSpanish, Castilian2019-02Garcia, CarolinaExperimental DataEnvironmental Data + + From 53f8222645671dcfd41717624b2bc9dcdbdd0fef Mon Sep 17 00:00:00 2001 From: Oliver Bertuch Date: Sat, 30 Sep 2023 09:43:39 +0200 Subject: [PATCH 4/4] fix(common): apply more lowlevel fix for broken UTF-8 chars #188 With this commit, we introduce an analysis routine that will go over the last few bytes when reading with CopyElement. This is faster than converting to String and checking for the UTF-8 unknown char sign over and over again. By using a buffered input stream, we can rewind the stream if necessary and read again up to the point that we don't have a broken char. Extensive testing was added to make sure the analysis function works with any length of multibyte UTF-8 chars --- .../java/io/gdcc/xoai/xml/CopyElement.java | 76 +++++++++++-------- .../io/gdcc/xoai/xml/CopyElementTest.java | 52 +++++++++++++ 2 files changed, 98 insertions(+), 30 deletions(-) diff --git a/xoai-common/src/main/java/io/gdcc/xoai/xml/CopyElement.java b/xoai-common/src/main/java/io/gdcc/xoai/xml/CopyElement.java index 23414ea3..65d556f5 100644 --- a/xoai-common/src/main/java/io/gdcc/xoai/xml/CopyElement.java +++ b/xoai-common/src/main/java/io/gdcc/xoai/xml/CopyElement.java @@ -9,6 +9,7 @@ package io.gdcc.xoai.xml; import io.gdcc.xoai.xmlio.exceptions.XmlWriteException; +import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; @@ -82,46 +83,61 @@ protected void writeXml(XmlWriter writer) throws IOException { * and the application must take care of it. */ - try (xmlInputStream; ) { - // fill the buffer once with the first 1024 chars and read as string - byte[] buffer = xmlInputStream.readNBytes(1024); - - // We will take some extra precautions to make sure we are not - // splitting any multi-byte UTF-8 characters in the process! - String firstChars; - int length = -1; - - if (buffer.length < 1024) { - // the entire metadata fragment is shorter than 1024 bytes; - // should be safe to just convert to String: - firstChars = new String(buffer, StandardCharsets.UTF_8); - } else { - length = 1021; - // try converting the entire buffer into a String, *except* for - // the last 3 bytes: - firstChars = new String(buffer, 0, length, StandardCharsets.UTF_8); - while (firstChars.charAt(firstChars.length() - 1) == '\uFFFD' && length <= 1024) { - // keep reading additional bytes, one at a time, for as long as - // the last character in the resulting String is a '�', aka - // Unicode "no such character" character, '\uFFFD', or in UTF-8 - length++; - firstChars = new String(buffer, 0, length, StandardCharsets.UTF_8); + try (xmlInputStream; + // create a buffered stream to allow for rewind + BufferedInputStream bufferedXmlInputStream = + new BufferedInputStream(xmlInputStream)) { + + final int n = 1024; + // mark the start of the stream, leave 1 byte headroom for reading ahead + bufferedXmlInputStream.mark(n + 1); + // read the first bytes + byte[] bytes = bufferedXmlInputStream.readNBytes(n); + + // broken UTF-8 multibyte char only possible when >= 1024 chars available + if (bytes.length == n) { + // analyze how far we can read without risk of broken char + final int maxN = maxBytesWithCompleteUTF8Chars(bytes); + // if we detected a (potentially broken) mbchar at the end, re-read. + if (maxN < n) { + // reset the stream to begin + bufferedXmlInputStream.reset(); + // replace bytes by reading again, but only so far as maxN + bytes = bufferedXmlInputStream.readNBytes(maxN); } } + String firstChars = new String(bytes, StandardCharsets.UTF_8); // match the start with the compiled regex and replace with nothing when matching. firstChars = xmlDeclaration.reset(firstChars).replaceFirst(""); // write the chars to the output stream writer.getOutputStream().write(firstChars.getBytes(StandardCharsets.UTF_8)); - // if we have any leftover unused bytes in the buffer, write those too: - if (length > -1 && length < 1024) { - writer.getOutputStream().write(buffer, length, 1024 - length); - } - // now send the rest of the stream - xmlInputStream.transferTo(writer.getOutputStream()); + bufferedXmlInputStream.transferTo(writer.getOutputStream()); + } + } + + public static int maxBytesWithCompleteUTF8Chars(final byte[] buffer) { + final int n = buffer.length; + + if (Byte.toUnsignedInt(buffer[n - 1]) + > 127) { // 0b0xxxxxxx are all 1-byte chars, so only seek for 0b1xxxxxxx + for (int i = n - 1; + i > n - 5; + i--) { // go max four bytes back (max UTF-8 mbchar length) + if (Byte.toUnsignedInt(buffer[i]) + > 191) { // 0b110xxxxx / 0b1110xxxx / 0b11110xxx are the UTF-8 multibyte + // char start bytes + // -> this is a multibyte start char => use "i" as perfect new length + // (array index starts at 0, so is already -1 for read length) + return i; + } + // -> this is a byte somewhere inside the mbchar, go -1 byte and try again + } } + + return n; } } diff --git a/xoai-common/src/test/java/io/gdcc/xoai/xml/CopyElementTest.java b/xoai-common/src/test/java/io/gdcc/xoai/xml/CopyElementTest.java index 57912af5..96b85da0 100644 --- a/xoai-common/src/test/java/io/gdcc/xoai/xml/CopyElementTest.java +++ b/xoai-common/src/test/java/io/gdcc/xoai/xml/CopyElementTest.java @@ -16,12 +16,18 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; +import java.io.UnsupportedEncodingException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; +import java.util.Arrays; import java.util.Map; +import java.util.stream.Stream; import javax.xml.stream.XMLStreamException; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; import org.xmlunit.matchers.EvaluateXPathMatcher; public class CopyElementTest { @@ -256,4 +262,50 @@ void ensureIssue188Fixed() throws IOException, XMLStreamException { EvaluateXPathMatcher.hasXPath("//dc:description/text()", equalTo(description)) .withNamespaceContext(Map.of("dc", "http://purl.org/dc/elements/1.1/"))); } + + static Stream multicharString() { + return Stream.of( + Arguments.of(4, "test", 0), // 1-byte ending + // 2-byte ending (complete, but we can't know that!) + Arguments.of(4, "testß", 0), + // 2-byte ending on 1st byte + Arguments.of(4, "testß", 1), + // 3-byte ending on 2nd byte + Arguments.of(4, "test’", 1), + // 3-byte ending on 3rd byte (complete, but we can't know that!) + Arguments.of(4, "test♥", 0), + // 3-byte ending on 2nd byte + Arguments.of(4, "test♥", 1), + // 3-byte ending on 1st byte + Arguments.of(4, "test♥", 2), + // 4-byte ending on 3rd byte + Arguments.of(4, "test\uD83D\uDD2B", 1)); + } + + @MethodSource("multicharString") + @ParameterizedTest + void ensureBufferAnalysisCorrectness(int expectedLength, String text, int bytesToCutOf) + throws UnsupportedEncodingException { + // given + // System.out.println(convertToBinary(text)); + byte[] bytes = text.getBytes(StandardCharsets.UTF_8); + // System.out.println(Arrays.toString(bytes)); + + // when + byte[] sut = Arrays.copyOf(bytes, bytes.length - bytesToCutOf); + int length = CopyElement.maxBytesWithCompleteUTF8Chars(sut); + + // then + assertEquals(expectedLength, length); + } + + static String convertToBinary(String input) { + byte[] bytes = input.getBytes(StandardCharsets.UTF_8); + StringBuilder sb = new StringBuilder(); + for (byte b : bytes) { + sb.append(String.format("%8s", Integer.toBinaryString(b & 0xFF)).replace(" ", "0")); + sb.append(" "); + } + return sb.toString(); + } }