fix(common): apply more lowlevel fix for broken UTF-8 chars #188

With this commit, we introduce an analysis routine that will go over the last few bytes when reading with CopyElement. This is faster than converting to String and checking for the UTF-8 unknown char sign over and over again. By using a buffered input stream, we can rewind the stream if necessary and read again up to the point that we don't have a broken char. Extensive testing was added to make sure the analysis function works with any length of multibyte UTF-8 chars
gdcc · Sep 30, 2023 · 53f8222 · 53f8222
1 parent 5057b23
commit 53f8222
Show file tree

Hide file tree

Showing 2 changed files with 98 additions and 30 deletions.
diff --git a/xoai-common/src/main/java/io/gdcc/xoai/xml/CopyElement.java b/xoai-common/src/main/java/io/gdcc/xoai/xml/CopyElement.java
@@ -9,6 +9,7 @@
 package io.gdcc.xoai.xml;
 
 import io.gdcc.xoai.xmlio.exceptions.XmlWriteException;
+import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
@@ -82,46 +83,61 @@ protected void writeXml(XmlWriter writer) throws IOException {
          * and the application must take care of it.
          */
 
-        try (xmlInputStream; ) {
-            // fill the buffer once with the first 1024 chars and read as string
-            byte[] buffer = xmlInputStream.readNBytes(1024);
-
-            // We will take some extra precautions to make sure we are not
-            // splitting any multi-byte UTF-8 characters in the process!
-            String firstChars;
-            int length = -1;
-
-            if (buffer.length < 1024) {
-                // the entire metadata fragment is shorter than 1024 bytes;
-                // should be safe to just convert to String:
-                firstChars = new String(buffer, StandardCharsets.UTF_8);
-            } else {
-                length = 1021;
-                // try converting the entire buffer into a String, *except* for
-                // the last 3 bytes:
-                firstChars = new String(buffer, 0, length, StandardCharsets.UTF_8);
-                while (firstChars.charAt(firstChars.length() - 1) == '\uFFFD' && length <= 1024) {
-                    // keep reading additional bytes, one at a time, for as long as
-                    // the last character in the resulting String is a '�', aka
-                    // Unicode "no such character" character, '\uFFFD', or <EF><BF><BD> in UTF-8
-                    length++;
-                    firstChars = new String(buffer, 0, length, StandardCharsets.UTF_8);
+        try (xmlInputStream;
+                // create a buffered stream to allow for rewind
+                BufferedInputStream bufferedXmlInputStream =
+                        new BufferedInputStream(xmlInputStream)) {
+
+            final int n = 1024;
+            // mark the start of the stream, leave 1 byte headroom for reading ahead
+            bufferedXmlInputStream.mark(n + 1);
+            // read the first bytes
+            byte[] bytes = bufferedXmlInputStream.readNBytes(n);
+
+            // broken UTF-8 multibyte char only possible when >= 1024 chars available
+            if (bytes.length == n) {
+                // analyze how far we can read without risk of broken char
+                final int maxN = maxBytesWithCompleteUTF8Chars(bytes);
+                // if we detected a (potentially broken) mbchar at the end, re-read.
+                if (maxN < n) {
+                    // reset the stream to begin
+                    bufferedXmlInputStream.reset();
+                    // replace bytes by reading again, but only so far as maxN
+                    bytes = bufferedXmlInputStream.readNBytes(maxN);
                 }
             }
 
+            String firstChars = new String(bytes, StandardCharsets.UTF_8);
             // match the start with the compiled regex and replace with nothing when matching.
             firstChars = xmlDeclaration.reset(firstChars).replaceFirst("");
 
             // write the chars to the output stream
             writer.getOutputStream().write(firstChars.getBytes(StandardCharsets.UTF_8));
 
-            // if we have any leftover unused bytes in the buffer, write those too:
-            if (length > -1 && length < 1024) {
-                writer.getOutputStream().write(buffer, length, 1024 - length);
-            }
-
             // now send the rest of the stream
-            xmlInputStream.transferTo(writer.getOutputStream());
+            bufferedXmlInputStream.transferTo(writer.getOutputStream());
+        }
+    }
+
+    public static int maxBytesWithCompleteUTF8Chars(final byte[] buffer) {
+        final int n = buffer.length;
+
+        if (Byte.toUnsignedInt(buffer[n - 1])
+                > 127) { // 0b0xxxxxxx are all 1-byte chars, so only seek for 0b1xxxxxxx
+            for (int i = n - 1;
+                    i > n - 5;
+                    i--) { // go max four bytes back (max UTF-8 mbchar length)
+                if (Byte.toUnsignedInt(buffer[i])
+                        > 191) { // 0b110xxxxx / 0b1110xxxx / 0b11110xxx are the UTF-8 multibyte
+                    // char start bytes
+                    // -> this is a multibyte start char => use "i" as perfect new length
+                    //    (array index starts at 0, so is already -1 for read length)
+                    return i;
+                }
+                // -> this is a byte somewhere inside the mbchar, go -1 byte and try again
+            }
         }
+
+        return n;
     }
 }
diff --git a/xoai-common/src/test/java/io/gdcc/xoai/xml/CopyElementTest.java b/xoai-common/src/test/java/io/gdcc/xoai/xml/CopyElementTest.java
@@ -16,12 +16,18 @@
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.OutputStreamWriter;
+import java.io.UnsupportedEncodingException;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.util.Arrays;
 import java.util.Map;
+import java.util.stream.Stream;
 import javax.xml.stream.XMLStreamException;
 import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
 import org.xmlunit.matchers.EvaluateXPathMatcher;
 
 public class CopyElementTest {
@@ -256,4 +262,50 @@ void ensureIssue188Fixed() throws IOException, XMLStreamException {
                 EvaluateXPathMatcher.hasXPath("//dc:description/text()", equalTo(description))
                         .withNamespaceContext(Map.of("dc", "http://purl.org/dc/elements/1.1/")));
     }
+
+    static Stream<Arguments> multicharString() {
+        return Stream.of(
+                Arguments.of(4, "test", 0), // 1-byte ending
+                // 2-byte ending (complete, but we can't know that!)
+                Arguments.of(4, "testß", 0),
+                // 2-byte ending on 1st byte
+                Arguments.of(4, "testß", 1),
+                // 3-byte ending on 2nd byte
+                Arguments.of(4, "test’", 1),
+                // 3-byte ending on 3rd byte (complete, but we can't know that!)
+                Arguments.of(4, "test♥", 0),
+                // 3-byte ending on 2nd byte
+                Arguments.of(4, "test♥", 1),
+                // 3-byte ending on 1st byte
+                Arguments.of(4, "test♥", 2),
+                // 4-byte ending on 3rd byte
+                Arguments.of(4, "test\uD83D\uDD2B", 1));
+    }
+
+    @MethodSource("multicharString")
+    @ParameterizedTest
+    void ensureBufferAnalysisCorrectness(int expectedLength, String text, int bytesToCutOf)
+            throws UnsupportedEncodingException {
+        // given
+        // System.out.println(convertToBinary(text));
+        byte[] bytes = text.getBytes(StandardCharsets.UTF_8);
+        // System.out.println(Arrays.toString(bytes));
+
+        // when
+        byte[] sut = Arrays.copyOf(bytes, bytes.length - bytesToCutOf);
+        int length = CopyElement.maxBytesWithCompleteUTF8Chars(sut);
+
+        // then
+        assertEquals(expectedLength, length);
+    }
+
+    static String convertToBinary(String input) {
+        byte[] bytes = input.getBytes(StandardCharsets.UTF_8);
+        StringBuilder sb = new StringBuilder();
+        for (byte b : bytes) {
+            sb.append(String.format("%8s", Integer.toBinaryString(b & 0xFF)).replace(" ", "0"));
+            sb.append(" ");
+        }
+        return sb.toString();
+    }
 }