Skip to content

Commit

Permalink
fix(common): apply more lowlevel fix for broken UTF-8 chars #188
Browse files Browse the repository at this point in the history
With this commit, we introduce an analysis routine that will go over
the last few bytes when reading with CopyElement. This is faster
than converting to String and checking for the UTF-8 unknown char sign
over and over again.

By using a buffered input stream, we can rewind the stream if necessary
and read again up to the point that we don't have a broken char.

Extensive testing was added to make sure the analysis function works
with any length of multibyte UTF-8 chars
  • Loading branch information
poikilotherm committed Sep 30, 2023
1 parent 5057b23 commit 53f8222
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 30 deletions.
76 changes: 46 additions & 30 deletions xoai-common/src/main/java/io/gdcc/xoai/xml/CopyElement.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
package io.gdcc.xoai.xml;

import io.gdcc.xoai.xmlio.exceptions.XmlWriteException;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
Expand Down Expand Up @@ -82,46 +83,61 @@ protected void writeXml(XmlWriter writer) throws IOException {
* and the application must take care of it.
*/

try (xmlInputStream; ) {
// fill the buffer once with the first 1024 chars and read as string
byte[] buffer = xmlInputStream.readNBytes(1024);

// We will take some extra precautions to make sure we are not
// splitting any multi-byte UTF-8 characters in the process!
String firstChars;
int length = -1;

if (buffer.length < 1024) {
// the entire metadata fragment is shorter than 1024 bytes;
// should be safe to just convert to String:
firstChars = new String(buffer, StandardCharsets.UTF_8);
} else {
length = 1021;
// try converting the entire buffer into a String, *except* for
// the last 3 bytes:
firstChars = new String(buffer, 0, length, StandardCharsets.UTF_8);
while (firstChars.charAt(firstChars.length() - 1) == '\uFFFD' && length <= 1024) {
// keep reading additional bytes, one at a time, for as long as
// the last character in the resulting String is a '�', aka
// Unicode "no such character" character, '\uFFFD', or <EF><BF><BD> in UTF-8
length++;
firstChars = new String(buffer, 0, length, StandardCharsets.UTF_8);
try (xmlInputStream;
// create a buffered stream to allow for rewind
BufferedInputStream bufferedXmlInputStream =
new BufferedInputStream(xmlInputStream)) {

final int n = 1024;
// mark the start of the stream, leave 1 byte headroom for reading ahead
bufferedXmlInputStream.mark(n + 1);
// read the first bytes
byte[] bytes = bufferedXmlInputStream.readNBytes(n);

// broken UTF-8 multibyte char only possible when >= 1024 chars available
if (bytes.length == n) {
// analyze how far we can read without risk of broken char
final int maxN = maxBytesWithCompleteUTF8Chars(bytes);
// if we detected a (potentially broken) mbchar at the end, re-read.
if (maxN < n) {
// reset the stream to begin
bufferedXmlInputStream.reset();
// replace bytes by reading again, but only so far as maxN
bytes = bufferedXmlInputStream.readNBytes(maxN);
}
}

String firstChars = new String(bytes, StandardCharsets.UTF_8);
// match the start with the compiled regex and replace with nothing when matching.
firstChars = xmlDeclaration.reset(firstChars).replaceFirst("");

// write the chars to the output stream
writer.getOutputStream().write(firstChars.getBytes(StandardCharsets.UTF_8));

// if we have any leftover unused bytes in the buffer, write those too:
if (length > -1 && length < 1024) {
writer.getOutputStream().write(buffer, length, 1024 - length);
}

// now send the rest of the stream
xmlInputStream.transferTo(writer.getOutputStream());
bufferedXmlInputStream.transferTo(writer.getOutputStream());
}
}

public static int maxBytesWithCompleteUTF8Chars(final byte[] buffer) {
final int n = buffer.length;

if (Byte.toUnsignedInt(buffer[n - 1])
> 127) { // 0b0xxxxxxx are all 1-byte chars, so only seek for 0b1xxxxxxx
for (int i = n - 1;
i > n - 5;
i--) { // go max four bytes back (max UTF-8 mbchar length)
if (Byte.toUnsignedInt(buffer[i])
> 191) { // 0b110xxxxx / 0b1110xxxx / 0b11110xxx are the UTF-8 multibyte
// char start bytes
// -> this is a multibyte start char => use "i" as perfect new length
// (array index starts at 0, so is already -1 for read length)
return i;
}
// -> this is a byte somewhere inside the mbchar, go -1 byte and try again
}
}

return n;
}
}
52 changes: 52 additions & 0 deletions xoai-common/src/test/java/io/gdcc/xoai/xml/CopyElementTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,18 @@
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.Map;
import java.util.stream.Stream;
import javax.xml.stream.XMLStreamException;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
import org.xmlunit.matchers.EvaluateXPathMatcher;

public class CopyElementTest {
Expand Down Expand Up @@ -256,4 +262,50 @@ void ensureIssue188Fixed() throws IOException, XMLStreamException {
EvaluateXPathMatcher.hasXPath("//dc:description/text()", equalTo(description))
.withNamespaceContext(Map.of("dc", "http://purl.org/dc/elements/1.1/")));
}

static Stream<Arguments> multicharString() {
return Stream.of(
Arguments.of(4, "test", 0), // 1-byte ending
// 2-byte ending (complete, but we can't know that!)
Arguments.of(4, "testß", 0),
// 2-byte ending on 1st byte
Arguments.of(4, "testß", 1),
// 3-byte ending on 2nd byte
Arguments.of(4, "test’", 1),
// 3-byte ending on 3rd byte (complete, but we can't know that!)
Arguments.of(4, "test♥", 0),
// 3-byte ending on 2nd byte
Arguments.of(4, "test♥", 1),
// 3-byte ending on 1st byte
Arguments.of(4, "test♥", 2),
// 4-byte ending on 3rd byte
Arguments.of(4, "test\uD83D\uDD2B", 1));
}

@MethodSource("multicharString")
@ParameterizedTest
void ensureBufferAnalysisCorrectness(int expectedLength, String text, int bytesToCutOf)
throws UnsupportedEncodingException {
// given
// System.out.println(convertToBinary(text));
byte[] bytes = text.getBytes(StandardCharsets.UTF_8);
// System.out.println(Arrays.toString(bytes));

// when
byte[] sut = Arrays.copyOf(bytes, bytes.length - bytesToCutOf);
int length = CopyElement.maxBytesWithCompleteUTF8Chars(sut);

// then
assertEquals(expectedLength, length);
}

static String convertToBinary(String input) {
byte[] bytes = input.getBytes(StandardCharsets.UTF_8);
StringBuilder sb = new StringBuilder();
for (byte b : bytes) {
sb.append(String.format("%8s", Integer.toBinaryString(b & 0xFF)).replace(" ", "0"));
sb.append(" ");
}
return sb.toString();
}
}

0 comments on commit 53f8222

Please sign in to comment.