From 9c23b78f956c23650f17cabb50fef795868316cb Mon Sep 17 00:00:00 2001 From: Jarno Elovirta Date: Thu, 9 May 2024 09:41:14 +0300 Subject: [PATCH 1/2] Add BOM support to reader and improve tests Signed-off-by: Jarno Elovirta --- .../dita/markdown/MarkdownReader.java | 32 +++++++++++---- .../dita/markdown/MarkdownReaderTest.java | 40 ++++++++++++++++--- 2 files changed, 60 insertions(+), 12 deletions(-) diff --git a/src/main/java/com/elovirta/dita/markdown/MarkdownReader.java b/src/main/java/com/elovirta/dita/markdown/MarkdownReader.java index d207a32..867e344 100644 --- a/src/main/java/com/elovirta/dita/markdown/MarkdownReader.java +++ b/src/main/java/com/elovirta/dita/markdown/MarkdownReader.java @@ -27,6 +27,7 @@ import java.net.URISyntaxException; import java.net.URL; import java.nio.CharBuffer; +import java.nio.charset.StandardCharsets; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -351,10 +352,11 @@ Map.Entry getSchema(char[] data, InputSource input) throws SAXPars @VisibleForTesting char[] getMarkdownContent(final InputSource input) throws IOException { final CharArrayWriter out = new CharArrayWriter(); + final String encoding = input.getEncoding() != null ? input.getEncoding() : StandardCharsets.UTF_8.name(); + final boolean isUtf8 = "UTF-8".equalsIgnoreCase(encoding); if (input.getByteStream() != null) { - final String encoding = input.getEncoding() != null ? input.getEncoding() : "UTF-8"; try ( - BufferedInputStream is = "UTF-8".equalsIgnoreCase(encoding) + BufferedInputStream is = isUtf8 ? consumeBOM(input.getByteStream()) : new BufferedInputStream(input.getByteStream()); Reader in = new InputStreamReader(is, encoding) @@ -362,7 +364,7 @@ char[] getMarkdownContent(final InputSource input) throws IOException { copy(in, out); } } else if (input.getCharacterStream() != null) { - try (Reader in = input.getCharacterStream()) { + try (Reader in = isUtf8 ? consumeBOM(input.getCharacterStream()) : input.getCharacterStream()) { copy(in, out); } } else if (input.getSystemId() != null) { @@ -372,11 +374,8 @@ char[] getMarkdownContent(final InputSource input) throws IOException { } catch (final URISyntaxException e) { throw new IllegalArgumentException(e); } - final String encoding = input.getEncoding() != null ? input.getEncoding() : "UTF-8"; try ( - BufferedInputStream is = "UTF-8".equalsIgnoreCase(encoding) - ? consumeBOM(inUrl.openStream()) - : new BufferedInputStream(inUrl.openStream()); + BufferedInputStream is = isUtf8 ? consumeBOM(inUrl.openStream()) : new BufferedInputStream(inUrl.openStream()); Reader in = new InputStreamReader(is, encoding) ) { copy(in, out); @@ -403,4 +402,23 @@ private BufferedInputStream consumeBOM(final InputStream in) throws IOException } return bin; } + + /** + * Returns a reader that skips the BOM if present. + * + * @param in the original reader + * @return a reader without a possible BOM + */ + private BufferedReader consumeBOM(final Reader in) throws IOException { + final BufferedReader bin = new BufferedReader(in); + bin.mark(1); + try { + if (bin.read() != '\uFEFF') { + bin.reset(); + } + } catch (final IOException e) { + bin.reset(); + } + return bin; + } } diff --git a/src/test/java/com/elovirta/dita/markdown/MarkdownReaderTest.java b/src/test/java/com/elovirta/dita/markdown/MarkdownReaderTest.java index f454b1d..3b8e3c3 100644 --- a/src/test/java/com/elovirta/dita/markdown/MarkdownReaderTest.java +++ b/src/test/java/com/elovirta/dita/markdown/MarkdownReaderTest.java @@ -10,12 +10,16 @@ import com.vladsch.flexmark.util.data.MutableDataSet; import java.io.IOException; import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; import java.net.URI; import java.net.URL; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.Map; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; import org.junit.jupiter.params.provider.ValueSource; import org.xml.sax.*; import org.xml.sax.helpers.XMLFilterImpl; @@ -191,13 +195,39 @@ public void test_fail(String file) { } } - @Test - public void getMarkdownContent_url() throws Exception { - final String input = getSrc() + "testBOM.md"; + @ParameterizedTest + @CsvSource({ "markdown/testBOM.md, UTF-8", "markdown/testNoBOM.md, UTF-8", "markdown/testNoBOM.md, ISO-8859-1" }) + public void getMarkdownContent_url(String input, String encoding) throws Exception { final URL in = getClass().getResource("/" + input); final InputSource i = new InputSource(in.toString()); - final char[] content = new MarkdownReader().getMarkdownContent(i); - assertEquals('W', content[0]); + i.setEncoding(encoding); + final char[] act = new MarkdownReader().getMarkdownContent(i); + assertEquals('W', act[0]); + } + + @ParameterizedTest + @CsvSource({ "/markdown/testBOM.md, UTF-8", "/markdown/testNoBOM.md, UTF-8", "/markdown/testNoBOM.md, ISO-8859-1" }) + public void getMarkdownContent_byteStream(String input, String encoding) throws Exception { + try (InputStream in = getClass().getResourceAsStream(input)) { + final InputSource i = new InputSource(in); + i.setEncoding(encoding); + final char[] act = new MarkdownReader().getMarkdownContent(i); + assertEquals('W', act[0]); + } + } + + @ParameterizedTest + @CsvSource({ "/markdown/testBOM.md, UTF-8", "/markdown/testNoBOM.md, UTF-8", "/markdown/testNoBOM.md, ISO-8859-1" }) + public void getMarkdownContent_characterStream(String input, String encoding) throws Exception { + try ( + InputStream in = getClass().getResourceAsStream(input); + Reader r = new InputStreamReader(in, StandardCharsets.UTF_8) + ) { + final InputSource i = new InputSource(r); + i.setEncoding(encoding); + final char[] act = new MarkdownReader().getMarkdownContent(i); + assertEquals('W', act[0]); + } } @Test From 86ac5492068ec286d884d0a1b880e0b3880860a6 Mon Sep 17 00:00:00 2001 From: Jarno Elovirta Date: Thu, 9 May 2024 09:58:05 +0300 Subject: [PATCH 2/2] Refactor failure test Signed-off-by: Jarno Elovirta --- .../dita/markdown/MDitaReaderCoreTest.java | 18 ++++++++++++------ .../dita/markdown/MDitaReaderExtendedTest.java | 18 ++++++++++++------ .../dita/markdown/MarkdownReaderTest.java | 16 ++++++++++------ 3 files changed, 34 insertions(+), 18 deletions(-) diff --git a/src/test/java/com/elovirta/dita/markdown/MDitaReaderCoreTest.java b/src/test/java/com/elovirta/dita/markdown/MDitaReaderCoreTest.java index f713af5..10cabd1 100644 --- a/src/test/java/com/elovirta/dita/markdown/MDitaReaderCoreTest.java +++ b/src/test/java/com/elovirta/dita/markdown/MDitaReaderCoreTest.java @@ -3,9 +3,11 @@ import static org.junit.jupiter.api.Assertions.*; import com.elovirta.dita.utils.AbstractReaderTest; +import java.io.InputStream; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.ValueSource; import org.opentest4j.AssertionFailedError; +import org.xml.sax.InputSource; import org.xml.sax.SAXException; public class MDitaReaderCoreTest extends AbstractReaderTest { @@ -81,11 +83,15 @@ public void test_unsupported(String file) { @ParameterizedTest @ValueSource(strings = { "header.md", "invalid_header.md", "invalid_header_third.md" }) public void test_fail(String file) { - try { - run(file); - fail(); - } catch (Exception e) { - assertEquals(SAXException.class, e.getCause().getClass()); - } + assertThrows( + SAXException.class, + () -> { + final String input = "/" + getSrc() + file; + try (final InputStream in = getClass().getResourceAsStream(input)) { + final InputSource i = new InputSource(in); + reader.parse(i); + } + } + ); } } diff --git a/src/test/java/com/elovirta/dita/markdown/MDitaReaderExtendedTest.java b/src/test/java/com/elovirta/dita/markdown/MDitaReaderExtendedTest.java index 7f8124e..be6ff7a 100644 --- a/src/test/java/com/elovirta/dita/markdown/MDitaReaderExtendedTest.java +++ b/src/test/java/com/elovirta/dita/markdown/MDitaReaderExtendedTest.java @@ -4,10 +4,12 @@ import com.elovirta.dita.utils.AbstractReaderTest; import java.io.IOException; +import java.io.InputStream; import java.util.Arrays; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.ValueSource; +import org.xml.sax.InputSource; import org.xml.sax.SAXException; public class MDitaReaderExtendedTest extends AbstractReaderTest { @@ -80,12 +82,16 @@ public void test(String file) throws Exception { @ParameterizedTest @ValueSource(strings = { "header.md", "invalid_header.md", "invalid_header_third.md" }) public void test_fail(String file) { - try { - run(file); - fail(); - } catch (Exception e) { - assertEquals(SAXException.class, e.getCause().getClass()); - } + assertThrows( + SAXException.class, + () -> { + final String input = "/" + getSrc() + file; + try (final InputStream in = getClass().getResourceAsStream(input)) { + final InputSource i = new InputSource(in); + reader.parse(i); + } + } + ); } @Test diff --git a/src/test/java/com/elovirta/dita/markdown/MarkdownReaderTest.java b/src/test/java/com/elovirta/dita/markdown/MarkdownReaderTest.java index 3b8e3c3..9a5d6e3 100644 --- a/src/test/java/com/elovirta/dita/markdown/MarkdownReaderTest.java +++ b/src/test/java/com/elovirta/dita/markdown/MarkdownReaderTest.java @@ -187,12 +187,16 @@ public void test_schemaParseFailure_withoutErrorHandler() throws Exception { @ParameterizedTest @ValueSource(strings = { "invalid_header.md" }) public void test_fail(String file) { - try { - run(file); - fail(); - } catch (Exception e) { - assertEquals(SAXException.class, e.getCause().getClass()); - } + assertThrows( + SAXException.class, + () -> { + final String input = "/" + getSrc() + file; + try (final InputStream in = getClass().getResourceAsStream(input)) { + final InputSource i = new InputSource(in); + reader.parse(i); + } + } + ); } @ParameterizedTest