Merge pull request #221 from jelovirt/feature/add-tests

Add BOM support for Reader input
jelovirt · May 9, 2024 · f3b40d1 · f3b40d1
2 parents 831acd4 + 86ac549
commit f3b40d1
Show file tree

Hide file tree

Showing 4 changed files with 94 additions and 30 deletions.
diff --git a/src/main/java/com/elovirta/dita/markdown/MarkdownReader.java b/src/main/java/com/elovirta/dita/markdown/MarkdownReader.java
@@ -27,6 +27,7 @@
 import java.net.URISyntaxException;
 import java.net.URL;
 import java.nio.CharBuffer;
+import java.nio.charset.StandardCharsets;
 import java.util.*;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
@@ -351,18 +352,19 @@ Map.Entry<URI, Locator> getSchema(char[] data, InputSource input) throws SAXPars
   @VisibleForTesting
   char[] getMarkdownContent(final InputSource input) throws IOException {
     final CharArrayWriter out = new CharArrayWriter();
+    final String encoding = input.getEncoding() != null ? input.getEncoding() : StandardCharsets.UTF_8.name();
+    final boolean isUtf8 = "UTF-8".equalsIgnoreCase(encoding);
     if (input.getByteStream() != null) {
-      final String encoding = input.getEncoding() != null ? input.getEncoding() : "UTF-8";
       try (
-        BufferedInputStream is = "UTF-8".equalsIgnoreCase(encoding)
+        BufferedInputStream is = isUtf8
           ? consumeBOM(input.getByteStream())
           : new BufferedInputStream(input.getByteStream());
         Reader in = new InputStreamReader(is, encoding)
       ) {
         copy(in, out);
       }
     } else if (input.getCharacterStream() != null) {
-      try (Reader in = input.getCharacterStream()) {
+      try (Reader in = isUtf8 ? consumeBOM(input.getCharacterStream()) : input.getCharacterStream()) {
         copy(in, out);
       }
     } else if (input.getSystemId() != null) {
@@ -372,11 +374,8 @@ char[] getMarkdownContent(final InputSource input) throws IOException {
       } catch (final URISyntaxException e) {
         throw new IllegalArgumentException(e);
       }
-      final String encoding = input.getEncoding() != null ? input.getEncoding() : "UTF-8";
       try (
-        BufferedInputStream is = "UTF-8".equalsIgnoreCase(encoding)
-          ? consumeBOM(inUrl.openStream())
-          : new BufferedInputStream(inUrl.openStream());
+        BufferedInputStream is = isUtf8 ? consumeBOM(inUrl.openStream()) : new BufferedInputStream(inUrl.openStream());
         Reader in = new InputStreamReader(is, encoding)
       ) {
         copy(in, out);
@@ -403,4 +402,23 @@ private BufferedInputStream consumeBOM(final InputStream in) throws IOException
     }
     return bin;
   }
+
+  /**
+   * Returns a reader that skips the BOM if present.
+   *
+   * @param in the original reader
+   * @return a reader without a possible BOM
+   */
+  private BufferedReader consumeBOM(final Reader in) throws IOException {
+    final BufferedReader bin = new BufferedReader(in);
+    bin.mark(1);
+    try {
+      if (bin.read() != '\uFEFF') {
+        bin.reset();
+      }
+    } catch (final IOException e) {
+      bin.reset();
+    }
+    return bin;
+  }
 }
diff --git a/src/test/java/com/elovirta/dita/markdown/MDitaReaderCoreTest.java b/src/test/java/com/elovirta/dita/markdown/MDitaReaderCoreTest.java
@@ -3,9 +3,11 @@
 import static org.junit.jupiter.api.Assertions.*;
 
 import com.elovirta.dita.utils.AbstractReaderTest;
+import java.io.InputStream;
 import org.junit.jupiter.params.ParameterizedTest;
 import org.junit.jupiter.params.provider.ValueSource;
 import org.opentest4j.AssertionFailedError;
+import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
 
 public class MDitaReaderCoreTest extends AbstractReaderTest {
@@ -81,11 +83,15 @@ public void test_unsupported(String file) {
   @ParameterizedTest
   @ValueSource(strings = { "header.md", "invalid_header.md", "invalid_header_third.md" })
   public void test_fail(String file) {
-    try {
-      run(file);
-      fail();
-    } catch (Exception e) {
-      assertEquals(SAXException.class, e.getCause().getClass());
-    }
+    assertThrows(
+      SAXException.class,
+      () -> {
+        final String input = "/" + getSrc() + file;
+        try (final InputStream in = getClass().getResourceAsStream(input)) {
+          final InputSource i = new InputSource(in);
+          reader.parse(i);
+        }
+      }
+    );
   }
 }
diff --git a/src/test/java/com/elovirta/dita/markdown/MDitaReaderExtendedTest.java b/src/test/java/com/elovirta/dita/markdown/MDitaReaderExtendedTest.java
@@ -4,10 +4,12 @@
 
 import com.elovirta.dita.utils.AbstractReaderTest;
 import java.io.IOException;
+import java.io.InputStream;
 import java.util.Arrays;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.params.ParameterizedTest;
 import org.junit.jupiter.params.provider.ValueSource;
+import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
 
 public class MDitaReaderExtendedTest extends AbstractReaderTest {
@@ -80,12 +82,16 @@ public void test(String file) throws Exception {
   @ParameterizedTest
   @ValueSource(strings = { "header.md", "invalid_header.md", "invalid_header_third.md" })
   public void test_fail(String file) {
-    try {
-      run(file);
-      fail();
-    } catch (Exception e) {
-      assertEquals(SAXException.class, e.getCause().getClass());
-    }
+    assertThrows(
+      SAXException.class,
+      () -> {
+        final String input = "/" + getSrc() + file;
+        try (final InputStream in = getClass().getResourceAsStream(input)) {
+          final InputSource i = new InputSource(in);
+          reader.parse(i);
+        }
+      }
+    );
   }
 
   @Test

diff --git a/src/test/java/com/elovirta/dita/markdown/MarkdownReaderTest.java b/src/test/java/com/elovirta/dita/markdown/MarkdownReaderTest.java
@@ -10,12 +10,16 @@
 import com.vladsch.flexmark.util.data.MutableDataSet;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
 import java.net.URI;
 import java.net.URL;
+import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 import java.util.Map;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.CsvSource;
 import org.junit.jupiter.params.provider.ValueSource;
 import org.xml.sax.*;
 import org.xml.sax.helpers.XMLFilterImpl;
@@ -183,21 +187,51 @@ public void test_schemaParseFailure_withoutErrorHandler() throws Exception {
   @ParameterizedTest
   @ValueSource(strings = { "invalid_header.md" })
   public void test_fail(String file) {
-    try {
-      run(file);
-      fail();
-    } catch (Exception e) {
-      assertEquals(SAXException.class, e.getCause().getClass());
-    }
+    assertThrows(
+      SAXException.class,
+      () -> {
+        final String input = "/" + getSrc() + file;
+        try (final InputStream in = getClass().getResourceAsStream(input)) {
+          final InputSource i = new InputSource(in);
+          reader.parse(i);
+        }
+      }
+    );
   }
 
-  @Test
-  public void getMarkdownContent_url() throws Exception {
-    final String input = getSrc() + "testBOM.md";
+  @ParameterizedTest
+  @CsvSource({ "markdown/testBOM.md, UTF-8", "markdown/testNoBOM.md, UTF-8", "markdown/testNoBOM.md, ISO-8859-1" })
+  public void getMarkdownContent_url(String input, String encoding) throws Exception {
     final URL in = getClass().getResource("/" + input);
     final InputSource i = new InputSource(in.toString());
-    final char[] content = new MarkdownReader().getMarkdownContent(i);
-    assertEquals('W', content[0]);
+    i.setEncoding(encoding);
+    final char[] act = new MarkdownReader().getMarkdownContent(i);
+    assertEquals('W', act[0]);
+  }
+
+  @ParameterizedTest
+  @CsvSource({ "/markdown/testBOM.md, UTF-8", "/markdown/testNoBOM.md, UTF-8", "/markdown/testNoBOM.md, ISO-8859-1" })
+  public void getMarkdownContent_byteStream(String input, String encoding) throws Exception {
+    try (InputStream in = getClass().getResourceAsStream(input)) {
+      final InputSource i = new InputSource(in);
+      i.setEncoding(encoding);
+      final char[] act = new MarkdownReader().getMarkdownContent(i);
+      assertEquals('W', act[0]);
+    }
+  }
+
+  @ParameterizedTest
+  @CsvSource({ "/markdown/testBOM.md, UTF-8", "/markdown/testNoBOM.md, UTF-8", "/markdown/testNoBOM.md, ISO-8859-1" })
+  public void getMarkdownContent_characterStream(String input, String encoding) throws Exception {
+    try (
+      InputStream in = getClass().getResourceAsStream(input);
+      Reader r = new InputStreamReader(in, StandardCharsets.UTF_8)
+    ) {
+      final InputSource i = new InputSource(r);
+      i.setEncoding(encoding);
+      final char[] act = new MarkdownReader().getMarkdownContent(i);
+      assertEquals('W', act[0]);
+    }
   }
 
   @Test