Add function for detecting BOM

Even though BOM is rarely used these days, I find it useful to be able detect it sometimes. I added new function that detects all the BOMs listed in Wikipedia. Currently UTF-32LE doesn't work for some reason. I'll investigate why.
peelonet · Oct 16, 2024 · e40a2db · e40a2db
1 parent 4b6841b
commit e40a2db
Show file tree

Hide file tree

Showing 4 changed files with 345 additions and 1 deletion.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -2,7 +2,7 @@ CMAKE_MINIMUM_REQUIRED(VERSION 3.6)
 
 PROJECT(
   PeeloUnicode
-  VERSION 1.0.0
+  VERSION 1.1.0
   DESCRIPTION "Header only C++ Unicode utilities."
   HOMEPAGE_URL "https://github.com/peelonet/peelo-unicode"
   LANGUAGES CXX

diff --git a/README.md b/README.md
@@ -112,3 +112,63 @@ main()
   }
 }
 ```
+
+## BOM detection
+
+The library provides function for detecting whether an byte string contains
+[byte order mark] or not, and which character encoding it is. Even though use
+of BOM is rare these days, it might sometimes be useful to able to detect it.
+
+List of detected character encodings are:
+
+- [UTF-8]
+- [UTF-16BE][UTF-16]
+- [UTF-16LE][UTF-16]
+- [UTF-32BE][UTF-32]
+- [UTF-32LE][UTF-32]
+- [UTF-7]
+- [UTF-1]
+- [UTF-EBCDIC]
+- [SCSU]
+- [BOCU-1]
+- [GB18030]
+
+[Byte order mark]: https://en.wikipedia.org/wiki/Byte_order_mark
+[UTF-7]: https://en.wikipedia.org/wiki/UTF-7
+[UTF-1]: https://en.wikipedia.org/wiki/UTF-1
+[UTF-EBCDIC]: https://en.wikipedia.org/wiki/UTF-EBCDIC
+[SCSU]: https://en.wikipedia.org/wiki/Standard_Compression_Scheme_for_Unicode
+[BOCU-1]: https://en.wikipedia.org/wiki/Binary_Ordered_Compression_for_Unicode
+[GB18030]: https://en.wikipedia.org/wiki/GB_18030
+
+### Example
+
+```cpp
+#include <fstream>
+#include <iostream>
+#include <peelo/unicode/bom.hpp>
+
+int
+main()
+{
+  char buffer[1024];
+  std::fstream f("file.txt");
+  std::size_t length;
+
+  f.read(buffer, sizeof(buffer));
+  length = f.gcount();
+  f.close();
+
+  if (const auto bom = peelo::unicode::detect_bom(buffer, length))
+  {
+    if (*bom == peelo::unicode::bom::utf16_be)
+    {
+      std::cout << "File has UTF-16BE BOM." << std::endl;
+    } else {
+      std::cout << "File has some other BOM." << std::endl;
+    }
+  } else {
+    std::cout << "File does not contain BOM." << std::endl;
+  }
+}
+```
diff --git a/include/peelo/unicode/bom.hpp b/include/peelo/unicode/bom.hpp
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2018-2024, peelo.net
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#pragma once
+
+#include <array>
+#include <cstring>
+#include <optional>
+#include <string>
+
+namespace peelo::unicode
+{
+  /**
+   * Enumeration of different recognized BOM types.
+   */
+  enum class bom
+  {
+    utf8,
+    utf16_be,
+    utf16_le,
+    utf32_be,
+    utf32_le,
+    utf7,
+    utf1,
+    utf_ebcdic,
+    scsu,
+    bocu_1,
+    gb18030,
+  };
+
+  /**
+   * Tests whether given byte string contains byte order mark or not.
+   *
+   * @param input Byte string to test.
+   * @param length Length of the given byte string.
+   * @return Byte order mark detected in the given byte string, or null option
+   *         if the given byte string does not contain byte order mark.
+   */
+  inline std::optional<bom>
+  detect_bom(const char* input, std::size_t length)
+  {
+    struct bom_info
+    {
+      const char* bytes;
+      std::size_t length;
+      bom type;
+    };
+    static constexpr std::size_t bom_array_size = 11;
+    static const std::array<bom_info, bom_array_size> bom_array =
+    {{
+      {
+        "\xef\xbb\xbf",
+        3,
+        bom::utf8,
+      },
+      {
+        "\0\0\xfe\xff",
+        4,
+        bom::utf32_be,
+      },
+      {
+        "\xff\xfe\0\0",
+        4,
+        bom::utf32_le,
+      },
+      {
+        "\xfe\xff",
+        2,
+        bom::utf16_be,
+      },
+      {
+        "\xff\xfe",
+        2,
+        bom::utf16_le,
+      },
+      {
+        "\x2b\x2f\x76",
+        3,
+        bom::utf7,
+      },
+      {
+        "\xf7\x64\x4c",
+        3,
+        bom::utf1,
+      },
+      {
+        "\xdd\x73\x66\x73",
+        4,
+        bom::utf_ebcdic
+      },
+      {
+        "\x0e\xfe\xff",
+        3,
+        bom::scsu
+      },
+      {
+        "\xfb\xee\x28",
+        3,
+        bom::bocu_1
+      },
+      {
+        "\x84\x31\x95\x33",
+        4,
+        bom::gb18030
+      },
+    }};
+
+    for (std::size_t i = 0; i < bom_array_size; ++i)
+    {
+      const auto& info = bom_array[i];
+
+      if (length < info.length)
+      {
+        continue;
+      }
+      else if (!std::memcmp(input, info.bytes, info.length))
+      {
+        return info.type;
+      }
+    }
+
+    return std::nullopt;
+  }
+
+  /**
+   * Tests whether given string contains byte order mark or not.
+   *
+   * @param input String to test.
+   * @return Byte order mark detected in the given byte string, or null option
+   *         if the given byte string does not contain byte order mark.
+   */
+  inline std::optional<bom>
+  detect_bom(const std::string& input)
+  {
+    return detect_bom(input.c_str(), input.length());
+  }
+}
diff --git a/test/test_bom.cpp b/test/test_bom.cpp
@@ -0,0 +1,124 @@
+#include <cassert>
+#include <iostream> // TODO: remove me
+
+#include <peelo/unicode/bom.hpp>
+
+using peelo::unicode::detect_bom;
+using peelo::unicode::bom;
+
+static void
+test_recognized_bom(
+  bom expected_type,
+  const char* input,
+  std::size_t length
+)
+{
+  // This looks weird but in GitHub CI I get warnings about unused variables if
+  // I do this in some other way.
+  if (const auto result = detect_bom(input, length))
+  {
+    assert(*result == expected_type);
+  } else {
+    assert(false);
+  }
+}
+
+static void
+test_utf8()
+{
+  test_recognized_bom(bom::utf8, "\xef\xbb\xbf", 3);
+}
+
+static void
+test_utf16_be()
+{
+  test_recognized_bom(bom::utf16_be, "\xfe\xff", 2);
+}
+
+static void
+test_utf16_le()
+{
+  test_recognized_bom(bom::utf16_le, "\xff\xfe", 2);
+}
+
+static void
+test_utf32_be()
+{
+  test_recognized_bom(bom::utf32_be, "\0\0\xfe\xff", 4);
+}
+
+static void
+test_utf32_le()
+{
+  test_recognized_bom(bom::utf32_le, "\xff\xfe\0\0", 4);
+}
+
+static void
+test_utf7()
+{
+  test_recognized_bom(bom::utf7, "\x2b\x2f\x76", 3);
+}
+
+static void
+test_utf1()
+{
+  test_recognized_bom(bom::utf1, "\xf7\x64\x4c", 3);
+}
+
+static void
+test_utf_ebcdic()
+{
+  test_recognized_bom(bom::utf_ebcdic, "\xdd\x73\x66\x73", 4);
+}
+
+static void
+test_scsu()
+{
+  test_recognized_bom(bom::scsu, "\x0e\xfe\xff", 3);
+}
+
+static void
+test_bocu_1()
+{
+  test_recognized_bom(bom::bocu_1, "\xfb\xee\x28", 3);
+}
+
+static void
+test_gb18030()
+{
+  test_recognized_bom(bom::gb18030, "\x84\x31\x95\x33", 4);
+}
+
+static void
+test_unrecognized_bom()
+{
+  assert(!detect_bom("", 0));
+  assert(!detect_bom("a", 1));
+  assert(!detect_bom("a\xef\xbb\xbf", 4));
+  assert(!detect_bom("\x00\xbb\xbf\xef\xbb\xbf", 6));
+}
+
+static void
+test_with_string()
+{
+  assert(!!detect_bom(std::string("\xef\xbb\xbf")));
+  assert(!detect_bom(std::string("a")));
+}
+
+int
+main()
+{
+  test_utf8();
+  test_utf16_be();
+  test_utf16_le();
+  test_utf32_be();
+  test_utf32_le();
+  test_utf7();
+  test_utf1();
+  test_utf_ebcdic();
+  test_scsu();
+  test_bocu_1();
+  test_gb18030();
+  test_unrecognized_bom();
+  test_with_string();
+}