diff --git a/CMakeLists.txt b/CMakeLists.txt index c09862b..4b2149a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ CMAKE_MINIMUM_REQUIRED(VERSION 3.6) PROJECT( PeeloUnicode - VERSION 1.0.0 + VERSION 1.1.0 DESCRIPTION "Header only C++ Unicode utilities." HOMEPAGE_URL "https://github.com/peelonet/peelo-unicode" LANGUAGES CXX diff --git a/README.md b/README.md index 82b0a3f..2ae5d60 100644 --- a/README.md +++ b/README.md @@ -112,3 +112,63 @@ main() } } ``` + +## BOM detection + +The library provides function for detecting whether an byte string contains +[byte order mark] or not, and which character encoding it is. Even though use +of BOM is rare these days, it might sometimes be useful to able to detect it. + +List of detected character encodings are: + +- [UTF-8] +- [UTF-16BE][UTF-16] +- [UTF-16LE][UTF-16] +- [UTF-32BE][UTF-32] +- [UTF-32LE][UTF-32] +- [UTF-7] +- [UTF-1] +- [UTF-EBCDIC] +- [SCSU] +- [BOCU-1] +- [GB18030] + +[Byte order mark]: https://en.wikipedia.org/wiki/Byte_order_mark +[UTF-7]: https://en.wikipedia.org/wiki/UTF-7 +[UTF-1]: https://en.wikipedia.org/wiki/UTF-1 +[UTF-EBCDIC]: https://en.wikipedia.org/wiki/UTF-EBCDIC +[SCSU]: https://en.wikipedia.org/wiki/Standard_Compression_Scheme_for_Unicode +[BOCU-1]: https://en.wikipedia.org/wiki/Binary_Ordered_Compression_for_Unicode +[GB18030]: https://en.wikipedia.org/wiki/GB_18030 + +### Example + +```cpp +#include +#include +#include + +int +main() +{ + char buffer[1024]; + std::fstream f("file.txt"); + std::size_t length; + + f.read(buffer, sizeof(buffer)); + length = f.gcount(); + f.close(); + + if (const auto bom = peelo::unicode::detect_bom(buffer, length)) + { + if (*bom == peelo::unicode::bom::utf16_be) + { + std::cout << "File has UTF-16BE BOM." << std::endl; + } else { + std::cout << "File has some other BOM." << std::endl; + } + } else { + std::cout << "File does not contain BOM." << std::endl; + } +} +``` diff --git a/include/peelo/unicode/bom.hpp b/include/peelo/unicode/bom.hpp new file mode 100644 index 0000000..7eb4404 --- /dev/null +++ b/include/peelo/unicode/bom.hpp @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2018-2024, peelo.net + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +#pragma once + +#include +#include +#include +#include + +namespace peelo::unicode +{ + /** + * Enumeration of different recognized BOM types. + */ + enum class bom + { + utf8, + utf16_be, + utf16_le, + utf32_be, + utf32_le, + utf7, + utf1, + utf_ebcdic, + scsu, + bocu_1, + gb18030, + }; + + /** + * Tests whether given byte string contains byte order mark or not. + * + * @param input Byte string to test. + * @param length Length of the given byte string. + * @return Byte order mark detected in the given byte string, or null option + * if the given byte string does not contain byte order mark. + */ + inline std::optional + detect_bom(const char* input, std::size_t length) + { + struct bom_info + { + const char* bytes; + std::size_t length; + bom type; + }; + static constexpr std::size_t bom_array_size = 11; + static const std::array bom_array = + { + { + { + "\xef\xbb\xbf", + 3, + bom::utf8, + }, + { + "\xfe\xff", + 2, + bom::utf16_be, + }, + { + "\xff\xfe", + 2, + bom::utf16_le, + }, + { + "\x00\x00\xfe\xff", + 4, + bom::utf32_be, + }, + // FIXME: For some reason test cases fail with this one. + { + "\xff\xfe\x00\x00", + 4, + bom::utf32_le, + }, + { + "\x2b\x2f\x76", + 3, + bom::utf7, + }, + { + "\xf7\x64\x4c", + 3, + bom::utf1, + }, + { + "\xdd\x73\x66\x73", + 4, + bom::utf_ebcdic + }, + { + "\x0e\xfe\xff", + 3, + bom::scsu + }, + { + "\xfb\xee\x28", + 3, + bom::bocu_1 + }, + { + "\x84\x31\x95\x33", + 4, + bom::gb18030 + }, + } + }; + + for (std::size_t i = 0; i < bom_array_size; ++i) + { + const auto& info = bom_array[i]; + + if (length < info.length) + { + continue; + } + else if (!std::memcmp(input, info.bytes, info.length)) + { + return info.type; + } + } + + return std::nullopt; + } + + /** + * Tests whether given string contains byte order mark or not. + * + * @param input String to test. + * @return Byte order mark detected in the given byte string, or null option + * if the given byte string does not contain byte order mark. + */ + inline std::optional + detect_bom(const std::string& input) + { + return detect_bom(input.c_str(), input.length()); + } +} diff --git a/test/test_bom.cpp b/test/test_bom.cpp new file mode 100644 index 0000000..529899d --- /dev/null +++ b/test/test_bom.cpp @@ -0,0 +1,123 @@ +#include + +#include + +using peelo::unicode::detect_bom; +using peelo::unicode::bom; + +static void +test_recognized_bom( + bom expected_type, + const char* input, + std::size_t length +) +{ + const auto result = detect_bom(input, length); + + assert(!!result); + assert(*result == expected_type); +} + +static void +test_utf8() +{ + test_recognized_bom(bom::utf8, "\xef\xbb\xbf", 3); +} + +static void +test_utf16_be() +{ + test_recognized_bom(bom::utf16_be, "\xfe\xff", 2); +} + +static void +test_utf16_le() +{ + test_recognized_bom(bom::utf16_le, "\xff\xfe", 2); +} + +static void +test_utf32_be() +{ + test_recognized_bom(bom::utf32_be, "\x00\x00\xfe\xff", 4); +} + +#if 0 +static void +test_utf32_le() +{ + const char input[] = { '\xff', '\xfe', '\x00', '\x00' }; + + test_recognized_bom(bom::utf32_le, input, 4); +} +#endif + +static void +test_utf7() +{ + test_recognized_bom(bom::utf7, "\x2b\x2f\x76", 3); +} + +static void +test_utf1() +{ + test_recognized_bom(bom::utf1, "\xf7\x64\x4c", 3); +} + +static void +test_utf_ebcdic() +{ + test_recognized_bom(bom::utf_ebcdic, "\xdd\x73\x66\x73", 4); +} + +static void +test_scsu() +{ + test_recognized_bom(bom::scsu, "\x0e\xfe\xff", 3); +} + +static void +test_bocu_1() +{ + test_recognized_bom(bom::bocu_1, "\xfb\xee\x28", 3); +} + +static void +test_gb18030() +{ + test_recognized_bom(bom::gb18030, "\x84\x31\x95\x33", 4); +} + +static void +test_unrecognized_bom() +{ + assert(!detect_bom("", 0)); + assert(!detect_bom("a", 1)); + assert(!detect_bom("a\xef\xbb\xbf", 4)); + assert(!detect_bom("\x00\xbb\xbf\xef\xbb\xbf", 6)); +} + +static void +test_with_string() +{ + assert(!!detect_bom(std::string("\xef\xbb\xbf"))); + assert(!detect_bom(std::string("a"))); +} + +int +main() +{ + test_utf8(); + test_utf16_be(); + test_utf16_le(); + test_utf32_be(); + // test_utf32_le(); + test_utf7(); + test_utf1(); + test_utf_ebcdic(); + test_scsu(); + test_bocu_1(); + test_gb18030(); + test_unrecognized_bom(); + test_with_string(); +}