From 925e7147ece348a170f4fea3a7f94ee72b433030 Mon Sep 17 00:00:00 2001 From: Nemanja Trifunovic Date: Sat, 21 Oct 2023 18:16:40 -0400 Subject: [PATCH] Merge V4 branch into main (#110) * Redefined and renamed types for code units. * Remove -Wsign-conversion from test builds. * find_invalid and is_valid that work with C-style strings. * Lifted the C++11 requirement for some functions that take std::string as an argument. * Support for C++20 u8string Issue #89 * Update test docker image to 4.0.0 * Update Dockerfile to run tests with a recent gcc compiler. * Make some internal helper functions non-template * Add append16 function Support for appending codepoints to existing utf16 encoded strings. See #91 * next16 * Tests and documentation for next16 * Rewrite CMakeLists Drop the existing CMake structure and write the new one from scratch. The root CMakeLists.txt is used for installing the package without building and running tests. Testing is done via a separate CMakeLists.txt in the tests directory. * Remove "samples" directory. The content of that file is already in the documentation. * Update README.md Restructure the reference, add installation instructions, toc, other minor changes --- .circleci/config.yml | 10 +- CMakeLists.txt | 89 ++-- README.md | 1009 +++++++++++++++++++++++++++++------- samples/docsample.cpp | 64 --- source/utf8.h | 12 + source/utf8/checked.h | 104 ++-- source/utf8/core.h | 205 ++++++-- source/utf8/cpp11.h | 37 +- source/utf8/cpp17.h | 9 +- source/utf8/cpp20.h | 124 +++++ source/utf8/unchecked.h | 84 ++- tests/CMakeLists.txt | 35 +- tests/docker/Dockerfile | 2 +- tests/test_checked_api.h | 37 +- tests/test_cpp11.cpp | 8 + tests/test_cpp17.cpp | 4 +- tests/test_cpp20.cpp | 77 +++ tests/test_unchecked_api.h | 29 ++ utf8cppConfig.cmake.in | 8 +- 19 files changed, 1454 insertions(+), 493 deletions(-) delete mode 100644 samples/docsample.cpp create mode 100644 source/utf8/cpp20.h create mode 100644 tests/test_cpp20.cpp diff --git a/.circleci/config.yml b/.circleci/config.yml index 2588646..e5e6f17 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -3,11 +3,11 @@ version: 2 jobs: build: docker: - - image: nemtrif/utf8cpp:3.1.3 + - image: nemtrif/utf8cpp:4.0.0 steps: - checkout - run: git submodule update --init --recursive --remote - - run: mkdir build - - run: cd build && cmake .. - - run: cd build && cmake --build . - - run: cd build && ctest -VV + - run: mkdir -p tests/build + - run: cd tests/build && cmake .. + - run: cd tests/build && cmake --build . + - run: cd tests/build && ctest -VV diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c13109..1bde05a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,67 +1,48 @@ -cmake_minimum_required (VERSION 3.0.2...3.27) -project (utf8cpp VERSION 3.2.5 LANGUAGES CXX) +cmake_minimum_required (VERSION 3.5...3.27) +project (utf8cpp + VERSION 4.0.0 + LANGUAGES CXX + DESCRIPTION "C++ portable library for working with utf-8 encoding") -if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) - set(IS_ROOT_PROJECT ON) -else() - set(IS_ROOT_PROJECT OFF) -endif() +add_library(${PROJECT_NAME} INTERFACE) -option(UTF8_TESTS "Enable tests for UTF8-CPP" ${IS_ROOT_PROJECT}) -option(UTF8_INSTALL "Enable installation for UTF8-CPP" ${IS_ROOT_PROJECT}) -option(UTF8_SAMPLES "Enable building samples for UTF8-CPP" ${IS_ROOT_PROJECT}) +include(GNUInstallDirs) -add_library(utf8cpp INTERFACE) target_include_directories(utf8cpp INTERFACE "$" $ ) -add_library(utf8::cpp ALIAS utf8cpp) -if(UTF8_INSTALL) - include(CMakePackageConfigHelpers) - if(MSVC) - set(DEF_INSTALL_CMAKE_DIR CMake) - else() - include(GNUInstallDirs) # define CMAKE_INSTALL_* - set(DEF_INSTALL_CMAKE_DIR ${CMAKE_INSTALL_LIBDIR}/cmake/utf8cpp) - endif() +include(CMakePackageConfigHelpers) +write_basic_package_version_file( + "${PROJECT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake" + VERSION ${PROJECT_VERSION} + COMPATIBILITY SameMajorVersion +) - if(${CMAKE_VERSION} VERSION_GREATER "3.14") - set(OPTIONAL_ARCH_INDEPENDENT "ARCH_INDEPENDENT") - endif() - - write_basic_package_version_file( - ${CMAKE_CURRENT_BINARY_DIR}/utf8cppConfigVersion.cmake - VERSION ${PROJECT_VERSION} - COMPATIBILITY SameMajorVersion - ${OPTIONAL_ARCH_INDEPENDENT} - ) +install(TARGETS ${PROJECT_NAME} + EXPORT ${PROJECT_NAME}Targets + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + PUBLIC_HEADER DESTINATION include COMPONENT Development + BUNDLE DESTINATION bin COMPONENT Runtime +) - configure_package_config_file( - ${PROJECT_SOURCE_DIR}/utf8cppConfig.cmake.in - ${CMAKE_CURRENT_BINARY_DIR}/utf8cppConfig.cmake - INSTALL_DESTINATION ${DEF_INSTALL_CMAKE_DIR} - ) +configure_package_config_file( + "${PROJECT_SOURCE_DIR}/${PROJECT_NAME}Config.cmake.in" + "${PROJECT_BINARY_DIR}/${PROJECT_NAME}Config.cmake" + INSTALL_DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/cmake +) - install(DIRECTORY source/ DESTINATION include/utf8cpp) - install(TARGETS utf8cpp EXPORT utf8cppTargets) - install(EXPORT utf8cppTargets DESTINATION ${DEF_INSTALL_CMAKE_DIR}) - install( - FILES - ${CMAKE_CURRENT_BINARY_DIR}/utf8cppConfig.cmake - ${CMAKE_CURRENT_BINARY_DIR}/utf8cppConfigVersion.cmake - DESTINATION - ${DEF_INSTALL_CMAKE_DIR} - ) -endif() +install(EXPORT ${PROJECT_NAME}Targets + FILE ${PROJECT_NAME}Targets.cmake + NAMESPACE ${PROJECT_NAME}:: + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/cmake) -if(UTF8_SAMPLES) - add_executable(docsample ${PROJECT_SOURCE_DIR}/samples/docsample.cpp) - target_link_libraries(docsample PRIVATE utf8::cpp) -endif() +install(FILES "${PROJECT_BINARY_DIR}/${PROJECT_NAME}Config.cmake" + "${PROJECT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake" + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/${PROJECT_NAME}/cmake) -if(UTF8_TESTS) - enable_testing() - add_subdirectory(tests) -endif() +install(FILES ${PROJECT_SOURCE_DIR}/source/utf8.h DESTINATION include) +install(DIRECTORY ${PROJECT_SOURCE_DIR}/source/utf8 DESTINATION include) diff --git a/README.md b/README.md index 4f96f65..bf0c3bc 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,124 @@ + + # UTF8-CPP: UTF-8 with C++ in a Portable Way + ## Introduction -C++ developers miss an easy and portable way of handling Unicode encoded strings. The original C++ Standard (known as C++98 or C++03) is Unicode agnostic. C++11 provides some support for Unicode on core language and library level: u8, u, and U character and string literals, char16_t and char32_t character types, u16string and u32string library classes, and codecvt support for conversions between Unicode encoding forms. In the meantime, developers use third party libraries like ICU, OS specific capabilities, or simply roll out their own solutions. - -In order to easily handle UTF-8 encoded Unicode strings, I came up with a small, C++98 compatible generic library. For anybody used to work with STL algorithms and iterators, it should be easy and natural to use. The code is freely available for any purpose - check out the [license](./LICENSE). The library has been used a lot in the past ten years both in commercial and open-source projects and is considered feature-complete now. If you run into bugs or performance issues, please let me know and I'll do my best to address them. - -The purpose of this article is not to offer an introduction to Unicode in general, and UTF-8 in particular. If you are not familiar with Unicode, be sure to check out [Unicode Home Page](http://www.unicode.org/) or some other source of information for Unicode. Also, it is not my aim to advocate the use of UTF-8 encoded strings in C++ programs; if you want to handle UTF-8 encoded strings from C++, I am sure you have good reasons for it. - +C++ developers still miss an easy and portable way of handling Unicode encoded strings. The original C++ standard (known as C++98 or C++03) is Unicode agnostic. Some progress has been made in the later editions of the standard, but it is still hard to work with Unicode using only the standard facilities. + +I came up with a small, C++98 compatible generic library in order to handle UTF-8 encoded strings. For anybody used to work with STL algorithms and iterators, it should be easy and natural to use. The code is freely available for any purpose - check out the [license](./LICENSE). The library has been used a lot since the first release in 2006 both in commercial and open-source projects and proved to be stable and useful. + +## Table of Contents + +- [UTF8-CPP: UTF-8 with C++ in a Portable Way](#utf8-cpp-utf-8-with-c-in-a-portable-way) + * [Introduction](#introduction) + * [Installation](#installation) + * [Examples of use](#examples-of-use) + + [Introductory Sample](#introductory-sample) + + [Checking if a file contains valid UTF-8 text](#checking-if-a-file-contains-valid-utf-8-text) + + [Ensure that a string contains valid UTF-8 text](#ensure-that-a-string-contains-valid-utf-8-text) + * [Points of interest](#points-of-interest) + - [Design goals and decisions](#design-goals-and-decisions) + - [Alternatives](#alternatives) + * [Reference](#reference) + + [Functions From utf8 Namespace](#functions-from-utf8-namespace) + - [utf8::append](#utf8append) + * [octet_iterator append(utfchar32_t cp, octet_iterator result)](#octet_iterator-appendutfchar32_t-cp-octet_iterator-result) + * [void append(utfchar32_t cp, std::string& s);](#void-appendutfchar32_t-cp-stdstring-s) + - [utf8::append16](#utf8append16) + * [word_iterator append16(utfchar32_t cp, word_iterator result)](#word_iterator-append16utfchar32_t-cp-word_iterator-result) + * [void append(utfchar32_t cp, std::u16string& s)](#void-appendutfchar32_t-cp-stdu16string-s) + - [utf8::next](#utf8next) + - [utf8::next16](#utf8next16) + - [utf8::peek_next](#utf8peek_next) + - [utf8::prior](#utf8prior) + - [utf8::advance](#utf8advance) + - [utf8::distance](#utf8distance) + - [utf8::utf16to8](#utf8utf16to8) + * [octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)](#octet_iterator-utf16to8-u16bit_iterator-start-u16bit_iterator-end-octet_iterator-result) + * [std::string utf16to8(const std::u16string& s)](#stdstring-utf16to8const-stdu16string-s) + * [std::string utf16to8(std::u16string_view s)](#stdstring-utf16to8stdu16string_view-s) + - [utf8::utf16tou8](#utf8utf16tou8) + * [std::u8string utf16tou8(const std::u16string& s)](#stdu8string-utf16tou8const-stdu16string-s) + * [std::u8string utf16tou8(const std::u16string_view& s)](#stdu8string-utf16tou8const-stdu16string_view-s) + - [utf8::utf8to16](#utf8utf8to16) + * [u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)](#u16bit_iterator-utf8to16-octet_iterator-start-octet_iterator-end-u16bit_iterator-result) + * [std::u16string utf8to16(const std::string& s)](#stdu16string-utf8to16const-stdstring-s) + * [std::u16string utf8to16(std::string_view s)](#stdu16string-utf8to16stdstring_view-s) + * [std::u16string utf8to16(std::u8string& s)](#stdu16string-utf8to16stdu8string-s) + * [std::u16string utf8to16(std::u8string_view& s)](#stdu16string-utf8to16stdu8string_view-s) + - [utf8::utf32to8](#utf8utf32to8) + * [octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)](#octet_iterator-utf32to8-u32bit_iterator-start-u32bit_iterator-end-octet_iterator-result) + * [std::string utf32to8(const std::u32string& s)](#stdstring-utf32to8const-stdu32string-s) + * [std::u8string utf32to8(const std::u32string& s)](#stdu8string-utf32to8const-stdu32string-s) + * [std::u8string utf32to8(const std::u32string_view& s)](#stdu8string-utf32to8const-stdu32string_view-s) + * [std::string utf32to8(const std::u32string& s)](#stdstring-utf32to8const-stdu32string-s-1) + * [std::string utf32to8(std::u32string_view s)](#stdstring-utf32to8stdu32string_view-s) + - [utf8::utf8to32](#utf8utf8to32) + * [u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)](#u32bit_iterator-utf8to32-octet_iterator-start-octet_iterator-end-u32bit_iterator-result) + * [std::u32string utf8to32(const std::u8string& s)](#stdu32string-utf8to32const-stdu8string-s) + * [std::u32string utf8to32(const std::u8string_view& s)](#stdu32string-utf8to32const-stdu8string_view-s) + * [std::u32string utf8to32(const std::string& s)](#stdu32string-utf8to32const-stdstring-s) + * [std::u32string utf8to32(std::string_view s)](#stdu32string-utf8to32stdstring_view-s) + - [utf8::find_invalid](#utf8find_invalid) + * [octet_iterator find_invalid(octet_iterator start, octet_iterator end)](#octet_iterator-find_invalidoctet_iterator-start-octet_iterator-end) + * [const char* find_invalid(const char* str)](#const-char-find_invalidconst-char-str) + * [std::size_t find_invalid(const std::string& s)](#stdsize_t-find_invalidconst-stdstring-s) + * [std::size_t find_invalid(std::string_view s)](#stdsize_t-find_invalidstdstring_view-s) + - [utf8::is_valid](#utf8is_valid) + * [bool is_valid(octet_iterator start, octet_iterator end)](#bool-is_validoctet_iterator-start-octet_iterator-end) + * [bool is_valid(const char* str)](#bool-is_validconst-char-str) + * [bool is_valid(const std::string& s)](#bool-is_validconst-stdstring-s) + * [bool is_valid(std::string_view s)](#bool-is_validstdstring_view-s) + - [utf8::replace_invalid](#utf8replace_invalid) + * [output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement)](#output_iterator-replace_invalidoctet_iterator-start-octet_iterator-end-output_iterator-out-utfchar32_t-replacement) + * [std::string replace_invalid(const std::string& s, utfchar32_t replacement)](#stdstring-replace_invalidconst-stdstring-s-utfchar32_t-replacement) + * [std::string replace_invalid(std::string_view s, char32_t replacement)](#stdstring-replace_invalidstdstring_view-s-char32_t-replacement) + - [utf8::starts_with_bom](#utf8starts_with_bom) + * [bool starts_with_bom (octet_iterator it, octet_iterator end)](#bool-starts_with_bom-octet_iterator-it-octet_iterator-end) + * [bool starts_with_bom(const std::string& s)](#bool-starts_with_bomconst-stdstring-s) + * [bool starts_with_bom(std::string_view s)](#bool-starts_with_bomstdstring_view-s) + + [Types From utf8 Namespace](#types-from-utf8-namespace) + - [utf8::exception](#utf8exception) + - [utf8::invalid_code_point](#utf8invalid_code_point) + - [utf8::invalid_utf8](#utf8invalid_utf8) + - [utf8::invalid_utf16](#utf8invalid_utf16) + - [utf8::not_enough_room](#utf8not_enough_room) + - [utf8::iterator](#utf8iterator) + * [Member functions](#member-functions) + + [Functions From utf8::unchecked Namespace](#functions-from-utf8unchecked-namespace) + - [utf8::unchecked::append](#utf8uncheckedappend) + - [utf8::unchecked::append16](#utf8uncheckedappend16) + - [utf8::unchecked::next](#utf8uncheckednext) + - [utf8::next16](#utf8next16-1) + - [utf8::unchecked::peek_next](#utf8uncheckedpeek_next) + - [utf8::unchecked::prior](#utf8uncheckedprior) + - [utf8::unchecked::advance](#utf8uncheckedadvance) + - [utf8::unchecked::distance](#utf8uncheckeddistance) + - [utf8::unchecked::utf16to8](#utf8uncheckedutf16to8) + - [utf8::unchecked::utf8to16](#utf8uncheckedutf8to16) + - [utf8::unchecked::utf32to8](#utf8uncheckedutf32to8) + - [utf8::unchecked::utf8to32](#utf8uncheckedutf8to32) + - [utf8::unchecked::replace_invalid](#utf8uncheckedreplace_invalid) + + [Types From utf8::unchecked Namespace](#types-from-utf8unchecked-namespace) + - [utf8::iterator](#utf8iterator-1) + * [Member functions](#member-functions-1) + + + + + +## Installation + +The recommended way to use the library is to download an official release and copy the content of source directory into location of your project's header files. +If you use CMake for your builds, I still recommend just copying the files into your project, but if you want you can use the CMakeList.txt file included in the project. + + ## Examples of use + ### Introductory Sample To illustrate the use of the library, let's start with a small but complete program that opens a file containing UTF-8 encoded text, reads it line by line, checks each line for invalid UTF-8 byte sequences, and converts it to UTF-16 encoding and back to UTF-8: @@ -100,6 +208,7 @@ In case you do not trust the `__cplusplus` macro or, for instance, do not want t the C++ 11 helper functions even with a modern compiler, define `UTF_CPP_CPLUSPLUS` macro before including `utf8.h` and assign it a value for the standard you want to use - the values are the same as for the `__cplusplus` macro. This can be also useful with compilers that are conservative in setting the `__cplusplus` macro even if they have a good support for a recent standard edition - Microsoft's Visual C++ is one example. + ### Checking if a file contains valid UTF-8 text Here is a function that checks whether the content of a file is valid UTF-8 encoded text without reading the content into the memory: @@ -126,6 +235,7 @@ Note that other functions that take input iterator arguments can be used in a si utf8::utf8to16(it, eos, back_inserter(u16string)); ``` + ### Ensure that a string contains valid UTF-8 text If we have some text that "probably" contains UTF-8 encoded text and we want to replace any invalid UTF-8 sequence with a replacement character, something like the following function may be used: @@ -142,8 +252,10 @@ void fix_utf8_string(std::string& str) The function will replace any invalid UTF-8 sequence with a Unicode replacement character. There is an overloaded function that enables the caller to supply their own replacement character. + ## Points of interest + #### Design goals and decisions The library was designed to be: @@ -153,9 +265,10 @@ The library was designed to be: 3. Lightweight: follow the "pay only for what you use" guideline. 4. Unintrusive: avoid forcing any particular design or even programming style on the user. This is a library, not a framework. + #### Alternatives -Here is an article I was made aware of only recently: [The Wonderfully Terrible World of C and C++ Encoding APIs (with Some Rust)](https://thephd.dev/the-c-c++-rust-string-text-encoding-api-landscape), by JeanHeyd Meneide. In the article, this library is compared with: +For alternatives and comparisons, I recommend the following article: [The Wonderfully Terrible World of C and C++ Encoding APIs (with Some Rust)](https://thephd.dev/the-c-c++-rust-string-text-encoding-api-landscape), by JeanHeyd Meneide. In the article, this library is compared with: - [simdutf](https://github.com/simdutf/simdutf) - [iconv](https://www.gnu.org/software/libiconv/) @@ -167,18 +280,54 @@ Here is an article I was made aware of only recently: [The Wonderfully Terrible The article presents author's view of the quality of the API design, but also some speed benchmarks. + ## Reference + ### Functions From utf8 Namespace + #### utf8::append -Available in version 3.0 and later. Requires a C++ 11 compliant compiler. + +##### octet_iterator append(utfchar32_t cp, octet_iterator result) + +Available in version 1.0 and later. + +Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence to a UTF-8 string. + +```cpp +template +octet_iterator append(utfchar32_t cp, octet_iterator result); +``` + +`octet_iterator`: an output iterator. +`cp`: a 32 bit integer representing a code point to append to the sequence. +`result`: an output iterator to the place in the sequence where to append the code point. +Return value: an iterator pointing to the place after the newly appended sequence. + +Example of use: + +```cpp +unsigned char u[5] = {0,0,0,0,0}; +unsigned char* end = append(0x0448, u); +assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0); +``` + +Note that `append` does not allocate any memory - it is the burden of the caller to make sure there is enough memory allocated for the operation. To make things more interesting, `append` can add anywhere between 1 and 4 octets to the sequence. In practice, you would most often want to use `std::back_inserter` to ensure that the necessary memory is allocated. + +In case of an invalid code point, a `utf8::invalid_code_point` exception is thrown. + + + +##### void append(utfchar32_t cp, std::string& s); + +Available in version 3.0 and later. Prior to 4.0 it required a C++ 11 compiler; the requirement is lifted with 4.0. Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence to a UTF-8 string. ```cpp -void append(char32_t cp, std::string& s); +void append(utfchar32_t cp, std::string& s); ``` `cp`: a code point to append to the string. @@ -194,19 +343,21 @@ assert (u[0] == char(0xd1) && u[1] == char(0x88) && u.length() == 2); In case of an invalid code point, a `utf8::invalid_code_point` exception is thrown. + +#### utf8::append16 + +##### word_iterator append16(utfchar32_t cp, word_iterator result) -#### utf8::append +Available in version 4.0 and later. -Available in version 1.0 and later. - -Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequence to a UTF-8 string. +Encodes a 32 bit code point as a UTF-16 sequence of words and appends the sequence to a UTF-16 string. ```cpp -template -octet_iterator append(uint32_t cp, octet_iterator result); +template +word_iterator append16(utfchar32_t cp, word_iterator result); ``` -`octet_iterator`: an output iterator. +`word_iterator`: an output iterator. `cp`: a 32 bit integer representing a code point to append to the sequence. `result`: an output iterator to the place in the sequence where to append the code point. Return value: an iterator pointing to the place after the newly appended sequence. @@ -214,15 +365,42 @@ Return value: an iterator pointing to the place after the newly appended sequenc Example of use: ```cpp -unsigned char u[5] = {0,0,0,0,0}; -unsigned char* end = append(0x0448, u); -assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0); +unsigned short u[2] = {0,0}; +unsigned short* end = append16(0x0448, u); +assert (u[0] == 0x0448 && u[1] == 0); ``` -Note that `append` does not allocate any memory - it is the burden of the caller to make sure there is enough memory allocated for the operation. To make things more interesting, `append` can add anywhere between 1 and 4 octets to the sequence. In practice, you would most often want to use `std::back_inserter` to ensure that the necessary memory is allocated. +Note that `append16` does not allocate any memory - it is the burden of the caller to make sure there is enough memory allocated for the operation. To make things more interesting, `append16` can add either one or two words to the sequence. In practice, you would most often want to use `std::back_inserter` to ensure that the necessary memory is allocated. In case of an invalid code point, a `utf8::invalid_code_point` exception is thrown. + + +##### void append(utfchar32_t cp, std::u16string& s) + +Available in version 4.0 and later. Requires a C++11 compliant compiler. + +Encodes a 32 bit code point as a UTF-16 sequence of words and appends the sequence to a UTF-16 string. + +```cpp +void append(utfchar32_t cp, std::u16string& s); +``` + +`cp`: a code point to append to the string. +`s`: a utf-16 encoded string to append the code point to. + +Example of use: + +```cpp +std::u16string u; +append(0x0448, u); +assert (u[0] == 0x0448 && u.length() == 1); +``` + +In case of an invalid code point, a `utf8::invalid_code_point` exception is thrown. + + + #### utf8::next Available in version 1.0 and later. @@ -231,7 +409,7 @@ Given the iterator to the beginning of the UTF-8 sequence, it returns the code p ```cpp template -uint32_t next(octet_iterator& it, octet_iterator end); +utfchar32_t next(octet_iterator& it, octet_iterator end); ``` `octet_iterator`: an input iterator. @@ -253,6 +431,39 @@ This function is typically used to iterate through a UTF-8 encoded string. In case of an invalid UTF-8 sequence, a `utf8::invalid_utf8` exception is thrown. + +#### utf8::next16 + +Available in version 4.0 and later. + +Given the iterator to the beginning of the UTF-16 sequence, it returns the code point and moves the iterator to the next position. + +```cpp +template +utfchar32_t next16(word_iterator& it, word_iterator end); +``` + +`word_iterator`: an input iterator. +`it`: a reference to an iterator pointing to the beginning of an UTF-16 encoded code point. After the function returns, it is incremented to point to the beginning of the next code point. +`end`: end of the UTF-16 sequence to be processed. If `it` gets equal to `end` during the extraction of a code point, an `utf8::not_enough_room` exception is thrown. +Return value: the 32 bit representation of the processed UTF-16 code point. + +Example of use: + +```cpp +const unsigned short u[3] = {0x65e5, 0xd800, 0xdf46}; +const unsigned short* w = u; +int cp = next16(w, w + 3); +assert (cp, 0x65e5); +assert (w, u + 1); +``` + +This function is typically used to iterate through a UTF-16 encoded string. + +In case of an invalid UTF-16 sequence, a `utf8::invalid_utf8` exception is thrown. + + + #### utf8::peek_next Available in version 2.1 and later. @@ -261,7 +472,7 @@ Given the iterator to the beginning of the UTF-8 sequence, it returns the code p ```cpp template -uint32_t peek_next(octet_iterator it, octet_iterator end); +utfchar32_t peek_next(octet_iterator it, octet_iterator end); ``` @@ -282,6 +493,7 @@ assert (w == twochars); In case of an invalid UTF-8 sequence, a `utf8::invalid_utf8` exception is thrown. + #### utf8::prior Available in version 1.02 and later. @@ -290,7 +502,7 @@ Given a reference to an iterator pointing to an octet in a UTF-8 sequence, it de ```cpp template -uint32_t prior(octet_iterator& it, octet_iterator start); +utfchar32_t prior(octet_iterator& it, octet_iterator start); ``` `octet_iterator`: a bidirectional iterator. @@ -316,6 +528,7 @@ In case `start` is reached before a UTF-8 lead octet is hit, or if an invalid UT In case `start` equals `it`, a `not_enough_room` exception is thrown. + #### utf8::advance Available in version 1.0 and later. @@ -345,6 +558,7 @@ assert (w == twochars); In case of an invalid code point, a `utf8::invalid_code_point` exception is thrown. + #### utf8::distance Available in version 1.0 and later. @@ -373,7 +587,41 @@ This function is used to find the length (in code points) of a UTF-8 encoded str In case of an invalid UTF-8 sequence, a `utf8::invalid_utf8` exception is thrown. If `last` does not point to the past-of-end of a UTF-8 sequence, a `utf8::not_enough_room` exception is thrown. + #### utf8::utf16to8 + +##### octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) + +Available in version 1.0 and later. + +Converts a UTF-16 encoded string to UTF-8. + +```cpp +template +octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result); +``` + +`u16bit_iterator`: an input iterator. +`octet_iterator`: an output iterator. +`start`: an iterator pointing to the beginning of the UTF-16 encoded string to convert. +`end`: an iterator pointing to pass-the-end of the UTF-16 encoded string to convert. +`result`: an output iterator to the place in the UTF-8 string where to append the result of conversion. +Return value: An iterator pointing to the place after the appended UTF-8 string. + +Example of use: + +```cpp +unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; +vector utf8result; +utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); +assert (utf8result.size() == 10); +``` + +In case of invalid UTF-16 sequence, a `utf8::invalid_utf16` exception is thrown. + + + +##### std::string utf16to8(const std::u16string& s) Available in version 3.0 and later. Requires a C++ 11 compliant compiler. @@ -396,7 +644,8 @@ Example of use: In case of invalid UTF-16 sequence, a `utf8::invalid_utf16` exception is thrown. -#### utf8::utf16to8 + +##### std::string utf16to8(std::u16string_view s) Available in version 3.2 and later. Requires a C++ 17 compliant compiler. @@ -420,37 +669,94 @@ Example of use: In case of invalid UTF-16 sequence, a `utf8::invalid_utf16` exception is thrown. + +#### utf8::utf16tou8 + +##### std::u8string utf16tou8(const std::u16string& s) -#### utf8::utf16to8 +Available in version 4.0 and later. Requires a C++ 20 compliant compiler. -Available in version 1.0 and later. +Converts a UTF-16 encoded string to UTF-8. + +```cpp +std::u8string utf16tou8(const std::u16string& s); +``` + +`s`: a UTF-16 encoded string. +Return value: A UTF-8 encoded string. + +Example of use: + +```cpp + u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + u8string u = utf16tou8(utf16string); + assert (u.size() == 10); +``` + +In case of invalid UTF-16 sequence, a `utf8::invalid_utf16` exception is thrown. + + +##### std::u8string utf16tou8(const std::u16string_view& s) + +Available in version 4.0 and later. Requires a C++ 20 compliant compiler. Converts a UTF-16 encoded string to UTF-8. ```cpp -template -octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result); +std::u8string utf16tou8(const std::u16string_view& s); ``` -`u16bit_iterator`: an input iterator. -`octet_iterator`: an output iterator. -`start`: an iterator pointing to the beginning of the UTF-16 encoded string to convert. -`end`: an iterator pointing to pass-the-end of the UTF-16 encoded string to convert. -`result`: an output iterator to the place in the UTF-8 string where to append the result of conversion. -Return value: An iterator pointing to the place after the appended UTF-8 string. +`s`: a UTF-16 encoded string. +Return value: A UTF-8 encoded string. Example of use: ```cpp -unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; -vector utf8result; -utf16to8(utf16string, utf16string + 5, back_inserter(utf8result)); -assert (utf8result.size() == 10); + u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + u16string_view utf16stringview(u16string); + u8string u = utf16tou8(utf16string); + assert (u.size() == 10); ``` In case of invalid UTF-16 sequence, a `utf8::invalid_utf16` exception is thrown. + #### utf8::utf8to16 + +##### u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) + +Available in version 1.0 and later. + +Converts an UTF-8 encoded string to UTF-16 + +```cpp +template +u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result); +``` + +`octet_iterator`: an input iterator. +`u16bit_iterator`: an output iterator. +`start`: an iterator pointing to the beginning of the UTF-8 encoded string to convert. < br /> `end`: an iterator pointing to pass-the-end of the UTF-8 encoded string to convert. +`result`: an output iterator to the place in the UTF-16 string where to append the result of conversion. +Return value: An iterator pointing to the place after the appended UTF-16 string. + +Example of use: + +```cpp +char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; +vector utf16result; +utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result)); +assert (utf16result.size() == 4); +assert (utf16result[2] == 0xd834); +assert (utf16result[3] == 0xdd1e); +``` + +In case of an invalid UTF-8 sequence, a `utf8::invalid_utf8` exception is thrown. If `end` does not point to the past-of-end of a UTF-8 sequence, a `utf8::not_enough_room` exception is thrown. + + + + +##### std::u16string utf8to16(const std::string& s) Available in version 3.0 and later. Requires a C++ 11 compliant compiler. @@ -475,7 +781,9 @@ assert (utf16result[3] == 0xdd1e); In case of an invalid UTF-8 sequence, a `utf8::invalid_utf8` exception is thrown. -#### utf8::utf8to16 + + +##### std::u16string utf8to16(std::string_view s) Available in version 3.2 and later. Requires a C++ 17 compliant compiler. @@ -501,37 +809,95 @@ assert (utf16result[3] == 0xdd1e); In case of an invalid UTF-8 sequence, a `utf8::invalid_utf8` exception is thrown. -#### utf8::utf8to16 + +##### std::u16string utf8to16(std::u8string& s) -Available in version 1.0 and later. +Available in version 4.0 and later. Requires a C++ 20 compliant compiler. -Converts an UTF-8 encoded string to UTF-16 +Converts an UTF-8 encoded string to UTF-16. ```cpp -template -u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result); +std::u16string utf8to16(std::u8string& s); ``` -`octet_iterator`: an input iterator. -`u16bit_iterator`: an output iterator. -`start`: an iterator pointing to the beginning of the UTF-8 encoded string to convert. < br /> `end`: an iterator pointing to pass-the-end of the UTF-8 encoded string to convert. -`result`: an output iterator to the place in the UTF-16 string where to append the result of conversion. -Return value: An iterator pointing to the place after the appended UTF-16 string. +`s`: an UTF-8 encoded string to convert. +Return value: A UTF-16 encoded string Example of use: ```cpp -char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; -vector utf16result; -utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result)); -assert (utf16result.size() == 4); +std::u8string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; +std::u16string utf16result = utf8to16(utf8_with_surrogates); +assert (utf16result.length() == 4); assert (utf16result[2] == 0xd834); assert (utf16result[3] == 0xdd1e); ``` -In case of an invalid UTF-8 sequence, a `utf8::invalid_utf8` exception is thrown. If `end` does not point to the past-of-end of a UTF-8 sequence, a `utf8::not_enough_room` exception is thrown. +In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. + + + +##### std::u16string utf8to16(std::u8string_view& s) + +Available in version 4.0 and later. Requires a C++ 20 compliant compiler. + +Converts an UTF-8 encoded string to UTF-16. + +```cpp +std::u16string utf8to16(std::u8string_view& s); +``` + +`s`: an UTF-8 encoded string to convert. +Return value: A UTF-16 encoded string + +Example of use: + +```cpp +std::u8string utf8_with_surrogates = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; +std::u8string_view utf8stringview {utf8_with_surrogates} +std::u16string utf16result = utf8to16(utf8stringview); +assert (utf16result.length() == 4); +assert (utf16result[2] == 0xd834); +assert (utf16result[3] == 0xdd1e); +``` + +In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. + #### utf8::utf32to8 + +##### octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) + +Available in version 1.0 and later. + +Converts a UTF-32 encoded string to UTF-8. + +```cpp +template +octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result); +``` + +`octet_iterator`: an output iterator. +`u32bit_iterator`: an input iterator. +`start`: an iterator pointing to the beginning of the UTF-32 encoded string to convert. +`end`: an iterator pointing to pass-the-end of the UTF-32 encoded string to convert. +`result`: an output iterator to the place in the UTF-8 string where to append the result of conversion. +Return value: An iterator pointing to the place after the appended UTF-8 string. + +Example of use: + +```cpp +int utf32string[] = {0x448, 0x65E5, 0x10346, 0}; +vector utf8result; +utf32to8(utf32string, utf32string + 3, back_inserter(utf8result)); +assert (utf8result.size() == 9); +``` + +In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown. + + + +##### std::string utf32to8(const std::u32string& s) Available in version 3.0 and later. Requires a C++ 11 compliant compiler. @@ -554,61 +920,194 @@ assert (utf8result.size() == 9); In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown. -#### utf8::utf32to8 + +##### std::u8string utf32to8(const std::u32string& s) + +Available in version 4.0 and later. Requires a C++ 20 compliant compiler. + +Converts a UTF-32 encoded string to UTF-8. + +```cpp +std::u8string utf32to8(const std::u32string& s); +``` + +`s`: a UTF-32 encoded string. +Return value: a UTF-8 encoded string. + +Example of use: + +```cpp +u32string utf32string = {0x448, 0x65E5, 0x10346}; +u8string utf8result = utf32to8(utf32string); +assert (utf8result.size() == 9); +``` + +In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown. + + + +##### std::u8string utf32to8(const std::u32string_view& s) + +Available in version 4.0 and later. Requires a C++ 20 compliant compiler. + +Converts a UTF-32 encoded string to UTF-8. + +```cpp +std::u8string utf32to8(const std::u32string_view& s); +``` + +`s`: a UTF-32 encoded string. +Return value: a UTF-8 encoded string. + +Example of use: + +```cpp +u32string utf32string = {0x448, 0x65E5, 0x10346}; +u32string_view utf32stringview(utf32string); +u8string utf8result = utf32to8(utf32stringview); +assert (utf8result.size() == 9); +``` + +In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown. + + + +##### std::string utf32to8(const std::u32string& s) + +Available in version 3.0 and later. Requires a C++ 11 compliant compiler. + +Converts a UTF-32 encoded string to UTF-8. + +```cpp +std::string utf32to8(const std::u32string& s); +``` + +`s`: a UTF-32 encoded string. +Return value: a UTF-8 encoded string. + +Example of use: + +```cpp +u32string utf32string = {0x448, 0x65E5, 0x10346}; +string utf8result = utf32to8(utf32string); +assert (utf8result.size() == 9); +``` + +In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown. + + +##### std::string utf32to8(std::u32string_view s) + +Available in version 3.2 and later. Requires a C++ 17 compliant compiler. + +Converts a UTF-32 encoded string to UTF-8. + +```cpp +std::string utf32to8(std::u32string_view s); +``` + +`s`: a UTF-32 encoded string. +Return value: a UTF-8 encoded string. + +Example of use: + +```cpp +u32string utf32string = {0x448, 0x65E5, 0x10346}; +u32string_view utf32stringview(utf32string); +string utf8result = utf32to8(utf32stringview); +assert (utf8result.size() == 9); +``` + +In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown. + + + +#### utf8::utf8to32 + +##### u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) + +Available in version 1.0 and later. + +Converts a UTF-8 encoded string to UTF-32. + +```cpp +template +u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result); +``` + +`octet_iterator`: an input iterator. +`u32bit_iterator`: an output iterator. +`start`: an iterator pointing to the beginning of the UTF-8 encoded string to convert. +`end`: an iterator pointing to pass-the-end of the UTF-8 encoded string to convert. +`result`: an output iterator to the place in the UTF-32 string where to append the result of conversion. +Return value: An iterator pointing to the place after the appended UTF-32 string. + +Example of use: + +```cpp +char* twochars = "\xe6\x97\xa5\xd1\x88"; +vector utf32result; +utf8to32(twochars, twochars + 5, back_inserter(utf32result)); +assert (utf32result.size() == 2); +``` + +In case of an invalid UTF-8 sequence, a `utf8::invalid_utf8` exception is thrown. If `end` does not point to the past-of-end of a UTF-8 sequence, a `utf8::not_enough_room` exception is thrown. + + + + +##### std::u32string utf8to32(const std::u8string& s) -Available in version 3.2 and later. Requires a C++ 17 compliant compiler. +Available in version 4.0 and later. Requires a C++ 20 compliant compiler. -Converts a UTF-32 encoded string to UTF-8. +Converts a UTF-8 encoded string to UTF-32. ```cpp -std::string utf32to8(std::u32string_view s); +std::u32string utf8to32(const std::u8string& s); ``` -`s`: a UTF-32 encoded string. -Return value: a UTF-8 encoded string. +`s`: a UTF-8 encoded string. +Return value: a UTF-32 encoded string. Example of use: ```cpp -u32string utf32string = {0x448, 0x65E5, 0x10346}; -u32string_view utf32stringview(utf32string); -string utf8result = utf32to8(utf32stringview); -assert (utf8result.size() == 9); +const std::u8string* twochars = u8"\xe6\x97\xa5\xd1\x88"; +u32string utf32result = utf8to32(twochars); +assert (utf32result.size() == 2); ``` -In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown. +In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. -#### utf8::utf32to8 + +##### std::u32string utf8to32(const std::u8string_view& s) -Available in version 1.0 and later. +Available in version 4.0 and later. Requires a C++ 20 compliant compiler. -Converts a UTF-32 encoded string to UTF-8. +Converts a UTF-8 encoded string to UTF-32. ```cpp -template -octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result); +std::u32string utf8to32(const std::u8string_view& s); ``` -`octet_iterator`: an output iterator. -`u32bit_iterator`: an input iterator. -`start`: an iterator pointing to the beginning of the UTF-32 encoded string to convert. -`end`: an iterator pointing to pass-the-end of the UTF-32 encoded string to convert. -`result`: an output iterator to the place in the UTF-8 string where to append the result of conversion. -Return value: An iterator pointing to the place after the appended UTF-8 string. +`s`: a UTF-8 encoded string. +Return value: a UTF-32 encoded string. Example of use: ```cpp -int utf32string[] = {0x448, 0x65E5, 0x10346, 0}; -vector utf8result; -utf32to8(utf32string, utf32string + 3, back_inserter(utf8result)); -assert (utf8result.size() == 9); +const u8string* twochars = u8"\xe6\x97\xa5\xd1\x88"; +const u8string_view stringview{twochars}; +u32string utf32result = utf8to32(stringview); +assert (utf32result.size() == 2); ``` -In case of invalid UTF-32 string, a `utf8::invalid_code_point` exception is thrown. +In case of an invalid UTF-8 seqence, a `utf8::invalid_utf8` exception is thrown. -#### utf8::utf8to32 + + +##### std::u32string utf8to32(const std::string& s) Available in version 3.0 and later. Requires a C++ 11 compliant compiler. @@ -631,7 +1130,8 @@ assert (utf32result.size() == 2); In case of an invalid UTF-8 sequence, a `utf8::invalid_utf8` exception is thrown. -#### utf8::utf8to32 + +##### std::u32string utf8to32(std::string_view s) Available in version 3.2 and later. Requires a C++ 17 compliant compiler. @@ -654,39 +1154,64 @@ assert (utf32result.size() == 2); In case of an invalid UTF-8 sequence, a `utf8::invalid_utf8` exception is thrown. - -#### utf8::utf8to32 + +#### utf8::find_invalid + +##### octet_iterator find_invalid(octet_iterator start, octet_iterator end) Available in version 1.0 and later. -Converts a UTF-8 encoded string to UTF-32. +Detects an invalid sequence within a UTF-8 string. ```cpp -template -u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result); +template +octet_iterator find_invalid(octet_iterator start, octet_iterator end); ``` `octet_iterator`: an input iterator. -`u32bit_iterator`: an output iterator. -`start`: an iterator pointing to the beginning of the UTF-8 encoded string to convert. -`end`: an iterator pointing to pass-the-end of the UTF-8 encoded string to convert. -`result`: an output iterator to the place in the UTF-32 string where to append the result of conversion. -Return value: An iterator pointing to the place after the appended UTF-32 string. +`start`: an iterator pointing to the beginning of the UTF-8 string to test for validity. +`end`: an iterator pointing to pass-the-end of the UTF-8 string to test for validity. +Return value: an iterator pointing to the first invalid octet in the UTF-8 string. In case none were found, equals `end`. Example of use: ```cpp -char* twochars = "\xe6\x97\xa5\xd1\x88"; -vector utf32result; -utf8to32(twochars, twochars + 5, back_inserter(utf32result)); -assert (utf32result.size() == 2); +char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; +char* invalid = find_invalid(utf_invalid, utf_invalid + 6); +assert (invalid == utf_invalid + 5); ``` -In case of an invalid UTF-8 sequence, a `utf8::invalid_utf8` exception is thrown. If `end` does not point to the past-of-end of a UTF-8 sequence, a `utf8::not_enough_room` exception is thrown. +This function is typically used to make sure a UTF-8 string is valid before processing it with other functions. It is especially important to call it if before doing any of the _unchecked_ operations on it. -#### utf8::find_invalid -Available in version 3.0 and later. Requires a C++ 11 compliant compiler. + +##### const char* find_invalid(const char* str) + +Available in version 4.0 and later. + +Detects an invalid sequence within a C-style UTF-8 string. + +```cpp +const char* find_invalid(const char* str); +``` + +`str`: a UTF-8 encoded string. +Return value: a pointer to the first invalid octet in the UTF-8 string. In case none were found, points to the trailing zero byte. + +Example of use: + +```cpp +const char* utf_invalid = "\xe6\x97\xa5\xd1\x88\xfa"; +const char* invalid = find_invalid(utf_invalid); +assert ((invalid - utf_invalid) == 5); +``` + +This function is typically used to make sure a UTF-8 string is valid before processing it with other functions. It is especially important to call it if before doing any of the _unchecked_ operations on it. + + +##### std::size_t find_invalid(const std::string& s) + +Available in version 3.0 and later. Prior to 4.0 it required a C++ 11 compiler; the requirement is lifted with 4.0 Detects an invalid sequence within a UTF-8 string. @@ -707,7 +1232,8 @@ assert (invalid == 5); This function is typically used to make sure a UTF-8 string is valid before processing it with other functions. It is especially important to call it if before doing any of the _unchecked_ operations on it. -#### utf8::find_invalid + +##### std::size_t find_invalid(std::string_view s) Available in version 3.2 and later. Requires a C++ 17 compliant compiler. @@ -730,36 +1256,65 @@ assert (invalid == 5); This function is typically used to make sure a UTF-8 string is valid before processing it with other functions. It is especially important to call it if before doing any of the _unchecked_ operations on it. - -#### utf8::find_invalid + +#### utf8::is_valid + +##### bool is_valid(octet_iterator start, octet_iterator end) Available in version 1.0 and later. -Detects an invalid sequence within a UTF-8 string. +Checks whether a sequence of octets is a valid UTF-8 string. ```cpp template -octet_iterator find_invalid(octet_iterator start, octet_iterator end); +bool is_valid(octet_iterator start, octet_iterator end); ``` `octet_iterator`: an input iterator. `start`: an iterator pointing to the beginning of the UTF-8 string to test for validity. `end`: an iterator pointing to pass-the-end of the UTF-8 string to test for validity. -Return value: an iterator pointing to the first invalid octet in the UTF-8 string. In case none were found, equals `end`. +Return value: `true` if the sequence is a valid UTF-8 string; `false` if not. Example of use: ```cpp char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; -char* invalid = find_invalid(utf_invalid, utf_invalid + 6); -assert (invalid == utf_invalid + 5); +bool bvalid = is_valid(utf_invalid, utf_invalid + 6); +assert (bvalid == false); ``` -This function is typically used to make sure a UTF-8 string is valid before processing it with other functions. It is especially important to call it if before doing any of the _unchecked_ operations on it. +`is_valid` is a shorthand for `find_invalid(start, end) == end;`. You may want to use it to make sure that a byte sequence is a valid UTF-8 string without the need to know where it fails if it is not valid. -#### utf8::is_valid -Available in version 3.0 and later. Requires a C++ 11 compliant compiler. + +##### bool is_valid(const char* str) + +Available in version 4.0 and later. + +Checks whether a C-style string contains valid UTF-8 encoded text. + +```cpp +bool is_valid(const char* str); +``` + +`str`: a UTF-8 encoded string. +Return value: `true` if the string contains valid UTF-8 encoded text; `false` if not. + +Example of use: + +```cpp +char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; +bool bvalid = is_valid(utf_invalid); +assert (bvalid == false); +``` + +You may want to use `is_valid` to make sure that a string contains valid UTF-8 text without the need to know where it fails if it is not valid. + + + +##### bool is_valid(const std::string& s) + +Available in version 3.0 and later. Prior to 4.0 it required a C++ 11 compiler; the requirement is lifted with 4.0 Checks whether a string object contains valid UTF-8 encoded text. @@ -780,7 +1335,8 @@ assert (bvalid == false); You may want to use `is_valid` to make sure that a string contains valid UTF-8 text without the need to know where it fails if it is not valid. -#### utf8::is_valid + +##### bool is_valid(std::string_view s) Available in version 3.2 and later. Requires a C++ 17 compliant compiler. @@ -803,41 +1359,54 @@ assert (bvalid == false); You may want to use `is_valid` to make sure that a string contains valid UTF-8 text without the need to know where it fails if it is not valid. + +#### utf8::replace_invalid + +##### output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement) -#### utf8::is_valid - -Available in version 1.0 and later. +Available in version 2.0 and later. -Checks whether a sequence of octets is a valid UTF-8 string. +Replaces all invalid UTF-8 sequences within a string with a replacement marker. ```cpp -template -bool is_valid(octet_iterator start, octet_iterator end); +template +output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement); +template +output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out); ``` `octet_iterator`: an input iterator. -`start`: an iterator pointing to the beginning of the UTF-8 string to test for validity. -`end`: an iterator pointing to pass-the-end of the UTF-8 string to test for validity. -Return value: `true` if the sequence is a valid UTF-8 string; `false` if not. +`output_iterator`: an output iterator. +`start`: an iterator pointing to the beginning of the UTF-8 string to look for invalid UTF-8 sequences. +`end`: an iterator pointing to pass-the-end of the UTF-8 string to look for invalid UTF-8 sequences. +`out`: An output iterator to the range where the result of replacement is stored. +`replacement`: A Unicode code point for the replacement marker. The version without this parameter assumes the value `0xfffd` +Return value: An iterator pointing to the place after the UTF-8 string with replaced invalid sequences. Example of use: ```cpp -char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; -bool bvalid = is_valid(utf_invalid, utf_invalid + 6); -assert (bvalid == false); +char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; +vector replace_invalid_result; +replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), back_inserter(replace_invalid_result), '?'); +bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end()); +assert (bvalid); +char* fixed_invalid_sequence = "a????z"; +assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), fixed_invalid_sequence)); ``` -`is_valid` is a shorthand for `find_invalid(start, end) == end;`. You may want to use it to make sure that a byte sequence is a valid UTF-8 string without the need to know where it fails if it is not valid. +`replace_invalid` does not perform in-place replacement of invalid sequences. Rather, it produces a copy of the original string with the invalid sequences replaced with a replacement marker. Therefore, `out` must not be in the `[start, end]` range. -#### utf8::replace_invalid -Available in version 3.0 and later. Requires a C++ 11 compliant compiler. + +##### std::string replace_invalid(const std::string& s, utfchar32_t replacement) + +Available in version 3.0 and later. Prior to 4.0 it required a C++ 11 compiler; the requirement is lifted with 4.0 Replaces all invalid UTF-8 sequences within a string with a replacement marker. ```cpp -std::string replace_invalid(const std::string& s, char32_t replacement); +std::string replace_invalid(const std::string& s, utfchar32_t replacement); std::string replace_invalid(const std::string& s); ``` @@ -856,7 +1425,8 @@ const string fixed_invalid_sequence = "a????z"; assert (fixed_invalid_sequence == replace_invalid_result); ``` -#### utf8::replace_invalid + +##### std::string replace_invalid(std::string_view s, char32_t replacement) Available in version 3.2 and later. Requires a C++ 17 compliant compiler. @@ -882,45 +1452,40 @@ const string fixed_invalid_sequence = "a????z"; assert(fixed_invalid_sequence, replace_invalid_result); ``` + +#### utf8::starts_with_bom + +##### bool starts_with_bom (octet_iterator it, octet_iterator end) -#### utf8::replace_invalid - -Available in version 2.0 and later. +Available in version 2.3 and later. -Replaces all invalid UTF-8 sequences within a string with a replacement marker. +Checks whether an octet sequence starts with a UTF-8 byte order mark (BOM) ```cpp -template -output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement); -template -output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out); +template +bool starts_with_bom (octet_iterator it, octet_iterator end); ``` `octet_iterator`: an input iterator. -`output_iterator`: an output iterator. -`start`: an iterator pointing to the beginning of the UTF-8 string to look for invalid UTF-8 sequences. -`end`: an iterator pointing to pass-the-end of the UTF-8 string to look for invalid UTF-8 sequences. -`out`: An output iterator to the range where the result of replacement is stored. -`replacement`: A Unicode code point for the replacement marker. The version without this parameter assumes the value `0xfffd` -Return value: An iterator pointing to the place after the UTF-8 string with replaced invalid sequences. +`it`: beginning of the octet sequence to check +`end`: pass-end of the sequence to check +Return value: `true` if the sequence starts with a UTF-8 byte order mark; `false` if not. Example of use: ```cpp -char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; -vector replace_invalid_result; -replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), back_inserter(replace_invalid_result), '?'); -bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end()); -assert (bvalid); -char* fixed_invalid_sequence = "a????z"; -assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), fixed_invalid_sequence)); +unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf}; +bool bbom = starts_with_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark)); +assert (bbom == true); ``` -`replace_invalid` does not perform in-place replacement of invalid sequences. Rather, it produces a copy of the original string with the invalid sequences replaced with a replacement marker. Therefore, `out` must not be in the `[start, end]` range. +The typical use of this function is to check the first three bytes of a file. If they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8 encoded text. -#### utf8::starts_with_bom -Available in version 3.0 and later. Requires a C++ 11 compliant compiler. + +##### bool starts_with_bom(const std::string& s) + +Available in version 3.0 and later. Prior to 4.0 it required a C++ 11 compiler; the requirement is lifted with 4.0 Checks whether a string starts with a UTF-8 byte order mark (BOM) @@ -945,7 +1510,8 @@ assert (no_bbom == false); The typical use of this function is to check the first three bytes of a file. If they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8 encoded text. -#### utf8::starts_with_bom + +##### bool starts_with_bom(std::string_view s) Available in version 3.2 and later. Requires a C++ 17 compliant compiler. @@ -973,34 +1539,10 @@ assert (!no_bbom); The typical use of this function is to check the first three bytes of a file. If they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8 encoded text. -#### utf8::starts_with_bom - -Available in version 2.3 and later. - -Checks whether an octet sequence starts with a UTF-8 byte order mark (BOM) - -```cpp -template -bool starts_with_bom (octet_iterator it, octet_iterator end); -``` - -`octet_iterator`: an input iterator. -`it`: beginning of the octet sequence to check -`end`: pass-end of the sequence to check -Return value: `true` if the sequence starts with a UTF-8 byte order mark; `false` if not. - -Example of use: - -```cpp -unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf}; -bool bbom = starts_with_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark)); -assert (bbom == true); -``` - -The typical use of this function is to check the first three bytes of a file. If they form the UTF-8 BOM, we want to skip them before processing the actual UTF-8 encoded text. - + ### Types From utf8 Namespace + #### utf8::exception Available in version 2.3 and later. @@ -1022,6 +1564,7 @@ catch(const utf8::exception& utfcpp_ex) { } ``` + #### utf8::invalid_code_point Available in version 1.0 and later. @@ -1031,12 +1574,13 @@ Thrown by UTF8 CPP functions such as `advance` and `next` if an UTF-8 sequence r ```cpp class invalid_code_point : public exception { public: - uint32_t code_point() const; + utfchar32_t code_point() const; }; ``` Member function `code_point()` can be used to determine the invalid code point that caused the exception to be thrown. + #### utf8::invalid_utf8 Available in version 1.0 and later. @@ -1046,12 +1590,13 @@ Thrown by UTF8 CPP functions such as `next` and `prior` if an invalid UTF-8 sequ ```cpp class invalid_utf8 : public exception { public: - uint8_t utf8_octet() const; + utfchar8_t utf8_octet() const; }; ``` Member function `utf8_octet()` can be used to determine the beginning of the byte sequence that caused the exception to be thrown. + #### utf8::invalid_utf16 Available in version 1.0 and later. @@ -1061,12 +1606,13 @@ Thrown by UTF8 CPP function `utf16to8` if an invalid UTF-16 sequence is detected ```cpp class invalid_utf16 : public exception { public: - uint16_t utf16_word() const; + utfchar16_t utf16_word() const; }; ``` Member function `utf16_word()` can be used to determine the UTF-16 code unit that caused the exception to be thrown. + #### utf8::not_enough_room Available in version 1.0 and later. @@ -1077,6 +1623,7 @@ Thrown by UTF8 CPP functions such as `next` if the end of the decoded UTF-8 sequ class not_enough_room : public exception {}; ``` + #### utf8::iterator Available in version 2.0 and later. @@ -1088,6 +1635,7 @@ template class iterator; ``` + ##### Member functions `iterator();` the deafult constructor; the underlying octet_iterator is constructed with its default constructor. @@ -1096,7 +1644,7 @@ class iterator; `octet_iterator base () const;` returns the underlying octet_iterator. -`uint32_t operator * () const;` decodes the utf-8 sequence the underlying octet_iterator is pointing to and returns the code point. +`utfchar32_t operator * () const;` decodes the utf-8 sequence the underlying octet_iterator is pointing to and returns the code point. `bool operator == (const iterator& rhs) const;` returns `true` if the two underlying iterators are equal. @@ -1140,8 +1688,10 @@ std::string s = "example"; utf8::iterator i (s.begin(), s.begin(), s.end()); ``` + ### Functions From utf8::unchecked Namespace + #### utf8::unchecked::append Available in version 1.0 and later. @@ -1150,7 +1700,7 @@ Encodes a 32 bit code point as a UTF-8 sequence of octets and appends the sequen ```cpp template -octet_iterator append(uint32_t cp, octet_iterator result); +octet_iterator append(utfchar32_t cp, octet_iterator result); ``` `cp`: A 32 bit integer representing a code point to append to the sequence. @@ -1167,6 +1717,35 @@ assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0); This is a faster but less safe version of `utf8::append`. It does not check for validity of the supplied code point, and may produce an invalid UTF-8 sequence. + +#### utf8::unchecked::append16 + +Available in version 4.0 and later. + +Encodes a 32 bit code point as a UTF-16 sequence of words and appends the sequence to a UTF-16 string. + +```cpp +template +word_iterator append16(utfchar32_t cp, word_iterator result) +``` + +`cp`: A 32 bit integer representing a code point to append to the sequence. +`result`: An output iterator to the place in the sequence where to append the code point. +Return value: An iterator pointing to the place after the newly appended sequence. + +Example of use: + +```cpp +unsigned short u[5] = {0,0}; +utf8::unchecked::append16(0x0448, u); +assert(u[0], 0x0448); +assert(u[1], 0x0000); +``` + +This is a faster but less safe version of `utf8::append`. It does not check for validity of the supplied code point, and may produce an invalid UTF-8 sequence. + + + #### utf8::unchecked::next Available in version 1.0 and later. @@ -1175,7 +1754,7 @@ Given the iterator to the beginning of a UTF-8 sequence, it returns the code poi ```cpp template -uint32_t next(octet_iterator& it); +utfchar32_t next(octet_iterator& it); ``` `it`: a reference to an iterator pointing to the beginning of an UTF-8 encoded code point. After the function returns, it is incremented to point to the beginning of the next code point. @@ -1193,6 +1772,39 @@ assert (w == twochars + 3); This is a faster but less safe version of `utf8::next`. It does not check for validity of the supplied UTF-8 sequence. + +#### utf8::next16 + +Available in version 4.0 and later. + +Given the iterator to the beginning of the UTF-16 sequence, it returns the code point and moves the iterator to the next position. + +```cpp +template +utfchar32_t next16(word_iterator& it); +``` + +`word_iterator`: an input iterator. +`it`: a reference to an iterator pointing to the beginning of an UTF-16 encoded code point. After the function returns, it is incremented to point to the beginning of the next code point. + +Return value: the 32 bit representation of the processed UTF-16 code point. + +Example of use: + +```cpp +const unsigned short u[3] = {0x65e5, 0xd800, 0xdf46}; +const unsigned short* w = u; +int cp = unchecked::next16(w); +assert (cp, 0x65e5); +assert (w, u + 1); +``` + +This function is typically used to iterate through a UTF-16 encoded string. + +This is a faster but less safe version of `utf8::next16`. It does not check for validity of the supplied UTF-8 sequence. + + + #### utf8::unchecked::peek_next Available in version 2.1 and later. @@ -1201,7 +1813,7 @@ Given the iterator to the beginning of a UTF-8 sequence, it returns the code poi ```cpp template -uint32_t peek_next(octet_iterator it); +utfchar32_t peek_next(octet_iterator it); ``` `it`: an iterator pointing to the beginning of an UTF-8 encoded code point. @@ -1219,6 +1831,7 @@ assert (w == twochars); This is a faster but less safe version of `utf8::peek_next`. It does not check for validity of the supplied UTF-8 sequence. + #### utf8::unchecked::prior Available in version 1.02 and later. @@ -1227,7 +1840,7 @@ Given a reference to an iterator pointing to an octet in a UTF-8 sequence, it de ```cpp template -uint32_t prior(octet_iterator& it); +utfchar32_t prior(octet_iterator& it); ``` `it`: a reference pointing to an octet within a UTF-8 encoded string. After the function returns, it is decremented to point to the beginning of the previous code point. @@ -1245,6 +1858,7 @@ assert (w == twochars); This is a faster but less safe version of `utf8::prior`. It does not check for validity of the supplied UTF-8 sequence and offers no boundary checking. + #### utf8::unchecked::advance Available in version 1.0 and later. @@ -1270,6 +1884,7 @@ assert (w == twochars + 5); This is a faster but less safe version of `utf8::advance`. It does not check for validity of the supplied UTF-8 sequence and offers no boundary checking. + #### utf8::unchecked::distance Available in version 1.0 and later. @@ -1295,6 +1910,7 @@ assert (dist == 2); This is a faster but less safe version of `utf8::distance`. It does not check for validity of the supplied UTF-8 sequence. + #### utf8::unchecked::utf16to8 Available in version 1.0 and later. @@ -1322,6 +1938,7 @@ assert (utf8result.size() == 10); This is a faster but less safe version of `utf8::utf16to8`. It does not check for validity of the supplied UTF-16 sequence. + #### utf8::unchecked::utf8to16 Available in version 1.0 and later. @@ -1350,6 +1967,7 @@ assert (utf16result[3] == 0xdd1e); This is a faster but less safe version of `utf8::utf8to16`. It does not check for validity of the supplied UTF-8 sequence. + #### utf8::unchecked::utf32to8 Available in version 1.0 and later. @@ -1377,6 +1995,7 @@ assert (utf8result.size() == 9); This is a faster but less safe version of `utf8::utf32to8`. It does not check for validity of the supplied UTF-32 sequence. + #### utf8::unchecked::utf8to32 Available in version 1.0 and later. @@ -1404,6 +2023,7 @@ assert (utf32result.size() == 2); This is a faster but less safe version of `utf8::utf8to32`. It does not check for validity of the supplied UTF-8 sequence. + #### utf8::unchecked::replace_invalid Available in version 3.1 and later. @@ -1412,7 +2032,7 @@ Replaces all invalid UTF-8 sequences within a string with a replacement marker. ```cpp template -output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement); +output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement); template output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out); ``` @@ -1441,8 +2061,10 @@ assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.end(), Unlike `utf8::replace_invalid`, this function does not verify validity of the replacement marker. + ### Types From utf8::unchecked Namespace + #### utf8::iterator Available in version 2.0 and later. @@ -1454,6 +2076,7 @@ template class iterator; ``` + ##### Member functions `iterator();` the deafult constructor; the underlying octet_iterator is constructed with its default constructor. @@ -1462,7 +2085,7 @@ class iterator; `octet_iterator base () const;` returns the underlying octet_iterator. -`uint32_t operator * () const;` decodes the utf-8 sequence the underlying octet_iterator is pointing to and returns the code point. +`utfchar32_t operator * () const;` decodes the utf-8 sequence the underlying octet_iterator is pointing to and returns the code point. `bool operator == (const iterator& rhs) const;` returns `true` if the two underlying iterators are equal. @@ -1499,9 +2122,3 @@ assert (*un_it == 0x10346); This is an unchecked version of `utf8::iterator`. It is faster in many cases, but offers no validity or range checks. -## Links - -1. [The Unicode Consortium](http://www.unicode.org/). -2. [ICU Library](http://icu.sourceforge.net/). -3. [UTF-8 at Wikipedia](http://en.wikipedia.org/wiki/UTF-8) -4. [UTF-8 and Unicode FAQ for Unix/Linux](http://www.cl.cam.ac.uk/~mgk25/unicode.html) diff --git a/samples/docsample.cpp b/samples/docsample.cpp deleted file mode 100644 index 6533887..0000000 --- a/samples/docsample.cpp +++ /dev/null @@ -1,64 +0,0 @@ -#include "../source/utf8.h" -#include -#include -#include -#include - - -using namespace std; - -int main(int argc, char** argv) -{ - if (argc != 2) { - cout << "\nUsage: docsample filename\n"; - return 0; - } - const char* test_file_path = argv[1]; - // Open the test file (must be UTF-8 encoded) - ifstream fs8(test_file_path); - if (!fs8.is_open()) { - cout << "Could not open " << test_file_path << endl; - return 0; - } - - unsigned line_count = 1; - string line; - // Play with all the lines in the file - while (getline(fs8, line)) { - // check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function) -#if __cplusplus >= 201103L // C++ 11 or later - auto end_it = utf8::find_invalid(line.begin(), line.end()); -#else - string::iterator end_it = utf8::find_invalid(line.begin(), line.end()); -#endif // C++ 11 - if (end_it != line.end()) { - cout << "Invalid UTF-8 encoding detected at line " << line_count << "\n"; - cout << "This part is fine: " << string(line.begin(), end_it) << "\n"; - } - // Get the line length (at least for the valid part) - ptrdiff_t length = utf8::distance(line.begin(), end_it); - cout << "Length of line " << line_count << " is " << length << "\n"; - - // Convert it to utf-16 -#if __cplusplus >= 201103L // C++ 11 or later - u16string utf16line = utf8::utf8to16(line); -#else - vector utf16line; - utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line)); -#endif // C++ 11 - // And back to utf-8; -#if __cplusplus >= 201103L // C++ 11 or later - string utf8line = utf8::utf16to8(utf16line); -#else - string utf8line; - utf8::utf16to8(utf16line.begin(), utf16line.end(), back_inserter(utf8line)); -#endif // C++ 11 - // Confirm that the conversion went OK: - if (utf8line != string(line.begin(), end_it)) - cout << "Error in UTF-16 conversion at line: " << line_count << "\n"; - - line_count++; - } - - return 0; -} diff --git a/source/utf8.h b/source/utf8.h index 82b13f5..b513530 100644 --- a/source/utf8.h +++ b/source/utf8.h @@ -28,6 +28,18 @@ DEALINGS IN THE SOFTWARE. #ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 #define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 +/* +To control the C++ language version used by the library, you can define UTF_CPP_CPLUSPLUS macro +and set it to one of the values used by the __cplusplus predefined macro. + +For instance, + #define UTF_CPP_CPLUSPLUS 199711L +will cause the UTF-8 CPP library to use only types and language features available in the C++ 98 standard. +Some library features will be disabled. + +If you leave UTF_CPP_CPLUSPLUS undefined, it will be internally assigned to __cplusplus. +*/ + #include "utf8/checked.h" #include "utf8/unchecked.h" diff --git a/source/utf8/checked.h b/source/utf8/checked.h index 512dcc2..98949f8 100644 --- a/source/utf8/checked.h +++ b/source/utf8/checked.h @@ -39,28 +39,28 @@ namespace utf8 // Exceptions that may be thrown from the library functions. class invalid_code_point : public exception { - uint32_t cp; + utfchar32_t cp; public: - invalid_code_point(uint32_t codepoint) : cp(codepoint) {} + invalid_code_point(utfchar32_t codepoint) : cp(codepoint) {} virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid code point"; } - uint32_t code_point() const {return cp;} + utfchar32_t code_point() const {return cp;} }; class invalid_utf8 : public exception { - uint8_t u8; + utfchar8_t u8; public: - invalid_utf8 (uint8_t u) : u8(u) {} - invalid_utf8 (char c) : u8(static_cast(c)) {} + invalid_utf8 (utfchar8_t u) : u8(u) {} + invalid_utf8 (char c) : u8(static_cast(c)) {} virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-8"; } - uint8_t utf8_octet() const {return u8;} + utfchar8_t utf8_octet() const {return u8;} }; class invalid_utf16 : public exception { - uint16_t u16; + utfchar16_t u16; public: - invalid_utf16 (uint16_t u) : u16(u) {} + invalid_utf16 (utfchar16_t u) : u16(u) {} virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-16"; } - uint16_t utf16_word() const {return u16;} + utfchar16_t utf16_word() const {return u16;} }; class not_enough_room : public exception { @@ -71,7 +71,7 @@ namespace utf8 /// The library API - functions intended to be called by the users template - octet_iterator append(uint32_t cp, octet_iterator result) + octet_iterator append(utfchar32_t cp, octet_iterator result) { if (!utf8::internal::is_code_point_valid(cp)) throw invalid_code_point(cp); @@ -79,8 +79,22 @@ namespace utf8 return internal::append(cp, result); } + inline void append(utfchar32_t cp, std::string& s) + { + append(cp, std::back_inserter(s)); + } + + template + word_iterator append16(utfchar32_t cp, word_iterator result) + { + if (!utf8::internal::is_code_point_valid(cp)) + throw invalid_code_point(cp); + + return internal::append16(cp, result); + } + template - output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) + output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement) { while (start != end) { octet_iterator sequence_start = start; @@ -115,14 +129,28 @@ namespace utf8 template inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) { - static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd); + static const utfchar32_t replacement_marker = utf8::internal::mask16(0xfffd); return utf8::replace_invalid(start, end, out, replacement_marker); } + inline std::string replace_invalid(const std::string& s, utfchar32_t replacement) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + return result; + } + + inline std::string replace_invalid(const std::string& s) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + template - uint32_t next(octet_iterator& it, octet_iterator end) + utfchar32_t next(octet_iterator& it, octet_iterator end) { - uint32_t cp = 0; + utfchar32_t cp = 0; internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); switch (err_code) { case internal::UTF8_OK : @@ -132,21 +160,31 @@ namespace utf8 case internal::INVALID_LEAD : case internal::INCOMPLETE_SEQUENCE : case internal::OVERLONG_SEQUENCE : - throw invalid_utf8(static_cast(*it)); + throw invalid_utf8(static_cast(*it)); case internal::INVALID_CODE_POINT : throw invalid_code_point(cp); } return cp; } + template + utfchar32_t next16(word_iterator& it, word_iterator end) + { + utfchar32_t cp = 0; + internal::utf_error err_code = utf8::internal::validate_next16(it, end, cp); + if (err_code == internal::NOT_ENOUGH_ROOM) + throw not_enough_room(); + return cp; + } + template - uint32_t peek_next(octet_iterator it, octet_iterator end) + utfchar32_t peek_next(octet_iterator it, octet_iterator end) { return utf8::next(it, end); } template - uint32_t prior(octet_iterator& it, octet_iterator start) + utfchar32_t prior(octet_iterator& it, octet_iterator start) { // can't do much if it == start if (it == start) @@ -189,23 +227,23 @@ namespace utf8 octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) { while (start != end) { - uint32_t cp = utf8::internal::mask16(*start++); + utfchar32_t cp = utf8::internal::mask16(*start++); // Take care of surrogate pairs first if (utf8::internal::is_lead_surrogate(cp)) { if (start != end) { - uint32_t trail_surrogate = utf8::internal::mask16(*start++); + const utfchar32_t trail_surrogate = utf8::internal::mask16(*start++); if (utf8::internal::is_trail_surrogate(trail_surrogate)) cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; else - throw invalid_utf16(static_cast(trail_surrogate)); + throw invalid_utf16(static_cast(trail_surrogate)); } else - throw invalid_utf16(static_cast(cp)); + throw invalid_utf16(static_cast(cp)); } // Lone trail surrogate else if (utf8::internal::is_trail_surrogate(cp)) - throw invalid_utf16(static_cast(cp)); + throw invalid_utf16(static_cast(cp)); result = utf8::append(cp, result); } @@ -216,13 +254,13 @@ namespace utf8 u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) { while (start < end) { - uint32_t cp = utf8::next(start, end); + const utfchar32_t cp = utf8::next(start, end); if (cp > 0xffff) { //make a surrogate pair - *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); - *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); + *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); + *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); } else - *result++ = static_cast(cp); + *result++ = static_cast(cp); } return result; } @@ -252,9 +290,9 @@ namespace utf8 octet_iterator range_start; octet_iterator range_end; public: - typedef uint32_t value_type; - typedef uint32_t* pointer; - typedef uint32_t& reference; + typedef utfchar32_t value_type; + typedef utfchar32_t* pointer; + typedef utfchar32_t& reference; typedef std::ptrdiff_t difference_type; typedef std::bidirectional_iterator_tag iterator_category; iterator () {} @@ -268,7 +306,7 @@ namespace utf8 } // the default "big three" are OK octet_iterator base () const { return it; } - uint32_t operator * () const + utfchar32_t operator * () const { octet_iterator temp = it; return utf8::next(temp, range_end); @@ -309,7 +347,9 @@ namespace utf8 } // namespace utf8 -#if UTF_CPP_CPLUSPLUS >= 201703L // C++ 17 or later +#if UTF_CPP_CPLUSPLUS >= 202002L // C++ 20 or later +#include "cpp20.h" +#elif UTF_CPP_CPLUSPLUS >= 201703L // C++ 17 or later #include "cpp17.h" #elif UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later #include "cpp11.h" diff --git a/source/utf8/core.h b/source/utf8/core.h index 34371ee..4494c53 100644 --- a/source/utf8/core.h +++ b/source/utf8/core.h @@ -29,6 +29,8 @@ DEALINGS IN THE SOFTWARE. #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 #include +#include +#include // Determine the C++ standard version. // If the user defines UTF_CPP_CPLUSPLUS, use that. @@ -49,12 +51,20 @@ DEALINGS IN THE SOFTWARE. namespace utf8 { - // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers - // You may need to change them to match your system. - // These typedefs have the same names as ones from cstdint, or boost/cstdint - typedef unsigned char uint8_t; - typedef unsigned short uint16_t; - typedef unsigned int uint32_t; +// The typedefs for 8-bit, 16-bit and 32-bit code units +#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later + #if UTF_CPP_CPLUSPLUS >= 202002L // C++ 20 or later + typedef char8_t utfchar8_t; + #else // C++ 11/14/17 + typedef unsigned char utfchar8_t; + #endif + typedef char16_t utfchar16_t; + typedef char32_t utfchar32_t; +#else // C++ 98/03 + typedef unsigned char utfchar8_t; + typedef unsigned short utfchar16_t; + typedef unsigned int utfchar32_t; +#endif // C++ 11 or later // Helper code - not intended to be directly called by the library users. May be changed at any time namespace internal @@ -62,61 +72,62 @@ namespace internal // Unicode constants // Leading (high) surrogates: 0xd800 - 0xdbff // Trailing (low) surrogates: 0xdc00 - 0xdfff - const uint16_t LEAD_SURROGATE_MIN = 0xd800u; - const uint16_t LEAD_SURROGATE_MAX = 0xdbffu; - const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u; - const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu; - const uint16_t LEAD_OFFSET = 0xd7c0u; // LEAD_SURROGATE_MIN - (0x10000 >> 10) - const uint32_t SURROGATE_OFFSET = 0xfca02400u; // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN + const utfchar16_t LEAD_SURROGATE_MIN = 0xd800u; + const utfchar16_t LEAD_SURROGATE_MAX = 0xdbffu; + const utfchar16_t TRAIL_SURROGATE_MIN = 0xdc00u; + const utfchar16_t TRAIL_SURROGATE_MAX = 0xdfffu; + const utfchar16_t LEAD_OFFSET = 0xd7c0u; // LEAD_SURROGATE_MIN - (0x10000 >> 10) + const utfchar32_t SURROGATE_OFFSET = 0xfca02400u; // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN // Maximum valid value for a Unicode code point - const uint32_t CODE_POINT_MAX = 0x0010ffffu; + const utfchar32_t CODE_POINT_MAX = 0x0010ffffu; template - inline uint8_t mask8(octet_type oc) + inline utfchar8_t mask8(octet_type oc) { - return static_cast(0xff & oc); + return static_cast(0xff & oc); } template - inline uint16_t mask16(u16_type oc) + inline utfchar16_t mask16(u16_type oc) { - return static_cast(0xffff & oc); + return static_cast(0xffff & oc); } + template inline bool is_trail(octet_type oc) { return ((utf8::internal::mask8(oc) >> 6) == 0x2); } - template - inline bool is_lead_surrogate(u16 cp) + inline bool is_lead_surrogate(utfchar32_t cp) { return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX); } - template - inline bool is_trail_surrogate(u16 cp) + inline bool is_trail_surrogate(utfchar32_t cp) { return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); } - template - inline bool is_surrogate(u16 cp) + inline bool is_surrogate(utfchar32_t cp) { return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); } - template - inline bool is_code_point_valid(u32 cp) + inline bool is_code_point_valid(utfchar32_t cp) { return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp)); } + inline bool is_in_bmp(utfchar32_t cp) + { + return cp < utfchar32_t(0x10000); + } + template - inline typename std::iterator_traits::difference_type - sequence_length(octet_iterator lead_it) + int sequence_length(octet_iterator lead_it) { - uint8_t lead = utf8::internal::mask8(*lead_it); + const utfchar8_t lead = utf8::internal::mask8(*lead_it); if (lead < 0x80) return 1; else if ((lead >> 5) == 0x6) @@ -129,8 +140,7 @@ namespace internal return 0; } - template - inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length) + inline bool is_overlong_sequence(utfchar32_t cp, int length) { if (cp < 0x80) { if (length != 1) @@ -144,7 +154,6 @@ namespace internal if (length != 3) return true; } - return false; } @@ -152,7 +161,7 @@ namespace internal /// Helper for get_sequence_x template - utf_error increase_safely(octet_iterator& it, octet_iterator end) + utf_error increase_safely(octet_iterator& it, const octet_iterator end) { if (++it == end) return NOT_ENOUGH_ROOM; @@ -163,11 +172,11 @@ namespace internal return UTF8_OK; } - #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;} + #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;} /// get_sequence_x functions decode utf-8 sequences of the length x template - utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point) + utf_error get_sequence_1(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) { if (it == end) return NOT_ENOUGH_ROOM; @@ -178,7 +187,7 @@ namespace internal } template - utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point) + utf_error get_sequence_2(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) { if (it == end) return NOT_ENOUGH_ROOM; @@ -193,7 +202,7 @@ namespace internal } template - utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point) + utf_error get_sequence_3(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) { if (it == end) return NOT_ENOUGH_ROOM; @@ -212,7 +221,7 @@ namespace internal } template - utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point) + utf_error get_sequence_4(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) { if (it == end) return NOT_ENOUGH_ROOM; @@ -237,7 +246,7 @@ namespace internal #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR template - utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point) + utf_error validate_next(octet_iterator& it, octet_iterator end, utfchar32_t& code_point) { if (it == end) return NOT_ENOUGH_ROOM; @@ -246,10 +255,9 @@ namespace internal // Of course, it does not make much sense with i.e. stream iterators octet_iterator original_it = it; - uint32_t cp = 0; + utfchar32_t cp = 0; // Determine the sequence length based on the lead octet - typedef typename std::iterator_traits::difference_type octet_difference_type; - const octet_difference_type length = utf8::internal::sequence_length(it); + const int length = utf8::internal::sequence_length(it); // Get trail octets and calculate the code point utf_error err = UTF8_OK; @@ -293,15 +301,51 @@ namespace internal template inline utf_error validate_next(octet_iterator& it, octet_iterator end) { - uint32_t ignored; + utfchar32_t ignored; return utf8::internal::validate_next(it, end, ignored); } + template + utf_error validate_next16(word_iterator& it, word_iterator end, utfchar32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + // Save the original value of it so we can go back in case of failure + // Of course, it does not make much sense with i.e. stream iterators + word_iterator original_it = it; + + utf_error err = UTF8_OK; + + const utfchar16_t first_word = *it++; + if (!is_surrogate(first_word)) { + code_point = first_word; + return UTF8_OK; + } + else { + if (it == end) + err = NOT_ENOUGH_ROOM; + else if (is_lead_surrogate(first_word)) { + const utfchar16_t second_word = *it++; + if (is_trail_surrogate(second_word)) { + code_point = (first_word << 10) + second_word + SURROGATE_OFFSET; + return UTF8_OK; + } else + err = INCOMPLETE_SEQUENCE; + + } else { + err = INVALID_LEAD; + } + } + // error branch + it = original_it; + return err; + } + // Internal implementation of both checked and unchecked append() function // This function will be invoked by the overloads below, as they will know // the octet_type. template - octet_iterator append(uint32_t cp, octet_iterator result) { + octet_iterator append(utfchar32_t cp, octet_iterator result) { if (cp < 0x80) // one octet *(result++) = static_cast(cp); else if (cp < 0x800) { // two octets @@ -325,7 +369,7 @@ namespace internal // One of the following overloads will be invoked from the API calls // A simple (but dangerous) case: the caller appends byte(s) to a char array - inline char* append(uint32_t cp, char* result) { + inline char* append(utfchar32_t cp, char* result) { return append(cp, result); } @@ -333,17 +377,49 @@ namespace internal // i.e. append(cp, std::back_inserter(str)); template std::back_insert_iterator append - (uint32_t cp, std::back_insert_iterator result) { + (utfchar32_t cp, std::back_insert_iterator result) { return append, typename container_type::value_type>(cp, result); } // The caller uses some other kind of output operator - not covered above // Note that in this case we are not able to determine octet_type - // so we assume it's uint_8; that can cause a conversion warning if we are wrong. + // so we assume it's utfchar8_t; that can cause a conversion warning if we are wrong. template - octet_iterator append(uint32_t cp, octet_iterator result) { - return append(cp, result); + octet_iterator append(utfchar32_t cp, octet_iterator result) { + return append(cp, result); + } + + // Internal implementation of both checked and unchecked append16() function + // This function will be invoked by the overloads below, as they will know + // the word_type. + template + word_iterator append16(utfchar32_t cp, word_iterator result) { + if (is_in_bmp(cp)) + *(result++) = static_cast(cp); + else { + // Code points from the supplementary planes are encoded via surrogate pairs + *(result++) = static_cast(LEAD_OFFSET + (cp >> 10)); + *(result++) = static_cast(TRAIL_SURROGATE_MIN + (cp & 0x3FF)); + } + return result; + } + + // Hopefully, most common case: the caller uses back_inserter + // i.e. append16(cp, std::back_inserter(str)); + template + std::back_insert_iterator append16 + (utfchar32_t cp, std::back_insert_iterator result) { + return append16, + typename container_type::value_type>(cp, result); + } + + // The caller uses some other kind of output operator - not covered above + // Note that in this case we are not able to determine word_type + // so we assume it's utfchar16_t; that can cause a conversion warning if we are wrong. + template + word_iterator append16(utfchar32_t cp, word_iterator result) { + return append16(cp, result); } } // namespace internal @@ -351,7 +427,7 @@ namespace internal /// The library API - functions intended to be called by the users // Byte order mark - const uint8_t bom[] = {0xef, 0xbb, 0xbf}; + const utfchar8_t bom[] = {0xef, 0xbb, 0xbf}; template octet_iterator find_invalid(octet_iterator start, octet_iterator end) @@ -365,12 +441,36 @@ namespace internal return result; } + inline const char* find_invalid(const char* str) + { + const char* end = str + std::strlen(str); + return find_invalid(str, end); + } + + inline std::size_t find_invalid(const std::string& s) + { + std::string::const_iterator invalid = find_invalid(s.begin(), s.end()); + return (invalid == s.end()) ? std::string::npos : static_cast(invalid - s.begin()); + } + template inline bool is_valid(octet_iterator start, octet_iterator end) { return (utf8::find_invalid(start, end) == end); } + inline bool is_valid(const char* str) + { + return (*(utf8::find_invalid(str)) == '\0'); + } + + inline bool is_valid(const std::string& s) + { + return is_valid(s.begin(), s.end()); + } + + + template inline bool starts_with_bom (octet_iterator it, octet_iterator end) { @@ -379,7 +479,12 @@ namespace internal ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) && ((it != end) && (utf8::internal::mask8(*it)) == bom[2]) ); - } + } + + inline bool starts_with_bom(const std::string& s) + { + return starts_with_bom(s.begin(), s.end()); + } } // namespace utf8 #endif // header guard diff --git a/source/utf8/cpp11.h b/source/utf8/cpp11.h index 2366f12..691633c 100644 --- a/source/utf8/cpp11.h +++ b/source/utf8/cpp11.h @@ -29,14 +29,12 @@ DEALINGS IN THE SOFTWARE. #define UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1 #include "checked.h" -#include namespace utf8 { - - inline void append(char32_t cp, std::string& s) + inline void append16(utfchar32_t cp, std::u16string& s) { - append(uint32_t(cp), std::back_inserter(s)); + append16(cp, std::back_inserter(s)); } inline std::string utf16to8(const std::u16string& s) @@ -66,37 +64,6 @@ namespace utf8 utf8to32(s.begin(), s.end(), std::back_inserter(result)); return result; } - - inline std::size_t find_invalid(const std::string& s) - { - std::string::const_iterator invalid = find_invalid(s.begin(), s.end()); - return (invalid == s.end()) ? std::string::npos : static_cast(invalid - s.begin()); - } - - inline bool is_valid(const std::string& s) - { - return is_valid(s.begin(), s.end()); - } - - inline std::string replace_invalid(const std::string& s, char32_t replacement) - { - std::string result; - replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); - return result; - } - - inline std::string replace_invalid(const std::string& s) - { - std::string result; - replace_invalid(s.begin(), s.end(), std::back_inserter(result)); - return result; - } - - inline bool starts_with_bom(const std::string& s) - { - return starts_with_bom(s.begin(), s.end()); - } - } // namespace utf8 #endif // header guard diff --git a/source/utf8/cpp17.h b/source/utf8/cpp17.h index 32a77ce..6e2fcc2 100644 --- a/source/utf8/cpp17.h +++ b/source/utf8/cpp17.h @@ -28,17 +28,10 @@ DEALINGS IN THE SOFTWARE. #ifndef UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9 #define UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9 -#include "checked.h" -#include +#include "cpp11.h" namespace utf8 { - - inline void append(char32_t cp, std::string& s) - { - append(uint32_t(cp), std::back_inserter(s)); - } - inline std::string utf16to8(std::u16string_view s) { std::string result; diff --git a/source/utf8/cpp20.h b/source/utf8/cpp20.h new file mode 100644 index 0000000..07b61d0 --- /dev/null +++ b/source/utf8/cpp20.h @@ -0,0 +1,124 @@ +// Copyright 2022 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_207e906c01_03a3_4daf_b420_ea7ea952b3c9 +#define UTF8_FOR_CPP_207e906c01_03a3_4daf_b420_ea7ea952b3c9 + +#include "cpp17.h" + +namespace utf8 +{ + inline std::u8string utf16tou8(const std::u16string& s) + { + std::u8string result; + utf16to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u8string utf16tou8(std::u16string_view s) + { + std::u8string result; + utf16to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u16string utf8to16(const std::u8string& s) + { + std::u16string result; + utf8to16(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u16string utf8to16(const std::u8string_view& s) + { + std::u16string result; + utf8to16(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u8string utf32tou8(const std::u32string& s) + { + std::u8string result; + utf32to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u8string utf32tou8(const std::u32string_view& s) + { + std::u8string result; + utf32to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u32string utf8to32(const std::u8string& s) + { + std::u32string result; + utf8to32(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u32string utf8to32(const std::u8string_view& s) + { + std::u32string result; + utf8to32(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::size_t find_invalid(const std::u8string& s) + { + std::u8string::const_iterator invalid = find_invalid(s.begin(), s.end()); + return (invalid == s.end()) ? std::string_view::npos : static_cast(invalid - s.begin()); + } + + inline bool is_valid(const std::u8string& s) + { + return is_valid(s.begin(), s.end()); + } + + inline std::u8string replace_invalid(const std::u8string& s, char32_t replacement) + { + std::u8string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + return result; + } + + inline std::u8string replace_invalid(const std::u8string& s) + { + std::u8string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline bool starts_with_bom(const std::u8string& s) + { + return starts_with_bom(s.begin(), s.end()); + } + +} // namespace utf8 + +#endif // header guard + diff --git a/source/utf8/unchecked.h b/source/utf8/unchecked.h index 7981839..6f928b7 100644 --- a/source/utf8/unchecked.h +++ b/source/utf8/unchecked.h @@ -35,13 +35,19 @@ namespace utf8 namespace unchecked { template - octet_iterator append(uint32_t cp, octet_iterator result) + octet_iterator append(utfchar32_t cp, octet_iterator result) { return internal::append(cp, result); } + template + word_iterator append16(utfchar32_t cp, word_iterator result) + { + return internal::append16(cp, result); + } + template - output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) + output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, utfchar32_t replacement) { while (start != end) { octet_iterator sequence_start = start; @@ -52,17 +58,17 @@ namespace utf8 *out++ = *it; break; case internal::NOT_ENOUGH_ROOM: - out = utf8::unchecked::append (replacement, out); + out = utf8::unchecked::append(replacement, out); start = end; break; case internal::INVALID_LEAD: - out = utf8::unchecked::append (replacement, out); + out = utf8::unchecked::append(replacement, out); ++start; break; case internal::INCOMPLETE_SEQUENCE: case internal::OVERLONG_SEQUENCE: case internal::INVALID_CODE_POINT: - out = utf8::unchecked::append (replacement, out); + out = utf8::unchecked::append(replacement, out); ++start; // just one replacement mark for the sequence while (start != end && utf8::internal::is_trail(*start)) @@ -76,16 +82,29 @@ namespace utf8 template inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) { - static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd); + static const utfchar32_t replacement_marker = utf8::internal::mask16(0xfffd); return utf8::unchecked::replace_invalid(start, end, out, replacement_marker); } + inline std::string replace_invalid(const std::string& s, utfchar32_t replacement) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + return result; + } + + inline std::string replace_invalid(const std::string& s) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + template - uint32_t next(octet_iterator& it) + utfchar32_t next(octet_iterator& it) { - uint32_t cp = utf8::internal::mask8(*it); - typename std::iterator_traits::difference_type length = utf8::internal::sequence_length(it); - switch (length) { + utfchar32_t cp = utf8::internal::mask8(*it); + switch (utf8::internal::sequence_length(it)) { case 1: break; case 2: @@ -112,13 +131,22 @@ namespace utf8 } template - uint32_t peek_next(octet_iterator it) + utfchar32_t peek_next(octet_iterator it) { return utf8::unchecked::next(it); } + template + utfchar32_t next16(word_iterator& it) + { + utfchar32_t cp = utf8::internal::mask16(*it++); + if (utf8::internal::is_lead_surrogate(cp)) + return (cp << 10) + *it++ + utf8::internal::SURROGATE_OFFSET; + return cp; + } + template - uint32_t prior(octet_iterator& it) + utfchar32_t prior(octet_iterator& it) { while (utf8::internal::is_trail(*(--it))) ; octet_iterator temp = it; @@ -126,7 +154,7 @@ namespace utf8 } template - void advance (octet_iterator& it, distance_type n) + void advance(octet_iterator& it, distance_type n) { const distance_type zero(0); if (n < zero) { @@ -142,7 +170,7 @@ namespace utf8 template typename std::iterator_traits::difference_type - distance (octet_iterator first, octet_iterator last) + distance(octet_iterator first, octet_iterator last) { typename std::iterator_traits::difference_type dist; for (dist = 0; first < last; ++dist) @@ -151,15 +179,15 @@ namespace utf8 } template - octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) + octet_iterator utf16to8(u16bit_iterator start, u16bit_iterator end, octet_iterator result) { while (start != end) { - uint32_t cp = utf8::internal::mask16(*start++); + utfchar32_t cp = utf8::internal::mask16(*start++); if (start == end) return result; // Take care of surrogate pairs first if (utf8::internal::is_lead_surrogate(cp)) { - uint32_t trail_surrogate = utf8::internal::mask16(*start++); + utfchar32_t trail_surrogate = utf8::internal::mask16(*start++); cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; } result = utf8::unchecked::append(cp, result); @@ -168,22 +196,22 @@ namespace utf8 } template - u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) + u16bit_iterator utf8to16(octet_iterator start, octet_iterator end, u16bit_iterator result) { while (start < end) { - uint32_t cp = utf8::unchecked::next(start); + utfchar32_t cp = utf8::unchecked::next(start); if (cp > 0xffff) { //make a surrogate pair - *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); - *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); + *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); + *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); } else - *result++ = static_cast(cp); + *result++ = static_cast(cp); } return result; } template - octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) + octet_iterator utf32to8(u32bit_iterator start, u32bit_iterator end, octet_iterator result) { while (start != end) result = utf8::unchecked::append(*(start++), result); @@ -192,7 +220,7 @@ namespace utf8 } template - u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) + u32bit_iterator utf8to32(octet_iterator start, octet_iterator end, u32bit_iterator result) { while (start < end) (*result++) = utf8::unchecked::next(start); @@ -205,16 +233,16 @@ namespace utf8 class iterator { octet_iterator it; public: - typedef uint32_t value_type; - typedef uint32_t* pointer; - typedef uint32_t& reference; + typedef utfchar32_t value_type; + typedef utfchar32_t* pointer; + typedef utfchar32_t& reference; typedef std::ptrdiff_t difference_type; typedef std::bidirectional_iterator_tag iterator_category; iterator () {} explicit iterator (const octet_iterator& octet_it): it(octet_it) {} // the default "big three" are OK octet_iterator base () const { return it; } - uint32_t operator * () const + utfchar32_t operator * () const { octet_iterator temp = it; return utf8::unchecked::next(temp); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index f3ce258..8a00a6a 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,15 +1,20 @@ -add_executable(negative ${PROJECT_SOURCE_DIR}/tests/negative.cpp) -add_executable(cpp11 ${PROJECT_SOURCE_DIR}/tests/test_cpp11.cpp) -add_executable(cpp17 ${PROJECT_SOURCE_DIR}/tests/test_cpp17.cpp) -add_executable(apitests ${PROJECT_SOURCE_DIR}/tests/apitests.cpp) +cmake_minimum_required (VERSION 3.5) +project(utfcpptests LANGUAGES CXX) +enable_testing() + +add_library(${PROJECT_NAME} INTERFACE) + +include_directories("${PROJECT_SOURCE_DIR}/../source") + +add_executable(negative negative.cpp) +add_executable(cpp11 test_cpp11.cpp) +add_executable(cpp17 test_cpp17.cpp) +add_executable(cpp20 test_cpp20.cpp) +add_executable(apitests apitests.cpp) + +add_executable(noexceptionstests noexceptionstests.cpp) -add_executable(noexceptionstests ${PROJECT_SOURCE_DIR}/tests/noexceptionstests.cpp) -target_link_libraries(negative PRIVATE utf8::cpp) -target_link_libraries(cpp11 PRIVATE utf8::cpp) -target_link_libraries(cpp17 PRIVATE utf8::cpp) -target_link_libraries(apitests PRIVATE utf8::cpp) -target_link_libraries(noexceptionstests PRIVATE utf8::cpp) target_compile_options(${PROJECT_NAME} INTERFACE $<$:/W4> @@ -35,9 +40,17 @@ set_target_properties(cpp17 CXX_STANDARD_REQUIRED YES CXX_EXTENSIONS NO) -add_test(negative_test negative ${PROJECT_SOURCE_DIR}/tests/test_data/utf8_invalid.txt) +set_target_properties(cpp20 + PROPERTIES + CXX_STANDARD 20 + CXX_STANDARD_REQUIRED YES + CXX_EXTENSIONS NO) + + +add_test(negative_test negative ${PROJECT_SOURCE_DIR}/test_data/utf8_invalid.txt) add_test(cpp11_test cpp11) add_test(cpp17_test cpp17) +add_test(cpp20_test cpp20) add_test(api_test apitests) add_test(noexceptions_test noexceptionstests) diff --git a/tests/docker/Dockerfile b/tests/docker/Dockerfile index 9df3717..dcdd47d 100644 --- a/tests/docker/Dockerfile +++ b/tests/docker/Dockerfile @@ -1,4 +1,4 @@ -FROM debian:buster-slim +FROM gcc:12.2 RUN apt-get update \ && apt-get install -y make g++ cmake git \ diff --git a/tests/test_checked_api.h b/tests/test_checked_api.h index 3a7067b..54e9cf8 100644 --- a/tests/test_checked_api.h +++ b/tests/test_checked_api.h @@ -47,6 +47,22 @@ TEST(CheckedAPITests, test_append) EXPECT_EQ (c[1], 0); } +TEST(CheckedAPITests, test_append16) +{ + utfchar16_t u[5] = {0,0}; + append16(0x0448, u); + EXPECT_EQ (u[0], 0x0448); + EXPECT_EQ (u[1], 0x0000); + + append16(0x65e5, u); + EXPECT_EQ (u[0], 0x65e5); + EXPECT_EQ (u[1], 0x0000); + + append16(0x10346, u); + EXPECT_EQ (u[0], 0xd800); + EXPECT_EQ (u[1], 0xdf46); +} + TEST(CheckedAPITests, test_next) { const char* twochars = "\xe6\x97\xa5\xd1\x88"; @@ -71,6 +87,19 @@ TEST(CheckedAPITests, test_next) EXPECT_EQ (w, threechars + 9); } +TEST(CheckedAPITests, test_next16) +{ + const utfchar16_t u[3] = {0x65e5, 0xd800, 0xdf46}; + const utfchar16_t* w = u; + utf8::utfchar32_t cp = next16(w, w + 3); + EXPECT_EQ (cp, 0x65e5); + EXPECT_EQ (w, u + 1); + + cp = next16(w, w + 2); + EXPECT_EQ (cp, 0x10346); + EXPECT_EQ (w, u + 3); +} + TEST(CheckedAPITests, test_peek_next) { const char* const cw = "\xe6\x97\xa5\xd1\x88"; @@ -171,7 +200,9 @@ TEST(CheckedAPITests, test_replace_invalid) TEST(CheckedAPITests, test_find_invalid) { char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; - char* invalid = find_invalid(utf_invalid, utf_invalid + 6); + const char* invalid = find_invalid(utf_invalid, utf_invalid + 6); + EXPECT_EQ (invalid, utf_invalid + 5); + invalid = find_invalid(utf_invalid); EXPECT_EQ (invalid, utf_invalid + 5); } @@ -180,9 +211,13 @@ TEST(CheckedAPITests, test_is_valid) char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa"; bool bvalid = is_valid(utf_invalid, utf_invalid + 6); EXPECT_FALSE (bvalid); + bvalid = is_valid(utf_invalid); + EXPECT_FALSE (bvalid); char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; bvalid = is_valid(utf8_with_surrogates, utf8_with_surrogates + 9); EXPECT_TRUE (bvalid); + bvalid = is_valid(utf8_with_surrogates); + EXPECT_TRUE (bvalid); } TEST(CheckedAPITests, test_starts_with_bom) diff --git a/tests/test_cpp11.cpp b/tests/test_cpp11.cpp index ee4ddd8..ee3518a 100644 --- a/tests/test_cpp11.cpp +++ b/tests/test_cpp11.cpp @@ -37,6 +37,14 @@ TEST(CPP11APITests, test_append) EXPECT_EQ (u.length(), 4); } +TEST(CPP11APITests, test_append16) +{ + u16string u; + append16(0x0448, u); + EXPECT_EQ (u[0], char16_t(0x0448)); + EXPECT_EQ (u.length(), 1); +} + TEST(CPP11APITests, test_utf16to8) { u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; diff --git a/tests/test_cpp17.cpp b/tests/test_cpp17.cpp index 4b87816..a38e6f7 100644 --- a/tests/test_cpp17.cpp +++ b/tests/test_cpp17.cpp @@ -10,8 +10,8 @@ using namespace std; TEST(CPP17APITests, test_utf16to8) { u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; - u16string_view utf16stringview(u16string); - string u = utf16to8(utf16string); + u16string_view utf16stringview(utf16string); + string u = utf16to8(utf16stringview); EXPECT_EQ (u.size(), 10); } diff --git a/tests/test_cpp20.cpp b/tests/test_cpp20.cpp new file mode 100644 index 0000000..50dbe30 --- /dev/null +++ b/tests/test_cpp20.cpp @@ -0,0 +1,77 @@ +#include "../extern/ftest/ftest.h" +#define UTF_CPP_CPLUSPLUS 202002L +#include "utf8.h" +#include +using namespace utf8; +using namespace std; + +TEST(CPP20APITests, test_utf16tou8) +{ + u16string utf16string = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e}; + u16string_view utf16stringview{utf16string}; + u8string u = utf16tou8(utf16string); + EXPECT_EQ (u.size(), 10); + u = utf16tou8(utf16stringview); + EXPECT_EQ (u.size(), 10); +} + +TEST(CPP20APITests, tes20t_utf8to16) +{ + u8string utf8_with_surrogates{u8"\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"}; + u16string utf16result = utf8to16(utf8_with_surrogates); + EXPECT_EQ (utf16result.size(), 4); + EXPECT_EQ (utf16result[2], 0xd834); + EXPECT_EQ (utf16result[3], 0xdd1e); +} + +TEST(CPP20APITests, test_utf32tou8) +{ + u32string utf32string = {0x448, 0x65E5, 0x10346}; + u32string_view utf32stringview{utf32string}; + u8string utf8result = utf32tou8(utf32stringview); + EXPECT_EQ (utf8result.size(), 9); +} + +TEST(CPP20APITests, test_utf8to32) +{ + u8string twochars = u8"\xe6\x97\xa5\xd1\x88"; + u32string utf32result = utf8to32(twochars); + EXPECT_EQ (utf32result.size(), 2); +} + +TEST(CPP20APITests, test_find_invalid) +{ + u8string utf_invalid = u8"\xe6\x97\xa5\xd1\x88\xfa"; + auto invalid = find_invalid(utf_invalid); + EXPECT_EQ (invalid, 5); +} + +TEST(CPP20APITests, test_is_valid) +{ + u8string utf_invalid = u8"\xe6\x97\xa5\xd1\x88\xfa"; + bool bvalid = is_valid(utf_invalid); + EXPECT_FALSE (bvalid); + u8string utf8_with_surrogates = u8"\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e"; + bvalid = is_valid(utf8_with_surrogates); + EXPECT_TRUE (bvalid); +} + +TEST(CPP20APITests, test_replace_invalid) +{ + u8string invalid_sequence = u8"a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z"; + u8string replace_invalid_result = replace_invalid(invalid_sequence, u8'?'); + bool bvalid = is_valid(replace_invalid_result); + EXPECT_TRUE (bvalid); + const u8string fixed_invalid_sequence = u8"a????z"; + EXPECT_EQ(fixed_invalid_sequence, replace_invalid_result); +} + +TEST(CPP20APITests, test_starts_with_bom) +{ + u8string byte_order_mark = u8"\xef\xbb\xbf"; + bool bbom = starts_with_bom(byte_order_mark); + EXPECT_TRUE (bbom); + u8string threechars = u8"\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88"; + bool no_bbom = starts_with_bom(threechars); + EXPECT_FALSE (no_bbom); +} diff --git a/tests/test_unchecked_api.h b/tests/test_unchecked_api.h index 66d0400..aa0cf69 100644 --- a/tests/test_unchecked_api.h +++ b/tests/test_unchecked_api.h @@ -40,6 +40,22 @@ TEST(UnCheckedAPITests, test_append) EXPECT_EQ (u[4], 0); } +TEST(UnCheckedAPITests, test_append16) +{ + unsigned short u[5] = {0,0}; + utf8::unchecked::append16(0x0448, u); + EXPECT_EQ (u[0], 0x0448); + EXPECT_EQ (u[1], 0x0000); + + utf8::unchecked::append16(0x65e5, u); + EXPECT_EQ (u[0], 0x65e5); + EXPECT_EQ (u[1], 0x0000); + + utf8::unchecked::append16(0x10346, u); + EXPECT_EQ (u[0], 0xd800); + EXPECT_EQ (u[1], 0xdf46); +} + TEST(UnCheckedAPITests, test_next) { const char* twochars = "\xe6\x97\xa5\xd1\x88"; @@ -64,6 +80,19 @@ TEST(UnCheckedAPITests, test_next) EXPECT_EQ (w, threechars + 9); } +TEST(UnCheckedAPITests, test_next16) +{ + const utf8::utfchar16_t u[3] = {0x65e5, 0xd800, 0xdf46}; + const utf8::utfchar16_t* w = u; + utf8::utfchar32_t cp = utf8::unchecked::next16(w); + EXPECT_EQ (cp, 0x65e5); + EXPECT_EQ (w, u + 1); + + cp = utf8::unchecked::next16(w); + EXPECT_EQ (cp, 0x10346); + EXPECT_EQ (w, u + 3); +} + TEST(UnCheckedAPITests, test_peek_next) { const char* const cw = "\xe6\x97\xa5\xd1\x88"; diff --git a/utf8cppConfig.cmake.in b/utf8cppConfig.cmake.in index fd3480b..9c15f36 100644 --- a/utf8cppConfig.cmake.in +++ b/utf8cppConfig.cmake.in @@ -1,8 +1,4 @@ @PACKAGE_INIT@ -include("${CMAKE_CURRENT_LIST_DIR}/utf8cppTargets.cmake") -check_required_components( "utf8cpp" ) - -if(NOT TARGET utf8::cpp) - add_library(utf8::cpp ALIAS utf8cpp) -endif() +include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Targets.cmake") +check_required_components("@PROJECT_NAME@")