diff --git a/CMakeLists.txt b/CMakeLists.txt index d70c8ba..7999803 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -113,30 +113,9 @@ if (USE_BOOST_OPTIONAL) add_definitions("-DJSONV_OPTIONAL_USE_BOOST=1") endif() -######################## -# Locale Configuration # -######################## - -if (CMAKE_COMPILER_IS_GNUCC) - if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS "5.0") - set(USE_BOOST_LOCALE_DEFAULT ON) - else() - set(USE_BOOST_LOCALE_DEFAULT OFF) - endif() -else() - set(USE_BOOST_LOCALE_DEFAULT OFF) -endif() - -option(USE_BOOST_LOCALE - "Should Boost.Locale be used for character encoding conversions (as opposed to the standard library)?" - ${USE_BOOST_LOCALE_DEFAULT} - ) -if (USE_BOOST_LOCALE) - add_definitions("-DJSONV_CHAR_CONVERT_USE_BOOST_LOCALE=1") - list(APPEND REQUIRED_BOOST_LIBRARIES "locale") -else() - add_definitions("-DJSONV_CHAR_CONVERT_USE_BOOST_LOCALE=0") -endif() +########################## +# Coverage Configuration # +########################## option(COVERAGE "Enable building with code coverage." diff --git a/include/jsonv/value.hpp b/include/jsonv/value.hpp index 5e97f82..22739e4 100644 --- a/include/jsonv/value.hpp +++ b/include/jsonv/value.hpp @@ -451,9 +451,8 @@ class JSONV_PUBLIC value **/ value(const char* value); - /** Create a \c kind::string with the given \a value. Keep in mind that it will be converted to and stored as a - * UTF-8 encoded string. - **/ + /// Create a \c kind::string with the given \a value. Keep in mind that it will be converted to and stored as a + /// UTF-8 encoded string. value(const std::wstring& value); /** Create a \c kind::string with the given \a value. Keep in mind that it will be converted to and stored as a diff --git a/src/jsonv/char_convert.cpp b/src/jsonv/char_convert.cpp index 5302287..5b80dd2 100644 --- a/src/jsonv/char_convert.cpp +++ b/src/jsonv/char_convert.cpp @@ -1,6 +1,6 @@ /** \file * - * Copyright (c) 2012-2014 by Travis Gockel. All rights reserved. + * Copyright (c) 2012-2018 by Travis Gockel. All rights reserved. * * This program is free software: you can redistribute it and/or modify it under the terms of the Apache License * as published by the Apache Software Foundation, either version 2 of the License, or (at your option) any later @@ -10,28 +10,11 @@ **/ #include "char_convert.hpp" -#include "detail/fixed_map.hpp" - -/** \def JSONV_CHAR_CONVERT_USE_BOOST_LOCALE - * Should JSON Voorhees use Boost.Locale to perform character conversions instead of the C++ Standard Library's - * \c codecvt? This must be set in GCC versions before 5, since \c codecvt is not supported. -**/ -#ifndef JSONV_CHAR_CONVERT_USE_BOOST_LOCALE -# if defined __clang__ && (__clang_major__ < 4 || (__clang_major__ == 3 && __clang_minor__ < 5)) -# define JSONV_CHAR_CONVERT_USE_BOOST_LOCALE 1 -# elif defined __GNUC__ && __GNUC__ < 5 -# define JSONV_CHAR_CONVERT_USE_BOOST_LOCALE 1 -# elif defined _MSC_VER && _MSC_VER < 1900 -# define JSONV_CHAR_CONVERT_USE_BOOST_LOCALE 1 -# else -# define JSONV_CHAR_CONVERT_USE_BOOST_LOCALE 0 -# endif -#endif - #include #include #include #include +#include #include #include #include @@ -39,11 +22,7 @@ #include #include -#if JSONV_CHAR_CONVERT_USE_BOOST_LOCALE -#include -#else -#include -#endif +#include "detail/fixed_map.hpp" namespace jsonv { @@ -589,44 +568,153 @@ string_decode_fn get_string_decoder(parse_options::encoding encoding) }; } -#if JSONV_CHAR_CONVERT_USE_BOOST_LOCALE - std::wstring convert_to_wide(string_view source) { - return boost::locale::conv::to_utf(source.data(), source.data() + source.size(), "utf-8"); -} + // Step 1: Determine the codepoints from the source + char32_t unicode_buff[source.size()]; + std::size_t unicode_idx = 0; + std::size_t large_codes = 0; -std::string convert_to_narrow(const std::wstring& source) -{ - return boost::locale::conv::from_utf(source, "utf-8"); -} + for (std::size_t source_idx = 0; source_idx < source.size(); /* inline */) + { + auto next_source = [&] () -> char32_t { return static_cast(source.at(source_idx++)); }; -std::string convert_to_narrow(const wchar_t* source) -{ - return boost::locale::conv::from_utf(source, "utf-8"); -} + char32_t codepoint; + std::size_t steps; + + auto c = next_source(); + if (c <= 0x7f) + { + codepoint = c; + steps = 0; + } + else if (c <= 0xbf) + { + throw std::range_error("Invalid UTF-8: Invalid character"); + } + else if (c <= 0xdf) + { + codepoint = c & 0x1f; + steps = 1; + } + else if (c <= 0xef) + { + codepoint = c & 0x0f; + steps = 2; + } + else if (c <= 0xf7) + { + codepoint = c & 0x07; + steps = 3; + } + else + { + throw std::range_error("Invalid UTF-8: Invalid character"); + } -#else + if (source_idx + steps > source.size()) + throw std::range_error("Invalid UTF-8: encoding sequence extends past end of source"); -std::wstring convert_to_wide(string_view source) -{ - std::wstring_convert> converter; - return converter.from_bytes(source.data(), source.data() + source.size()); + for (std::size_t step = 0; step < steps; ++step) + { + auto in_c = next_source(); + if (in_c < 0x80 || in_c > 0xbf) + throw std::range_error("Invalid UTF-8: invalid character"); + + codepoint = (codepoint << 6) | (in_c & 0x3fU); + } + + if (codepoint >= 0xd800U && codepoint <= 0xdfffU) + throw std::range_error("Invalid UTF-8: surrogate code point is not a Unicode character"); + + if (codepoint > 0x10ffffU) + throw std::range_error("Invalid UTF-8: code point is too large"); + + unicode_buff[unicode_idx++] = codepoint; + if (codepoint > 0xffffU) + ++large_codes; + } + + // Step 2: Fill the string from codepoints + std::wstring out; + out.reserve(unicode_idx + large_codes); + + for (std::size_t idx = 0; idx < unicode_idx; ++idx) + { + char32_t code_point = unicode_buff[idx]; + if (code_point <= 0xffffU) + { + out += wchar_t(code_point); + } + else + { + uint16_t high, low; + utf16_create_surrogates(code_point, &high, &low); + out += wchar_t(high); + out += wchar_t(low); + } + } + return out; } -std::string convert_to_narrow(const std::wstring& source) +static std::string convert_to_narrow(const wchar_t* source_data, std::size_t source_size) { - std::wstring_convert> converter; - return converter.to_bytes(source); + // Step 1: Extract codepoints from the source + char32_t unicode_buff[source_size]; + std::size_t unicode_idx = 0; + std::size_t out_chars = 0; + + for (std::size_t source_idx = 0; source_idx < source_size; /* inline */) + { + auto next_source = [&] () -> char32_t { return static_cast(source_data[source_idx++]); }; + + char32_t codepoint; + auto c = next_source(); + + // normal + if ((c & 0xfc00U) != 0xd800U) + { + codepoint = c; + } + // surrogate start + else + { + if (source_idx + 1 >= source_size) + throw std::range_error("Invalid UTF-16: surrogate extends past end of string"); + + auto c_lo = next_source(); + if (!utf16_combine_surrogates(c, c_lo, &codepoint)) + throw std::range_error("Invalid UTF-16: invalid surrogate pair"); + } + + unicode_buff[unicode_idx++] = codepoint; + out_chars += (codepoint <= 0x007fU) ? 1U + : (codepoint <= 0x07ffU) ? 2U + : (codepoint <= 0xffffU) ? 3U + : 4U; + } + + // Step 2: Fill the string from codepoints + std::string out; + out.reserve(out_chars); + + for (std::size_t idx = 0U; idx < unicode_idx; ++idx) + { + utf8_append_code(out, unicode_buff[idx]); + } + + return out; } std::string convert_to_narrow(const wchar_t* source) { - std::wstring_convert> converter; - return converter.to_bytes(source); + return convert_to_narrow(source, wcslen(source)); } -#endif +std::string convert_to_narrow(const std::wstring& source) +{ + return convert_to_narrow(source.c_str(), source.size()); +} } }