From 9be0c1f7100070b6f75a29c28ba7a93c6e9f75a6 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Thu, 14 Nov 2024 13:28:22 -0700 Subject: [PATCH 01/14] utf8.c: Replace macros by more compact equivalents There are shortcuts available that cut these 8 names to 2. --- utf8.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/utf8.c b/utf8.c index 0782eabe11f4..4dc691ed7949 100644 --- a/utf8.c +++ b/utf8.c @@ -1605,14 +1605,8 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)) && ( isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0) || UNLIKELY(UTF8_IS_PERL_EXTENDED(s0))))) - && ((flags & ( UTF8_DISALLOW_NONCHAR - |UTF8_DISALLOW_SURROGATE - |UTF8_DISALLOW_SUPER - |UTF8_DISALLOW_PERL_EXTENDED - |UTF8_WARN_NONCHAR - |UTF8_WARN_SURROGATE - |UTF8_WARN_SUPER - |UTF8_WARN_PERL_EXTENDED)))) + && ((flags & ( UTF8_DISALLOW_ILLEGAL_INTERCHANGE + |UTF8_WARN_ILLEGAL_INTERCHANGE)))) { /* If there were no malformations, or the only malformation is an * overlong, 'uv' is valid */ From a51ae5e530aec09402df913ed2d06771ef4eac51 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Thu, 14 Nov 2024 13:41:59 -0700 Subject: [PATCH 02/14] utf8.c: Move most important conditional to be first It turns out that the information generated in this block is only needed if the final conditional in this complicated group of them is true, which checks if the caller wants anything special for certain classes of code points. Because that final condition is subsidiary, the block was getting executed just to be thrown away. --- utf8.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/utf8.c b/utf8.c index 4dc691ed7949..b76b06fab8d0 100644 --- a/utf8.c +++ b/utf8.c @@ -1591,10 +1591,12 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, } /* Here, we have found all the possible problems, except for when the input - * is for a problematic code point not allowed by the input parameters. */ - + * is for a problematic code point not allowed by the input parameters. + * Check now for those parameters */ + if ( (flags & ( UTF8_DISALLOW_ILLEGAL_INTERCHANGE + |UTF8_WARN_ILLEGAL_INTERCHANGE)) /* uv is valid for overlongs */ - if ( ( ( LIKELY(! (possible_problems & ~UTF8_GOT_LONG)) + && ( ( LIKELY(! (possible_problems & ~UTF8_GOT_LONG)) && isUNICODE_POSSIBLY_PROBLEMATIC(uv)) || ( UNLIKELY(possible_problems) @@ -1604,9 +1606,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, * code */ && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)) && ( isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0) - || UNLIKELY(UTF8_IS_PERL_EXTENDED(s0))))) - && ((flags & ( UTF8_DISALLOW_ILLEGAL_INTERCHANGE - |UTF8_WARN_ILLEGAL_INTERCHANGE)))) + || UNLIKELY(UTF8_IS_PERL_EXTENDED(s0)))))) { /* If there were no malformations, or the only malformation is an * overlong, 'uv' is valid */ From 12285fd71b2703e5832175873e4809d6db9c516a Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Thu, 14 Nov 2024 13:51:42 -0700 Subject: [PATCH 03/14] utf8.c: Split conditionals As a first step in simplifying this overly complicated series of conditionals, pull out the first one into a separate 'if'. The next commits will do more. --- utf8.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/utf8.c b/utf8.c index b76b06fab8d0..4f96f409a422 100644 --- a/utf8.c +++ b/utf8.c @@ -1593,10 +1593,11 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, /* Here, we have found all the possible problems, except for when the input * is for a problematic code point not allowed by the input parameters. * Check now for those parameters */ - if ( (flags & ( UTF8_DISALLOW_ILLEGAL_INTERCHANGE - |UTF8_WARN_ILLEGAL_INTERCHANGE)) + if (flags & ( UTF8_DISALLOW_ILLEGAL_INTERCHANGE + |UTF8_WARN_ILLEGAL_INTERCHANGE)) + { /* uv is valid for overlongs */ - && ( ( LIKELY(! (possible_problems & ~UTF8_GOT_LONG)) + if ( ( LIKELY(! (possible_problems & ~UTF8_GOT_LONG)) && isUNICODE_POSSIBLY_PROBLEMATIC(uv)) || ( UNLIKELY(possible_problems) @@ -1606,8 +1607,8 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, * code */ && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)) && ( isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0) - || UNLIKELY(UTF8_IS_PERL_EXTENDED(s0)))))) - { + || UNLIKELY(UTF8_IS_PERL_EXTENDED(s0))))) + { /* If there were no malformations, or the only malformation is an * overlong, 'uv' is valid */ if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) { @@ -1645,6 +1646,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, /* We need a complete well-formed UTF-8 character to discern * non-characters, so can't look for them here */ } + } } ready_to_handle_errors: From 3d86fdf6fb66c2e9ad090338bf853df8ade2140f Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Thu, 14 Nov 2024 14:43:55 -0700 Subject: [PATCH 04/14] utf8.c: Further simplify a complex conditional This hoists a clause in a complex conditional to the 'if' statement above it, converting that to two conditionals from one, while decreasing the number in the much larger interior 'if' by 1. This is in preparation for further simplifications in the next few commits. --- utf8.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/utf8.c b/utf8.c index 4f96f409a422..950dece88985 100644 --- a/utf8.c +++ b/utf8.c @@ -1593,19 +1593,18 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, /* Here, we have found all the possible problems, except for when the input * is for a problematic code point not allowed by the input parameters. * Check now for those parameters */ - if (flags & ( UTF8_DISALLOW_ILLEGAL_INTERCHANGE - |UTF8_WARN_ILLEGAL_INTERCHANGE)) + if ( flags & ( UTF8_DISALLOW_ILLEGAL_INTERCHANGE + |UTF8_WARN_ILLEGAL_INTERCHANGE) + + /* if overflow, we know without looking further that this + * is a non-Unicode code point, which we deal with below in + * the overflow handling code */ + && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))) { /* uv is valid for overlongs */ if ( ( LIKELY(! (possible_problems & ~UTF8_GOT_LONG)) && isUNICODE_POSSIBLY_PROBLEMATIC(uv)) || ( UNLIKELY(possible_problems) - - /* if overflow, we know without looking further - * precisely which of the problematic types it is, - * and we deal with those in the overflow handling - * code */ - && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)) && ( isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0) || UNLIKELY(UTF8_IS_PERL_EXTENDED(s0))))) { From 857fe5630dab3677f79d57436265fb7da758959e Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Thu, 14 Nov 2024 14:00:24 -0700 Subject: [PATCH 05/14] utf8.c: Further simplify complex conditional This splits these into an if clause, and an else clause --- utf8.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/utf8.c b/utf8.c index 950dece88985..601d2e24a626 100644 --- a/utf8.c +++ b/utf8.c @@ -1601,16 +1601,11 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, * the overflow handling code */ && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))) { - /* uv is valid for overlongs */ - if ( ( LIKELY(! (possible_problems & ~UTF8_GOT_LONG)) - && isUNICODE_POSSIBLY_PROBLEMATIC(uv)) - || ( UNLIKELY(possible_problems) - && ( isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0) - || UNLIKELY(UTF8_IS_PERL_EXTENDED(s0))))) - { /* If there were no malformations, or the only malformation is an * overlong, 'uv' is valid */ - if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) { + if ( LIKELY(! (possible_problems & ~UTF8_GOT_LONG)) + && isUNICODE_POSSIBLY_PROBLEMATIC(uv)) + { if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) { possible_problems |= UTF8_GOT_SURROGATE; } @@ -1621,9 +1616,12 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, possible_problems |= UTF8_GOT_NONCHAR; } } - else { /* Otherwise, need to look at the source UTF-8, possibly - adjusted to be non-overlong */ - + else if ( UNLIKELY(possible_problems) + /* Otherwise, need to look at the source UTF-8, + * possibly adjusted to be non-overlong */ + && ( isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0) + || UNLIKELY(UTF8_IS_PERL_EXTENDED(s0)))) + { if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0) > UTF_START_BYTE_110000_)) { @@ -1645,7 +1643,6 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, /* We need a complete well-formed UTF-8 character to discern * non-characters, so can't look for them here */ } - } } ready_to_handle_errors: From 8a4d5f93d430b3b18995033a427d0f6e6ada8ec4 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Thu, 14 Nov 2024 15:07:05 -0700 Subject: [PATCH 06/14] utf8.c: Swap order of blocks This makes things a bit simpler, but mainly leads to further simplifications in the next commits. --- utf8.c | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/utf8.c b/utf8.c index 601d2e24a626..2acf2b57f575 100644 --- a/utf8.c +++ b/utf8.c @@ -1601,27 +1601,13 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, * the overflow handling code */ && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))) { - /* If there were no malformations, or the only malformation is an - * overlong, 'uv' is valid */ - if ( LIKELY(! (possible_problems & ~UTF8_GOT_LONG)) - && isUNICODE_POSSIBLY_PROBLEMATIC(uv)) - { - if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) { - possible_problems |= UTF8_GOT_SURROGATE; - } - else if (UNLIKELY(UNICODE_IS_SUPER(uv))) { - possible_problems |= UTF8_GOT_SUPER; - } - else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) { - possible_problems |= UTF8_GOT_NONCHAR; - } - } - else if ( UNLIKELY(possible_problems) - /* Otherwise, need to look at the source UTF-8, - * possibly adjusted to be non-overlong */ - && ( isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0) - || UNLIKELY(UTF8_IS_PERL_EXTENDED(s0)))) - { + if (UNLIKELY(possible_problems & ~UTF8_GOT_LONG)) { + + /* Here, there is a malformation other than overlong, we need to + look at the source UTF-8, possibly adjusted to be non-overlong */ + if ( isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0) + || UNLIKELY(UTF8_IS_PERL_EXTENDED(s0))) + { if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0) > UTF_START_BYTE_110000_)) { @@ -1642,7 +1628,24 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, /* We need a complete well-formed UTF-8 character to discern * non-characters, so can't look for them here */ + } } + else + + /* Here there were no malformations, or the only malformation is an + * overlong, 'uv' is valid, and the 'if' above made sure that it + * could be problematic */ + if (isUNICODE_POSSIBLY_PROBLEMATIC(uv)) { + if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) { + possible_problems |= UTF8_GOT_SURROGATE; + } + else if (UNLIKELY(UNICODE_IS_SUPER(uv))) { + possible_problems |= UTF8_GOT_SUPER; + } + else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) { + possible_problems |= UTF8_GOT_NONCHAR; + } + } } ready_to_handle_errors: From 8a3b341d326a8d6114b2e00acb806f4d250aac8a Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Fri, 15 Nov 2024 06:43:28 -0700 Subject: [PATCH 07/14] utf8.c: Check specially for perl-extended UTF-8 More rigorous testing of the overlong malformation, yet to be committed, showed that this needs to be handled specially. This commit does part of that. Perl extended UTF-8 means you are using a start byte not recognized by any UTF-8 standard. Suppose it is an overlong sequence that reduces down to something representable using standard UTF-8. The string still used non-standard UTF-8 to get there, so should still be called out when the input parameters to this function ask for that. This commit is a first step towards that. --- utf8.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/utf8.c b/utf8.c index 2acf2b57f575..fee73178dfd3 100644 --- a/utf8.c +++ b/utf8.c @@ -1601,6 +1601,15 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, * the overflow handling code */ && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))) { + /* By examining just the first byte, we can see if this is using + * non-standard UTF-8. Even if it is an overlong that reduces to a + * small code point, it is still using this Perl invention, so mark it + * as such */ + if (UNLIKELY(UTF8_IS_PERL_EXTENDED(s0))) { + possible_problems |= UTF8_GOT_SUPER; + } + else { + /* See if the input has malformations besides possibly overlong */ if (UNLIKELY(possible_problems & ~UTF8_GOT_LONG)) { /* Here, there is a malformation other than overlong, we need to @@ -1646,6 +1655,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, possible_problems |= UTF8_GOT_NONCHAR; } } + } } ready_to_handle_errors: From 88bb717a6f2eeef24e1f60495ea03c763aa6ff65 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 17 Nov 2024 14:32:02 -0700 Subject: [PATCH 08/14] utf8.c: Remove intermediate value By not overriding the computed value of malformed input until later in the function, we can eliminate this temporary variable. This paves the way to a much bigger simplification in the next commit. --- utf8.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/utf8.c b/utf8.c index fee73178dfd3..d79b3d4e0dc6 100644 --- a/utf8.c +++ b/utf8.c @@ -1377,7 +1377,6 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, U8 * adjusted_s0; U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this routine; see [perl #130921] */ - UV uv_so_far; dTHX; PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER; @@ -1421,7 +1420,6 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, avail_len = 0; discard_errors = 0; adjusted_s0 = (U8 *) s0; - uv_so_far = 0; if (errors) { *errors = 0; @@ -1534,16 +1532,10 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, * A convenience macro that matches either of the too-short conditions. */ # define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION) - if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) { - uv_so_far = uv; - uv = UNICODE_REPLACEMENT; - } - /* Check for overflow. The algorithm requires us to not look past the end * of the current character, even if partial, so the upper limit is 's' */ if (UNLIKELY(does_utf8_overflow(s0, s) >= ALMOST_CERTAINLY_OVERFLOWS)) { possible_problems |= UTF8_GOT_OVERFLOW; - uv = UNICODE_REPLACEMENT; } /* Check for overlong. If no problems so far, 'uv' is the correct code @@ -1566,7 +1558,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, * cases */ && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))) { - UV min_uv = uv_so_far; + UV min_uv = uv; STRLEN i; /* Here, the input is both overlong and is missing some trailing @@ -1714,6 +1706,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, * extended UTF-8, but we handle all three cases here */ possible_problems &= ~(UTF8_GOT_SUPER|UTF8_GOT_PERL_EXTENDED); *errors |= UTF8_GOT_OVERFLOW; + uv = UNICODE_REPLACEMENT; /* But the API says we flag all errors found */ if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) { @@ -1807,6 +1800,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, case UTF8_GOT_SHORT: *errors |= UTF8_GOT_SHORT; + uv = UNICODE_REPLACEMENT; if (! (flags & UTF8_ALLOW_SHORT)) { disallowed = TRUE; @@ -1829,6 +1823,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, case UTF8_GOT_NON_CONTINUATION: *errors |= UTF8_GOT_NON_CONTINUATION; + uv = UNICODE_REPLACEMENT; if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) { disallowed = TRUE; From 2286cf06ff009f415c5439fdea945d03fe7d9b27 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 17 Nov 2024 14:37:57 -0700 Subject: [PATCH 09/14] utf8.c: Combine two blocks It turns out that the work being done in the first block is only used in the second block. If that block doesn't get executed, the first block's effort is thrown away. So fold the first block into the second. This results in a bunch of temporaries that were used to communicate between the blocks being able to be removed. More detailed comments are added. --- utf8.c | 116 ++++++++++++++++++++++++--------------------------------- 1 file changed, 49 insertions(+), 67 deletions(-) diff --git a/utf8.c b/utf8.c index d79b3d4e0dc6..65f7ef03a57d 100644 --- a/utf8.c +++ b/utf8.c @@ -1371,12 +1371,6 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, U32 discard_errors; /* Used to save branches when 'errors' is NULL; this gets set and discarded */ - /* The below are used only if there is both an overlong malformation and a - * too short one. Otherwise the first two are set to 's0' and 'send', and - * the third not used at all */ - U8 * adjusted_s0; - U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this - routine; see [perl #130921] */ dTHX; PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER; @@ -1419,7 +1413,6 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, expectlen = 0; avail_len = 0; discard_errors = 0; - adjusted_s0 = (U8 *) s0; if (errors) { *errors = 0; @@ -1549,37 +1542,6 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, || (UNLIKELY(0 < is_utf8_overlong(s0, s - s0)))))) { possible_problems |= UTF8_GOT_LONG; - - if ( UNLIKELY( possible_problems & UTF8_GOT_TOO_SHORT) - - /* The calculation in the 'true' branch of this 'if' - * below won't work if overflows, and isn't needed - * anyway. Further below we handle all overflow - * cases */ - && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))) - { - UV min_uv = uv; - STRLEN i; - - /* Here, the input is both overlong and is missing some trailing - * bytes. There is no single code point it could be for, but there - * may be enough information present to determine if what we have - * so far is for an unallowed code point, such as for a surrogate. - * The code further below has the intelligence to determine this, - * but just for non-overlong UTF-8 sequences. What we do here is - * calculate the smallest code point the input could represent if - * there were no too short malformation. Then we compute and save - * the UTF-8 for that, which is what the code below looks at - * instead of the raw input. It turns out that the smallest such - * code point is all we need. */ - for (i = curlen; i < expectlen; i++) { - min_uv = UTF8_ACCUMULATE(min_uv, - I8_TO_NATIVE_UTF8(UTF_MIN_CONTINUATION_BYTE)); - } - - adjusted_s0 = temp_char_buf; - (void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0); - } } /* Here, we have found all the possible problems, except for when the input @@ -1604,38 +1566,58 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, /* See if the input has malformations besides possibly overlong */ if (UNLIKELY(possible_problems & ~UTF8_GOT_LONG)) { - /* Here, there is a malformation other than overlong, we need to - look at the source UTF-8, possibly adjusted to be non-overlong */ - if ( isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0) - || UNLIKELY(UTF8_IS_PERL_EXTENDED(s0))) - { - if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0) - > UTF_START_BYTE_110000_)) - { - possible_problems |= UTF8_GOT_SUPER; - } - else if (curlen > 1) { - if (UNLIKELY( NATIVE_UTF8_TO_I8(*adjusted_s0) - == UTF_START_BYTE_110000_ - && NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)) - >= UTF_FIRST_CONT_BYTE_110000_)) - { - possible_problems |= UTF8_GOT_SUPER; - } - else if (UNLIKELY(is_SURROGATE_utf8(adjusted_s0))) { - possible_problems |= UTF8_GOT_SURROGATE; - } - } + /* Here, the input is malformed in some way besides possibly + * overlong, except it doesn't overflow. If you look at the + * code above, to get here, it must be a too short string, + * possibly overlong besides. */ + assert(possible_problems & UTF8_GOT_TOO_SHORT); - /* We need a complete well-formed UTF-8 character to discern - * non-characters, so can't look for them here */ + /* There is no single code point it could be for, but there may + * be enough information present to determine if what we have + * so far would, if filled out completely, be for one of these + * problematic code points we are being asked to check for. + * + * The range of surrogates is + * ASCII platforms EBCDIC I8 + * "\xed\xa0\x80" "\xf1\xb6\xa0\xa0" + * to "\xed\xbf\xbf". "\xf1\xb7\xbf\xbf" + * + * (Continuation byte range): + * \x80 to \xbf \xa0 to \xbf + * + * In both cases, if we have the first two bytes, we can tell + * if it is a surrogate or not. If we have only one byte, we + * can't tell, so we have to assume it isn't a surrogate. + * + * It is more complicated for supers due to the possibility of + * overlongs. For example, in ASCII, the first non-Unicode code + * point is represented by the sequence \xf4\x90\x80\x80, so + * \xf8\x80\x80\x80\x41 looks like it is for a much bigger code + * point. But it in fact is an overlong representation of the + * letter "A". + * + * So what we do is calculate the smallest code point the input + * could represent if there were no too short malformation. + * This is done by pretending the input was filled out to its + * full length with occurrences of the smallest continuation + * byte. For surrogates we could just look at the bytes, but + * this single algorithm works for both those and supers. + * + * To determine if a code point is a non-character, we need all + * bytes, so this effort is wasted if the caller is looking for + * just those, but that is unlikely; the two official Unicode + * restrictions include the other two. */ + for (unsigned i = curlen; i < expectlen; i++) { + uv = UTF8_ACCUMULATE(uv, + I8_TO_NATIVE_UTF8(UTF_MIN_CONTINUATION_BYTE)); + } } - } - else - /* Here there were no malformations, or the only malformation is an - * overlong, 'uv' is valid, and the 'if' above made sure that it - * could be problematic */ + /* Here 'uv' is as valid as it can get. Perhaps it was valid all + * along because there were no malformations, or the only + * malformation is an overlong (which allows it to be fully + * computed). Or it may have been "cured" as best it can by the + * loop just above. */ if (isUNICODE_POSSIBLY_PROBLEMATIC(uv)) { if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) { possible_problems |= UTF8_GOT_SURROGATE; From 953bbd916aa891a0036af051f79d1ffa0f4bf3ac Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 17 Nov 2024 05:11:22 -0700 Subject: [PATCH 10/14] utf8.c: Don't throw away work Don't execute this loop if it would be pointless. --- utf8.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/utf8.c b/utf8.c index 65f7ef03a57d..9325f9ae874e 100644 --- a/utf8.c +++ b/utf8.c @@ -1564,7 +1564,9 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, } else { /* See if the input has malformations besides possibly overlong */ - if (UNLIKELY(possible_problems & ~UTF8_GOT_LONG)) { + if ( UNLIKELY(possible_problems & ~UTF8_GOT_LONG) + && LIKELY(flags & ~(UTF8_DISALLOW_NONCHAR|UTF8_WARN_NONCHAR))) + { /* Here, the input is malformed in some way besides possibly * overlong, except it doesn't overflow. If you look at the @@ -1576,6 +1578,10 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, * be enough information present to determine if what we have * so far would, if filled out completely, be for one of these * problematic code points we are being asked to check for. + * But to determine if a code point is a non-character, we need + * all bytes, so this effort would be wasted, hence the + * conditional above excludes this step if those are the only + * thing being checked for. * * The range of surrogates is * ASCII platforms EBCDIC I8 @@ -1601,12 +1607,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, * This is done by pretending the input was filled out to its * full length with occurrences of the smallest continuation * byte. For surrogates we could just look at the bytes, but - * this single algorithm works for both those and supers. - * - * To determine if a code point is a non-character, we need all - * bytes, so this effort is wasted if the caller is looking for - * just those, but that is unlikely; the two official Unicode - * restrictions include the other two. */ + * this single algorithm works for both those and supers. */ for (unsigned i = curlen; i < expectlen; i++) { uv = UTF8_ACCUMULATE(uv, I8_TO_NATIVE_UTF8(UTF_MIN_CONTINUATION_BYTE)); From 7881d756ea48937ce4b254e0fdeb7c8ea5b0131c Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 17 Nov 2024 09:52:59 -0700 Subject: [PATCH 11/14] utf8n_to_uvchr_msgs_helper: Add assertion Make sure it isn't being called with unexpected input -- --- utf8.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/utf8.c b/utf8.c index 9325f9ae874e..4cbaf8c32430 100644 --- a/utf8.c +++ b/utf8.c @@ -1459,6 +1459,10 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, expectlen = UTF8SKIP(s); uv = *s; + /* This is a helper function; invariants should have been handled before + * calling it */ + assert(! NATIVE_BYTE_IS_INVARIANT(*s0)); + /* A well-formed UTF-8 character, as the vast majority of calls to this * function will be for, has this expected length. For efficiency, set * things up here to return it. It will be overridden only in those rare From 7f8a862e114f3c44c5d53d9f05a6dc40882754c8 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 17 Nov 2024 10:00:49 -0700 Subject: [PATCH 12/14] utf8n_to_uvchr_msgs_helper: Don't throw away work Admittedly not much work, but I realized in code reading that there are function exits that ignore this initialization. Instead move the initialization to later, where it is actually needed --- utf8.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/utf8.c b/utf8.c index 4cbaf8c32430..44a5b9dfdf86 100644 --- a/utf8.c +++ b/utf8.c @@ -1456,8 +1456,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, } /* We now know we can examine the first byte of the input */ - expectlen = UTF8SKIP(s); - uv = *s; + expectlen = UTF8SKIP(s0); /* This is a helper function; invariants should have been handled before * calling it */ @@ -1472,7 +1471,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, } /* A continuation character can't start a valid sequence */ - if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) { + if (UNLIKELY(UTF8_IS_CONTINUATION(*s0))) { possible_problems |= UTF8_GOT_CONTINUATION; curlen = 1; uv = UNICODE_REPLACEMENT; @@ -1487,7 +1486,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits * that indicate the number of bytes in the character's whole UTF-8 * sequence, leaving just the bits that are part of the value. */ - uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen); + uv = NATIVE_UTF8_TO_I8(*s0) & UTF_START_MASK(expectlen); /* Setup the loop end point, making sure to not look past the end of the * input string, and flag it as too short if the size isn't big enough. */ From c9a6111a94663b90ac7008c4068651fb4f6853e2 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 17 Nov 2024 15:37:38 -0700 Subject: [PATCH 13/14] utf8.c: White-space only Remove excess indentation --- utf8.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/utf8.c b/utf8.c index 44a5b9dfdf86..3562e771e50b 100644 --- a/utf8.c +++ b/utf8.c @@ -1526,7 +1526,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, * separate. * * A convenience macro that matches either of the too-short conditions. */ -# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION) +#define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION) /* Check for overflow. The algorithm requires us to not look past the end * of the current character, even if partial, so the upper limit is 's' */ @@ -1538,8 +1538,8 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, * point value. Simply see if it is expressible in fewer bytes. Otherwise * we must look at the UTF-8 byte sequence itself to see if it is for an * overlong */ - if ( ( LIKELY(! possible_problems) - && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv))) + if ( ( LIKELY(! possible_problems) + && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv))) || ( UNLIKELY(possible_problems) && ( UNLIKELY(! UTF8_IS_START(*s0)) || (UNLIKELY(0 < is_utf8_overlong(s0, s - s0)))))) From bfac0e34222e12a1f9145f2819973809c7c1ae2e Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 17 Nov 2024 15:37:59 -0700 Subject: [PATCH 14/14] utf8n_to_uvchr_msgs_helper(): Refactor expression More rigorous testing of the overlong malformation, yet to be committed, showed that this didn't work as intended. The IS_UTF8_START_BYTE() excludes start bytes that always lead to overlong sequences. Fortunately the logic caused that to be mostly bypassed. But this commit fixes it all. --- utf8.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/utf8.c b/utf8.c index 3562e771e50b..9d4b4bc13a51 100644 --- a/utf8.c +++ b/utf8.c @@ -1534,15 +1534,19 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, possible_problems |= UTF8_GOT_OVERFLOW; } +/* Is the first byte of 's' a start byte in the UTF-8 encoding system, not + * excluding starting an overlong sequence? */ +#define UTF8_IS_SYNTACTIC_START_BYTE(s) (NATIVE_TO_I8(*s) >= 0xC0) + /* Check for overlong. If no problems so far, 'uv' is the correct code - * point value. Simply see if it is expressible in fewer bytes. Otherwise - * we must look at the UTF-8 byte sequence itself to see if it is for an - * overlong */ + * point value. Simply see if it is expressible in fewer bytes. But if + * there are other malformations, we may be still be able to tell if this + * is an overlong by looking at the UTF-8 byte sequence itself */ if ( ( LIKELY(! possible_problems) - && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv))) - || ( UNLIKELY(possible_problems) - && ( UNLIKELY(! UTF8_IS_START(*s0)) - || (UNLIKELY(0 < is_utf8_overlong(s0, s - s0)))))) + && UNLIKELY(expectlen > OFFUNISKIP(uv))) + || ( UNLIKELY(possible_problems) + && UTF8_IS_SYNTACTIC_START_BYTE(s0) + && UNLIKELY(0 < is_utf8_overlong(s0, s - s0)))) { possible_problems |= UTF8_GOT_LONG; }