Perl · khwilliamson · Nov 24, 2024 · Nov 14, 2024 · Nov 14, 2024 · Nov 14, 2024
diff --git a/utf8.c b/utf8.c
@@ -1371,13 +1371,6 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
     U32 discard_errors;   /* Used to save branches when 'errors' is NULL; this
                              gets set and discarded */
 
-    /* The below are used only if there is both an overlong malformation and a
-     * too short one.  Otherwise the first two are set to 's0' and 'send', and
-     * the third not used at all */
-    U8 * adjusted_s0;
-    U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
-                                            routine; see [perl #130921] */
-    UV uv_so_far;
     dTHX;
 
     PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER;
@@ -1420,8 +1413,6 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
     expectlen = 0;
     avail_len = 0;
     discard_errors = 0;
-    adjusted_s0 = (U8 *) s0;
-    uv_so_far = 0;
 
     if (errors) {
         *errors = 0;
@@ -1465,8 +1456,11 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
     }
 
     /* We now know we can examine the first byte of the input */
-    expectlen = UTF8SKIP(s);
-    uv = *s;
+    expectlen = UTF8SKIP(s0);
+
+    /* This is a helper function; invariants should have been handled before
+     * calling it */
+    assert(! NATIVE_BYTE_IS_INVARIANT(*s0));
 
     /* A well-formed UTF-8 character, as the vast majority of calls to this
      * function will be for, has this expected length.  For efficiency, set
@@ -1477,7 +1471,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
     }
 
     /* A continuation character can't start a valid sequence */
-    if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
+    if (UNLIKELY(UTF8_IS_CONTINUATION(*s0))) {
         possible_problems |= UTF8_GOT_CONTINUATION;
         curlen = 1;
         uv = UNICODE_REPLACEMENT;
@@ -1492,7 +1486,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
     /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
      * that indicate the number of bytes in the character's whole UTF-8
      * sequence, leaving just the bits that are part of the value.  */
-    uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
+    uv = NATIVE_UTF8_TO_I8(*s0) & UTF_START_MASK(expectlen);
 
     /* Setup the loop end point, making sure to not look past the end of the
      * input string, and flag it as too short if the size isn't big enough. */
@@ -1532,124 +1526,117 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
      * separate.
      *
      * A convenience macro that matches either of the too-short conditions.  */
-#   define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
-
-    if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
-        uv_so_far = uv;
-        uv = UNICODE_REPLACEMENT;
-    }
+#define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
 
     /* Check for overflow.  The algorithm requires us to not look past the end
      * of the current character, even if partial, so the upper limit is 's' */
     if (UNLIKELY(does_utf8_overflow(s0, s) >= ALMOST_CERTAINLY_OVERFLOWS)) {
         possible_problems |= UTF8_GOT_OVERFLOW;
-        uv = UNICODE_REPLACEMENT;
     }
 
+/* Is the first byte of 's' a start byte in the UTF-8 encoding system, not
+ * excluding starting an overlong sequence? */
+#define UTF8_IS_SYNTACTIC_START_BYTE(s)  (NATIVE_TO_I8(*s) >= 0xC0)
+
     /* Check for overlong.  If no problems so far, 'uv' is the correct code
-     * point value.  Simply see if it is expressible in fewer bytes.  Otherwise
-     * we must look at the UTF-8 byte sequence itself to see if it is for an
-     * overlong */
-    if (     (   LIKELY(! possible_problems)
-              && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
-        || (       UNLIKELY(possible_problems)
-            && (   UNLIKELY(! UTF8_IS_START(*s0))
-                || (UNLIKELY(0 < is_utf8_overlong(s0, s - s0))))))
+     * point value.  Simply see if it is expressible in fewer bytes.  But if
+     * there are other malformations, we may be still be able to tell if this
+     * is an overlong by looking at the UTF-8 byte sequence itself */
+    if (   (   LIKELY(! possible_problems)
+            && UNLIKELY(expectlen > OFFUNISKIP(uv)))
+        || (   UNLIKELY(possible_problems)
+            && UTF8_IS_SYNTACTIC_START_BYTE(s0)
+            && UNLIKELY(0 < is_utf8_overlong(s0, s - s0))))
     {
         possible_problems |= UTF8_GOT_LONG;
-
-        if (   UNLIKELY(   possible_problems & UTF8_GOT_TOO_SHORT)
-
-                          /* The calculation in the 'true' branch of this 'if'
-                           * below won't work if overflows, and isn't needed
-                           * anyway.  Further below we handle all overflow
-                           * cases */
-            &&   LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)))
-        {
-            UV min_uv = uv_so_far;
-            STRLEN i;
-
-            /* Here, the input is both overlong and is missing some trailing
-             * bytes.  There is no single code point it could be for, but there
-             * may be enough information present to determine if what we have
-             * so far is for an unallowed code point, such as for a surrogate.
-             * The code further below has the intelligence to determine this,
-             * but just for non-overlong UTF-8 sequences.  What we do here is
-             * calculate the smallest code point the input could represent if
-             * there were no too short malformation.  Then we compute and save
-             * the UTF-8 for that, which is what the code below looks at
-             * instead of the raw input.  It turns out that the smallest such
-             * code point is all we need. */
-            for (i = curlen; i < expectlen; i++) {
-                min_uv = UTF8_ACCUMULATE(min_uv,
-                                I8_TO_NATIVE_UTF8(UTF_MIN_CONTINUATION_BYTE));
-            }
-
-            adjusted_s0 = temp_char_buf;
-            (void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
-        }
     }
 
     /* Here, we have found all the possible problems, except for when the input
-     * is for a problematic code point not allowed by the input parameters. */
-
-                                /* uv is valid for overlongs */
-    if (   (   (      LIKELY(! (possible_problems & ~UTF8_GOT_LONG))
-                   && isUNICODE_POSSIBLY_PROBLEMATIC(uv))
-            || (   UNLIKELY(possible_problems)
-
-                          /* if overflow, we know without looking further
-                           * precisely which of the problematic types it is,
-                           * and we deal with those in the overflow handling
-                           * code */
-                && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
-                && (   isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)
-                    || UNLIKELY(UTF8_IS_PERL_EXTENDED(s0)))))
-        && ((flags & ( UTF8_DISALLOW_NONCHAR
-                      |UTF8_DISALLOW_SURROGATE
-                      |UTF8_DISALLOW_SUPER
-                      |UTF8_DISALLOW_PERL_EXTENDED
-                      |UTF8_WARN_NONCHAR
-                      |UTF8_WARN_SURROGATE
-                      |UTF8_WARN_SUPER
-                      |UTF8_WARN_PERL_EXTENDED))))
+     * is for a problematic code point not allowed by the input parameters.
+     * Check now for those parameters */
+    if (   flags & ( UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+                    |UTF8_WARN_ILLEGAL_INTERCHANGE)
+
+                    /* if overflow, we know without looking further that this
+                     * is a non-Unicode code point, which we deal with below in
+                     * the overflow handling code */
+        && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)))
     {
-        /* If there were no malformations, or the only malformation is an
-         * overlong, 'uv' is valid */
-        if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
-            if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
-                possible_problems |= UTF8_GOT_SURROGATE;
-            }
-            else if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
-                possible_problems |= UTF8_GOT_SUPER;
-            }
-            else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
-                possible_problems |= UTF8_GOT_NONCHAR;
-            }
+        /* By examining just the first byte, we can see if this is using
+         * non-standard UTF-8.  Even if it is an overlong that reduces to a
+         * small code point, it is still using this Perl invention, so mark it
+         * as such */
+        if (UNLIKELY(UTF8_IS_PERL_EXTENDED(s0))) {
+            possible_problems |= UTF8_GOT_SUPER;
         }
-        else {  /* Otherwise, need to look at the source UTF-8, possibly
-                   adjusted to be non-overlong */
-
-            if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
-                                                    > UTF_START_BYTE_110000_))
+        else {
+            /* See if the input has malformations besides possibly overlong */
+            if (   UNLIKELY(possible_problems & ~UTF8_GOT_LONG)
+                && LIKELY(flags & ~(UTF8_DISALLOW_NONCHAR|UTF8_WARN_NONCHAR)))
             {
-                possible_problems |= UTF8_GOT_SUPER;
+
+                /* Here, the input is malformed in some way besides possibly
+                 * overlong, except it doesn't overflow.  If you look at the
+                 * code above, to get here, it must be a too short string,
+                 * possibly overlong besides. */
+                assert(possible_problems & UTF8_GOT_TOO_SHORT);
+
+                /* There is no single code point it could be for, but there may
+                 * be enough information present to determine if what we have
+                 * so far would, if filled out completely, be for one of these
+                 * problematic code points we are being asked to check for.
+                 * But to determine if a code point is a non-character, we need
+                 * all bytes, so this effort would be wasted, hence the
+                 * conditional above excludes this step if those are the only
+                 * thing being checked for.
+                 *
+                 * The range of surrogates is
+                 *      ASCII platforms                  EBCDIC I8
+                 *      "\xed\xa0\x80"               "\xf1\xb6\xa0\xa0"
+                 * to   "\xed\xbf\xbf".              "\xf1\xb7\xbf\xbf"
+                 *
+                 * (Continuation byte range):
+                 *       \x80 to \xbf                     \xa0 to \xbf
+                 *
+                 * In both cases, if we have the first two bytes, we can tell
+                 * if it is a surrogate or not.  If we have only one byte, we
+                 * can't tell, so we have to assume it isn't a surrogate.
+                 *
+                 * It is more complicated for supers due to the possibility of
+                 * overlongs. For example, in ASCII, the first non-Unicode code
+                 * point is represented by the sequence \xf4\x90\x80\x80, so
+                 * \xf8\x80\x80\x80\x41 looks like it is for a much bigger code
+                 * point.  But it in fact is an overlong representation of the
+                 * letter "A".
+                 *
+                 * So what we do is calculate the smallest code point the input
+                 * could represent if there were no too short malformation.
+                 * This is done by pretending the input was filled out to its
+                 * full length with occurrences of the smallest continuation
+                 * byte.  For surrogates we could just look at the bytes, but
+                 * this single algorithm works for both those and supers. */
+                for (unsigned i = curlen; i < expectlen; i++) {
+                    uv = UTF8_ACCUMULATE(uv,
+                                I8_TO_NATIVE_UTF8(UTF_MIN_CONTINUATION_BYTE));
+                }
             }
-            else if (curlen > 1) {
-                if (UNLIKELY(   NATIVE_UTF8_TO_I8(*adjusted_s0)
-                                                == UTF_START_BYTE_110000_
-                             && NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1))
-                                                >= UTF_FIRST_CONT_BYTE_110000_))
-                {
+
+            /* Here 'uv' is as valid as it can get.  Perhaps it was valid all
+             * along because there were no malformations, or the only
+             * malformation is an overlong (which allows it to be fully
+             * computed).  Or it may have been "cured" as best it can by the
+             * loop just above. */
+            if (isUNICODE_POSSIBLY_PROBLEMATIC(uv)) {
+                if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
+                    possible_problems |= UTF8_GOT_SURROGATE;
+                }
+                else if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
                     possible_problems |= UTF8_GOT_SUPER;
                 }
-                else if (UNLIKELY(is_SURROGATE_utf8(adjusted_s0))) {
-                    possible_problems |= UTF8_GOT_SURROGATE;
+                else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
+                    possible_problems |= UTF8_GOT_NONCHAR;
                 }
             }
-
-            /* We need a complete well-formed UTF-8 character to discern
-             * non-characters, so can't look for them here */
         }
     }
 
@@ -1709,6 +1696,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
                  * extended UTF-8, but we handle all three cases here */
                 possible_problems &= ~(UTF8_GOT_SUPER|UTF8_GOT_PERL_EXTENDED);
                 *errors |= UTF8_GOT_OVERFLOW;
+                uv = UNICODE_REPLACEMENT;
 
                 /* But the API says we flag all errors found */
                 if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) {
@@ -1802,6 +1790,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
 
               case UTF8_GOT_SHORT:
                 *errors |= UTF8_GOT_SHORT;
+                uv = UNICODE_REPLACEMENT;
 
                 if (! (flags & UTF8_ALLOW_SHORT)) {
                     disallowed = TRUE;
@@ -1824,6 +1813,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
 
               case UTF8_GOT_NON_CONTINUATION:
                 *errors |= UTF8_GOT_NON_CONTINUATION;
+                uv = UNICODE_REPLACEMENT;
 
                 if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
                     disallowed = TRUE;