Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

utf8n_to_uvchr(): Simplify and fix some overlongs edge cases #22757

Merged
merged 14 commits into from
Nov 24, 2024
Merged
210 changes: 100 additions & 110 deletions utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -1371,13 +1371,6 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
U32 discard_errors; /* Used to save branches when 'errors' is NULL; this
gets set and discarded */

/* The below are used only if there is both an overlong malformation and a
* too short one. Otherwise the first two are set to 's0' and 'send', and
* the third not used at all */
U8 * adjusted_s0;
U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
routine; see [perl #130921] */
UV uv_so_far;
dTHX;

PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER;
Expand Down Expand Up @@ -1420,8 +1413,6 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
expectlen = 0;
avail_len = 0;
discard_errors = 0;
adjusted_s0 = (U8 *) s0;
uv_so_far = 0;

if (errors) {
*errors = 0;
Expand Down Expand Up @@ -1465,8 +1456,11 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
}

/* We now know we can examine the first byte of the input */
expectlen = UTF8SKIP(s);
uv = *s;
expectlen = UTF8SKIP(s0);

/* This is a helper function; invariants should have been handled before
* calling it */
assert(! NATIVE_BYTE_IS_INVARIANT(*s0));

/* A well-formed UTF-8 character, as the vast majority of calls to this
* function will be for, has this expected length. For efficiency, set
Expand All @@ -1477,7 +1471,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
}

/* A continuation character can't start a valid sequence */
if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
if (UNLIKELY(UTF8_IS_CONTINUATION(*s0))) {
possible_problems |= UTF8_GOT_CONTINUATION;
curlen = 1;
uv = UNICODE_REPLACEMENT;
Expand All @@ -1492,7 +1486,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
/* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
* that indicate the number of bytes in the character's whole UTF-8
* sequence, leaving just the bits that are part of the value. */
uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
uv = NATIVE_UTF8_TO_I8(*s0) & UTF_START_MASK(expectlen);

/* Setup the loop end point, making sure to not look past the end of the
* input string, and flag it as too short if the size isn't big enough. */
Expand Down Expand Up @@ -1532,124 +1526,117 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
* separate.
*
* A convenience macro that matches either of the too-short conditions. */
# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)

if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
uv_so_far = uv;
uv = UNICODE_REPLACEMENT;
}
#define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)

/* Check for overflow. The algorithm requires us to not look past the end
* of the current character, even if partial, so the upper limit is 's' */
if (UNLIKELY(does_utf8_overflow(s0, s) >= ALMOST_CERTAINLY_OVERFLOWS)) {
possible_problems |= UTF8_GOT_OVERFLOW;
uv = UNICODE_REPLACEMENT;
}

/* Is the first byte of 's' a start byte in the UTF-8 encoding system, not
* excluding starting an overlong sequence? */
#define UTF8_IS_SYNTACTIC_START_BYTE(s) (NATIVE_TO_I8(*s) >= 0xC0)

/* Check for overlong. If no problems so far, 'uv' is the correct code
* point value. Simply see if it is expressible in fewer bytes. Otherwise
* we must look at the UTF-8 byte sequence itself to see if it is for an
* overlong */
if ( ( LIKELY(! possible_problems)
&& UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
|| ( UNLIKELY(possible_problems)
&& ( UNLIKELY(! UTF8_IS_START(*s0))
|| (UNLIKELY(0 < is_utf8_overlong(s0, s - s0))))))
* point value. Simply see if it is expressible in fewer bytes. But if
* there are other malformations, we may be still be able to tell if this
* is an overlong by looking at the UTF-8 byte sequence itself */
if ( ( LIKELY(! possible_problems)
&& UNLIKELY(expectlen > OFFUNISKIP(uv)))
|| ( UNLIKELY(possible_problems)
&& UTF8_IS_SYNTACTIC_START_BYTE(s0)
&& UNLIKELY(0 < is_utf8_overlong(s0, s - s0))))
{
possible_problems |= UTF8_GOT_LONG;

if ( UNLIKELY( possible_problems & UTF8_GOT_TOO_SHORT)

/* The calculation in the 'true' branch of this 'if'
* below won't work if overflows, and isn't needed
* anyway. Further below we handle all overflow
* cases */
&& LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)))
{
UV min_uv = uv_so_far;
STRLEN i;

/* Here, the input is both overlong and is missing some trailing
* bytes. There is no single code point it could be for, but there
* may be enough information present to determine if what we have
* so far is for an unallowed code point, such as for a surrogate.
* The code further below has the intelligence to determine this,
* but just for non-overlong UTF-8 sequences. What we do here is
* calculate the smallest code point the input could represent if
* there were no too short malformation. Then we compute and save
* the UTF-8 for that, which is what the code below looks at
* instead of the raw input. It turns out that the smallest such
* code point is all we need. */
for (i = curlen; i < expectlen; i++) {
min_uv = UTF8_ACCUMULATE(min_uv,
I8_TO_NATIVE_UTF8(UTF_MIN_CONTINUATION_BYTE));
}

adjusted_s0 = temp_char_buf;
(void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
}
}

/* Here, we have found all the possible problems, except for when the input
* is for a problematic code point not allowed by the input parameters. */

/* uv is valid for overlongs */
if ( ( ( LIKELY(! (possible_problems & ~UTF8_GOT_LONG))
&& isUNICODE_POSSIBLY_PROBLEMATIC(uv))
|| ( UNLIKELY(possible_problems)

/* if overflow, we know without looking further
* precisely which of the problematic types it is,
* and we deal with those in the overflow handling
* code */
&& LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
&& ( isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)
|| UNLIKELY(UTF8_IS_PERL_EXTENDED(s0)))))
&& ((flags & ( UTF8_DISALLOW_NONCHAR
|UTF8_DISALLOW_SURROGATE
|UTF8_DISALLOW_SUPER
|UTF8_DISALLOW_PERL_EXTENDED
|UTF8_WARN_NONCHAR
|UTF8_WARN_SURROGATE
|UTF8_WARN_SUPER
|UTF8_WARN_PERL_EXTENDED))))
* is for a problematic code point not allowed by the input parameters.
* Check now for those parameters */
if ( flags & ( UTF8_DISALLOW_ILLEGAL_INTERCHANGE
|UTF8_WARN_ILLEGAL_INTERCHANGE)

/* if overflow, we know without looking further that this
* is a non-Unicode code point, which we deal with below in
* the overflow handling code */
&& LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)))
{
/* If there were no malformations, or the only malformation is an
* overlong, 'uv' is valid */
if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
possible_problems |= UTF8_GOT_SURROGATE;
}
else if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
possible_problems |= UTF8_GOT_SUPER;
}
else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
possible_problems |= UTF8_GOT_NONCHAR;
}
/* By examining just the first byte, we can see if this is using
* non-standard UTF-8. Even if it is an overlong that reduces to a
* small code point, it is still using this Perl invention, so mark it
* as such */
if (UNLIKELY(UTF8_IS_PERL_EXTENDED(s0))) {
possible_problems |= UTF8_GOT_SUPER;
}
else { /* Otherwise, need to look at the source UTF-8, possibly
adjusted to be non-overlong */

if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
> UTF_START_BYTE_110000_))
else {
/* See if the input has malformations besides possibly overlong */
if ( UNLIKELY(possible_problems & ~UTF8_GOT_LONG)
&& LIKELY(flags & ~(UTF8_DISALLOW_NONCHAR|UTF8_WARN_NONCHAR)))
{
possible_problems |= UTF8_GOT_SUPER;

/* Here, the input is malformed in some way besides possibly
* overlong, except it doesn't overflow. If you look at the
* code above, to get here, it must be a too short string,
* possibly overlong besides. */
assert(possible_problems & UTF8_GOT_TOO_SHORT);

/* There is no single code point it could be for, but there may
* be enough information present to determine if what we have
* so far would, if filled out completely, be for one of these
* problematic code points we are being asked to check for.
* But to determine if a code point is a non-character, we need
* all bytes, so this effort would be wasted, hence the
* conditional above excludes this step if those are the only
* thing being checked for.
*
* The range of surrogates is
* ASCII platforms EBCDIC I8
* "\xed\xa0\x80" "\xf1\xb6\xa0\xa0"
* to "\xed\xbf\xbf". "\xf1\xb7\xbf\xbf"
*
* (Continuation byte range):
* \x80 to \xbf \xa0 to \xbf
*
* In both cases, if we have the first two bytes, we can tell
* if it is a surrogate or not. If we have only one byte, we
* can't tell, so we have to assume it isn't a surrogate.
*
* It is more complicated for supers due to the possibility of
* overlongs. For example, in ASCII, the first non-Unicode code
* point is represented by the sequence \xf4\x90\x80\x80, so
* \xf8\x80\x80\x80\x41 looks like it is for a much bigger code
* point. But it in fact is an overlong representation of the
* letter "A".
*
* So what we do is calculate the smallest code point the input
* could represent if there were no too short malformation.
* This is done by pretending the input was filled out to its
* full length with occurrences of the smallest continuation
* byte. For surrogates we could just look at the bytes, but
* this single algorithm works for both those and supers. */
for (unsigned i = curlen; i < expectlen; i++) {
uv = UTF8_ACCUMULATE(uv,
I8_TO_NATIVE_UTF8(UTF_MIN_CONTINUATION_BYTE));
}
}
else if (curlen > 1) {
if (UNLIKELY( NATIVE_UTF8_TO_I8(*adjusted_s0)
== UTF_START_BYTE_110000_
&& NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1))
>= UTF_FIRST_CONT_BYTE_110000_))
{

/* Here 'uv' is as valid as it can get. Perhaps it was valid all
* along because there were no malformations, or the only
* malformation is an overlong (which allows it to be fully
* computed). Or it may have been "cured" as best it can by the
* loop just above. */
if (isUNICODE_POSSIBLY_PROBLEMATIC(uv)) {
if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
possible_problems |= UTF8_GOT_SURROGATE;
}
else if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
possible_problems |= UTF8_GOT_SUPER;
}
else if (UNLIKELY(is_SURROGATE_utf8(adjusted_s0))) {
possible_problems |= UTF8_GOT_SURROGATE;
else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
possible_problems |= UTF8_GOT_NONCHAR;
}
}

/* We need a complete well-formed UTF-8 character to discern
* non-characters, so can't look for them here */
}
}

Expand Down Expand Up @@ -1709,6 +1696,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
* extended UTF-8, but we handle all three cases here */
possible_problems &= ~(UTF8_GOT_SUPER|UTF8_GOT_PERL_EXTENDED);
*errors |= UTF8_GOT_OVERFLOW;
uv = UNICODE_REPLACEMENT;

/* But the API says we flag all errors found */
if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) {
Expand Down Expand Up @@ -1802,6 +1790,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,

case UTF8_GOT_SHORT:
*errors |= UTF8_GOT_SHORT;
uv = UNICODE_REPLACEMENT;

if (! (flags & UTF8_ALLOW_SHORT)) {
disallowed = TRUE;
Expand All @@ -1824,6 +1813,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,

case UTF8_GOT_NON_CONTINUATION:
*errors |= UTF8_GOT_NON_CONTINUATION;
uv = UNICODE_REPLACEMENT;

if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
disallowed = TRUE;
Expand Down
Loading