Scalar implementation of the UTF-8 GetPointerToFirstInvalidByte. No t…

…est, no benchmark.
simdutf · Nov 14, 2023 · f478fd0 · f478fd0
1 parent 719b85f
commit f478fd0
Show file tree

Hide file tree

Showing 3 changed files with 84 additions and 7 deletions.
diff --git a/benchmark/Benchmark.cs b/benchmark/Benchmark.cs
@@ -14,12 +14,12 @@ namespace SimdUnicodeBenchmarks
     // for a standard benchmark
     public class Checker
     {
-        List<char[]> names;
-        List<byte[]> AsciiBytes;
-        List<char[]> nonAsciichars;
-        public List<byte[]> nonAsciiBytes; // Declare at the class level
+        List<char[]> names = new List<char[]>();
+        List<byte[]> AsciiBytes = new List<byte[]>();
+        List<char[]> nonAsciichars = new List<char[]>();
+        public List<byte[]> nonAsciiBytes = new List<byte[]>(); // Declare at the class level
 
-        List<bool> results;
+        List<bool> results = new List<bool>();
 
         public static bool RuntimeIsAsciiApproach(ReadOnlySpan<char> s)
         {

diff --git a/src/Ascii.cs b/src/Ascii.cs
@@ -102,9 +102,9 @@ public static unsafe bool SIMDIsAscii(this ReadOnlySpan<char> s)
                         {
                             Vector128<ushort> raw1 = Sse41.LoadDquVector128((ushort*)pStart + i);
                             Vector128<ushort> raw2 = Sse41.LoadDquVector128((ushort*)pStart + i + 8);
-                            
+
                             total = Sse2.Or(total, raw1);
-                            total = Sse2.Or(total, raw2); 
+                            total = Sse2.Or(total, raw2);
                         }
 
                         Vector128<ushort> b127 = Vector128.Create((ushort)127);

diff --git a/src/UTF8.cs b/src/UTF8.cs
@@ -8,5 +8,82 @@ namespace SimdUnicode
 {
     public static class UTF8
     {
+        public unsafe static byte* GetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength)
+        {
+
+            int pos = 0;
+            int next_pos;
+            uint code_point = 0;
+            while (pos < inputLength)
+            {
+                // If the next  16 bytes are ascii, we can skip them.
+                next_pos = pos + 16;
+                if (next_pos <= inputLength)
+                { // if it is safe to read 16 more bytes, check that they are ascii
+                    ulong v1 = *(ulong*)pInputBuffer;
+                    ulong v2 = *(ulong*)(pInputBuffer + 8);
+                    ulong v = v1 | v2;
+
+                    if ((v & 0x8080808080808080) == 0)
+                    {
+                        pos = next_pos;
+                        continue;
+                    }
+
+                }
+                byte first_byte = pInputBuffer[pos];
+                while (first_byte < 0b10000000)
+                {
+                    if (++pos == inputLength) { return pInputBuffer + inputLength; }
+                    first_byte = pInputBuffer[pos];
+                }
+
+                if ((first_byte & 0b11100000) == 0b11000000)
+                {
+                    next_pos = pos + 2;
+                    if (next_pos > inputLength) { return pInputBuffer + pos; }
+                    if ((pInputBuffer[pos + 1] & 0b11000000) != 0b10000000) { return pInputBuffer + pos; }
+                    // range check
+                    code_point = (uint)(first_byte & 0b00011111) << 6 | (uint)(pInputBuffer[pos + 1] & 0b00111111);
+                    if ((code_point < 0x80) || (0x7ff < code_point)) { return pInputBuffer + pos; }
+                }
+                else if ((first_byte & 0b11110000) == 0b11100000)
+                {
+                    next_pos = pos + 3;
+                    if (next_pos > inputLength) { return pInputBuffer + pos; }
+                    if ((pInputBuffer[pos + 1] & 0b11000000) != 0b10000000) { return pInputBuffer + pos; }
+                    if ((pInputBuffer[pos + 2] & 0b11000000) != 0b10000000) { return pInputBuffer + pos; }
+                    // range check
+                    code_point = (uint)(first_byte & 0b00001111) << 12 |
+                                 (uint)(pInputBuffer[pos + 1] & 0b00111111) << 6 |
+                                 (uint)(pInputBuffer[pos + 2] & 0b00111111);
+                    if ((code_point < 0x800) || (0xffff < code_point) ||
+                        (0xd7ff < code_point && code_point < 0xe000))
+                    {
+                        return pInputBuffer + pos;
+                    }
+                }
+                else if ((first_byte & 0b11111000) == 0b11110000)
+                { // 0b11110000
+                    next_pos = pos + 4;
+                    if (next_pos > inputLength) { return pInputBuffer + pos; }
+                    if ((pInputBuffer[pos + 1] & 0b11000000) != 0b10000000) { return pInputBuffer + pos; }
+                    if ((pInputBuffer[pos + 2] & 0b11000000) != 0b10000000) { return pInputBuffer + pos; }
+                    if ((pInputBuffer[pos + 3] & 0b11000000) != 0b10000000) { return pInputBuffer + pos; }
+                    // range check
+                    code_point =
+                        (uint)(first_byte & 0b00000111) << 18 | (uint)(pInputBuffer[pos + 1] & 0b00111111) << 12 |
+                        (uint)(pInputBuffer[pos + 2] & 0b00111111) << 6 | (uint)(pInputBuffer[pos + 3] & 0b00111111);
+                    if (code_point <= 0xffff || 0x10ffff < code_point) { return pInputBuffer + pos; }
+                }
+                else
+                {
+                    // we may have a continuation
+                    return pInputBuffer + pos;
+                }
+                pos = next_pos;
+            }
+            return pInputBuffer + inputLength;
+        }
     }
 }