Skip to content

Commit

Permalink
Scalar implementation of the UTF-8 GetPointerToFirstInvalidByte. No t…
Browse files Browse the repository at this point in the history
…est, no benchmark.
  • Loading branch information
lemire committed Nov 14, 2023
1 parent 719b85f commit f478fd0
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 7 deletions.
10 changes: 5 additions & 5 deletions benchmark/Benchmark.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ namespace SimdUnicodeBenchmarks
// for a standard benchmark
public class Checker
{
List<char[]> names;
List<byte[]> AsciiBytes;
List<char[]> nonAsciichars;
public List<byte[]> nonAsciiBytes; // Declare at the class level
List<char[]> names = new List<char[]>();
List<byte[]> AsciiBytes = new List<byte[]>();
List<char[]> nonAsciichars = new List<char[]>();
public List<byte[]> nonAsciiBytes = new List<byte[]>(); // Declare at the class level

List<bool> results;
List<bool> results = new List<bool>();

public static bool RuntimeIsAsciiApproach(ReadOnlySpan<char> s)
{
Expand Down
4 changes: 2 additions & 2 deletions src/Ascii.cs
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,9 @@ public static unsafe bool SIMDIsAscii(this ReadOnlySpan<char> s)
{
Vector128<ushort> raw1 = Sse41.LoadDquVector128((ushort*)pStart + i);
Vector128<ushort> raw2 = Sse41.LoadDquVector128((ushort*)pStart + i + 8);

total = Sse2.Or(total, raw1);
total = Sse2.Or(total, raw2);
total = Sse2.Or(total, raw2);
}

Vector128<ushort> b127 = Vector128.Create((ushort)127);
Expand Down
77 changes: 77 additions & 0 deletions src/UTF8.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,82 @@ namespace SimdUnicode
{
public static class UTF8
{
public unsafe static byte* GetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength)
{

int pos = 0;
int next_pos;
uint code_point = 0;
while (pos < inputLength)
{
// If the next 16 bytes are ascii, we can skip them.
next_pos = pos + 16;
if (next_pos <= inputLength)
{ // if it is safe to read 16 more bytes, check that they are ascii
ulong v1 = *(ulong*)pInputBuffer;
ulong v2 = *(ulong*)(pInputBuffer + 8);
ulong v = v1 | v2;

if ((v & 0x8080808080808080) == 0)
{
pos = next_pos;
continue;
}

}
byte first_byte = pInputBuffer[pos];
while (first_byte < 0b10000000)
{
if (++pos == inputLength) { return pInputBuffer + inputLength; }
first_byte = pInputBuffer[pos];
}

if ((first_byte & 0b11100000) == 0b11000000)
{
next_pos = pos + 2;
if (next_pos > inputLength) { return pInputBuffer + pos; }
if ((pInputBuffer[pos + 1] & 0b11000000) != 0b10000000) { return pInputBuffer + pos; }
// range check
code_point = (uint)(first_byte & 0b00011111) << 6 | (uint)(pInputBuffer[pos + 1] & 0b00111111);
if ((code_point < 0x80) || (0x7ff < code_point)) { return pInputBuffer + pos; }
}
else if ((first_byte & 0b11110000) == 0b11100000)
{
next_pos = pos + 3;
if (next_pos > inputLength) { return pInputBuffer + pos; }
if ((pInputBuffer[pos + 1] & 0b11000000) != 0b10000000) { return pInputBuffer + pos; }
if ((pInputBuffer[pos + 2] & 0b11000000) != 0b10000000) { return pInputBuffer + pos; }
// range check
code_point = (uint)(first_byte & 0b00001111) << 12 |
(uint)(pInputBuffer[pos + 1] & 0b00111111) << 6 |
(uint)(pInputBuffer[pos + 2] & 0b00111111);
if ((code_point < 0x800) || (0xffff < code_point) ||
(0xd7ff < code_point && code_point < 0xe000))
{
return pInputBuffer + pos;
}
}
else if ((first_byte & 0b11111000) == 0b11110000)
{ // 0b11110000
next_pos = pos + 4;
if (next_pos > inputLength) { return pInputBuffer + pos; }
if ((pInputBuffer[pos + 1] & 0b11000000) != 0b10000000) { return pInputBuffer + pos; }
if ((pInputBuffer[pos + 2] & 0b11000000) != 0b10000000) { return pInputBuffer + pos; }
if ((pInputBuffer[pos + 3] & 0b11000000) != 0b10000000) { return pInputBuffer + pos; }
// range check
code_point =
(uint)(first_byte & 0b00000111) << 18 | (uint)(pInputBuffer[pos + 1] & 0b00111111) << 12 |
(uint)(pInputBuffer[pos + 2] & 0b00111111) << 6 | (uint)(pInputBuffer[pos + 3] & 0b00111111);
if (code_point <= 0xffff || 0x10ffff < code_point) { return pInputBuffer + pos; }
}
else
{
// we may have a continuation
return pInputBuffer + pos;
}
pos = next_pos;
}
return pInputBuffer + inputLength;
}
}
}

0 comments on commit f478fd0

Please sign in to comment.