diff --git a/README.md b/README.md index aff57e3..c0a7709 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,9 @@ This is a fast C# library to validate UTF-8 strings. ## Motivation We seek to speed up the `Utf8Utility.GetPointerToFirstInvalidByte` function from the C# runtime library. -[The function is private in the Microsoft Runtime](https://github.com/dotnet/runtime/blob/4d709cd12269fcbb3d0fccfb2515541944475954/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs), but we can expose it manually. +[The function is private in the Microsoft Runtime](https://github.com/dotnet/runtime/blob/4d709cd12269fcbb3d0fccfb2515541944475954/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs), but we can expose it manually. The C# runtime +function is well optimized and it makes use of advanced CPU instructions. Nevertheless, we propose +an alternative that can be several times faster. Specifically, we provide the function `SimdUnicode.UTF8.GetPointerToFirstInvalidByte` which is a faster drop-in replacement: @@ -35,7 +37,7 @@ We apply the algorithm used by Node.js, Bun, Oracle GraalVM, by the PHP interpre ## Requirements -We recommend you install .NET 8: https://dotnet.microsoft.com/en-us/download/dotnet/8.0 +We recommend you install .NET 8 or better: https://dotnet.microsoft.com/en-us/download/dotnet/8.0 ## Running tests @@ -74,8 +76,6 @@ Or to target specific categories: dotnet test --filter "Category=scalar" ``` - - ## Running Benchmarks To run the benchmarks, run the following command: @@ -98,6 +98,28 @@ cd benchmark sudo dotnet run -c Release ``` +## Results (x64) + +To be completed. + +## Results (ARM) + +On an Apple M2 system, our validation function is two to three times +faster than the standard library. + +| data set | SimdUnicode speed (GB/s) | .NET speed (GB/s) | +|:----------------|:-----------|:--------------------------| +| Arabic-Lipsum | 6.7 | 3.5 | +| Chinese-Lipsum | 6.7 | 4.8 | +| Emoji-Lipsum | 6.7 | 2.5 | +| Hebrew-Lipsum | 6.7 | 3.5 | +| Hindi-Lipsum | 6.8 | 3.0 | +| Japanese-Lipsum | 6.8 | 4.6  | +| Korean-Lipsum | 6.6 | 1.8 | +| Latin-Lipsum | 87 | 38 | +| Russian-Lipsum | 6.7 | 2.6 | + + ## Building the library ``` @@ -139,7 +161,7 @@ You can print the content of a vector register like so: ## More reading -- https://github.com/dotnet/coreclr/pull/21948/files#diff-2a22774bd6bff8e217ecbb3a41afad033ce0ca0f33645e9d8f5bdf7c9e3ac248 +- [Add optimized UTF-8 validation and transcoding apis, hook them up to UTF8Encoding](https://github.com/dotnet/coreclr/pull/21948/files#diff-2a22774bd6bff8e217ecbb3a41afad033ce0ca0f33645e9d8f5bdf7c9e3ac248) - https://github.com/dotnet/runtime/issues/41699 - https://learn.microsoft.com/en-us/dotnet/standard/design-guidelines/ - https://learn.microsoft.com/en-us/dotnet/csharp/fundamentals/coding-style/coding-conventions diff --git a/benchmark/Benchmark.cs b/benchmark/Benchmark.cs index 3f2a303..e3af605 100644 --- a/benchmark/Benchmark.cs +++ b/benchmark/Benchmark.cs @@ -5,6 +5,7 @@ using BenchmarkDotNet.Configs; using BenchmarkDotNet.Reports; using BenchmarkDotNet.Filters; +using BenchmarkDotNet.Jobs; using System.Text; using System.Runtime; using System.Runtime.InteropServices; @@ -272,10 +273,23 @@ public unsafe void SIMDUtf8ValidationRealDataSse() } public class Program { - static void Main(string[] args) => BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args, DefaultConfig.Instance - .WithSummaryStyle(SummaryStyle.Default.WithMaxParameterColumnWidth(100))); - + static void Main(string[] args) + { + if (args.Length == 0) + { + args = new string[] { "--filter", "*" }; + } + var job = Job.Default + .WithWarmupCount(1) + .WithMinIterationCount(2) + .WithMaxIterationCount(10) + .AsDefault(); + BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args, DefaultConfig.Instance.AddJob(job).WithSummaryStyle(SummaryStyle.Default.WithMaxParameterColumnWidth(100))); + } } + + // } + } diff --git a/benchmark/UTF8_runtime.cs b/benchmark/UTF8_runtime.cs index 0530d65..1a31b76 100644 --- a/benchmark/UTF8_runtime.cs +++ b/benchmark/UTF8_runtime.cs @@ -9,22 +9,8 @@ using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.X86; -// Changes made from the Runtime (most of the stuff in the runtime is behind some private/internal class or some such. The path of least resistance was to copy paste. -// Copy pasted from: https://source.dot.net/#System.Private.CoreLib/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs,ca18aaa87cbdbfe8 -// https://source.dot.net/#System.Text.Encodings.Web/src/libraries/System.Private.CoreLib/src/System/Text/UnicodeUtility.cs,7d9353a7dd29c82b -// https://source.dot.net/#System.Text.Encodings.Web/src/libraries/System.Private.CoreLib/src/System/Text/UnicodeDebug.cs,ac87f1ec2c614dce -// https://source.dot.net/#System.Private.CoreLib/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs,0ca89dfb3ec2a5da -// https://source.dot.net/#System.Private.CoreLib/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs,87a9969a4e35fdde -// https://source.dot.net/#System.Private.CoreLib/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs,ff5eb1221e0665eb -// https://source.dot.net/#System.Private.CoreLib/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs,068a221b81a99840 -// https://source.dot.net/#System.Private.CoreLib/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs,74fc59ef51d5afa8 -// https://source.dot.net/#System.Private.CoreLib/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs,7fc454ccbfa2fc31 -// https://source.dot.net/#System.Private.CoreLib/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs,70ca16f03c3cc41b -// https://source.dot.net/#System.Private.CoreLib/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs,5a3126ae65f71ef2 -// https://source.dot.net/#System.Private.CoreLib/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs,6ee61246657067fb -// https://source.dot.net/#System.Private.CoreLib/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs,706384069881fe22 -// - +// Copy pasted from the System/Text/Unicode/Utf8Utility.Helpers.cs and associated files. +// Important: copyright belongs to the .NET Foundation. namespace DotnetRuntime { diff --git a/src/UTF8.cs b/src/UTF8.cs index e38cb34..edffb54 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -24,7 +24,7 @@ public static class UTF8 public unsafe static byte* GetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength, out int Utf16CodeUnitCountAdjustment, out int ScalarCodeUnitCountAdjustment) { - if (AdvSimd.Arm64.IsSupported) + if (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) { return GetPointerToFirstInvalidByteArm64(pInputBuffer, inputLength, out Utf16CodeUnitCountAdjustment, out ScalarCodeUnitCountAdjustment); } @@ -707,7 +707,6 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust return GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment); } -// public unsafe static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) { int processedLength = 0; @@ -851,7 +850,6 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust { // We have an ASCII block, no need to process it, but // we need to check if the previous block was incomplete. - // if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) { int off = processedLength >= 3 ? processedLength - 3 : processedLength;