From 7581aa74923d47957a9ce7ae805e0da52bb27bd9 Mon Sep 17 00:00:00 2001 From: Paul Irwin Date: Wed, 4 Dec 2024 16:52:33 -0700 Subject: [PATCH] UnicodeUtil updates: TryUTF8toUTF16, ReadOnlySpan methods, #1024 --- src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs | 51 +++++--- src/Lucene.Net/Util/BytesRef.cs | 20 +++ src/Lucene.Net/Util/UnicodeUtil.cs | 122 ++++++++++++++---- 3 files changed, 152 insertions(+), 41 deletions(-) diff --git a/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs b/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs index bab16426f7..cdace9c1cc 100644 --- a/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs +++ b/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs @@ -50,15 +50,15 @@ namespace Lucene.Net.Codecs public static class BlockTreeTermsWriter { /// - /// Suggested default value for the - /// minItemsInBlock parameter to + /// Suggested default value for the + /// minItemsInBlock parameter to /// . /// public const int DEFAULT_MIN_BLOCK_SIZE = 25; /// - /// Suggested default value for the - /// maxItemsInBlock parameter to + /// Suggested default value for the + /// maxItemsInBlock parameter to /// . /// public const int DEFAULT_MAX_BLOCK_SIZE = 48; @@ -296,12 +296,12 @@ public FieldMetaData(FieldInfo fieldInfo, BytesRef rootCode, long numTerms, long /// to set state. It is *optional* and can be used when overriding the WriteHeader(), /// WriteIndexHeader(). It only matters in the case where the state /// is required inside of any of those methods that is passed in to the subclass constructor. - /// + /// /// When passed to the constructor, it is set to the protected field m_subclassState before /// any of the above methods are called where it is available for reading when overriding the above methods. - /// + /// /// If your subclass needs to pass more than one piece of data, you can create a class or struct to do so. - /// All other virtual members of BlockTreeTermsWriter are not called in the constructor, + /// All other virtual members of BlockTreeTermsWriter are not called in the constructor, /// so the overrides of those methods won't specifically need to use this field (although they could for consistency). /// [SuppressMessage("CodeQuality", "IDE0079:Remove unnecessary suppression", Justification = "This is a SonarCloud issue")] @@ -468,7 +468,20 @@ public PendingBlock(BytesRef prefix, long fp, bool hasTerms, bool isFloor, int f public override string ToString() { - return "BLOCK: " + Prefix.Utf8ToString(); + return $"BLOCK: {Prefix.Utf8ToString()}"; + } + + #nullable enable + public bool TryToString([NotNullWhen(true)] out string? result) + { + if (Prefix.TryUtf8ToString(out string? prefixString)) + { + result = $"BLOCK: {prefixString}"; + return true; + } + + result = null; + return false; } // LUCENENET specific - to keep the Debug.Assert statement from throwing exceptions @@ -476,12 +489,11 @@ public override string ToString() // to using PendingBlock.Prefix.ToString() if PendingBlock.ToString() errors. // This struct defers formatting the string until it is actually used as a parameter // in string.Format(). - private struct PendingBlocksFormatter // For assert + private readonly struct PendingBlocksFormatter // For assert { -#pragma warning disable IDE0044 // Add readonly modifier - private IList blocks; -#pragma warning restore IDE0044 // Add readonly modifier - public PendingBlocksFormatter(IList blocks) + private readonly IList? blocks; + + public PendingBlocksFormatter(IList? blocks) { this.blocks = blocks; // May be null } @@ -500,17 +512,17 @@ public override string ToString() // For assert it.MoveNext(); while (true) { - var e = it.Current; + var e = it.Current ?? throw new InvalidOperationException("Expected a non-null value in the enumerator due to Count check above."); // There is a chance that the Prefix will contain invalid UTF8, // so we catch that and use the alternative way of displaying it - try + if (e.TryToString(out string? eString)) { - sb.Append(e.ToString()); + sb.Append(eString); } - catch (IndexOutOfRangeException) + else { sb.Append("BLOCK: "); - sb.Append(e.Prefix.ToString()); + sb.Append(e.Prefix); } if (!it.MoveNext()) { @@ -520,6 +532,7 @@ public override string ToString() // For assert } } } + #nullable restore public void CompileIndex(IList floorBlocks, RAMOutputStream scratchBytes) { @@ -1351,4 +1364,4 @@ protected override void Dispose(bool disposing) } } } -} \ No newline at end of file +} diff --git a/src/Lucene.Net/Util/BytesRef.cs b/src/Lucene.Net/Util/BytesRef.cs index b381f31eb8..9ea8566c33 100644 --- a/src/Lucene.Net/Util/BytesRef.cs +++ b/src/Lucene.Net/Util/BytesRef.cs @@ -243,6 +243,26 @@ public string Utf8ToString() return @ref.ToString(); } + #nullable enable + /// + /// Tries to interpret the stored bytes as UTF8 bytes, returning the + /// resulting as an output parameter . + /// + /// The resulting string output. + /// true if successful, false otherwise. + public bool TryUtf8ToString([NotNullWhen(true)] out string? result) + { + if (UnicodeUtil.TryUTF8toUTF16(bytes, Offset, Length, out CharsRef? @ref)) + { + result = @ref.ToString(); + return true; + } + + result = null; + return false; + } + #nullable restore + /// /// Returns hex encoded bytes, eg [0x6c 0x75 0x63 0x65 0x6e 0x65] public override string ToString() diff --git a/src/Lucene.Net/Util/UnicodeUtil.cs b/src/Lucene.Net/Util/UnicodeUtil.cs index 5974af1a16..65dd2fabc9 100644 --- a/src/Lucene.Net/Util/UnicodeUtil.cs +++ b/src/Lucene.Net/Util/UnicodeUtil.cs @@ -3,6 +3,7 @@ using Lucene.Net.Diagnostics; using Lucene.Net.Support; using System; +using System.Diagnostics.CodeAnalysis; using System.Runtime.CompilerServices; using System.Text; @@ -123,13 +124,13 @@ public static class UnicodeUtil private const int SURROGATE_OFFSET = Character.MinSupplementaryCodePoint - (UNI_SUR_HIGH_START << (int)HALF_SHIFT) - UNI_SUR_LOW_START; /// - /// Encode characters from a , starting at + /// Encode characters from a (with generic type argument ) , starting at /// and ending at . After encoding, result.Offset will always be 0. /// /// is null. // TODO: broken if incoming result.offset != 0 // LUCENENET specific overload - public static void UTF16toUTF8(Span source, BytesRef result) + public static void UTF16toUTF8(ReadOnlySpan source, BytesRef result) { // LUCENENET: Added guard clause if (result is null) @@ -200,7 +201,7 @@ public static void UTF16toUTF8(Span source, BytesRef result) } /// - /// Encode characters from a , starting at + /// Encode characters from a (with generic type argument ) , starting at /// for chars. After encoding, result.Offset will always be 0. /// /// or is null. @@ -212,11 +213,9 @@ public static void UTF16toUTF8(Span source, BytesRef result) /// and refer to a location outside of . /// // TODO: broken if incoming result.offset != 0 - public static void UTF16toUTF8(char[] source, int offset, int length, BytesRef result) + public static void UTF16toUTF8(ReadOnlySpan source, int offset, int length, BytesRef result) { // LUCENENET: Added guard clauses - if (source is null) - throw new ArgumentNullException(nameof(source)); if (result is null) throw new ArgumentNullException(nameof(result)); if (offset < 0) @@ -633,7 +632,7 @@ public static bool ValidUTF16String(StringBuilder s) // LUCENENET specific overl return true; } - public static bool ValidUTF16String(char[] s, int size) + public static bool ValidUTF16String(ReadOnlySpan s, int size) { for (int i = 0; i < size; i++) { @@ -828,16 +827,16 @@ public static void UTF8toUTF32(BytesRef utf8, Int32sRef utf32) private const int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_); /// - /// Cover JDK 1.5 API. Create a String from an array of . + /// Cover JDK 1.5 API. Create a String from a span of . /// - /// The code array. - /// The start of the text in the code point array. + /// The code point span. + /// The start of the text in the code point span. /// The number of code points. /// a String representing the code points between offset and count. /// If an invalid code point is encountered. /// If the offset or count are out of bounds. [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static string NewString(int[] codePoints, int offset, int count) + public static string NewString(ReadOnlySpan codePoints, int offset, int count) { // LUCENENET: Character.ToString() was optimized to use the stack for arrays // of codepoints 256 or less, so it performs better than using ToCharArray(). @@ -849,26 +848,26 @@ public static string NewString(int[] codePoints, int offset, int count) /// /// LUCENENET specific. /// - /// The code array. - /// The start of the text in the code point array. + /// The code span. + /// The start of the text in the code point span. /// The number of code points. /// a char array representing the code points between offset and count. // LUCENENET NOTE: This code was originally in the NewString() method (above). // It has been refactored from the original to remove the exception throw/catch and - // instead proactively resizes the array instead of relying on excpetions + copy operations - public static char[] ToCharArray(int[] codePoints, int offset, int count) + // instead proactively resizes the array instead of relying on exceptions + copy operations + public static char[] ToCharArray(ReadOnlySpan codePoints, int offset, int count) { if (count < 0) { throw new ArgumentOutOfRangeException(nameof(count), "count must be >= 0"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) } - const int countThreashold = 1024; // If the number of chars exceeds this, we count them instead of allocating count * 2 + const int countThreshold = 1024; // If the number of chars exceeds this, we count them instead of allocating count * 2 // LUCENENET: as a first approximation, assume each codepoint // is 2 characters (since it cannot be longer than this) int arrayLength = count * 2; - // LUCENENET: if we go over the threashold, count the number of + // LUCENENET: if we go over the threshold, count the number of // chars we will need so we can allocate the precise amount of memory - if (count > countThreashold) + if (count > countThreshold) { arrayLength = 0; for (int r = offset, e = offset + count; r < e; ++r) @@ -951,15 +950,18 @@ public static string ToHexString(string s) } /// - /// Interprets the given byte array as UTF-8 and converts to UTF-16. The will be extended if + /// Interprets the given byte span as UTF-8 and converts to UTF-16. The will be extended if /// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint. /// /// NOTE: Full characters are read, even if this reads past the length passed (and /// can result in an if invalid UTF-8 is passed). /// Explicit checks for valid UTF-8 are not performed. /// + /// + /// LUCENENET specific: This method uses (with generic type argument ) instead of byte[]. + /// // TODO: broken if chars.offset != 0 - public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef chars) + public static void UTF8toUTF16(ReadOnlySpan utf8, int offset, int length, CharsRef chars) { int out_offset = chars.Offset = 0; char[] @out = chars.Chars = ArrayUtil.Grow(chars.Chars, length); @@ -1001,9 +1003,85 @@ public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef cha chars.Length = out_offset - chars.Offset; } + #nullable enable + /// + /// Tries to interpret the given byte span as UTF-8 and convert to UTF-16, providing the result in a new . + /// + /// NOTE: Explicit checks for valid UTF-8 are not performed. + /// + /// + /// LUCENENET specific: This method uses (with generic type argument ) instead of byte[]. + /// + public static bool TryUTF8toUTF16(ReadOnlySpan utf8, int offset, int length, [NotNullWhen(true)] out CharsRef? chars) + { + CharsRef result = new CharsRef(length); + int out_offset = 0; + char[] @out = result.Chars; + int limit = offset + length; + while (offset < limit) + { + if (utf8.Length <= offset) + { + chars = null; + return false; + } + + int b = utf8[offset++] & 0xff; + if (b < 0xc0) + { + if (Debugging.AssertsEnabled) Debugging.Assert(b < 0x80); + @out[out_offset++] = (char)b; + } + else if (b < 0xe0) + { + if (utf8.Length <= offset) + { + chars = null; + return false; + } + @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[offset++] & 0x3f)); + } + else if (b < 0xf0) + { + if (utf8.Length <= offset + 1) + { + chars = null; + return false; + } + @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[offset] & 0x3f) << 6) + (utf8[offset + 1] & 0x3f)); + offset += 2; + } + else + { + if (utf8.Length <= offset + 2) + { + chars = null; + return false; + } + if (Debugging.AssertsEnabled) Debugging.Assert(b < 0xf8, "b = 0x{0:x}", b); + int ch = ((b & 0x7) << 18) + ((utf8[offset] & 0x3f) << 12) + ((utf8[offset + 1] & 0x3f) << 6) + (utf8[offset + 2] & 0x3f); + offset += 3; + if (ch < UNI_MAX_BMP) + { + @out[out_offset++] = (char)ch; + } + else + { + int chHalf = ch - 0x0010000; + @out[out_offset++] = (char)((chHalf >> 10) + 0xD800); + @out[out_offset++] = (char)((chHalf & HALF_MASK) + 0xDC00); + } + } + } + result.Length = out_offset; + chars = result; + return true; + } + #nullable restore + /// - /// Utility method for - /// + /// Utility method for + /// [MethodImpl(MethodImplOptions.AggressiveInlining)] public static void UTF8toUTF16(BytesRef bytesRef, CharsRef chars) {