diff --git a/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs b/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs
index bb8e736a7c..be98e7a3d4 100644
--- a/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs
+++ b/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs
@@ -327,5 +327,18 @@ public virtual void TestUTF8UTF16CharsRef()
Assert.AreEqual(cRef.ToString(), unicode);
}
}
+
+ [Test]
+ [LuceneNetSpecific] // this is a Lucene.NET specific method
+ public void TestTryUTF8toUTF16()
+ {
+ string unicode = TestUtil.RandomRealisticUnicodeString(Random);
+ var utf8 = new BytesRef(IOUtils.CHARSET_UTF_8.GetBytes(unicode));
+
+ bool success = UnicodeUtil.TryUTF8toUTF16(utf8, out var chars);
+
+ Assert.IsTrue(success);
+ Assert.AreEqual(unicode, chars?.ToString());
+ }
}
}
diff --git a/src/Lucene.Net/Util/UnicodeUtil.cs b/src/Lucene.Net/Util/UnicodeUtil.cs
index 65dd2fabc9..3069ef0379 100644
--- a/src/Lucene.Net/Util/UnicodeUtil.cs
+++ b/src/Lucene.Net/Util/UnicodeUtil.cs
@@ -6,6 +6,7 @@
using System.Diagnostics.CodeAnalysis;
using System.Runtime.CompilerServices;
using System.Text;
+#nullable enable
namespace Lucene.Net.Util
{
@@ -108,7 +109,10 @@ public static class UnicodeUtil
///
/// WARNING: this is not a valid UTF8 Term
///
- public static readonly BytesRef BIG_TERM = new BytesRef(new byte[] { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }); // TODO this is unrelated here find a better place for it
+ public static readonly BytesRef BIG_TERM = new BytesRef(new byte[]
+ {
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ }); // TODO this is unrelated here find a better place for it
public const int UNI_SUR_HIGH_START = 0xD800;
public const int UNI_SUR_HIGH_END = 0xDBFF;
@@ -121,7 +125,8 @@ public static class UnicodeUtil
private const long HALF_SHIFT = 10;
private const long HALF_MASK = 0x3FFL;
- private const int SURROGATE_OFFSET = Character.MinSupplementaryCodePoint - (UNI_SUR_HIGH_START << (int)HALF_SHIFT) - UNI_SUR_LOW_START;
+ private const int SURROGATE_OFFSET = Character.MinSupplementaryCodePoint -
+ (UNI_SUR_HIGH_START << (int)HALF_SHIFT) - UNI_SUR_LOW_START;
///
/// Encode characters from a (with generic type argument ) , starting at
@@ -149,6 +154,7 @@ public static void UTF16toUTF8(ReadOnlySpan source, BytesRef result)
{
@out = result.Bytes = new byte[maxLen];
}
+
result.Offset = 0;
while (i < end)
@@ -189,6 +195,7 @@ public static void UTF16toUTF8(ReadOnlySpan source, BytesRef result)
continue;
}
}
+
// replace unpaired surrogate or out-of-order low surrogate
// with substitution character
@out[upto++] = 0xEF;
@@ -196,12 +203,13 @@ public static void UTF16toUTF8(ReadOnlySpan source, BytesRef result)
@out[upto++] = 0xBD;
}
}
+
//assert matches(source, offset, length, out, upto);
result.Length = upto;
}
///
- /// Encode characters from a (with generic type argument ) , starting at
+ /// Encode characters from a , starting at
/// for chars. After encoding, result.Offset will always be 0.
///
/// or is null.
@@ -213,6 +221,31 @@ public static void UTF16toUTF8(ReadOnlySpan source, BytesRef result)
/// and refer to a location outside of .
///
// TODO: broken if incoming result.offset != 0
+ public static void UTF16toUTF8(char[] source, int offset, int length, BytesRef result)
+ {
+ // LUCENENET: Added guard clauses
+ if (source is null)
+ throw new ArgumentNullException(nameof(source));
+
+ UTF16toUTF8(source.AsSpan(), offset, length, result);
+ }
+
+ ///
+ /// Encode characters from a (with generic type argument ) , starting at
+ /// for chars. After encoding, result.Offset will always be 0.
+ ///
+ /// is null.
+ ///
+ /// or is less than zero.
+ ///
+ /// -or-
+ ///
+ /// and refer to a location outside of .
+ ///
+ ///
+ /// LUCENENET specific overload.
+ ///
+ // TODO: broken if incoming result.offset != 0
public static void UTF16toUTF8(ReadOnlySpan source, int offset, int length, BytesRef result)
{
// LUCENENET: Added guard clauses
@@ -223,7 +256,8 @@ public static void UTF16toUTF8(ReadOnlySpan source, int offset, int length
if (length < 0)
throw new ArgumentOutOfRangeException(nameof(length), $"{nameof(length)} must not be negative.");
if (offset > source.Length - length) // Checks for int overflow
- throw new ArgumentOutOfRangeException(nameof(length), $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}.");
+ throw new ArgumentOutOfRangeException(nameof(length),
+ $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}.");
int upto = 0;
int i = offset;
@@ -235,6 +269,7 @@ public static void UTF16toUTF8(ReadOnlySpan source, int offset, int length
{
@out = result.Bytes = new byte[maxLen];
}
+
result.Offset = 0;
while (i < end)
@@ -275,6 +310,7 @@ public static void UTF16toUTF8(ReadOnlySpan source, int offset, int length
continue;
}
}
+
// replace unpaired surrogate or out-of-order low surrogate
// with substitution character
@out[upto++] = 0xEF;
@@ -282,6 +318,7 @@ public static void UTF16toUTF8(ReadOnlySpan source, int offset, int length
@out[upto++] = 0xBD;
}
}
+
//assert matches(source, offset, length, out, upto);
result.Length = upto;
}
@@ -311,7 +348,8 @@ public static void UTF16toUTF8(ICharSequence source, int offset, int length, Byt
if (length < 0)
throw new ArgumentOutOfRangeException(nameof(length), $"{nameof(length)} must not be negative.");
if (offset > source.Length - length) // Checks for int overflow
- throw new ArgumentOutOfRangeException(nameof(length), $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}.");
+ throw new ArgumentOutOfRangeException(nameof(length),
+ $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}.");
int end = offset + length;
@@ -362,6 +400,7 @@ public static void UTF16toUTF8(ICharSequence source, int offset, int length, Byt
continue;
}
}
+
// replace unpaired surrogate or out-of-order low surrogate
// with substitution character
@out[upto++] = 0xEF;
@@ -369,6 +408,7 @@ public static void UTF16toUTF8(ICharSequence source, int offset, int length, Byt
@out[upto++] = 0xBD;
}
}
+
//assert matches(s, offset, length, out, upto);
result.Length = upto;
}
@@ -400,7 +440,8 @@ public static void UTF16toUTF8(string source, int offset, int length, BytesRef r
if (length < 0)
throw new ArgumentOutOfRangeException(nameof(length), $"{nameof(length)} must not be negative.");
if (offset > source.Length - length) // Checks for int overflow
- throw new ArgumentOutOfRangeException(nameof(length), $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}.");
+ throw new ArgumentOutOfRangeException(nameof(length),
+ $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}.");
int end = offset + length;
@@ -451,6 +492,7 @@ public static void UTF16toUTF8(string source, int offset, int length, BytesRef r
continue;
}
}
+
// replace unpaired surrogate or out-of-order low surrogate
// with substitution character
@out[upto++] = 0xEF;
@@ -458,6 +500,7 @@ public static void UTF16toUTF8(string source, int offset, int length, BytesRef r
@out[upto++] = 0xBD;
}
}
+
//assert matches(s, offset, length, out, upto);
result.Length = upto;
}
@@ -535,19 +578,19 @@ public static bool ValidUTF16String(ICharSequence s)
// Valid surrogate pair
}
else
- // Unmatched high surrogate
+ // Unmatched high surrogate
{
return false;
}
}
else
- // Unmatched high surrogate
+ // Unmatched high surrogate
{
return false;
}
}
else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
- // Unmatched low surrogate
+ // Unmatched low surrogate
{
return false;
}
@@ -556,7 +599,8 @@ public static bool ValidUTF16String(ICharSequence s)
return true;
}
- public static bool ValidUTF16String(string s) // LUCENENET specific overload because string doesn't implement ICharSequence
+ public static bool
+ ValidUTF16String(string s) // LUCENENET specific overload because string doesn't implement ICharSequence
{
int size = s.Length;
for (int i = 0; i < size; i++)
@@ -573,19 +617,19 @@ public static bool ValidUTF16String(string s) // LUCENENET specific overload bec
// Valid surrogate pair
}
else
- // Unmatched high surrogate
+ // Unmatched high surrogate
{
return false;
}
}
else
- // Unmatched high surrogate
+ // Unmatched high surrogate
{
return false;
}
}
else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
- // Unmatched low surrogate
+ // Unmatched low surrogate
{
return false;
}
@@ -594,7 +638,9 @@ public static bool ValidUTF16String(string s) // LUCENENET specific overload bec
return true;
}
- public static bool ValidUTF16String(StringBuilder s) // LUCENENET specific overload because StringBuilder doesn't implement ICharSequence
+ public static bool
+ ValidUTF16String(
+ StringBuilder s) // LUCENENET specific overload because StringBuilder doesn't implement ICharSequence
{
int size = s.Length;
for (int i = 0; i < size; i++)
@@ -611,19 +657,19 @@ public static bool ValidUTF16String(StringBuilder s) // LUCENENET specific overl
// Valid surrogate pair
}
else
- // Unmatched high surrogate
+ // Unmatched high surrogate
{
return false;
}
}
else
- // Unmatched high surrogate
+ // Unmatched high surrogate
{
return false;
}
}
else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
- // Unmatched low surrogate
+ // Unmatched low surrogate
{
return false;
}
@@ -632,6 +678,8 @@ public static bool ValidUTF16String(StringBuilder s) // LUCENENET specific overl
return true;
}
+ public static bool ValidUTF16String(char[] s, int size) => ValidUTF16String(s.AsSpan(), size);
+
public static bool ValidUTF16String(ReadOnlySpan s, int size)
{
for (int i = 0; i < size; i++)
@@ -658,7 +706,7 @@ public static bool ValidUTF16String(ReadOnlySpan s, int size)
}
}
else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
- // Unmatched low surrogate
+ // Unmatched low surrogate
{
return false;
}
@@ -676,10 +724,13 @@ public static bool ValidUTF16String(ReadOnlySpan s, int size)
/* Map UTF-8 encoded prefix byte to sequence length. -1 (0xFF)
* means illegal prefix. see RFC 2279 for details */
internal static readonly int[] utf8CodeLength = LoadUTF8CodeLength();
- private static int[] LoadUTF8CodeLength() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
+
+ private static int[]
+ LoadUTF8CodeLength() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
{
- int v = int.MinValue;
- return new int[] {
+ const int v = int.MinValue;
+ return new int[]
+ {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -720,12 +771,31 @@ public static int CodePointCount(BytesRef utf8)
for (; pos < limit; codePointCount++)
{
int v = bytes[pos] & 0xFF;
- if (v < /* 0xxx xxxx */ 0x80) { pos += 1; continue; }
- if (v >= /* 110x xxxx */ 0xc0)
+ if (v < /* 0xxx xxxx */ 0x80)
+ {
+ pos += 1;
+ continue;
+ }
+
+ if (v >= /* 110x xxxx */ 0xc0)
{
- if (v < /* 111x xxxx */ 0xe0) { pos += 2; continue; }
- if (v < /* 1111 xxxx */ 0xf0) { pos += 3; continue; }
- if (v < /* 1111 1xxx */ 0xf8) { pos += 4; continue; }
+ if (v < /* 111x xxxx */ 0xe0)
+ {
+ pos += 2;
+ continue;
+ }
+
+ if (v < /* 1111 xxxx */ 0xf0)
+ {
+ pos += 3;
+ continue;
+ }
+
+ if (v < /* 1111 1xxx */ 0xf8)
+ {
+ pos += 4;
+ continue;
+ }
// fallthrough, consider 5 and 6 byte sequences invalid.
}
@@ -756,6 +826,7 @@ public static void UTF8toUTF32(BytesRef utf8, Int32sRef utf32)
{
utf32.Int32s = new int[utf8.Length];
}
+
int utf32Count = 0;
int utf8Upto = utf8.Offset;
int[] ints = utf32.Int32s;
@@ -795,6 +866,7 @@ public static void UTF8toUTF32(BytesRef utf8, Int32sRef utf32)
{
v = v << 6 | bytes[utf8Upto++] & 63;
}
+
ints[utf32Count++] = v;
}
@@ -824,7 +896,25 @@ public static void UTF8toUTF32(BytesRef utf8, Int32sRef utf32)
///
/// Value that all lead surrogate starts with.
- private const int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_);
+ private const int LEAD_SURROGATE_OFFSET_ =
+ LEAD_SURROGATE_MIN_VALUE - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_);
+
+ ///
+ /// Cover JDK 1.5 API. Create a String from an array of .
+ ///
+ /// The code point array.
+ /// The start of the text in the code point array.
+ /// The number of code points.
+ /// a String representing the code points between offset and count.
+ /// If an invalid code point is encountered.
+ /// If the offset or count are out of bounds.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static string NewString(int[] codePoints, int offset, int count)
+ {
+ // LUCENENET: Character.ToString() was optimized to use the stack for arrays
+ // of codepoints 256 or less, so it performs better than using ToCharArray().
+ return Character.ToString(codePoints, offset, count);
+ }
///
/// Cover JDK 1.5 API. Create a String from a span of .
@@ -843,6 +933,23 @@ public static string NewString(ReadOnlySpan codePoints, int offset, int cou
return Character.ToString(codePoints, offset, count);
}
+ ///
+ /// Generates char array that represents the provided input code points.
+ ///
+ /// LUCENENET specific.
+ ///
+ /// The code array.
+ /// The start of the text in the code point array.
+ /// The number of code points.
+ /// a char array representing the code points between offset and count.
+ // LUCENENET NOTE: This code was originally in the NewString() method (above).
+ // It has been refactored from the original to remove the exception throw/catch and
+ // instead proactively resizes the array instead of relying on exceptions + copy operations
+ public static char[] ToCharArray(int[] codePoints, int offset, int count)
+ {
+ return ToCharArray(codePoints.AsSpan(), offset, count);
+ }
+
///
/// Generates char array that represents the provided input code points.
///
@@ -949,6 +1056,20 @@ public static string ToHexString(string s)
return sb.ToString();
}
+ ///
+ /// Interprets the given byte array as UTF-8 and converts to UTF-16. The will be extended if
+ /// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint.
+ ///
+ /// NOTE: Full characters are read, even if this reads past the length passed (and
+ /// can result in an if invalid UTF-8 is passed).
+ /// Explicit checks for valid UTF-8 are not performed.
+ ///
+ // TODO: broken if chars.offset != 0
+ public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef chars)
+ {
+ UTF8toUTF16(utf8.AsSpan(), offset, length, chars);
+ }
+
///
/// Interprets the given byte span as UTF-8 and converts to UTF-16. The will be extended if
/// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint.
@@ -958,7 +1079,7 @@ public static string ToHexString(string s)
/// Explicit checks for valid UTF-8 are not performed.
///
///
- /// LUCENENET specific: This method uses (with generic type argument ) instead of byte[].
+ /// LUCENENET specific overload.
///
// TODO: broken if chars.offset != 0
public static void UTF8toUTF16(ReadOnlySpan utf8, int offset, int length, CharsRef chars)
@@ -1003,7 +1124,6 @@ public static void UTF8toUTF16(ReadOnlySpan utf8, int offset, int length,
chars.Length = out_offset - chars.Offset;
}
- #nullable enable
///
/// Tries to interpret the given byte span as UTF-8 and convert to UTF-16, providing the result in a new .
///
@@ -1077,7 +1197,6 @@ public static bool TryUTF8toUTF16(ReadOnlySpan utf8, int offset, int lengt
chars = result;
return true;
}
- #nullable restore
///
/// Utility method for
@@ -1085,7 +1204,15 @@ public static bool TryUTF8toUTF16(ReadOnlySpan utf8, int offset, int lengt
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static void UTF8toUTF16(BytesRef bytesRef, CharsRef chars)
{
- UTF8toUTF16(bytesRef.Bytes, bytesRef.Offset, bytesRef.Length, chars);
+ UTF8toUTF16(bytesRef.Bytes.AsSpan(), bytesRef.Offset, bytesRef.Length, chars);
+ }
+
+ ///
+ /// Utility method for
+ ///
+ public static bool TryUTF8toUTF16(BytesRef bytesRef, out CharsRef? chars)
+ {
+ return TryUTF8toUTF16(bytesRef.Bytes.AsSpan(), bytesRef.Offset, bytesRef.Length, out chars);
}
}
}