Skip to content

Commit

Permalink
More C# idiomatic utf8random
Browse files Browse the repository at this point in the history
  • Loading branch information
Nick-Nuon committed Nov 16, 2023
1 parent 36d121d commit fe4e0cd
Showing 1 changed file with 33 additions and 58 deletions.
91 changes: 33 additions & 58 deletions test/helpers/randomutf8.cs
Original file line number Diff line number Diff line change
@@ -1,35 +1,29 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

public class RandomUtf8
{
// Internal random number generator
private Random gen;

// Array of probabilities for each UTF-8 byte count (1-byte, 2-bytes, etc.)
private double[] probabilities;

// Maximum number of bytes a UTF-8 character can be (based on the standard)
private const int maxByteLength = 4;

// Constructor initializing the generator with seed and probabilities
public RandomUtf8(uint seed, int prob_1byte, int prob_2bytes, int prob_3bytes, int prob_4bytes)
{
gen = new Random((int)seed);
probabilities = new double[maxByteLength] { prob_1byte, prob_2bytes, prob_3bytes, prob_4bytes };
}

// Generates a byte array of random UTF-8 sequences of specified length
public byte[] Generate(int outputBytes)
{
List<byte> result = new List<byte>(outputBytes);
var result = new List<byte>();
while (result.Count < outputBytes)
{
uint codePoint = GenerateCodePoint();
byte[] utf8Bytes = EncodeToUTF8(codePoint);
int byteCount = PickRandomByteCount();
int codePoint = GenerateCodePoint(byteCount);
byte[] utf8Bytes = Encoding.UTF8.GetBytes(char.ConvertFromUtf32(codePoint));

// Ensure we don't exceed the desired length
if (result.Count + utf8Bytes.Length > outputBytes)
break;

Expand All @@ -38,67 +32,48 @@ public byte[] Generate(int outputBytes)
return result.ToArray();
}

// Generates a byte array of random UTF-8 sequences and returns it along with its length
public (byte[] utf8, int count) GenerateCounted(int outputBytes)
{
var utf8 = Generate(outputBytes);
return (utf8, utf8.Length);
}

// Overload to regenerate the byte sequence with a new seed
public byte[] Generate(int outputBytes, long seed)
private int GenerateCodePoint(int byteCount)
{
gen = new Random((int)seed);
return Generate(outputBytes);
}

// Generate a random UTF-8 code point based on probabilities
private uint GenerateCodePoint()
{
int byteCount = PickRandomByteCount();

// Depending on the byte count, generate an appropriate UTF-8 sequence
switch (byteCount)
{
// Each case follows UTF-8 encoding rules for 1-byte, 2-byte, 3-byte, and 4-byte sequences
case 1: return (uint)gen.Next(0x00, 0x80); // 1-byte sequence
case 2: return (uint)((gen.Next(0xC2, 0xDF) << 8) | (0x80 | gen.Next(0x00, 0x40)));
case 3: return (uint)((gen.Next(0xE0, 0xEF) << 16) | ((0x80 | gen.Next(0x00, 0x40)) << 8) | (0x80 | gen.Next(0x00, 0x40)));
case 4: return (uint)((gen.Next(0xF0, 0xF4) << 24) | ((0x80 | gen.Next(0x00, 0x40)) << 16) | ((0x80 | gen.Next(0x00, 0x40)) << 8) | (0x80 | gen.Next(0x00, 0x40)));
default: throw new InvalidOperationException($"Invalid byte count: {byteCount}"); // Guard clause for invalid byte count
case 1:
// Generate a code point for a 1-byte UTF-8 character (ASCII)
return gen.Next(0x0000, 0x007F + 1);// +1 because gen.Next() excludes the upper bound
case 2:
// Generate a code point for a 2-byte UTF-8 character (Latin)
return gen.Next(0x0080, 0x07FF + 1);
case 3:
// Generate a code point for a 3-byte UTF-8 character (Asiatic)
// Note: This range skips over the surrogate pair range U+D800 to U+DFFF
if (gen.NextDouble() < 0.5)
{
// Generate code point in U+0800 to U+D7FF range
return gen.Next(0x0800, 0xD7FF + 1);
}
else
{
// Generate code point in U+E000 to U+FFFF range
return gen.Next(0xE000, 0xFFFF + 1);
}
case 4:
// Generate a code point for a 4-byte UTF-8 character (Supplementary)
// The +1 is factored into the ConvertFromUtf32 method
return gen.Next(0x010000, 0x10FFFF);
default:
throw new InvalidOperationException($"Invalid byte count: {byteCount}");
}
}

// Pick a random byte count based on the given probabilities
private int PickRandomByteCount()
{
double randomValue = gen.NextDouble() * probabilities.Sum();
double cumulative = 0.0;

// Check each cumulative probability until the random value is less than the cumulative sum
for (int i = 0; i < maxByteLength; i++)
{
cumulative += probabilities[i];
if (randomValue <= cumulative)
return i + 1; // Return the byte count
return i + 1;
}

return maxByteLength; // Default to max byte length
}

// Convert the generated code point into a valid UTF-8 sequence
private byte[] EncodeToUTF8(uint codePoint)
{
var result = new List<byte>();

// Break the code point into its constituent bytes
while (codePoint != 0)
{
result.Add((byte)(codePoint & 0xFF));
codePoint >>= 8;
}

result.Reverse(); // Reverse to get the bytes in the correct order
return result.ToArray();
return maxByteLength;
}
}

0 comments on commit fe4e0cd

Please sign in to comment.