Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize offsetGet #506

Merged
merged 4 commits into from
Sep 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion src/Tools/CustomJsonSerializer.php
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ class CustomJsonSerializer extends JsonSerializer
'viewOptions',
'eventOptions',
'userOptions',
'asciiMap',
];

/**
Expand Down
186 changes: 15 additions & 171 deletions src/UtfString.php
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,10 @@

use function mb_check_encoding;
use function mb_strlen;
use function mb_substr;
use function ord;
use function strlen;
use function substr;

/**
* Implementation for UTF-8 strings.
Expand Down Expand Up @@ -68,128 +71,15 @@
*/
public $charLen = 0;

/**
* A map of ASCII binary values to their ASCII code
* This is to improve performance and avoid calling ord($byte)
*
* Source: https://www.freecodecamp.org/news/ascii-table-hex-to-ascii-value-character-code-chart-2/
*
* @var array<int|string,int>
*/
protected static $asciiMap = [
"\0" => 0, // (00000000) NUL Null
"\t" => 9, // (00001001) HT Horizontal Tab
"\n" => 10, // (00001010) LF Newline / Line Feed
"\v" => 11, // (00001011) VT Vertical Tab
"\f" => 12, // (00001100) FF Form Feed
"\r" => 13, // (00001101) CR Carriage Return
' ' => 32, // (00100000) SP Space
'!' => 33, // (00100001) ! Exclamation mark
'"' => 34, // (00100010) " Double quote
'#' => 35, // (00100011) # Number
'$' => 36, // (00100100) $ Dollar
'%' => 37, // (00100101) % Percent
'&' => 38, // (00100110) & Ampersand
'\'' => 39, // (00100111) ' Single quote
'(' => 40, // (00101000) ( Left parenthesis
')' => 41, // (00101001) ) Right parenthesis
'*' => 42, // (00101010) * Asterisk
'+' => 43, // (00101011) + Plus
',' => 44, // (00101100) , Comma
'-' => 45, // (00101101) - Minus
'.' => 46, // (00101110) . Period
'/' => 47, // (00101111) / Slash
'0' => 48, // (00110000) 0 Zero
'1' => 49, // (00110001) 1 One
'2' => 50, // (00110010) 2 Two
'3' => 51, // (00110011) 3 Three
'4' => 52, // (00110100) 4 Four
'5' => 53, // (00110101) 5 Five
'6' => 54, // (00110110) 6 Six
'7' => 55, // (00110111) 7 Seven
'8' => 56, // (00111000) 8 Eight
'9' => 57, // (00111001) 9 Nine
':' => 58, // (00111010) : Colon
';' => 59, // (00111011) ; Semicolon
'<' => 60, // (00111100) < Less than
'=' => 61, // (00111101) = Equal sign
'>' => 62, // (00111110) > Greater than
'?' => 63, // (00111111) ? Question mark
'@' => 64, // (01000000) @ At sign
'A' => 65, // (01000001) A Uppercase A
'B' => 66, // (01000010) B Uppercase B
'C' => 67, // (01000011) C Uppercase C
'D' => 68, // (01000100) D Uppercase D
'E' => 69, // (01000101) E Uppercase E
'F' => 70, // (01000110) F Uppercase F
'G' => 71, // (01000111) G Uppercase G
'H' => 72, // (01001000) H Uppercase H
'I' => 73, // (01001001) I Uppercase I
'J' => 74, // (01001010) J Uppercase J
'K' => 75, // (01001011) K Uppercase K
'L' => 76, // (01001100) L Uppercase L
'M' => 77, // (01001101) M Uppercase M
'N' => 78, // (01001110) N Uppercase N
'O' => 79, // (01001111) O Uppercase O
'P' => 80, // (01010000) P Uppercase P
'Q' => 81, // (01010001) Q Uppercase Q
'R' => 82, // (01010010) R Uppercase R
'S' => 83, // (01010011) S Uppercase S
'T' => 84, // (01010100) T Uppercase T
'U' => 85, // (01010101) U Uppercase U
'V' => 86, // (01010110) V Uppercase V
'W' => 87, // (01010111) W Uppercase W
'X' => 88, // (01011000) X Uppercase X
'Y' => 89, // (01011001) Y Uppercase Y
'Z' => 90, // (01011010) Z Uppercase Z
'[' => 91, // (01011011) [ Left square bracket
'\\' => 92, // (01011100) \ backslash
']' => 93, // (01011101) ] Right square bracket
'^' => 94, // (01011110) ^ Caret / circumflex
'_' => 95, // (01011111) _ Underscore
'`' => 96, // (01100000) ` Grave / accent
'a' => 97, // (01100001) a Lowercase a
'b' => 98, // (01100010) b Lowercase b
'c' => 99, // (01100011) c Lowercase c
'd' => 100, // (01100100) d Lowercase d
'e' => 101, // (01100101) e Lowercase e
'f' => 102, // (01100110) f Lowercase
'g' => 103, // (01100111) g Lowercase g
'h' => 104, // (01101000) h Lowercase h
'i' => 105, // (01101001) i Lowercase i
'j' => 106, // (01101010) j Lowercase j
'k' => 107, // (01101011) k Lowercase k
'l' => 108, // (01101100) l Lowercase l
'm' => 109, // (01101101) m Lowercase m
'n' => 110, // (01101110) n Lowercase n
'o' => 111, // (01101111) o Lowercase o
'p' => 112, // (01110000) p Lowercase p
'q' => 113, // (01110001) q Lowercase q
'r' => 114, // (01110010) r Lowercase r
's' => 115, // (01110011) s Lowercase s
't' => 116, // (01110100) t Lowercase t
'u' => 117, // (01110101) u Lowercase u
'v' => 118, // (01110110) v Lowercase v
'w' => 119, // (01110111) w Lowercase w
'x' => 120, // (01111000) x Lowercase x
'y' => 121, // (01111001) y Lowercase y
'z' => 122, // (01111010) z Lowercase z
'{' => 123, // (01111011) { Left curly bracket
'|' => 124, // (01111100) | Vertical bar
'}' => 125, // (01111101) } Right curly bracket
'~' => 126, // (01111110) ~ Tilde
"\x7f" => 127, // (01111111) DEL Delete
];

/**
* @param string $str the string
*/
public function __construct($str)
{
$this->str = $str;
$this->byteLen = mb_strlen($str, '8bit');

Check warning on line 80 in src/UtfString.php

View workflow job for this annotation

GitHub Actions / Mutation tests with PHP 8.1

Escaped Mutant for Mutator "MBString": --- Original +++ New @@ @@ public function __construct($str) { $this->str = $str; - $this->byteLen = mb_strlen($str, '8bit'); + $this->byteLen = strlen($str); if (!mb_check_encoding($str, 'UTF-8')) { $this->charLen = 0; } else {
if (! mb_check_encoding($str, 'UTF-8')) {
$this->charLen = 0;

Check warning on line 82 in src/UtfString.php

View workflow job for this annotation

GitHub Actions / Mutation tests with PHP 8.1

Escaped Mutant for Mutator "DecrementInteger": --- Original +++ New @@ @@ $this->str = $str; $this->byteLen = mb_strlen($str, '8bit'); if (!mb_check_encoding($str, 'UTF-8')) { - $this->charLen = 0; + $this->charLen = -1; } else { $this->charLen = mb_strlen($str, 'UTF-8'); }
} else {
$this->charLen = mb_strlen($str, 'UTF-8');
}
Expand All @@ -202,7 +92,7 @@
*/
public function offsetExists($offset): bool
{
return ($offset >= 0) && ($offset < $this->charLen);

Check warning on line 95 in src/UtfString.php

View workflow job for this annotation

GitHub Actions / Mutation tests with PHP 8.1

Escaped Mutant for Mutator "GreaterThanOrEqualTo": --- Original +++ New @@ @@ */ public function offsetExists($offset) : bool { - return $offset >= 0 && $offset < $this->charLen; + return $offset > 0 && $offset < $this->charLen; } /** * Gets the character at given offset.
}

/**
Expand All @@ -212,37 +102,37 @@
*/
public function offsetGet($offset): string|null
{
// This function moves the internal byte and character pointer to the requested offset.
// This function is part of hot code so the aim is to do the following
// operations as efficiently as possible.
// UTF-8 character encoding is a variable length encoding that encodes Unicode
// characters in 1-4 bytes. Thus we fetch 4 bytes from the current offset and then use mb_substr
// to get the first UTF-8 character in it. We then use strlen to get the character's size in bytes.
if (($offset < 0) || ($offset >= $this->charLen)) {
return null;
}

$delta = $offset - $this->charIdx;

if ($delta > 0) {

Check warning on line 117 in src/UtfString.php

View workflow job for this annotation

GitHub Actions / Mutation tests with PHP 8.1

Escaped Mutant for Mutator "GreaterThan": --- Original +++ New @@ @@ return null; } $delta = $offset - $this->charIdx; - if ($delta > 0) { + if ($delta >= 0) { // Fast forwarding. $this->byteIdx += strlen(mb_substr(substr($this->str, $this->byteIdx, 4 * $delta), 0, $delta)); $this->charIdx += $delta;
// Fast forwarding.
while ($delta-- > 0) {
$this->byteIdx += static::getCharLength($this->str[$this->byteIdx]);
++$this->charIdx;
}
$this->byteIdx += strlen(mb_substr(substr($this->str, $this->byteIdx, 4 * $delta), 0, $delta));

Check warning on line 119 in src/UtfString.php

View workflow job for this annotation

GitHub Actions / Mutation tests with PHP 8.1

Escaped Mutant for Mutator "IncrementInteger": --- Original +++ New @@ @@ $delta = $offset - $this->charIdx; if ($delta > 0) { // Fast forwarding. - $this->byteIdx += strlen(mb_substr(substr($this->str, $this->byteIdx, 4 * $delta), 0, $delta)); + $this->byteIdx += strlen(mb_substr(substr($this->str, $this->byteIdx, 5 * $delta), 0, $delta)); $this->charIdx += $delta; } elseif ($delta < 0) { // Rewinding.
$this->charIdx += $delta;
} elseif ($delta < 0) {

Check warning on line 121 in src/UtfString.php

View workflow job for this annotation

GitHub Actions / Mutation tests with PHP 8.1

Escaped Mutant for Mutator "LessThan": --- Original +++ New @@ @@ // Fast forwarding. $this->byteIdx += strlen(mb_substr(substr($this->str, $this->byteIdx, 4 * $delta), 0, $delta)); $this->charIdx += $delta; - } elseif ($delta < 0) { + } elseif ($delta <= 0) { // Rewinding. while ($delta++ < 0) { // We rewind byte by byte and only count characters that are not continuation bytes,
// Rewinding.
while ($delta++ < 0) {
// We rewind byte by byte and only count characters that are not continuation bytes,
// i.e. ASCII characters and first octets of multibyte characters
do {
$byte = ord($this->str[--$this->byteIdx]);
williamdes marked this conversation as resolved.
Show resolved Hide resolved
} while (($byte >= 128) && ($byte < 192));

Check warning on line 128 in src/UtfString.php

View workflow job for this annotation

GitHub Actions / Mutation tests with PHP 8.1

Escaped Mutant for Mutator "GreaterThanOrEqualTo": --- Original +++ New @@ @@ // i.e. ASCII characters and first octets of multibyte characters do { $byte = ord($this->str[--$this->byteIdx]); - } while ($byte >= 128 && $byte < 192); + } while ($byte > 128 && $byte < 192); --$this->charIdx; } }

Check warning on line 128 in src/UtfString.php

View workflow job for this annotation

GitHub Actions / Mutation tests with PHP 8.1

Escaped Mutant for Mutator "LessThan": --- Original +++ New @@ @@ // i.e. ASCII characters and first octets of multibyte characters do { $byte = ord($this->str[--$this->byteIdx]); - } while ($byte >= 128 && $byte < 192); + } while ($byte >= 128 && $byte <= 192); --$this->charIdx; } }

--$this->charIdx;
}
}

$bytesCount = static::getCharLength($this->str[$this->byteIdx]);

$ret = '';
for ($i = 0; $bytesCount-- > 0; ++$i) {
$ret .= $this->str[$this->byteIdx + $i];
}

return $ret;
// Fetch the first Unicode character within the next 4 bytes in the string.
return mb_substr(substr($this->str, $this->byteIdx, 4), 0, 1);

Check warning on line 135 in src/UtfString.php

View workflow job for this annotation

GitHub Actions / Mutation tests with PHP 8.1

Escaped Mutant for Mutator "IncrementInteger": --- Original +++ New @@ @@ } } // Fetch the first Unicode character within the next 4 bytes in the string. - return mb_substr(substr($this->str, $this->byteIdx, 4), 0, 1); + return mb_substr(substr($this->str, $this->byteIdx, 5), 0, 1); } /** * Sets the value of a character.
}

/**
Expand Down Expand Up @@ -270,52 +160,6 @@
throw new Exception('Not implemented.');
}

/**
* Gets the length of an UTF-8 character.
*
* According to RFC 3629, a UTF-8 character can have at most 4 bytes.
* However, this implementation supports UTF-8 characters containing up to 6
* bytes.
*
* @see https://tools.ietf.org/html/rfc3629
*
* @param string $byte the byte to be analyzed
*/
public static function getCharLength($byte): int
{
// Use the default ASCII map as queries are mostly ASCII chars
// ord($byte) has a performance cost

if (! isset(static::$asciiMap[$byte])) {
// Complete the cache with missing items
static::$asciiMap[$byte] = ord($byte);
}

$byte = static::$asciiMap[$byte];

if ($byte < 128) {
return 1;
}

if ($byte < 224) {
return 2;
}

if ($byte < 240) {
return 3;
}

if ($byte < 248) {
return 4;
}

if ($byte < 252) {
return 5; // unofficial
}

return 6; // unofficial
}

/**
* Returns the length in characters of the string.
*/
Expand Down
30 changes: 6 additions & 24 deletions tests/Misc/UtfStringTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
use PHPUnit\Framework\Attributes\DataProvider;
use Throwable;

use function chr;

class UtfStringTest extends TestCase
{
/**
Expand Down Expand Up @@ -55,27 +53,6 @@ public function testUnset(): void
unset($str[0]);
}

public function testGetCharLength(): void
{
$this->assertEquals(1, UtfString::getCharLength(chr(0x00))); // 00000000
$this->assertEquals(1, UtfString::getCharLength(chr(0x7F))); // 01111111

$this->assertEquals(2, UtfString::getCharLength(chr(0xC0))); // 11000000
$this->assertEquals(2, UtfString::getCharLength(chr(0xDF))); // 11011111

$this->assertEquals(3, UtfString::getCharLength(chr(0xE0))); // 11100000
$this->assertEquals(3, UtfString::getCharLength(chr(0xEF))); // 11101111

$this->assertEquals(4, UtfString::getCharLength(chr(0xF0))); // 11110000
$this->assertEquals(4, UtfString::getCharLength(chr(0xF7))); // 11110111

$this->assertEquals(5, UtfString::getCharLength(chr(0xF8))); // 11111000
$this->assertEquals(5, UtfString::getCharLength(chr(0xFB))); // 11111011

$this->assertEquals(6, UtfString::getCharLength(chr(0xFC))); // 11111100
$this->assertEquals(6, UtfString::getCharLength(chr(0xFD))); // 11111101
}

public function testToString(): void
{
$str = new UtfString(self::TEST_PHRASE);
Expand Down Expand Up @@ -112,7 +89,7 @@ public static function utf8StringsProvider(): array
'č',
],
'emoji' => [
'😂😄😃😀😊😉😍😘😚😗😂👿😮😨😱😠😡😤😖😆😋👯',
'🦋😄😃😀😊😉😍😘😚😗😂👿😮😨😱😠😡😤😖😆😋👯',
'😂',
'😋',
],
Expand All @@ -121,6 +98,11 @@ public static function utf8StringsProvider(): array
null,
null,
],
'random' => [
'xℤⅿↈⅬ⅀ↆℜℝ⅗ℾ℧ⅰℓⅯⅵⅣ⅒21⅞',
'ℾ',
'⅞',
],
];
}
}
54 changes: 22 additions & 32 deletions tests/benchmarks/UtfStringBench.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

use PhpMyAdmin\SqlParser\UtfString;

use function chr;
use function file_get_contents;

class UtfStringBench
Expand All @@ -19,8 +18,7 @@ class UtfStringBench
* @Iterations(20)
* @Revs(4)
* @OutputTimeUnit("milliseconds")
* @Assert("mode(variant.time.avg) < 100 milliseconds +/- 10%")
* @Assert("mode(variant.time.avg) > 30 milliseconds +/- 10%")
* @Assert("mode(variant.time.avg) < 40 milliseconds +/- 10%")
*/
public function benchBuildUtfString(): void
{
Expand All @@ -30,38 +28,30 @@ public function benchBuildUtfString(): void
}
}

/**
* @BeforeMethods("setUp")
* @Iterations(2)
* @Revs(2)
* @OutputTimeUnit("microseconds")
* @Assert("mode(variant.time.avg) < 800 microseconds +/- 20%")
* @Assert("mode(variant.time.avg) > 100 microseconds +/- 10%")
*/
public function benchGetCharLength(): void
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you benchmark the function and set a baseline please ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should it not remain the same? How do I run this benchmark?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, since it's a more complete function and the function was removed maybe everything should be updated then
But having a benchmark data could help

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a new commit. Is this what you had in mind? These are the results I get:

    benchBuildUtfString.....................I19 ✔ Mo6.052ms (±3.59%)
    benchUtfStringRandomAccessWithUnicode...I19 ✔ Mo63.459μs (±10.50%)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that's good to have checked at all times.
For the results I do not know, if the benchmark was added before we could compare but that's totally okay never mind :)

{
UtfString::getCharLength(chr(0x00)); // 00000000
UtfString::getCharLength(chr(0x7F)); // 01111111

UtfString::getCharLength(chr(0xC0)); // 11000000
UtfString::getCharLength(chr(0xDF)); // 11011111

UtfString::getCharLength(chr(0xE0)); // 11100000
UtfString::getCharLength(chr(0xEF)); // 11101111

UtfString::getCharLength(chr(0xF0)); // 11110000
UtfString::getCharLength(chr(0xF7)); // 11110111

UtfString::getCharLength(chr(0xF8)); // 11111000
UtfString::getCharLength(chr(0xFB)); // 11111011

UtfString::getCharLength(chr(0xFC)); // 11111100
UtfString::getCharLength(chr(0xFD)); // 11111101
}

public function setUp(): void
{
$contentsPath = __DIR__ . '/../../LICENSE.txt';
$this->testContents = (string) file_get_contents($contentsPath);
}

/**
* @Iterations(20)
* @Revs(4)
* @OutputTimeUnit("microseconds")
* @Assert("mode(variant.time.avg) < 120 microseconds +/- 10%")
*/
public function benchUtfStringRandomAccessWithUnicode(): void
{
$text = 'abcdefghijklmnopqrstuvwxyz
áéíóúýěřťǔǐǒǎšďȟǰǩľžčǚň
🦋😄😃😀😊😉😍😘😚😗😂👿😮😨😱😠😡😤😖😆😋👯
P\xf8\xed\xb9ern\xec \xbelu\xbbou\xe8k\xfd k\xf3d \xfap\xecl \xef\xe1belsk\xe9 k\xf3dy
xℤⅿↈⅬ⅀ↆℜℝ⅗ℾ℧ⅰℓⅯⅵⅣ⅒21⅞';

$str1 = new UtfString($text);
$str1->offsetGet(10);
$str1->offsetGet(100);
$str1->offsetGet(20);
$str1->offsetGet(0);
}
}
Loading