Skip to content

Commit

Permalink
Add StringUtil::toUTF8()
Browse files Browse the repository at this point in the history
  • Loading branch information
spawnia committed Feb 20, 2024
1 parent f8fb22c commit 444753a
Show file tree
Hide file tree
Showing 7 changed files with 126 additions and 3 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ See [GitHub releases](https://github.com/mll-lab/php-utils/releases).

## Unreleased

### Added

- Add `StringUtil::toUTF8()`

## v1.11.0

### Added
Expand Down
6 changes: 3 additions & 3 deletions src/QxManager/FilledRow.php
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ public function __construct(
string $targetName,
string $signalCh1,
string $signalCh2,
int $referenceCopies = null,
string $wellNotes = null,
string $rdqConversionFactor = null
?int $referenceCopies = null,
?string $wellNotes = null,
?string $rdqConversionFactor = null
) {
$this->targetName = $targetName;
$this->signalCh1 = $signalCh1;
Expand Down
71 changes: 71 additions & 0 deletions src/StringUtil.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,21 @@

final class StringUtil
{
/** https://en.wikipedia.org/wiki/Byte_order_mark#UTF-8 */
public const UTF_8_BOM = "\xEF\xBB\xBF";

/** https://en.wikipedia.org/wiki/Byte_order_mark#UTF-16 */
public const UTF_16_BIG_ENDIAN_BOM = "\xFE\xFF";

/** https://en.wikipedia.org/wiki/Byte_order_mark#UTF-16 */
public const UTF_16_LITTLE_ENDIAN_BOM = "\xFF\xFE";

/** https://en.wikipedia.org/wiki/Byte_order_mark#UTF-32 */
public const UTF_32_BIG_ENDIAN_BOM = "\x00\x00\xFE\xFF";

/** https://en.wikipedia.org/wiki/Byte_order_mark#UTF-32 */
public const UTF_32_LITTLE_ENDIAN_BOM = "\xFF\xFE\x00\x00";

/** @param iterable<string|null> $parts */
public static function joinNonEmpty(string $glue, iterable $parts): string
{
Expand Down Expand Up @@ -82,6 +97,62 @@ public static function normalizeLineEndings(string $input, string $to = "\r\n"):
return \Safe\preg_replace("/\r\n|\r|\n/", $to, $input);
}

/** Convert string that could be in different UTF encodings (UTF-8, UTF-16BE, ...) to UTF-8. */
public static function toUTF8(string $string): string
{
$encoding = mb_detect_encoding($string, null, true);

if ($encoding === false) {
$encoding = self::guessEncoding($string);
}

$converted = \Safe\mb_convert_encoding($string, 'UTF-8', $encoding);
assert(is_string($converted), 'because a string was passed to mb_convert_encoding');

return $converted;
}

private static function guessEncoding(string $text): string
{
// @see https://www.php.net/manual/en/function.mb-detect-encoding.php#91051
$first3 = substr($text, 0, 3);
if ($first3 === self::UTF_8_BOM) {
return 'UTF-8';

Check warning on line 120 in src/StringUtil.php

View check run for this annotation

Codecov / codecov/patch

src/StringUtil.php#L120

Added line #L120 was not covered by tests
}

$first4 = substr($text, 0, 3);
if ($first4 === self::UTF_32_BIG_ENDIAN_BOM) {
return 'UTF-32BE';

Check warning on line 125 in src/StringUtil.php

View check run for this annotation

Codecov / codecov/patch

src/StringUtil.php#L125

Added line #L125 was not covered by tests
}
if ($first4 === self::UTF_32_LITTLE_ENDIAN_BOM) {
return 'UTF-32LE';

Check warning on line 128 in src/StringUtil.php

View check run for this annotation

Codecov / codecov/patch

src/StringUtil.php#L128

Added line #L128 was not covered by tests
}

$first2 = substr($text, 0, 2);
if ($first2 === self::UTF_16_BIG_ENDIAN_BOM) {
return 'UTF-16BE';

Check warning on line 133 in src/StringUtil.php

View check run for this annotation

Codecov / codecov/patch

src/StringUtil.php#L133

Added line #L133 was not covered by tests
}
if ($first2 === self::UTF_16_LITTLE_ENDIAN_BOM) {
return 'UTF-16LE';
}

// https://kence.org/2019/11/27/detecting-windows-1252-encoding
// If the string contains characters in ranges that are either control characters
// or invalid for ISO-8859-1 or CP-1252, we are unable to reliably guess.
if (\Safe\preg_match('/[\x00-\x08\x0E-\x1F\x81\x8D\x8F\x90\x9D]/', $text, $matches) !== 0) {
throw new \Exception("Can not determine UTF encoding of text: {$text}");

Check warning on line 143 in src/StringUtil.php

View check run for this annotation

Codecov / codecov/patch

src/StringUtil.php#L143

Added line #L143 was not covered by tests
}

// If we get here, we're going to assume it's either Windows-1252 or ISO-8859-1.
// If the string contains characters in the ISO-8859-1 reserved range, that's probably Windows-1252.
if (\Safe\preg_match('/[\x80-\x9F]/', $text) !== 0) {
return 'Windows-1252';

Check warning on line 149 in src/StringUtil.php

View check run for this annotation

Codecov / codecov/patch

src/StringUtil.php#L149

Added line #L149 was not covered by tests
}

// Give up and return ISO-8859-1.
return 'ISO-8859-1';
}

/**
* Pad a number with leading zero's.
*
Expand Down
41 changes: 41 additions & 0 deletions tests/StringUtilTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,47 @@ public function testNormalizeLineEndings(): void
);
}

public function testUTF8(): void
{
$expectedUTF8 = 'test';

$string = \Safe\file_get_contents(__DIR__ . '/StringUtilTestData/UTF-8.csv');

self::assertSame($expectedUTF8, $string);
self::assertSame($expectedUTF8, StringUtil::toUTF8($string));
}

public function testUTF16LE(): void
{
// The zero width no-break space (ZWNBSP) is a deprecated use of the Unicode character at code point U+FEFF.
// Character U+FEFF is intended for use as a Byte Order Mark (BOM) at the start of a file
// -> https://unicode-explorer.com/c/FEFF
$expectedUTF8 = 'test';

$string = \Safe\file_get_contents(__DIR__ . '/StringUtilTestData/UTF-16LE.csv');
self::assertNotSame($expectedUTF8, $string);
self::assertSame($expectedUTF8, StringUtil::toUTF8($string));
}

public function testWindows1252(): void
{
$expectedUTF8 = <<<CSV
FileName,WellId,Sample Description,From [bp],To [bp],Average Size [bp],Conc. [ng/µl],Region Molarity [nmol/l],% of Total,Region Comment
2023-05-16 - 13.01.27.D1000,A12,RNA_191_23-049780_A1,170,550,312,23.7,121,95.50,IDT
2023-05-16 - 13.01.27.D1000,B12,RNA_191_23-049782_B1,170,550,308,16.1,82.5,92.27,IDT
2023-05-16 - 13.01.27.D1000,C12,RNA_191_23-049776_C1,170,550,310,16.7,85.3,93.76,IDT
2023-05-16 - 13.01.27.D1000,D12,RNA_191_23-049778_D1,170,550,307,11.4,58.6,91.65,IDT
2023-05-16 - 13.01.27.D1000,E12,RNA_191_NTC_E1,170,550,304,9.63,50.0,90.88,IDT
CSV;

$string = \Safe\file_get_contents(__DIR__ . '/StringUtilTestData/windows-1252.csv');
self::assertNotSame($expectedUTF8, $string);

$utf8String = StringUtil::toUTF8($string);
self::assertSame(StringUtil::normalizeLineEndings($expectedUTF8), StringUtil::normalizeLineEndings($utf8String));
}

public function testLeftPadNumber(): void
{
self::assertSame(
Expand Down
Binary file added tests/StringUtilTestData/UTF-16LE.csv
Binary file not shown.
1 change: 1 addition & 0 deletions tests/StringUtilTestData/UTF-8.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
test
6 changes: 6 additions & 0 deletions tests/StringUtilTestData/windows-1252.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
FileName,WellId,Sample Description,From [bp],To [bp],Average Size [bp],Conc. [ng/�l],Region Molarity [nmol/l],% of Total,Region Comment
2023-05-16 - 13.01.27.D1000,A12,RNA_191_23-049780_A1,170,550,312,23.7,121,95.50,IDT
2023-05-16 - 13.01.27.D1000,B12,RNA_191_23-049782_B1,170,550,308,16.1,82.5,92.27,IDT
2023-05-16 - 13.01.27.D1000,C12,RNA_191_23-049776_C1,170,550,310,16.7,85.3,93.76,IDT
2023-05-16 - 13.01.27.D1000,D12,RNA_191_23-049778_D1,170,550,307,11.4,58.6,91.65,IDT
2023-05-16 - 13.01.27.D1000,E12,RNA_191_NTC_E1,170,550,304,9.63,50.0,90.88,IDT

0 comments on commit 444753a

Please sign in to comment.