diff --git a/CHANGELOG.md b/CHANGELOG.md index dbbc30c..c714b35 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,10 @@ See [GitHub releases](https://github.com/mll-lab/php-utils/releases). ## Unreleased +### Added + +- Add `StringUtil::toUTF8()` + ## v1.11.0 ### Added diff --git a/src/QxManager/FilledRow.php b/src/QxManager/FilledRow.php index 49dadf0..5ba9b60 100644 --- a/src/QxManager/FilledRow.php +++ b/src/QxManager/FilledRow.php @@ -50,9 +50,9 @@ public function __construct( string $targetName, string $signalCh1, string $signalCh2, - int $referenceCopies = null, - string $wellNotes = null, - string $rdqConversionFactor = null + ?int $referenceCopies = null, + ?string $wellNotes = null, + ?string $rdqConversionFactor = null ) { $this->targetName = $targetName; $this->signalCh1 = $signalCh1; diff --git a/src/StringUtil.php b/src/StringUtil.php index c2abd75..324eb1d 100644 --- a/src/StringUtil.php +++ b/src/StringUtil.php @@ -6,6 +6,21 @@ final class StringUtil { + /** https://en.wikipedia.org/wiki/Byte_order_mark#UTF-8 */ + public const UTF_8_BOM = "\xEF\xBB\xBF"; + + /** https://en.wikipedia.org/wiki/Byte_order_mark#UTF-16 */ + public const UTF_16_BIG_ENDIAN_BOM = "\xFE\xFF"; + + /** https://en.wikipedia.org/wiki/Byte_order_mark#UTF-16 */ + public const UTF_16_LITTLE_ENDIAN_BOM = "\xFF\xFE"; + + /** https://en.wikipedia.org/wiki/Byte_order_mark#UTF-32 */ + public const UTF_32_BIG_ENDIAN_BOM = "\x00\x00\xFE\xFF"; + + /** https://en.wikipedia.org/wiki/Byte_order_mark#UTF-32 */ + public const UTF_32_LITTLE_ENDIAN_BOM = "\xFF\xFE\x00\x00"; + /** @param iterable $parts */ public static function joinNonEmpty(string $glue, iterable $parts): string { @@ -82,6 +97,62 @@ public static function normalizeLineEndings(string $input, string $to = "\r\n"): return \Safe\preg_replace("/\r\n|\r|\n/", $to, $input); } + /** Convert string that could be in different UTF encodings (UTF-8, UTF-16BE, ...) to UTF-8. */ + public static function toUTF8(string $string): string + { + $encoding = mb_detect_encoding($string, null, true); + + if ($encoding === false) { + $encoding = self::guessEncoding($string); + } + + $converted = \Safe\mb_convert_encoding($string, 'UTF-8', $encoding); + assert(is_string($converted), 'because a string was passed to mb_convert_encoding'); + + return $converted; + } + + private static function guessEncoding(string $text): string + { + // @see https://www.php.net/manual/en/function.mb-detect-encoding.php#91051 + $first3 = substr($text, 0, 3); + if ($first3 === self::UTF_8_BOM) { + return 'UTF-8'; + } + + $first4 = substr($text, 0, 3); + if ($first4 === self::UTF_32_BIG_ENDIAN_BOM) { + return 'UTF-32BE'; + } + if ($first4 === self::UTF_32_LITTLE_ENDIAN_BOM) { + return 'UTF-32LE'; + } + + $first2 = substr($text, 0, 2); + if ($first2 === self::UTF_16_BIG_ENDIAN_BOM) { + return 'UTF-16BE'; + } + if ($first2 === self::UTF_16_LITTLE_ENDIAN_BOM) { + return 'UTF-16LE'; + } + + // https://kence.org/2019/11/27/detecting-windows-1252-encoding + // If the string contains characters in ranges that are either control characters + // or invalid for ISO-8859-1 or CP-1252, we are unable to reliably guess. + if (\Safe\preg_match('/[\x00-\x08\x0E-\x1F\x81\x8D\x8F\x90\x9D]/', $text, $matches) !== 0) { + throw new \Exception("Can not determine UTF encoding of text: {$text}"); + } + + // If we get here, we're going to assume it's either Windows-1252 or ISO-8859-1. + // If the string contains characters in the ISO-8859-1 reserved range, that's probably Windows-1252. + if (\Safe\preg_match('/[\x80-\x9F]/', $text) !== 0) { + return 'Windows-1252'; + } + + // Give up and return ISO-8859-1. + return 'ISO-8859-1'; + } + /** * Pad a number with leading zero's. * diff --git a/tests/StringUtilTest.php b/tests/StringUtilTest.php index 7d56b98..fa88e78 100644 --- a/tests/StringUtilTest.php +++ b/tests/StringUtilTest.php @@ -85,6 +85,47 @@ public function testNormalizeLineEndings(): void ); } + public function testUTF8(): void + { + $expectedUTF8 = 'test'; + + $string = \Safe\file_get_contents(__DIR__ . '/StringUtilTestData/UTF-8.csv'); + + self::assertSame($expectedUTF8, $string); + self::assertSame($expectedUTF8, StringUtil::toUTF8($string)); + } + + public function testUTF16LE(): void + { + // The zero width no-break space (ZWNBSP) is a deprecated use of the Unicode character at code point U+FEFF. + // Character U+FEFF is intended for use as a Byte Order Mark (BOM) at the start of a file + // -> https://unicode-explorer.com/c/FEFF + $expectedUTF8 = 'test'; + + $string = \Safe\file_get_contents(__DIR__ . '/StringUtilTestData/UTF-16LE.csv'); + self::assertNotSame($expectedUTF8, $string); + self::assertSame($expectedUTF8, StringUtil::toUTF8($string)); + } + + public function testWindows1252(): void + { + $expectedUTF8 = <<