From 535418d9efb3e9aebd05daeba106873066183119 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Thu, 1 Feb 2024 19:02:54 -0600 Subject: [PATCH 1/5] =?UTF-8?q?feat(developer):=20hint/warn=20on=20range?= =?UTF-8?q?=20check=20=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - just the test files #10316 --- .../fixtures/sections/tran/tran-hint-range.xml | 14 ++++++++++++++ .../fixtures/sections/tran/tran-hint-range2.xml | 15 +++++++++++++++ .../fixtures/sections/tran/tran-warn-range.xml | 14 ++++++++++++++ 3 files changed, 43 insertions(+) create mode 100644 developer/src/kmc-ldml/test/fixtures/sections/tran/tran-hint-range.xml create mode 100644 developer/src/kmc-ldml/test/fixtures/sections/tran/tran-hint-range2.xml create mode 100644 developer/src/kmc-ldml/test/fixtures/sections/tran/tran-warn-range.xml diff --git a/developer/src/kmc-ldml/test/fixtures/sections/tran/tran-hint-range.xml b/developer/src/kmc-ldml/test/fixtures/sections/tran/tran-hint-range.xml new file mode 100644 index 00000000000..a0c7b42a14b --- /dev/null +++ b/developer/src/kmc-ldml/test/fixtures/sections/tran/tran-hint-range.xml @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/developer/src/kmc-ldml/test/fixtures/sections/tran/tran-hint-range2.xml b/developer/src/kmc-ldml/test/fixtures/sections/tran/tran-hint-range2.xml new file mode 100644 index 00000000000..9c9e3ae0554 --- /dev/null +++ b/developer/src/kmc-ldml/test/fixtures/sections/tran/tran-hint-range2.xml @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/developer/src/kmc-ldml/test/fixtures/sections/tran/tran-warn-range.xml b/developer/src/kmc-ldml/test/fixtures/sections/tran/tran-warn-range.xml new file mode 100644 index 00000000000..5341ab7ab0f --- /dev/null +++ b/developer/src/kmc-ldml/test/fixtures/sections/tran/tran-warn-range.xml @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + From db1c4b8636685fc490a038b26261bde40efd3206 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Fri, 2 Feb 2024 00:05:10 -0600 Subject: [PATCH 2/5] =?UTF-8?q?feat(developer):=20hint/warn=20on=20range?= =?UTF-8?q?=20check=20=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - failing test #10316 --- .../src/kmc-ldml/src/compiler/messages.ts | 9 +++++++++ developer/src/kmc-ldml/test/test-tran.ts | 18 ++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/developer/src/kmc-ldml/src/compiler/messages.ts b/developer/src/kmc-ldml/src/compiler/messages.ts index 750cc38c0f2..082ed88d011 100644 --- a/developer/src/kmc-ldml/src/compiler/messages.ts +++ b/developer/src/kmc-ldml/src/compiler/messages.ts @@ -157,6 +157,15 @@ export class CompilerMessages { static Error_IllegalCharacters = (o: { count: number, lowestCh: number }) => m(this.ERROR_IllegalCharacters, `File contains ${o.count} illegal character(s), including ${util.describeCodepoint(o.lowestCh)}`); static ERROR_IllegalCharacters = SevError | 0x0025; + + static Hint_CharClassImplicitDenorm = (o: { lowestCh: number }) => + m(this.HINT_CharClassImplicitDenorm, `File has character classes which span non-NFD character(s), including ${util.describeCodepoint(o.lowestCh)}. These will not match any text.`); + static HINT_CharClassImplicitDenorm = SevHint | 0x0023; + + static Warn_CharClassExplicitDenorm = (o: { lowestCh: number }) => + m(this.WARN_CharClassExplicitDenorm, `File has character classes which include non-NFD characters(s), including ${util.describeCodepoint(o.lowestCh)}. These will not match any text.`); + static WARN_CharClassExplicitDenorm = SevWarn | 0x0024; + } diff --git a/developer/src/kmc-ldml/test/test-tran.ts b/developer/src/kmc-ldml/test/test-tran.ts index b91f554486b..00c86dd7848 100644 --- a/developer/src/kmc-ldml/test/test-tran.ts +++ b/developer/src/kmc-ldml/test/test-tran.ts @@ -245,6 +245,24 @@ describe('tran', function () { assert.strictEqual(transforms[0].to.value, "x"); } }, + { + subpath: 'sections/tran/tran-warn-range.xml', + warnings: [ + CompilerMessages.Warn_CharClassExplicitDenorm({lowestCh: 0}), + ], + }, + { + subpath: 'sections/tran/tran-hint-range.xml', + warnings: [ + CompilerMessages.Hint_CharClassImplicitDenorm({lowestCh: 0}), + ], + }, + { + subpath: 'sections/tran/tran-hint-range2.xml', + warnings: [ + CompilerMessages.Hint_CharClassImplicitDenorm({lowestCh: 0}), + ], + }, ], tranDependencies); }); From 107e8e98f94d955984059440c27e1597ae241001 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Fri, 2 Feb 2024 12:09:23 -0600 Subject: [PATCH 3/5] =?UTF-8?q?feat(developer):=20hint/warn=20on=20range?= =?UTF-8?q?=20check=20=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - analyzer #10316 --- common/web/types/src/util/util.ts | 62 ++++++++++++---- common/web/types/test/util/test-unescape.ts | 80 +++++++++++++++------ 2 files changed, 108 insertions(+), 34 deletions(-) diff --git a/common/web/types/src/util/util.ts b/common/web/types/src/util/util.ts index ac84eb7e83d..1e088c588a2 100644 --- a/common/web/types/src/util/util.ts +++ b/common/web/types/src/util/util.ts @@ -197,7 +197,7 @@ toOneChar(value: string) : number { export function describeCodepoint(ch : number) : string { let s; - const p = getProblem(ch); + const p = BadStringAnalyzer.getProblem(ch); if (p != null) { // for example: 'PUA (U+E010)' s = p; @@ -208,10 +208,12 @@ export function describeCodepoint(ch : number) : string { return `${s} (U+${Number(ch).toString(16).toUpperCase()})`; } + export enum BadStringType { pua = 'PUA', unassigned = 'Unassigned', illegal = 'Illegal', + denormalized = "Denormalized" }; // Following from kmx_xstring.h / .cpp @@ -314,34 +316,35 @@ class BadStringMap extends Map> { } } -function getProblem(ch : number) : BadStringType { - if (!isValidUnicode(ch)) { - return BadStringType.illegal; - } else if(isPUA(ch)) { - return BadStringType.pua; - } else { // TODO-LDML: unassigned - return null; - } -} -export class BadStringAnalyzer { +/** abstract class for analyzing and categorizing strings */ +export abstract class StringAnalyzer { /** add a string for analysis */ public add(s : string) { - for (const c of s) { + for (const c of [...s]) { const ch = c.codePointAt(0); - const problem = getProblem(ch); + const problem = this.analyzeCodePoint(c, ch); if (problem) { this.addProblem(ch, problem); } } } - private addProblem(ch : number, type : BadStringType) { + /** + * subclass interface + * @param c single codepoint to analyze (string) + * @param ch single codepoint to analyze (scalar) + */ + protected abstract analyzeCodePoint(c: string, ch: number) : BadStringType; + + /** internal interface for the result of an analysis */ + protected addProblem(ch : number, type : BadStringType) { if (!this.m.has(type)) { this.m.set(type, new Set()); } this.m.get(type).add(ch); } + /** get the results of the analysis */ public analyze() : BadStringMap { if (this.m.size == 0) { return null; @@ -350,5 +353,36 @@ export class BadStringAnalyzer { } } + /** internal map */ private m = new BadStringMap(); } + +/** analyze a string looking for bad unicode */ +export class BadStringAnalyzer extends StringAnalyzer { + /** analyze one codepoint */ + protected analyzeCodePoint(c: string, ch: number): BadStringType { + return BadStringAnalyzer.getProblem(ch); + } + /** export analyzer function */ + public static getProblem(ch: number) { + if (!isValidUnicode(ch)) { + return BadStringType.illegal; + } else if(isPUA(ch)) { + return BadStringType.pua; + } else { // TODO-LDML: unassigned + return null; + } + } +} + +/** Analyzer that checks if something isn't NFD */ +export class NFDAnalyzer extends StringAnalyzer { + protected analyzeCodePoint(c: string, ch: number): BadStringType { + const nfd = c.normalize("NFD"); + if (c !== nfd) { + return BadStringType.denormalized; + } else { + return null; + } + } +} diff --git a/common/web/types/test/util/test-unescape.ts b/common/web/types/test/util/test-unescape.ts index 989951ae5cf..c0cd5069bbb 100644 --- a/common/web/types/test/util/test-unescape.ts +++ b/common/web/types/test/util/test-unescape.ts @@ -1,6 +1,6 @@ import 'mocha'; import {assert} from 'chai'; -import {unescapeString, UnescapeError, isOneChar, toOneChar, unescapeOneQuadString, BadStringAnalyzer, isValidUnicode, describeCodepoint, isPUA, BadStringType, unescapeStringToRegex, unescapeQuadString} from '../../src/util/util.js'; +import {unescapeString, UnescapeError, isOneChar, toOneChar, unescapeOneQuadString, BadStringAnalyzer, isValidUnicode, describeCodepoint, isPUA, BadStringType, unescapeStringToRegex, unescapeQuadString, NFDAnalyzer} from '../../src/util/util.js'; describe('test UTF32 functions()', function() { it('should properly categorize strings', () => { @@ -207,9 +207,8 @@ describe('test BadStringAnalyzer', () => { }); } }); - describe('should return nothing for all valid strings', () => { - it('should handle a case with some odd strs in it', () => { - const strs = "But you can call me “\uE010\uFDD0\uFFFE\uD800”, for short." + + it('should handle a case with some odd strs in it', () => { + const strs = "But you can call me “\uE010\uFDD0\uFFFE\uD800”, for short." + ([ 0xF800, 0x05FFFF, @@ -217,21 +216,62 @@ describe('test BadStringAnalyzer', () => { 0x04FFFE, ].map(ch => String.fromCodePoint(ch)).join('')); - const bsa = new BadStringAnalyzer(); - for (const s of strs) { - bsa.add(s); - } - const m = bsa.analyze(); - assert.isNotNull(m); - assert.containsAllKeys(m, [BadStringType.pua, BadStringType.illegal]); - assert.sameDeepMembers(Array.from(m.get(BadStringType.pua).values()), [ - 0xE010,0xF800, 0x102222, - ], `pua analysis`); - assert.sameDeepMembers(Array.from(m.get(BadStringType.illegal).values()), [ - 0xFDD0,0xD800,0xFFFE, - 0x05FFFF, - 0x04FFFE, - ], `illegal analysis`); - }); + const bsa = new BadStringAnalyzer(); + for (const s of strs) { + bsa.add(s); + } + const m = bsa.analyze(); + assert.isNotNull(m); + assert.containsAllKeys(m, [BadStringType.pua, BadStringType.illegal]); + assert.sameDeepMembers(Array.from(m.get(BadStringType.pua).values()), [ + 0xE010, 0xF800, 0x102222, + ], `pua analysis`); + assert.sameDeepMembers(Array.from(m.get(BadStringType.illegal).values()), [ + 0xFDD0, 0xD800, 0xFFFE, + 0x05FFFF, + 0x04FFFE, + ], `illegal analysis`); + }); +}); + +describe('test NFDAnalyzer', () => { + describe('should return nothing for NFD strings', () => { + const cases = [ + [], + ['a',], + ['a', 'b',] + ]; + for (const strs of cases) { + const title = titleize(strs); + it(`should analyze ${title}`, () => { + const bsa = new NFDAnalyzer(); + for (const s of strs) { + bsa.add(s); + } + const m = bsa.analyze(); + assert.isNull(m, `${title}`); + }); + } + }); + it('should handle a case with some odd strs in it', () => { + const strs = "This text is in NFD, but not all of it is." + + ([ + 0x00E8, + 0x0344, + 0x2FA1D, + ].map(ch => String.fromCodePoint(ch)).join('')); + + const bsa = new NFDAnalyzer(); + for (const s of strs) { + bsa.add(s); + } + const m = bsa.analyze(); + assert.isNotNull(m); + assert.containsAllKeys(m, [BadStringType.denormalized]); + assert.sameDeepMembers(Array.from(m.get(BadStringType.denormalized).values()), [ + 0x00E8, + 0x0344, + 0x1FA1D, + ], `denorm analysis`); }); }); From 10940be7e06b23b647e703f0298305943f3e7d41 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Fri, 2 Feb 2024 15:18:08 -0600 Subject: [PATCH 4/5] =?UTF-8?q?feat(developer):=20hint/warn=20on=20range?= =?UTF-8?q?=20check=20=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - analyzer #10316 --- common/web/types/test/util/test-unescape.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/web/types/test/util/test-unescape.ts b/common/web/types/test/util/test-unescape.ts index c0cd5069bbb..8c75b491824 100644 --- a/common/web/types/test/util/test-unescape.ts +++ b/common/web/types/test/util/test-unescape.ts @@ -271,7 +271,7 @@ describe('test NFDAnalyzer', () => { assert.sameDeepMembers(Array.from(m.get(BadStringType.denormalized).values()), [ 0x00E8, 0x0344, - 0x1FA1D, + 0x2FA1D, ], `denorm analysis`); }); }); From 80ad3a87fc7783787c37223cc039474049ae3a55 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Fri, 2 Feb 2024 16:59:58 -0600 Subject: [PATCH 5/5] =?UTF-8?q?feat(developer):=20hint/warn=20on=20range?= =?UTF-8?q?=20check=20=F0=9F=99=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - add warning into tran.ts - will be one warning per element #10316 --- .../src/kmc-ldml/src/compiler/messages.ts | 4 +- developer/src/kmc-ldml/src/compiler/tran.ts | 60 +++++++++++++++++++ .../sections/tran/tran-hint-range.xml | 2 +- .../sections/tran/tran-warn-range.xml | 2 +- developer/src/kmc-ldml/test/test-tran.ts | 6 +- 5 files changed, 67 insertions(+), 7 deletions(-) diff --git a/developer/src/kmc-ldml/src/compiler/messages.ts b/developer/src/kmc-ldml/src/compiler/messages.ts index 082ed88d011..1a4692c3c87 100644 --- a/developer/src/kmc-ldml/src/compiler/messages.ts +++ b/developer/src/kmc-ldml/src/compiler/messages.ts @@ -160,11 +160,11 @@ export class CompilerMessages { static Hint_CharClassImplicitDenorm = (o: { lowestCh: number }) => m(this.HINT_CharClassImplicitDenorm, `File has character classes which span non-NFD character(s), including ${util.describeCodepoint(o.lowestCh)}. These will not match any text.`); - static HINT_CharClassImplicitDenorm = SevHint | 0x0023; + static HINT_CharClassImplicitDenorm = SevHint | 0x0026; static Warn_CharClassExplicitDenorm = (o: { lowestCh: number }) => m(this.WARN_CharClassExplicitDenorm, `File has character classes which include non-NFD characters(s), including ${util.describeCodepoint(o.lowestCh)}. These will not match any text.`); - static WARN_CharClassExplicitDenorm = SevWarn | 0x0024; + static WARN_CharClassExplicitDenorm = SevWarn | 0x0027; } diff --git a/developer/src/kmc-ldml/src/compiler/tran.ts b/developer/src/kmc-ldml/src/compiler/tran.ts index 38ee5e3f2c2..de42290ac1b 100644 --- a/developer/src/kmc-ldml/src/compiler/tran.ts +++ b/developer/src/kmc-ldml/src/compiler/tran.ts @@ -141,6 +141,8 @@ export abstract class TransformCompiler { diff --git a/developer/src/kmc-ldml/test/fixtures/sections/tran/tran-hint-range.xml b/developer/src/kmc-ldml/test/fixtures/sections/tran/tran-hint-range.xml index a0c7b42a14b..4323a879376 100644 --- a/developer/src/kmc-ldml/test/fixtures/sections/tran/tran-hint-range.xml +++ b/developer/src/kmc-ldml/test/fixtures/sections/tran/tran-hint-range.xml @@ -8,7 +8,7 @@ - + diff --git a/developer/src/kmc-ldml/test/fixtures/sections/tran/tran-warn-range.xml b/developer/src/kmc-ldml/test/fixtures/sections/tran/tran-warn-range.xml index 5341ab7ab0f..9ac22715818 100644 --- a/developer/src/kmc-ldml/test/fixtures/sections/tran/tran-warn-range.xml +++ b/developer/src/kmc-ldml/test/fixtures/sections/tran/tran-warn-range.xml @@ -7,7 +7,7 @@ - + diff --git a/developer/src/kmc-ldml/test/test-tran.ts b/developer/src/kmc-ldml/test/test-tran.ts index 00c86dd7848..bef11cd4356 100644 --- a/developer/src/kmc-ldml/test/test-tran.ts +++ b/developer/src/kmc-ldml/test/test-tran.ts @@ -248,19 +248,19 @@ describe('tran', function () { { subpath: 'sections/tran/tran-warn-range.xml', warnings: [ - CompilerMessages.Warn_CharClassExplicitDenorm({lowestCh: 0}), + CompilerMessages.Warn_CharClassExplicitDenorm({lowestCh: 0xE1}), ], }, { subpath: 'sections/tran/tran-hint-range.xml', warnings: [ - CompilerMessages.Hint_CharClassImplicitDenorm({lowestCh: 0}), + CompilerMessages.Hint_CharClassImplicitDenorm({lowestCh: 0xc0}), ], }, { subpath: 'sections/tran/tran-hint-range2.xml', warnings: [ - CompilerMessages.Hint_CharClassImplicitDenorm({lowestCh: 0}), + CompilerMessages.Hint_CharClassImplicitDenorm({lowestCh: 0xC0}), ], }, ], tranDependencies);