diff --git a/common/web/types/src/util/util.ts b/common/web/types/src/util/util.ts index ac84eb7e83d..1e088c588a2 100644 --- a/common/web/types/src/util/util.ts +++ b/common/web/types/src/util/util.ts @@ -197,7 +197,7 @@ toOneChar(value: string) : number { export function describeCodepoint(ch : number) : string { let s; - const p = getProblem(ch); + const p = BadStringAnalyzer.getProblem(ch); if (p != null) { // for example: 'PUA (U+E010)' s = p; @@ -208,10 +208,12 @@ export function describeCodepoint(ch : number) : string { return `${s} (U+${Number(ch).toString(16).toUpperCase()})`; } + export enum BadStringType { pua = 'PUA', unassigned = 'Unassigned', illegal = 'Illegal', + denormalized = "Denormalized" }; // Following from kmx_xstring.h / .cpp @@ -314,34 +316,35 @@ class BadStringMap extends Map> { } } -function getProblem(ch : number) : BadStringType { - if (!isValidUnicode(ch)) { - return BadStringType.illegal; - } else if(isPUA(ch)) { - return BadStringType.pua; - } else { // TODO-LDML: unassigned - return null; - } -} -export class BadStringAnalyzer { +/** abstract class for analyzing and categorizing strings */ +export abstract class StringAnalyzer { /** add a string for analysis */ public add(s : string) { - for (const c of s) { + for (const c of [...s]) { const ch = c.codePointAt(0); - const problem = getProblem(ch); + const problem = this.analyzeCodePoint(c, ch); if (problem) { this.addProblem(ch, problem); } } } - private addProblem(ch : number, type : BadStringType) { + /** + * subclass interface + * @param c single codepoint to analyze (string) + * @param ch single codepoint to analyze (scalar) + */ + protected abstract analyzeCodePoint(c: string, ch: number) : BadStringType; + + /** internal interface for the result of an analysis */ + protected addProblem(ch : number, type : BadStringType) { if (!this.m.has(type)) { this.m.set(type, new Set()); } this.m.get(type).add(ch); } + /** get the results of the analysis */ public analyze() : BadStringMap { if (this.m.size == 0) { return null; @@ -350,5 +353,36 @@ export class BadStringAnalyzer { } } + /** internal map */ private m = new BadStringMap(); } + +/** analyze a string looking for bad unicode */ +export class BadStringAnalyzer extends StringAnalyzer { + /** analyze one codepoint */ + protected analyzeCodePoint(c: string, ch: number): BadStringType { + return BadStringAnalyzer.getProblem(ch); + } + /** export analyzer function */ + public static getProblem(ch: number) { + if (!isValidUnicode(ch)) { + return BadStringType.illegal; + } else if(isPUA(ch)) { + return BadStringType.pua; + } else { // TODO-LDML: unassigned + return null; + } + } +} + +/** Analyzer that checks if something isn't NFD */ +export class NFDAnalyzer extends StringAnalyzer { + protected analyzeCodePoint(c: string, ch: number): BadStringType { + const nfd = c.normalize("NFD"); + if (c !== nfd) { + return BadStringType.denormalized; + } else { + return null; + } + } +} diff --git a/common/web/types/test/util/test-unescape.ts b/common/web/types/test/util/test-unescape.ts index 989951ae5cf..8c75b491824 100644 --- a/common/web/types/test/util/test-unescape.ts +++ b/common/web/types/test/util/test-unescape.ts @@ -1,6 +1,6 @@ import 'mocha'; import {assert} from 'chai'; -import {unescapeString, UnescapeError, isOneChar, toOneChar, unescapeOneQuadString, BadStringAnalyzer, isValidUnicode, describeCodepoint, isPUA, BadStringType, unescapeStringToRegex, unescapeQuadString} from '../../src/util/util.js'; +import {unescapeString, UnescapeError, isOneChar, toOneChar, unescapeOneQuadString, BadStringAnalyzer, isValidUnicode, describeCodepoint, isPUA, BadStringType, unescapeStringToRegex, unescapeQuadString, NFDAnalyzer} from '../../src/util/util.js'; describe('test UTF32 functions()', function() { it('should properly categorize strings', () => { @@ -207,9 +207,8 @@ describe('test BadStringAnalyzer', () => { }); } }); - describe('should return nothing for all valid strings', () => { - it('should handle a case with some odd strs in it', () => { - const strs = "But you can call me “\uE010\uFDD0\uFFFE\uD800”, for short." + + it('should handle a case with some odd strs in it', () => { + const strs = "But you can call me “\uE010\uFDD0\uFFFE\uD800”, for short." + ([ 0xF800, 0x05FFFF, @@ -217,21 +216,62 @@ describe('test BadStringAnalyzer', () => { 0x04FFFE, ].map(ch => String.fromCodePoint(ch)).join('')); - const bsa = new BadStringAnalyzer(); - for (const s of strs) { - bsa.add(s); - } - const m = bsa.analyze(); - assert.isNotNull(m); - assert.containsAllKeys(m, [BadStringType.pua, BadStringType.illegal]); - assert.sameDeepMembers(Array.from(m.get(BadStringType.pua).values()), [ - 0xE010,0xF800, 0x102222, - ], `pua analysis`); - assert.sameDeepMembers(Array.from(m.get(BadStringType.illegal).values()), [ - 0xFDD0,0xD800,0xFFFE, - 0x05FFFF, - 0x04FFFE, - ], `illegal analysis`); - }); + const bsa = new BadStringAnalyzer(); + for (const s of strs) { + bsa.add(s); + } + const m = bsa.analyze(); + assert.isNotNull(m); + assert.containsAllKeys(m, [BadStringType.pua, BadStringType.illegal]); + assert.sameDeepMembers(Array.from(m.get(BadStringType.pua).values()), [ + 0xE010, 0xF800, 0x102222, + ], `pua analysis`); + assert.sameDeepMembers(Array.from(m.get(BadStringType.illegal).values()), [ + 0xFDD0, 0xD800, 0xFFFE, + 0x05FFFF, + 0x04FFFE, + ], `illegal analysis`); + }); +}); + +describe('test NFDAnalyzer', () => { + describe('should return nothing for NFD strings', () => { + const cases = [ + [], + ['a',], + ['a', 'b',] + ]; + for (const strs of cases) { + const title = titleize(strs); + it(`should analyze ${title}`, () => { + const bsa = new NFDAnalyzer(); + for (const s of strs) { + bsa.add(s); + } + const m = bsa.analyze(); + assert.isNull(m, `${title}`); + }); + } + }); + it('should handle a case with some odd strs in it', () => { + const strs = "This text is in NFD, but not all of it is." + + ([ + 0x00E8, + 0x0344, + 0x2FA1D, + ].map(ch => String.fromCodePoint(ch)).join('')); + + const bsa = new NFDAnalyzer(); + for (const s of strs) { + bsa.add(s); + } + const m = bsa.analyze(); + assert.isNotNull(m); + assert.containsAllKeys(m, [BadStringType.denormalized]); + assert.sameDeepMembers(Array.from(m.get(BadStringType.denormalized).values()), [ + 0x00E8, + 0x0344, + 0x2FA1D, + ], `denorm analysis`); }); }); diff --git a/developer/src/kmc-ldml/src/compiler/messages.ts b/developer/src/kmc-ldml/src/compiler/messages.ts index 750cc38c0f2..1a4692c3c87 100644 --- a/developer/src/kmc-ldml/src/compiler/messages.ts +++ b/developer/src/kmc-ldml/src/compiler/messages.ts @@ -157,6 +157,15 @@ export class CompilerMessages { static Error_IllegalCharacters = (o: { count: number, lowestCh: number }) => m(this.ERROR_IllegalCharacters, `File contains ${o.count} illegal character(s), including ${util.describeCodepoint(o.lowestCh)}`); static ERROR_IllegalCharacters = SevError | 0x0025; + + static Hint_CharClassImplicitDenorm = (o: { lowestCh: number }) => + m(this.HINT_CharClassImplicitDenorm, `File has character classes which span non-NFD character(s), including ${util.describeCodepoint(o.lowestCh)}. These will not match any text.`); + static HINT_CharClassImplicitDenorm = SevHint | 0x0026; + + static Warn_CharClassExplicitDenorm = (o: { lowestCh: number }) => + m(this.WARN_CharClassExplicitDenorm, `File has character classes which include non-NFD characters(s), including ${util.describeCodepoint(o.lowestCh)}. These will not match any text.`); + static WARN_CharClassExplicitDenorm = SevWarn | 0x0027; + } diff --git a/developer/src/kmc-ldml/src/compiler/tran.ts b/developer/src/kmc-ldml/src/compiler/tran.ts index 6b9c11f4d9d..558d2370ff4 100644 --- a/developer/src/kmc-ldml/src/compiler/tran.ts +++ b/developer/src/kmc-ldml/src/compiler/tran.ts @@ -141,6 +141,8 @@ export abstract class TransformCompiler { diff --git a/developer/src/kmc-ldml/test/fixtures/sections/tran/tran-hint-range.xml b/developer/src/kmc-ldml/test/fixtures/sections/tran/tran-hint-range.xml new file mode 100644 index 00000000000..4323a879376 --- /dev/null +++ b/developer/src/kmc-ldml/test/fixtures/sections/tran/tran-hint-range.xml @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/developer/src/kmc-ldml/test/fixtures/sections/tran/tran-hint-range2.xml b/developer/src/kmc-ldml/test/fixtures/sections/tran/tran-hint-range2.xml new file mode 100644 index 00000000000..9c9e3ae0554 --- /dev/null +++ b/developer/src/kmc-ldml/test/fixtures/sections/tran/tran-hint-range2.xml @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/developer/src/kmc-ldml/test/fixtures/sections/tran/tran-warn-range.xml b/developer/src/kmc-ldml/test/fixtures/sections/tran/tran-warn-range.xml new file mode 100644 index 00000000000..9ac22715818 --- /dev/null +++ b/developer/src/kmc-ldml/test/fixtures/sections/tran/tran-warn-range.xml @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/developer/src/kmc-ldml/test/test-tran.ts b/developer/src/kmc-ldml/test/test-tran.ts index b91f554486b..bef11cd4356 100644 --- a/developer/src/kmc-ldml/test/test-tran.ts +++ b/developer/src/kmc-ldml/test/test-tran.ts @@ -245,6 +245,24 @@ describe('tran', function () { assert.strictEqual(transforms[0].to.value, "x"); } }, + { + subpath: 'sections/tran/tran-warn-range.xml', + warnings: [ + CompilerMessages.Warn_CharClassExplicitDenorm({lowestCh: 0xE1}), + ], + }, + { + subpath: 'sections/tran/tran-hint-range.xml', + warnings: [ + CompilerMessages.Hint_CharClassImplicitDenorm({lowestCh: 0xc0}), + ], + }, + { + subpath: 'sections/tran/tran-hint-range2.xml', + warnings: [ + CompilerMessages.Hint_CharClassImplicitDenorm({lowestCh: 0xC0}), + ], + }, ], tranDependencies); });