Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(developer): range warning for non-NFD chars 🙀 #10614

Merged
merged 7 commits into from
Feb 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 48 additions & 14 deletions common/web/types/src/util/util.ts
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ toOneChar(value: string) : number {

export function describeCodepoint(ch : number) : string {
let s;
const p = getProblem(ch);
const p = BadStringAnalyzer.getProblem(ch);
if (p != null) {
// for example: 'PUA (U+E010)'
s = p;
Expand All @@ -208,10 +208,12 @@ export function describeCodepoint(ch : number) : string {
return `${s} (U+${Number(ch).toString(16).toUpperCase()})`;
}


export enum BadStringType {
pua = 'PUA',
unassigned = 'Unassigned',
illegal = 'Illegal',
denormalized = "Denormalized"
};

// Following from kmx_xstring.h / .cpp
Expand Down Expand Up @@ -314,34 +316,35 @@ class BadStringMap extends Map<BadStringType, Set<number>> {
}
}

function getProblem(ch : number) : BadStringType {
if (!isValidUnicode(ch)) {
return BadStringType.illegal;
} else if(isPUA(ch)) {
return BadStringType.pua;
} else { // TODO-LDML: unassigned
return null;
}
}
export class BadStringAnalyzer {
/** abstract class for analyzing and categorizing strings */
export abstract class StringAnalyzer {
/** add a string for analysis */
public add(s : string) {
for (const c of s) {
for (const c of [...s]) {
const ch = c.codePointAt(0);
const problem = getProblem(ch);
const problem = this.analyzeCodePoint(c, ch);
if (problem) {
this.addProblem(ch, problem);
}
}
}

private addProblem(ch : number, type : BadStringType) {
/**
* subclass interface
* @param c single codepoint to analyze (string)
* @param ch single codepoint to analyze (scalar)
*/
protected abstract analyzeCodePoint(c: string, ch: number) : BadStringType;

/** internal interface for the result of an analysis */
protected addProblem(ch : number, type : BadStringType) {
if (!this.m.has(type)) {
this.m.set(type, new Set<number>());
}
this.m.get(type).add(ch);
}

/** get the results of the analysis */
public analyze() : BadStringMap {
if (this.m.size == 0) {
return null;
Expand All @@ -350,5 +353,36 @@ export class BadStringAnalyzer {
}
}

/** internal map */
private m = new BadStringMap();
}

/** analyze a string looking for bad unicode */
export class BadStringAnalyzer extends StringAnalyzer {
/** analyze one codepoint */
protected analyzeCodePoint(c: string, ch: number): BadStringType {
return BadStringAnalyzer.getProblem(ch);
}
/** export analyzer function */
public static getProblem(ch: number) {
if (!isValidUnicode(ch)) {
return BadStringType.illegal;
} else if(isPUA(ch)) {
return BadStringType.pua;
} else { // TODO-LDML: unassigned
return null;
}
}
}

/** Analyzer that checks if something isn't NFD */
export class NFDAnalyzer extends StringAnalyzer {
protected analyzeCodePoint(c: string, ch: number): BadStringType {
const nfd = c.normalize("NFD");
if (c !== nfd) {
return BadStringType.denormalized;
} else {
return null;
}
}
Comment on lines +380 to +387
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder about the performance of this -- seems like it could be quite expensive?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

right. maybe some kind of cache would help.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, could perhaps cache?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's only on actual ranges in transforms.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, limited use so we can probably ignore for now. If it's always a single codepoint we could build a map and ship it -- trivial. Danger with that is that if we use normalize() elsewhere then the map could be inconsistent wrt Unicode version.

}
80 changes: 60 additions & 20 deletions common/web/types/test/util/test-unescape.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import 'mocha';
import {assert} from 'chai';
import {unescapeString, UnescapeError, isOneChar, toOneChar, unescapeOneQuadString, BadStringAnalyzer, isValidUnicode, describeCodepoint, isPUA, BadStringType, unescapeStringToRegex, unescapeQuadString} from '../../src/util/util.js';
import {unescapeString, UnescapeError, isOneChar, toOneChar, unescapeOneQuadString, BadStringAnalyzer, isValidUnicode, describeCodepoint, isPUA, BadStringType, unescapeStringToRegex, unescapeQuadString, NFDAnalyzer} from '../../src/util/util.js';

describe('test UTF32 functions()', function() {
it('should properly categorize strings', () => {
Expand Down Expand Up @@ -207,31 +207,71 @@ describe('test BadStringAnalyzer', () => {
});
}
});
describe('should return nothing for all valid strings', () => {
it('should handle a case with some odd strs in it', () => {
const strs = "But you can call me “\uE010\uFDD0\uFFFE\uD800”, for short." +
it('should handle a case with some odd strs in it', () => {
const strs = "But you can call me “\uE010\uFDD0\uFFFE\uD800”, for short." +
([
0xF800,
0x05FFFF,
0x102222,
0x04FFFE,
].map(ch => String.fromCodePoint(ch)).join(''));

const bsa = new BadStringAnalyzer();
for (const s of strs) {
bsa.add(s);
}
const m = bsa.analyze();
assert.isNotNull(m);
assert.containsAllKeys(m, [BadStringType.pua, BadStringType.illegal]);
assert.sameDeepMembers(Array.from(m.get(BadStringType.pua).values()), [
0xE010,0xF800, 0x102222,
], `pua analysis`);
assert.sameDeepMembers(Array.from(m.get(BadStringType.illegal).values()), [
0xFDD0,0xD800,0xFFFE,
0x05FFFF,
0x04FFFE,
], `illegal analysis`);
});
const bsa = new BadStringAnalyzer();
for (const s of strs) {
bsa.add(s);
}
const m = bsa.analyze();
assert.isNotNull(m);
assert.containsAllKeys(m, [BadStringType.pua, BadStringType.illegal]);
assert.sameDeepMembers(Array.from(m.get(BadStringType.pua).values()), [
0xE010, 0xF800, 0x102222,
], `pua analysis`);
assert.sameDeepMembers(Array.from(m.get(BadStringType.illegal).values()), [
0xFDD0, 0xD800, 0xFFFE,
0x05FFFF,
0x04FFFE,
], `illegal analysis`);
});
});

describe('test NFDAnalyzer', () => {
describe('should return nothing for NFD strings', () => {
const cases = [
[],
['a',],
['a', 'b',]
];
for (const strs of cases) {
const title = titleize(strs);
it(`should analyze ${title}`, () => {
const bsa = new NFDAnalyzer();
for (const s of strs) {
bsa.add(s);
}
const m = bsa.analyze();
assert.isNull(m, `${title}`);
});
}
});
it('should handle a case with some odd strs in it', () => {
const strs = "This text is in NFD, but not all of it is." +
([
0x00E8,
0x0344,
0x2FA1D,
].map(ch => String.fromCodePoint(ch)).join(''));

const bsa = new NFDAnalyzer();
for (const s of strs) {
bsa.add(s);
}
const m = bsa.analyze();
assert.isNotNull(m);
assert.containsAllKeys(m, [BadStringType.denormalized]);
assert.sameDeepMembers(Array.from(m.get(BadStringType.denormalized).values()), [
0x00E8,
0x0344,
0x2FA1D,
], `denorm analysis`);
});
});
9 changes: 9 additions & 0 deletions developer/src/kmc-ldml/src/compiler/messages.ts
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,15 @@ export class CompilerMessages {
static Error_IllegalCharacters = (o: { count: number, lowestCh: number }) =>
m(this.ERROR_IllegalCharacters, `File contains ${o.count} illegal character(s), including ${util.describeCodepoint(o.lowestCh)}`);
static ERROR_IllegalCharacters = SevError | 0x0025;

static Hint_CharClassImplicitDenorm = (o: { lowestCh: number }) =>
m(this.HINT_CharClassImplicitDenorm, `File has character classes which span non-NFD character(s), including ${util.describeCodepoint(o.lowestCh)}. These will not match any text.`);
static HINT_CharClassImplicitDenorm = SevHint | 0x0026;

static Warn_CharClassExplicitDenorm = (o: { lowestCh: number }) =>
m(this.WARN_CharClassExplicitDenorm, `File has character classes which include non-NFD characters(s), including ${util.describeCodepoint(o.lowestCh)}. These will not match any text.`);
static WARN_CharClassExplicitDenorm = SevWarn | 0x0027;

}


60 changes: 60 additions & 0 deletions developer/src/kmc-ldml/src/compiler/tran.ts
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,8 @@ export abstract class TransformCompiler<T extends TransformCompilerType, TranBas
// don't unescape literals here, because we are going to pass them through into the regex
cookedFrom = util.unescapeStringToRegex(cookedFrom);

this.checkRanges(cookedFrom); // check before normalizing

if (!sections?.meta?.normalizationDisabled) {
// nfd here.
cookedFrom = MarkerParser.nfd_markers(cookedFrom, true);
Expand Down Expand Up @@ -197,6 +199,64 @@ export abstract class TransformCompiler<T extends TransformCompilerType, TranBas
defaults.delete(this.id);
return defaults;
}

private checkRanges(cookedFrom: string) {
// extract all of the potential ranges - but don't match any-markers!
const anyRange = /(?<!\\uffff\\u0008)\[([^\]]+)\]/g;
const ranges = cookedFrom.matchAll(anyRange);

if (!ranges) return;

// extract inner members of a range (inside the [])
const rangeRegex = /(\\u[0-9a-fA-F]{4}|.)-(\\u[0-9a-fA-F]{4}|.)|./g;

const rangeExplicit = new util.NFDAnalyzer();
const rangeImplicit = new util.NFDAnalyzer();

/** process an explicit entry */
function processExplicit(s: string) {
if (s.startsWith('\\u{')) {
s = util.unescapeString(s);
} else if(s.startsWith('\\u')) {
s = util.unescapeOneQuadString(s);
}
rangeExplicit.add(s);
return s;
}

for (const [, sub] of ranges) {
const subRanges = sub.matchAll(rangeRegex);
for (const [all, start, end] of subRanges) {
if (!start && !end) {
// explicit single char
processExplicit(all); // matched one char
} else {
// start-end range - get explicit start and end chars
const s = processExplicit(start);
const sch = s.codePointAt(0);
const e = processExplicit(end);
const ech = e.codePointAt(0);
// now, process the inner chars, not including explicit
for (let n = sch; n < ech; n++) {
// add inner text
rangeImplicit.add(String.fromCodePoint(n));
}
}
}
}

// analyze ranges
const explicitSet = rangeExplicit.analyze()?.get(util.BadStringType.denormalized);
if (explicitSet) {
this.callbacks.reportMessage(CompilerMessages.Warn_CharClassExplicitDenorm({ lowestCh: explicitSet.values().next().value }));
} else {
// don't analyze the implicit set of THIS range, if explicit is already problematic
const implicitSet = rangeImplicit.analyze()?.get(util.BadStringType.denormalized);
if (implicitSet) {
this.callbacks.reportMessage(CompilerMessages.Hint_CharClassImplicitDenorm({ lowestCh: implicitSet.values().next().value }));
}
}
}
}

export class TranCompiler extends TransformCompiler<'simple', Tran /*, TranItem*/> {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE keyboard3 SYSTEM "../../../../../../../resources/standards-data/ldml-keyboards/techpreview/dtd/ldmlKeyboard3.dtd">
<keyboard3 locale="mt" conformsTo="techpreview">
<info name="tran-hint-range" />

<keys />

<transforms type="simple">
<transformGroup>
<!-- all NFD but crosses non-NFD-->
<transform from="[\u{0020}-\u{0250}]" to="x" />
</transformGroup>
</transforms>
</keyboard3>
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<?xml version="1.0" encoding="UTF-8"?>

<!DOCTYPE keyboard3 SYSTEM "../../../../../../../resources/standards-data/ldml-keyboards/techpreview/dtd/ldmlKeyboard3.dtd">
<keyboard3 locale="mt" conformsTo="techpreview">
<info name="tran-hint-range"/>

<keys />

<transforms type="simple">
<transformGroup>
<!-- all NFD but crosses non-NFD -->
<transform from="[a-ɐ]" to="x"/>
</transformGroup>
</transforms>
</keyboard3>
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE keyboard3 SYSTEM "../../../../../../../resources/standards-data/ldml-keyboards/techpreview/dtd/ldmlKeyboard3.dtd">
<keyboard3 locale="mt" conformsTo="techpreview">
<info name="tran-hint-range" />

<keys />

<transforms type="simple">
<transformGroup>
<!-- warning, includes NFC start/end-->
<transform from="[á-é]" to="x" />
</transformGroup>
</transforms>
</keyboard3>
18 changes: 18 additions & 0 deletions developer/src/kmc-ldml/test/test-tran.ts
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,24 @@ describe('tran', function () {
assert.strictEqual(transforms[0].to.value, "x");
}
},
{
subpath: 'sections/tran/tran-warn-range.xml',
warnings: [
CompilerMessages.Warn_CharClassExplicitDenorm({lowestCh: 0xE1}),
],
},
{
subpath: 'sections/tran/tran-hint-range.xml',
warnings: [
CompilerMessages.Hint_CharClassImplicitDenorm({lowestCh: 0xc0}),
],
},
{
subpath: 'sections/tran/tran-hint-range2.xml',
warnings: [
CompilerMessages.Hint_CharClassImplicitDenorm({lowestCh: 0xC0}),
],
},
], tranDependencies);
});

Expand Down
Loading