Skip to content

Commit

Permalink
Merge pull request #10614 from keymanapp/feat/developer/10316-range-w…
Browse files Browse the repository at this point in the history
…arning
  • Loading branch information
srl295 authored Feb 7, 2024
2 parents 889e9c1 + 5360cff commit 72c8988
Show file tree
Hide file tree
Showing 8 changed files with 238 additions and 34 deletions.
62 changes: 48 additions & 14 deletions common/web/types/src/util/util.ts
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ toOneChar(value: string) : number {

export function describeCodepoint(ch : number) : string {
let s;
const p = getProblem(ch);
const p = BadStringAnalyzer.getProblem(ch);
if (p != null) {
// for example: 'PUA (U+E010)'
s = p;
Expand All @@ -208,10 +208,12 @@ export function describeCodepoint(ch : number) : string {
return `${s} (U+${Number(ch).toString(16).toUpperCase()})`;
}


export enum BadStringType {
pua = 'PUA',
unassigned = 'Unassigned',
illegal = 'Illegal',
denormalized = "Denormalized"
};

// Following from kmx_xstring.h / .cpp
Expand Down Expand Up @@ -314,34 +316,35 @@ class BadStringMap extends Map<BadStringType, Set<number>> {
}
}

function getProblem(ch : number) : BadStringType {
if (!isValidUnicode(ch)) {
return BadStringType.illegal;
} else if(isPUA(ch)) {
return BadStringType.pua;
} else { // TODO-LDML: unassigned
return null;
}
}
export class BadStringAnalyzer {
/** abstract class for analyzing and categorizing strings */
export abstract class StringAnalyzer {
/** add a string for analysis */
public add(s : string) {
for (const c of s) {
for (const c of [...s]) {
const ch = c.codePointAt(0);
const problem = getProblem(ch);
const problem = this.analyzeCodePoint(c, ch);
if (problem) {
this.addProblem(ch, problem);
}
}
}

private addProblem(ch : number, type : BadStringType) {
/**
* subclass interface
* @param c single codepoint to analyze (string)
* @param ch single codepoint to analyze (scalar)
*/
protected abstract analyzeCodePoint(c: string, ch: number) : BadStringType;

/** internal interface for the result of an analysis */
protected addProblem(ch : number, type : BadStringType) {
if (!this.m.has(type)) {
this.m.set(type, new Set<number>());
}
this.m.get(type).add(ch);
}

/** get the results of the analysis */
public analyze() : BadStringMap {
if (this.m.size == 0) {
return null;
Expand All @@ -350,5 +353,36 @@ export class BadStringAnalyzer {
}
}

/** internal map */
private m = new BadStringMap();
}

/** analyze a string looking for bad unicode */
export class BadStringAnalyzer extends StringAnalyzer {
/** analyze one codepoint */
protected analyzeCodePoint(c: string, ch: number): BadStringType {
return BadStringAnalyzer.getProblem(ch);
}
/** export analyzer function */
public static getProblem(ch: number) {
if (!isValidUnicode(ch)) {
return BadStringType.illegal;
} else if(isPUA(ch)) {
return BadStringType.pua;
} else { // TODO-LDML: unassigned
return null;
}
}
}

/** Analyzer that checks if something isn't NFD */
export class NFDAnalyzer extends StringAnalyzer {
protected analyzeCodePoint(c: string, ch: number): BadStringType {
const nfd = c.normalize("NFD");
if (c !== nfd) {
return BadStringType.denormalized;
} else {
return null;
}
}
}
80 changes: 60 additions & 20 deletions common/web/types/test/util/test-unescape.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import 'mocha';
import {assert} from 'chai';
import {unescapeString, UnescapeError, isOneChar, toOneChar, unescapeOneQuadString, BadStringAnalyzer, isValidUnicode, describeCodepoint, isPUA, BadStringType, unescapeStringToRegex, unescapeQuadString} from '../../src/util/util.js';
import {unescapeString, UnescapeError, isOneChar, toOneChar, unescapeOneQuadString, BadStringAnalyzer, isValidUnicode, describeCodepoint, isPUA, BadStringType, unescapeStringToRegex, unescapeQuadString, NFDAnalyzer} from '../../src/util/util.js';

describe('test UTF32 functions()', function() {
it('should properly categorize strings', () => {
Expand Down Expand Up @@ -207,31 +207,71 @@ describe('test BadStringAnalyzer', () => {
});
}
});
describe('should return nothing for all valid strings', () => {
it('should handle a case with some odd strs in it', () => {
const strs = "But you can call me “\uE010\uFDD0\uFFFE\uD800”, for short." +
it('should handle a case with some odd strs in it', () => {
const strs = "But you can call me “\uE010\uFDD0\uFFFE\uD800”, for short." +
([
0xF800,
0x05FFFF,
0x102222,
0x04FFFE,
].map(ch => String.fromCodePoint(ch)).join(''));

const bsa = new BadStringAnalyzer();
for (const s of strs) {
bsa.add(s);
}
const m = bsa.analyze();
assert.isNotNull(m);
assert.containsAllKeys(m, [BadStringType.pua, BadStringType.illegal]);
assert.sameDeepMembers(Array.from(m.get(BadStringType.pua).values()), [
0xE010,0xF800, 0x102222,
], `pua analysis`);
assert.sameDeepMembers(Array.from(m.get(BadStringType.illegal).values()), [
0xFDD0,0xD800,0xFFFE,
0x05FFFF,
0x04FFFE,
], `illegal analysis`);
});
const bsa = new BadStringAnalyzer();
for (const s of strs) {
bsa.add(s);
}
const m = bsa.analyze();
assert.isNotNull(m);
assert.containsAllKeys(m, [BadStringType.pua, BadStringType.illegal]);
assert.sameDeepMembers(Array.from(m.get(BadStringType.pua).values()), [
0xE010, 0xF800, 0x102222,
], `pua analysis`);
assert.sameDeepMembers(Array.from(m.get(BadStringType.illegal).values()), [
0xFDD0, 0xD800, 0xFFFE,
0x05FFFF,
0x04FFFE,
], `illegal analysis`);
});
});

describe('test NFDAnalyzer', () => {
describe('should return nothing for NFD strings', () => {
const cases = [
[],
['a',],
['a', 'b',]
];
for (const strs of cases) {
const title = titleize(strs);
it(`should analyze ${title}`, () => {
const bsa = new NFDAnalyzer();
for (const s of strs) {
bsa.add(s);
}
const m = bsa.analyze();
assert.isNull(m, `${title}`);
});
}
});
it('should handle a case with some odd strs in it', () => {
const strs = "This text is in NFD, but not all of it is." +
([
0x00E8,
0x0344,
0x2FA1D,
].map(ch => String.fromCodePoint(ch)).join(''));

const bsa = new NFDAnalyzer();
for (const s of strs) {
bsa.add(s);
}
const m = bsa.analyze();
assert.isNotNull(m);
assert.containsAllKeys(m, [BadStringType.denormalized]);
assert.sameDeepMembers(Array.from(m.get(BadStringType.denormalized).values()), [
0x00E8,
0x0344,
0x2FA1D,
], `denorm analysis`);
});
});
9 changes: 9 additions & 0 deletions developer/src/kmc-ldml/src/compiler/messages.ts
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,15 @@ export class CompilerMessages {
static Error_IllegalCharacters = (o: { count: number, lowestCh: number }) =>
m(this.ERROR_IllegalCharacters, `File contains ${o.count} illegal character(s), including ${util.describeCodepoint(o.lowestCh)}`);
static ERROR_IllegalCharacters = SevError | 0x0025;

static Hint_CharClassImplicitDenorm = (o: { lowestCh: number }) =>
m(this.HINT_CharClassImplicitDenorm, `File has character classes which span non-NFD character(s), including ${util.describeCodepoint(o.lowestCh)}. These will not match any text.`);
static HINT_CharClassImplicitDenorm = SevHint | 0x0026;

static Warn_CharClassExplicitDenorm = (o: { lowestCh: number }) =>
m(this.WARN_CharClassExplicitDenorm, `File has character classes which include non-NFD characters(s), including ${util.describeCodepoint(o.lowestCh)}. These will not match any text.`);
static WARN_CharClassExplicitDenorm = SevWarn | 0x0027;

}


60 changes: 60 additions & 0 deletions developer/src/kmc-ldml/src/compiler/tran.ts
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,8 @@ export abstract class TransformCompiler<T extends TransformCompilerType, TranBas
// don't unescape literals here, because we are going to pass them through into the regex
cookedFrom = util.unescapeStringToRegex(cookedFrom);

this.checkRanges(cookedFrom); // check before normalizing

if (!sections?.meta?.normalizationDisabled) {
// nfd here.
cookedFrom = MarkerParser.nfd_markers(cookedFrom, true);
Expand Down Expand Up @@ -197,6 +199,64 @@ export abstract class TransformCompiler<T extends TransformCompilerType, TranBas
defaults.delete(this.id);
return defaults;
}

private checkRanges(cookedFrom: string) {
// extract all of the potential ranges - but don't match any-markers!
const anyRange = /(?<!\\uffff\\u0008)\[([^\]]+)\]/g;
const ranges = cookedFrom.matchAll(anyRange);

if (!ranges) return;

// extract inner members of a range (inside the [])
const rangeRegex = /(\\u[0-9a-fA-F]{4}|.)-(\\u[0-9a-fA-F]{4}|.)|./g;

const rangeExplicit = new util.NFDAnalyzer();
const rangeImplicit = new util.NFDAnalyzer();

/** process an explicit entry */
function processExplicit(s: string) {
if (s.startsWith('\\u{')) {
s = util.unescapeString(s);
} else if(s.startsWith('\\u')) {
s = util.unescapeOneQuadString(s);
}
rangeExplicit.add(s);
return s;
}

for (const [, sub] of ranges) {
const subRanges = sub.matchAll(rangeRegex);
for (const [all, start, end] of subRanges) {
if (!start && !end) {
// explicit single char
processExplicit(all); // matched one char
} else {
// start-end range - get explicit start and end chars
const s = processExplicit(start);
const sch = s.codePointAt(0);
const e = processExplicit(end);
const ech = e.codePointAt(0);
// now, process the inner chars, not including explicit
for (let n = sch; n < ech; n++) {
// add inner text
rangeImplicit.add(String.fromCodePoint(n));
}
}
}
}

// analyze ranges
const explicitSet = rangeExplicit.analyze()?.get(util.BadStringType.denormalized);
if (explicitSet) {
this.callbacks.reportMessage(CompilerMessages.Warn_CharClassExplicitDenorm({ lowestCh: explicitSet.values().next().value }));
} else {
// don't analyze the implicit set of THIS range, if explicit is already problematic
const implicitSet = rangeImplicit.analyze()?.get(util.BadStringType.denormalized);
if (implicitSet) {
this.callbacks.reportMessage(CompilerMessages.Hint_CharClassImplicitDenorm({ lowestCh: implicitSet.values().next().value }));
}
}
}
}

export class TranCompiler extends TransformCompiler<'simple', Tran /*, TranItem*/> {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE keyboard3 SYSTEM "../../../../../../../resources/standards-data/ldml-keyboards/techpreview/dtd/ldmlKeyboard3.dtd">
<keyboard3 locale="mt" conformsTo="techpreview">
<info name="tran-hint-range" />

<keys />

<transforms type="simple">
<transformGroup>
<!-- all NFD but crosses non-NFD-->
<transform from="[\u{0020}-\u{0250}]" to="x" />
</transformGroup>
</transforms>
</keyboard3>
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<?xml version="1.0" encoding="UTF-8"?>

<!DOCTYPE keyboard3 SYSTEM "../../../../../../../resources/standards-data/ldml-keyboards/techpreview/dtd/ldmlKeyboard3.dtd">
<keyboard3 locale="mt" conformsTo="techpreview">
<info name="tran-hint-range"/>

<keys />

<transforms type="simple">
<transformGroup>
<!-- all NFD but crosses non-NFD -->
<transform from="[a-ɐ]" to="x"/>
</transformGroup>
</transforms>
</keyboard3>
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE keyboard3 SYSTEM "../../../../../../../resources/standards-data/ldml-keyboards/techpreview/dtd/ldmlKeyboard3.dtd">
<keyboard3 locale="mt" conformsTo="techpreview">
<info name="tran-hint-range" />

<keys />

<transforms type="simple">
<transformGroup>
<!-- warning, includes NFC start/end-->
<transform from="[á-é]" to="x" />
</transformGroup>
</transforms>
</keyboard3>
18 changes: 18 additions & 0 deletions developer/src/kmc-ldml/test/test-tran.ts
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,24 @@ describe('tran', function () {
assert.strictEqual(transforms[0].to.value, "x");
}
},
{
subpath: 'sections/tran/tran-warn-range.xml',
warnings: [
CompilerMessages.Warn_CharClassExplicitDenorm({lowestCh: 0xE1}),
],
},
{
subpath: 'sections/tran/tran-hint-range.xml',
warnings: [
CompilerMessages.Hint_CharClassImplicitDenorm({lowestCh: 0xc0}),
],
},
{
subpath: 'sections/tran/tran-hint-range2.xml',
warnings: [
CompilerMessages.Hint_CharClassImplicitDenorm({lowestCh: 0xC0}),
],
},
], tranDependencies);
});

Expand Down

0 comments on commit 72c8988

Please sign in to comment.