keymanapp · srl295 · Feb 7, 2024 · Feb 2, 2024 · Feb 2, 2024 · Feb 2, 2024
@@ -197,7 +197,7 @@ toOneChar(value: string) : number {
 
 export function describeCodepoint(ch : number) : string {
   let s;
-  const p = getProblem(ch);
+  const p = BadStringAnalyzer.getProblem(ch);
   if (p != null) {
     // for example: 'PUA (U+E010)'
     s = p;
@@ -208,10 +208,12 @@ export function describeCodepoint(ch : number) : string {
   return `${s} (U+${Number(ch).toString(16).toUpperCase()})`;
 }
 
+
 export enum BadStringType {
   pua = 'PUA',
   unassigned = 'Unassigned',
   illegal = 'Illegal',
+  denormalized = "Denormalized"
 };
 
 // Following from kmx_xstring.h / .cpp
@@ -314,34 +316,35 @@ class BadStringMap extends Map<BadStringType, Set<number>> {
   }
 }
 
-function getProblem(ch : number) : BadStringType {
-  if (!isValidUnicode(ch)) {
-    return BadStringType.illegal;
-  } else if(isPUA(ch)) {
-    return BadStringType.pua;
-  } else { // TODO-LDML: unassigned
-    return null;
-  }
-}
-export class BadStringAnalyzer {
+/** abstract class for analyzing and categorizing strings */
+export abstract class StringAnalyzer {
   /** add a string for analysis */
   public add(s : string) {
-    for (const c of s) {
+    for (const c of [...s]) {
       const ch = c.codePointAt(0);
-      const problem = getProblem(ch);
+      const problem = this.analyzeCodePoint(c, ch);
       if (problem) {
         this.addProblem(ch, problem);
       }
     }
   }
 
-  private addProblem(ch : number, type : BadStringType) {
+  /**
+   * subclass interface
+   * @param c single codepoint to analyze (string)
+   * @param ch single codepoint to analyze (scalar)
+   */
+  protected abstract analyzeCodePoint(c: string, ch: number) : BadStringType;
+
+  /** internal interface for the result of an analysis */
+  protected addProblem(ch : number, type : BadStringType) {
     if (!this.m.has(type)) {
       this.m.set(type, new Set<number>());
     }
     this.m.get(type).add(ch);
   }
 
+  /** get the results of the analysis */
   public analyze() : BadStringMap {
     if (this.m.size == 0) {
       return null;
@@ -350,5 +353,36 @@ export class BadStringAnalyzer {
     }
   }
 
+  /** internal map */
   private m = new BadStringMap();
 }
+
+/** analyze a string looking for bad unicode */
+export class BadStringAnalyzer extends StringAnalyzer {
+  /** analyze one codepoint */
+  protected analyzeCodePoint(c: string, ch: number): BadStringType {
+    return BadStringAnalyzer.getProblem(ch);
+  }
+  /** export analyzer function  */
+  public static getProblem(ch: number) {
+    if (!isValidUnicode(ch)) {
+      return BadStringType.illegal;
+    } else if(isPUA(ch)) {
+      return BadStringType.pua;
+    } else { // TODO-LDML: unassigned
+      return null;
+    }
+  }
+}
+
+/** Analyzer that checks if something isn't NFD */
+export class NFDAnalyzer extends StringAnalyzer {
+  protected analyzeCodePoint(c: string, ch: number): BadStringType {
+    const nfd = c.normalize("NFD");
+    if (c !== nfd) {
+      return BadStringType.denormalized;
+    } else {
+      return null;
+    }
+  }
+}
@@ -1,6 +1,6 @@
 import 'mocha';
 import {assert} from 'chai';
-import {unescapeString, UnescapeError, isOneChar, toOneChar, unescapeOneQuadString, BadStringAnalyzer, isValidUnicode, describeCodepoint, isPUA, BadStringType, unescapeStringToRegex, unescapeQuadString} from '../../src/util/util.js';
+import {unescapeString, UnescapeError, isOneChar, toOneChar, unescapeOneQuadString, BadStringAnalyzer, isValidUnicode, describeCodepoint, isPUA, BadStringType, unescapeStringToRegex, unescapeQuadString, NFDAnalyzer} from '../../src/util/util.js';
 
 describe('test UTF32 functions()', function() {
   it('should properly categorize strings', () => {
@@ -207,31 +207,71 @@ describe('test BadStringAnalyzer', () => {
       });
     }
   });
-  describe('should return nothing for all valid strings', () => {
-    it('should handle a case with some odd strs in it', () => {
-      const strs = "But you can call me “\uE010\uFDD0\uFFFE\uD800”, for short." +
+  it('should handle a case with some odd strs in it', () => {
+    const strs = "But you can call me “\uE010\uFDD0\uFFFE\uD800”, for short." +
       ([
         0xF800,
         0x05FFFF,
         0x102222,
         0x04FFFE,
       ].map(ch => String.fromCodePoint(ch)).join(''));
 
-      const bsa = new BadStringAnalyzer();
-      for (const s of strs) {
-        bsa.add(s);
-      }
-      const m = bsa.analyze();
-      assert.isNotNull(m);
-      assert.containsAllKeys(m, [BadStringType.pua, BadStringType.illegal]);
-      assert.sameDeepMembers(Array.from(m.get(BadStringType.pua).values()), [
-        0xE010,0xF800, 0x102222,
-      ], `pua analysis`);
-      assert.sameDeepMembers(Array.from(m.get(BadStringType.illegal).values()), [
-        0xFDD0,0xD800,0xFFFE,
-        0x05FFFF,
-        0x04FFFE,
-      ], `illegal analysis`);
-    });
+    const bsa = new BadStringAnalyzer();
+    for (const s of strs) {
+      bsa.add(s);
+    }
+    const m = bsa.analyze();
+    assert.isNotNull(m);
+    assert.containsAllKeys(m, [BadStringType.pua, BadStringType.illegal]);
+    assert.sameDeepMembers(Array.from(m.get(BadStringType.pua).values()), [
+      0xE010, 0xF800, 0x102222,
+    ], `pua analysis`);
+    assert.sameDeepMembers(Array.from(m.get(BadStringType.illegal).values()), [
+      0xFDD0, 0xD800, 0xFFFE,
+      0x05FFFF,
+      0x04FFFE,
+    ], `illegal analysis`);
+  });
+});
+
+describe('test NFDAnalyzer', () => {
+  describe('should return nothing for NFD strings', () => {
+    const cases = [
+      [],
+      ['a',],
+      ['a', 'b',]
+    ];
+    for (const strs of cases) {
+      const title = titleize(strs);
+      it(`should analyze ${title}`, () => {
+        const bsa = new NFDAnalyzer();
+        for (const s of strs) {
+          bsa.add(s);
+        }
+        const m = bsa.analyze();
+        assert.isNull(m, `${title}`);
+      });
+    }
+  });
+  it('should handle a case with some odd strs in it', () => {
+    const strs = "This text is in NFD, but not all of it is." +
+      ([
+        0x00E8,
+        0x0344,
+        0x2FA1D,
+      ].map(ch => String.fromCodePoint(ch)).join(''));
+
+    const bsa = new NFDAnalyzer();
+    for (const s of strs) {
+      bsa.add(s);
+    }
+    const m = bsa.analyze();
+    assert.isNotNull(m);
+    assert.containsAllKeys(m, [BadStringType.denormalized]);
+    assert.sameDeepMembers(Array.from(m.get(BadStringType.denormalized).values()), [
+      0x00E8,
+      0x0344,
+      0x2FA1D,
+  ], `denorm analysis`);
   });
 });
@@ -157,6 +157,15 @@ export class CompilerMessages {
   static Error_IllegalCharacters = (o: { count: number, lowestCh: number }) =>
   m(this.ERROR_IllegalCharacters, `File contains ${o.count} illegal character(s), including ${util.describeCodepoint(o.lowestCh)}`);
   static ERROR_IllegalCharacters = SevError | 0x0025;
+
+  static Hint_CharClassImplicitDenorm = (o: { lowestCh: number }) =>
+  m(this.HINT_CharClassImplicitDenorm, `File has character classes which span non-NFD character(s), including ${util.describeCodepoint(o.lowestCh)}. These will not match any text.`);
+  static HINT_CharClassImplicitDenorm = SevHint | 0x0026;
+
+  static Warn_CharClassExplicitDenorm = (o: { lowestCh: number }) =>
+  m(this.WARN_CharClassExplicitDenorm, `File has character classes which include non-NFD characters(s), including ${util.describeCodepoint(o.lowestCh)}. These will not match any text.`);
+  static WARN_CharClassExplicitDenorm = SevWarn | 0x0027;
+
 }
 
 
@@ -141,6 +141,8 @@ export abstract class TransformCompiler<T extends TransformCompilerType, TranBas
     // don't unescape literals here, because we are going to pass them through into the regex
     cookedFrom = util.unescapeStringToRegex(cookedFrom);
 
+    this.checkRanges(cookedFrom); // check before normalizing
+
     if (!sections?.meta?.normalizationDisabled) {
       // nfd here.
       cookedFrom = MarkerParser.nfd_markers(cookedFrom, true);
@@ -197,6 +199,64 @@ export abstract class TransformCompiler<T extends TransformCompilerType, TranBas
     defaults.delete(this.id);
     return defaults;
   }
+
+  private checkRanges(cookedFrom: string) {
+    // extract all of the potential ranges - but don't match any-markers!
+    const anyRange = /(?<!\\uffff\\u0008)\[([^\]]+)\]/g;
+    const ranges = cookedFrom.matchAll(anyRange);
+
+    if (!ranges) return;
+
+    // extract inner members of a range (inside the [])
+    const rangeRegex = /(\\u[0-9a-fA-F]{4}|.)-(\\u[0-9a-fA-F]{4}|.)|./g;
+
+    const rangeExplicit = new util.NFDAnalyzer();
+    const rangeImplicit = new util.NFDAnalyzer();
+
+    /** process an explicit entry */
+    function processExplicit(s: string) {
+      if (s.startsWith('\\u{')) {
+        s = util.unescapeString(s);
+      } else if(s.startsWith('\\u')) {
+        s = util.unescapeOneQuadString(s);
+      }
+      rangeExplicit.add(s);
+      return s;
+    }
+
+    for (const [, sub] of ranges) {
+      const subRanges = sub.matchAll(rangeRegex);
+      for (const [all, start, end] of subRanges) {
+        if (!start && !end) {
+          // explicit single char
+          processExplicit(all); // matched one char
+        } else {
+          // start-end range - get explicit start and end chars
+          const s = processExplicit(start);
+          const sch = s.codePointAt(0);
+          const e = processExplicit(end);
+          const ech = e.codePointAt(0);
+          // now, process the inner chars, not including explicit
+          for (let n = sch; n < ech; n++) {
+            // add inner text
+            rangeImplicit.add(String.fromCodePoint(n));
+          }
+        }
+      }
+    }
+
+    // analyze ranges
+    const explicitSet = rangeExplicit.analyze()?.get(util.BadStringType.denormalized);
+    if (explicitSet) {
+      this.callbacks.reportMessage(CompilerMessages.Warn_CharClassExplicitDenorm({ lowestCh: explicitSet.values().next().value }));
+    } else {
+      // don't analyze the implicit set of THIS range, if explicit is already problematic
+      const implicitSet = rangeImplicit.analyze()?.get(util.BadStringType.denormalized);
+      if (implicitSet) {
+        this.callbacks.reportMessage(CompilerMessages.Hint_CharClassImplicitDenorm({ lowestCh: implicitSet.values().next().value }));
+      }
+    }
+  }
 }
 
 export class TranCompiler extends TransformCompiler<'simple', Tran /*, TranItem*/> {

@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE keyboard3 SYSTEM "../../../../../../../resources/standards-data/ldml-keyboards/techpreview/dtd/ldmlKeyboard3.dtd">
+<keyboard3 locale="mt" conformsTo="techpreview">
+    <info name="tran-hint-range" />
+
+    <keys />
+
+    <transforms type="simple">
+        <transformGroup>
+            <!-- all NFD but crosses non-NFD-->
+            <transform from="[\u{0020}-\u{0250}]" to="x" />
+        </transformGroup>
+    </transforms>
+</keyboard3>
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!DOCTYPE keyboard3 SYSTEM "../../../../../../../resources/standards-data/ldml-keyboards/techpreview/dtd/ldmlKeyboard3.dtd">
+<keyboard3 locale="mt" conformsTo="techpreview">
+  <info name="tran-hint-range"/>
+
+  <keys />
+
+  <transforms type="simple">
+    <transformGroup>
+      <!-- all NFD but crosses non-NFD -->
+      <transform from="[a-ɐ]" to="x"/>
+    </transformGroup>
+  </transforms>
+</keyboard3>
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE keyboard3 SYSTEM "../../../../../../../resources/standards-data/ldml-keyboards/techpreview/dtd/ldmlKeyboard3.dtd">
+<keyboard3 locale="mt" conformsTo="techpreview">
+    <info name="tran-hint-range" />
+
+    <keys />
+
+    <transforms type="simple">
+        <transformGroup>
+            <!-- warning, includes NFC start/end-->
+            <transform from="[á-é]" to="x" />
+        </transformGroup>
+    </transforms>
+</keyboard3>
@@ -245,6 +245,24 @@ describe('tran', function () {
         assert.strictEqual(transforms[0].to.value, "x");
       }
     },
+    {
+      subpath: 'sections/tran/tran-warn-range.xml',
+      warnings: [
+        CompilerMessages.Warn_CharClassExplicitDenorm({lowestCh: 0xE1}),
+      ],
+    },
+    {
+      subpath: 'sections/tran/tran-hint-range.xml',
+      warnings: [
+        CompilerMessages.Hint_CharClassImplicitDenorm({lowestCh: 0xc0}),
+      ],
+    },
+    {
+      subpath: 'sections/tran/tran-hint-range2.xml',
+      warnings: [
+        CompilerMessages.Hint_CharClassImplicitDenorm({lowestCh: 0xC0}),
+      ],
+    },
   ], tranDependencies);
 });