Skip to content

Commit

Permalink
Disambiguation
Browse files Browse the repository at this point in the history
  • Loading branch information
Porges committed Aug 5, 2023
1 parent f813b3a commit c161644
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 20 deletions.
20 changes: 10 additions & 10 deletions transliteration_testdata.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2112,7 +2112,7 @@ export const hindi: [string, string][] = [
["औपचारिकता", "aupacārikatā"],
["और", "aura"],
["औऱ", "auṟa"],
["औरंगउटान", "auraṅgauṭāna"],
["औरंगउटान", "auraṅga:uṭāna"],
["औरंगज़ेब", "auraṅgazēba"],
["औरंगाबाद", "auraṅgābāda"],
["औरत", "aurata"],
Expand Down Expand Up @@ -2152,7 +2152,7 @@ export const hindi: [string, string][] = [
["कंवलजीत", "kãvalajīta"],
["कंसर्ट", "kãsarṭa"],
["कंसल्टेंसी", "kãsalṭē̃sī"],
["कइयों", "kaiyō̃"],
["कइयों", "ka:iyō̃"],
["कई", "kaī"],
["कक्कड़", "kakkaṛa"],
["कक्ष", "kakṣa"],
Expand Down Expand Up @@ -3577,7 +3577,7 @@ export const hindi: [string, string][] = [
["गंधक", "gandhaka"],
["गंभीर", "gambhīra"],
["गंभीरता", "gambhīratā"],
["गइ", "gai"],
["गइ", "ga:i"],
["गई", "gaī"],
["गईं", "gaī̃"],
["गए", "gaē"],
Expand Down Expand Up @@ -9321,7 +9321,7 @@ export const hindi: [string, string][] = [
["बंधुओं", "bandhuō̃"],
["बंधुत्व", "bandhutva"],
["बंधे", "bandhē"],
["बंबइया", "bambaiyā"],
["बंबइया", "bamba:iyā"],
["बंबई", "bambaī"],
["बंसोडे", "bãsōḍē"],
["बंसोदे", "bãsōdē"],
Expand Down Expand Up @@ -9562,7 +9562,7 @@ export const hindi: [string, string][] = [
["बबली", "babalī"],
["बबलू", "babalū"],
["बम", "bama"],
["बम्बइया", "bambaiyā"],
["बम्बइया", "bamba:iyā"],
["बम्बई", "bambaī"],
["बमवर्षक", "bamavarṣaka"],
["बमों", "bamō̃"],
Expand Down Expand Up @@ -11291,7 +11291,7 @@ export const hindi: [string, string][] = [
["मुंगेर", "muṅgēra"],
["मुंजाल", "muñjāla"],
["मुंडा", "muṇḍā"],
["मुंबइया", "mumbaiyā"],
["मुंबइया", "mumba:iyā"],
["मुंबई", "mumbaī"],
["मुंशी", "mũśī"],
["मुंह", "mũha"],
Expand Down Expand Up @@ -13463,8 +13463,8 @@ export const hindi: [string, string][] = [
["वेश्यालय", "vēśyālaya"],
["वेषभूषा", "vēṣabhūṣā"],
["वेस्ट", "vēsṭa"],
["वेस्टइंडीज", "vēsṭaiṇḍīja"],
["वेस्टइंडीज़", "vēsṭaiṇḍīza"],
["वेस्टइंडीज", "vēsṭa:iṇḍīja"],
["वेस्टइंडीज़", "vēsṭa:iṇḍīza"],
["वैंगांकर", "vaiṅgāṅkara"],
["वैंडेनवर्ग", "vaiṇḍēnavarga"],
["वैकम", "vaikama"],
Expand Down Expand Up @@ -14068,7 +14068,7 @@ export const hindi: [string, string][] = [
["संहार", "sãhāra"],
["संहिता", "sãhitā"],
["सईद", "saīda"],
["सउदी", "saudī"],
["सउदी", "sa:udī"],
["सऊदी", "saūdī"],
["स्कडमोर", "skaḍamōra"],
["सकता", "sakatā"],
Expand Down Expand Up @@ -14322,7 +14322,7 @@ export const hindi: [string, string][] = [
["स्पाइबॉट", "spāibôṭa"],
["स्पाइवेयर", "spāivēyara"],
["सपाट", "sapāṭa"],
["स्पायवेयरइन्फ़ो", "spāyavēyarainfō"],
["स्पायवेयरइन्फ़ो", "spāyavēyara:infō"],
["स्पिटज़र", "spiṭazara"],
["स्पिति", "spiti"],
["स्पिन", "spina"],
Expand Down
16 changes: 13 additions & 3 deletions transliterator.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ it("should be good", () => {
describe("unicode tests", () => {
const tests = [
["ऱ्", "ṟ"],
["र्\u200d", "r‍"], // should ZWJ be removed?
];

it.each(tests)("check %s", (input, output) => {
Expand Down Expand Up @@ -59,7 +58,20 @@ describe("Malayalam special forms", () => {
it.each(tests)("check %s", (input, output) => {
expect(translit(input)).toEqual(output);
});
});

describe("Devanagari special cases", () => {
const tests = [
["बइ", "ba:i"],
["बै", "bai"],
["र्य", "rya"],
//["र्य", "r:ya"],
["र्‍", "r̆"], // "eyelash R" as used in Nepali/Marathi
];

it.each(tests)("check %s", (input, output) => {
expect(translit(input)).toEqual(output);
});
});

describe("npm transliteration", () => {
Expand Down Expand Up @@ -91,10 +103,8 @@ describe("conjuncts", () => {
});
});

/*
describe('hindi tests', () => {
it.each(hindi)("check %s", (input, output) => {
expect(translit(input)).toEqual(output);
});
});
*/
33 changes: 26 additions & 7 deletions transliterator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,20 +46,31 @@ const letters = expand(new Map<string | string[], Letter>([
[["ऽ", "ഽ", "ಽ"], "’"],
[["अ", "അ", "ಅ"], { value: "a", vowelType: "full" }],
[["आ", "ആ", "ಆ"], { value: "ā", vowelType: "full" }],
[["इ", /*,*/ "ಇ"], { value: "i", vowelType: "full" }],
["ഇ", {
[["इ", "ഇ", "ಇ"], {
value: "i",
vowelType: "full",
onPrev: [
// if previous is letter with no vowel mark
// (so it will be -a), this is 'a:i' to distinguish from 'ai'
// really this should test that previous is also a Malayalam character
// really this should test that previous is also a script character
// but can't do that without /v which is unsupported in Node currently.
[/(?<=\p{Letter})/uy, ":i"]
// this would be simpler if we checked upon the transliterated output...
[/(?<=\p{Letter})(?<![आആಆइഇಇईഈಈउഉಉ])/uy, ":i"]
]
}],
[["ई", "ഈ", "ಈ"], { value: "ī", vowelType: "full" }],
[["उ", "ഉ", "ಉ"], { value: "u", vowelType: "full" }],
[["उ", "ഉ", "ಉ"], {
value: "u",
vowelType: "full",
onPrev: [
// if previous is letter with no vowel mark
// (so it will be -a), this is 'a:u' to distinguish from 'au'
// really this should test that previous is also a script character
// but can't do that without /v which is unsupported in Node currently.
// this would be simpler if we checked upon the transliterated output...
[/(?<=\p{Letter})(?<![आആಆइഇಇईഈಈउഉಉ])/uy, ":u"]
]
}],
[["ऊ", "ഊ", "ಊ"], { value: "ū", vowelType: "full" }],
[["ऋ", "ഋ", "ಋ"], { value: "r̥", vowelType: "semi" }],
[["ॠ", "ൠ", "ಌೠ"], { value: "r̥̄", vowelType: "semi" }],
Expand Down Expand Up @@ -107,14 +118,21 @@ const letters = expand(new Map<string | string[], Letter>([
[/(?<=്)\p{Letter}/uy, "y:"] // special medial form for suppressed vowel in Malayalam
]
}],
[["र", /*,*/ "ರ"], { value: "r", implicitVowel: "a" }],
["र", {
value: "r",
implicitVowel: "a",
onNext: [
[/(?<=्\u200d)/uy, "r̆"] // special case for "eyelash-R" in Nepali/Marathi
]
}],
["ര", {
value: "r",
implicitVowel: "a",
onNext: [
[wordFinal, "ṟ"] // special final form in Malayalam
]
}],
[["ರ"], { value: "r", implicitVowel: "a" }],
[["ऱ", "റ", "ಱ"], { value: "ṟ", implicitVowel: "a" }], // TODO: Malayalam special final form
[["ल", "ല", "ಲ"], { value: "l", implicitVowel: "a" }],
[["ळ", "ള", "ಳ"], { value: "ḷ", implicitVowel: "a" }],
Expand Down Expand Up @@ -262,7 +280,8 @@ function expand<K>(input: Map<string | string[], K>): Map<string, K> {
}

// include Nuktas alongside character, for matching
const regex = /((?!\p{Script=Latin})\p{L}[಼\u093c]?)(\p{M}*)/gu;
// allow suffix ZWJ, for now
const regex = /((?!\p{Script=Latin})\p{L}[಼\u093c]?)(\p{M}*)\u200d?/gu;

export function translit(value: string): string {
return value.replaceAll(regex, replacement);
Expand Down

0 comments on commit c161644

Please sign in to comment.