From a9ca12bb37d7fd11a2d408afb83901e73a26852e Mon Sep 17 00:00:00 2001 From: alexharri Date: Mon, 14 Nov 2022 18:20:39 +0000 Subject: [PATCH 1/8] fix: update README to include information on correctness --- README.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/README.md b/README.md index d8ce0ce..52bc0b8 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,9 @@ applyCase("þgf", "Helga Fríða Smáradóttir"); - [Usage](#Usage) - [Cases](#Cases) - [Whitespace](#Whitespace) +- [Correctness](#Correctness) + - [Passing a name in the wrong case](Passing_a_name_in_the_wrong_case) + - [What happens if beygla does not find a pattern?](What_happens_if_beygla_does_not_find_a_pattern) --- @@ -168,3 +171,23 @@ If the name includes superfluous whitespace, `applyCase` removes it. applyCase("þgf", " \n Helga Dís\tSmáradóttir \n\n"); //=> "Helgu Dís Smáradóttur" ``` + +## Correctness + +Beygla will correctly apply the desired case to the input name in most cases. + +Most Icelandic names (81%), especially common ones, are present on [bin.arnastofnun.is](https://bin.arnastofnun.is/gogn/). Beygla is guaranteed to produce a correct result for those names. + +This does not mean that Beygla produces an incorrect result for the other 19% of names. Beygla finds patterns in name endings based on the data on [bin.arnastofnun.is](https://bin.arnastofnun.is/gogn/) and applies those patterns any name. This means that beygla will produce a correct result for most names, even if the name is not in the dataset on [bin.arnastofnun.is](https://bin.arnastofnun.is/gogn/). + + +### Passing a name in the wrong case + +Beygla operates on the assumption that names provided to it are in the nominative case (nefnifall). If a name provided to beygla is in another case than nominative, an incorrect result is extremely likely. + + +### What happens if beygla does not find a pattern? + +Given a name that has an ending that beygla does not recognize, it will not apply the case to the name. + +Do note that beygla attempts to apply the case to every name (first, last, and middle name) in a full name individually. This means that some names in a full name might have a case applied, and some not. From 8987fa66ca669dc8c2906867369ce3d58a69bdee Mon Sep 17 00:00:00 2001 From: alexharri Date: Mon, 14 Nov 2022 18:45:40 +0000 Subject: [PATCH 2/8] feat: add getDeclensionForName and test unknown names --- lib/beygla.spec.ts | 46 +++++++++++++++++++++++++++++++++++++++++++--- lib/beygla.ts | 6 ++++++ 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/lib/beygla.spec.ts b/lib/beygla.spec.ts index 99e7cdf..3702924 100644 --- a/lib/beygla.spec.ts +++ b/lib/beygla.spec.ts @@ -1,8 +1,8 @@ -import { applyCase as _applyCase } from "./beygla"; +import * as _beygla from "./beygla"; import serializedInput from "./read/serializedInput"; import groupedNames from "../out/grouped-names.json"; -let applyCase = _applyCase; +let beygla = _beygla; const testingBuild = process.env.TEST_BUILD === "true"; if (testingBuild) { @@ -11,9 +11,11 @@ if (testingBuild) { // on the build output. console.log("Testing built module."); - applyCase = require("../dist/beygla.esm.js").applyCase; + beygla = require("../dist/beygla.esm.js"); } +const { applyCase, getDeclensionForName } = beygla; + jest.mock("./read/serializedInput", () => { const fs = require("fs"); const path = require("path"); @@ -116,4 +118,42 @@ describe("applyCase", () => { expect(son).toEqual("syni"); expect(dottir).toEqual("dóttur"); }); + + it("finds correct(ish) declension for some unknown names", () => { + const tests: Array<[name: string, declension: string]> = [ + ["Sotti", "1;i,a,a,a"], + ["Sófía", "1;a,u,u,u"], + ["Kórekur", "2;ur,,i,s"], + ["Olivia", "1;a,u,u,u"], + ["Caritas", "0;,,,ar"], + ["Hávarr", "1;r,,i,s"], + ["Ermenga", "1;a,u,u,u"], + ["Fannþór", "0;,,i,s"], + ["Ísbrá", "0;,,,r"], + ["Sófús", "0;,,i,ar"], + ["Kristólín", "0;,,,ar"], + ["Jasper", "0;,,,s"], + ["Rúnel", "0;,,i,s"], + ["Agok", "0;,,i,s"], + ]; + + for (const [name, declension] of tests) { + expect(getDeclensionForName(name)).toEqual(declension); + } + }); + + it("does not find a declension for some unknown names", () => { + const tests: string[] = [ + "Emanuel", + "Frederik", + "Evan", + "Lennon", + "Artemis", + "Kaín", + ]; + + for (const name of tests) { + expect(getDeclensionForName(name)).toEqual(null); + } + }); }); diff --git a/lib/beygla.ts b/lib/beygla.ts index ad531b7..d0a129e 100644 --- a/lib/beygla.ts +++ b/lib/beygla.ts @@ -105,3 +105,9 @@ export function applyCase(caseStr: Case, name: string): string { const names = name.split(/\s+/).filter(Boolean); return names.map((name) => applyCaseToName(caseStr, name)).join(" "); } + +export function getDeclensionForName(name: string): string | null { + if (name.split(/\s+/).length > 1) + throw new Error("Name must not include whitespace"); + return extractDeclension(trie, name); +} From e47f57039f3761c5e8303efce2ea33fd57c80cc2 Mon Sep 17 00:00:00 2001 From: alexharri Date: Mon, 14 Nov 2022 18:53:44 +0000 Subject: [PATCH 3/8] document correctness on 20 samples from unknown names --- README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.md b/README.md index 52bc0b8..09720cc 100644 --- a/README.md +++ b/README.md @@ -180,6 +180,16 @@ Most Icelandic names (81%), especially common ones, are present on [bin.arnastof This does not mean that Beygla produces an incorrect result for the other 19% of names. Beygla finds patterns in name endings based on the data on [bin.arnastofnun.is](https://bin.arnastofnun.is/gogn/) and applies those patterns any name. This means that beygla will produce a correct result for most names, even if the name is not in the dataset on [bin.arnastofnun.is](https://bin.arnastofnun.is/gogn/). +I tried randomly sampling 20 names from the list of legal Icelandic names not present in [bin.arnastofnun.is](https://bin.arnastofnun.is/gogn/): + + * 14 names matched a pattern with the correct result + * 6 names matched no pattern + * 0 names matched a pattern with an incorrect result + +Even though I happened to get no incorrect results, this is a very small sample. I'm absolutely certain that there are a handful of names that will produce incorrect results. + +See [beygla.spec.ts](https://github.com/alexharri/beygla/blob/master/lib/beygla.spec.ts). + ### Passing a name in the wrong case From 9e03538fc814e0fe06f2a6194546e42945a9ff00 Mon Sep 17 00:00:00 2001 From: alexharri Date: Mon, 14 Nov 2022 18:58:21 +0000 Subject: [PATCH 4/8] make titles in correctness linkable --- README.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 09720cc..b0a2a35 100644 --- a/README.md +++ b/README.md @@ -191,12 +191,16 @@ Even though I happened to get no incorrect results, this is a very small sample. See [beygla.spec.ts](https://github.com/alexharri/beygla/blob/master/lib/beygla.spec.ts). -### Passing a name in the wrong case +

+Passing a name in the wrong case +

Beygla operates on the assumption that names provided to it are in the nominative case (nefnifall). If a name provided to beygla is in another case than nominative, an incorrect result is extremely likely. -### What happens if beygla does not find a pattern? +

+What happens if beygla does not find a pattern? +

Given a name that has an ending that beygla does not recognize, it will not apply the case to the name. From 9771327f6fde6f8c31c7a137af59d880aea1e34c Mon Sep 17 00:00:00 2001 From: alexharri Date: Mon, 14 Nov 2022 18:59:12 +0000 Subject: [PATCH 5/8] add id to correctness section title --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b0a2a35..8b36e5a 100644 --- a/README.md +++ b/README.md @@ -172,7 +172,9 @@ applyCase("þgf", " \n Helga Dís\tSmáradóttir \n\n"); //=> "Helgu Dís Smáradóttur" ``` -## Correctness +

+Correctness +

Beygla will correctly apply the desired case to the input name in most cases. From c8f422b1081ec23a12d2e4d846c6540ec1ac8f57 Mon Sep 17 00:00:00 2001 From: alexharri Date: Mon, 14 Nov 2022 18:59:53 +0000 Subject: [PATCH 6/8] phrasing --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8b36e5a..132daeb 100644 --- a/README.md +++ b/README.md @@ -180,7 +180,7 @@ Beygla will correctly apply the desired case to the input name in most cases. Most Icelandic names (81%), especially common ones, are present on [bin.arnastofnun.is](https://bin.arnastofnun.is/gogn/). Beygla is guaranteed to produce a correct result for those names. -This does not mean that Beygla produces an incorrect result for the other 19% of names. Beygla finds patterns in name endings based on the data on [bin.arnastofnun.is](https://bin.arnastofnun.is/gogn/) and applies those patterns any name. This means that beygla will produce a correct result for most names, even if the name is not in the dataset on [bin.arnastofnun.is](https://bin.arnastofnun.is/gogn/). +This does not mean that Beygla produces an incorrect result for the other 19% of names. Beygla finds patterns in name endings based on the data on [bin.arnastofnun.is](https://bin.arnastofnun.is/gogn/) and applies those patterns to any input name. This means that beygla will produce a correct result for most names, even if the name is not in the dataset on [bin.arnastofnun.is](https://bin.arnastofnun.is/gogn/). I tried randomly sampling 20 names from the list of legal Icelandic names not present in [bin.arnastofnun.is](https://bin.arnastofnun.is/gogn/): From 12410f11513eecbe7150687afc60a7e9775c3ba8 Mon Sep 17 00:00:00 2001 From: alexharri Date: Mon, 14 Nov 2022 19:00:04 +0000 Subject: [PATCH 7/8] change word --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 132daeb..c08b1a0 100644 --- a/README.md +++ b/README.md @@ -180,7 +180,7 @@ Beygla will correctly apply the desired case to the input name in most cases. Most Icelandic names (81%), especially common ones, are present on [bin.arnastofnun.is](https://bin.arnastofnun.is/gogn/). Beygla is guaranteed to produce a correct result for those names. -This does not mean that Beygla produces an incorrect result for the other 19% of names. Beygla finds patterns in name endings based on the data on [bin.arnastofnun.is](https://bin.arnastofnun.is/gogn/) and applies those patterns to any input name. This means that beygla will produce a correct result for most names, even if the name is not in the dataset on [bin.arnastofnun.is](https://bin.arnastofnun.is/gogn/). +This does not mean that Beygla produces an incorrect result for the other 19% of names. Beygla finds patterns in name endings based on the data on [bin.arnastofnun.is](https://bin.arnastofnun.is/gogn/) and applies those patterns to any input name. This means that beygla will produce a correct result for most names, even if the name is not in the dataset from [bin.arnastofnun.is](https://bin.arnastofnun.is/gogn/). I tried randomly sampling 20 names from the list of legal Icelandic names not present in [bin.arnastofnun.is](https://bin.arnastofnun.is/gogn/): From 509ddc254234615798ff19c8657712789ba24c23 Mon Sep 17 00:00:00 2001 From: alexharri Date: Mon, 14 Nov 2022 19:01:24 +0000 Subject: [PATCH 8/8] remove word from test name --- lib/beygla.spec.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/beygla.spec.ts b/lib/beygla.spec.ts index 3702924..a3a959c 100644 --- a/lib/beygla.spec.ts +++ b/lib/beygla.spec.ts @@ -119,7 +119,7 @@ describe("applyCase", () => { expect(dottir).toEqual("dóttur"); }); - it("finds correct(ish) declension for some unknown names", () => { + it("finds correct declension for some unknown names", () => { const tests: Array<[name: string, declension: string]> = [ ["Sotti", "1;i,a,a,a"], ["Sófía", "1;a,u,u,u"],