From a1a1c45b5a0784626c0c0a00b6b0ff55e4fd2447 Mon Sep 17 00:00:00 2001 From: Levi Gruspe Date: Fri, 21 Apr 2023 17:14:36 +0800 Subject: [PATCH] Label inventories using Glottocodes instead of ISO639-3 --- simphones/inventories.py | 24 +++++++++++++++++------- test/test_inventories.py | 26 +++++++++++++++++++++++--- 2 files changed, 40 insertions(+), 10 deletions(-) diff --git a/simphones/inventories.py b/simphones/inventories.py index 2e9fcb1..40880b2 100644 --- a/simphones/inventories.py +++ b/simphones/inventories.py @@ -39,8 +39,12 @@ def substitute(phone: Phone) -> Phone: def get_phonological_inventories() -> InventoryDataset: """Get phonological inventories from the PHOIBLE dataset. - The keys of the returned dictionary are ISO 639-3 language codes or "*". - The "*" inventory combines the inventories of every language. + The result is a dictionary with Glottocodes as keys. + In addition, there are three special keys: + + - "*" (combination of the inventories of every language) + - "Djindewal" (doesn't have a Glottocode) + - "ModernAramaic" (doesn't have a Glottocode) """ phoible = Path(__file__).with_name("phoible.csv") @@ -50,7 +54,7 @@ def get_phonological_inventories() -> InventoryDataset: next(rows) # Drop the header. for row in rows: - code = row[2] + glottocode = row[1] raw_phoneme = substitute(row[6]) allophones = parse_allophones(row[7]) @@ -65,8 +69,12 @@ def get_phonological_inventories() -> InventoryDataset: update_inventory(combined_inventory, phoneme, allophones) # Update language inventory. - # NA inventories are not skipped, but they are mushed together into - # one. + # If the language has no Glottocode, use the language name as a key + # instead. + code = glottocode + if code == "NA": + language_name = row[3] + code = language_name.replace(" ", "") language_inventory = inventories.setdefault(code, {}) update_inventory(language_inventory, phoneme, allophones) return inventories @@ -117,8 +125,10 @@ def update_inventory( def get_sounds(language: LanguageCode = "*") -> set[Phone]: """Return set of sounds in the given language. - If no ISO639-3 language code is given, the return value is the combined - inventories of all languages in the PHOIBLE data. + If no Glottocode is given, the return value is the combined inventories of + all languages in the PHOIBLE data. + + Simply returns an empty set if the language code is not in PHOIBLE. """ inventories = get_phonological_inventories() inventory = inventories.get(language, {}) diff --git a/test/test_inventories.py b/test/test_inventories.py index 46e6cfa..506f473 100644 --- a/test/test_inventories.py +++ b/test/test_inventories.py @@ -3,6 +3,7 @@ # See https://www.gnu.org/licenses/gpl-3.0.en.html """Test simphones.inventories.""" from argparse import Namespace +import re import pytest @@ -32,9 +33,28 @@ def test_get_phonological_inventories() -> None: def test_get_phonological_inventories_na() -> None: - """`get_phonological_inventories` should include an NA inventory.""" + """`get_phonological_inventories` shouldn't contain an NA inventory. + + If the Glottocode of a language isn't available, it should use the name of + the language as key instead. + """ + inventories = get_phonological_inventories() + assert "NA" not in inventories + + assert "Djindewal" in inventories + assert "ModernAramaic" in inventories + + +def test_get_phonological_inventories_glottocode_keys() -> None: + """The keys of the return value of `get_phonological_inventories` should be + Glottocodes, except for special keys. + """ + special = {"*", "Djindewal", "ModernAramaic"} inventories = get_phonological_inventories() - assert "NA" in inventories + for glottocode in inventories: + is_glottocode = re.match("[a-z]{4}[0-9]{4}", glottocode) + if not is_glottocode: + assert glottocode in special def test_get_sounds() -> None: @@ -42,7 +62,7 @@ def test_get_sounds() -> None: inventory of the language. """ # Test on a few languages, because it's slow. - languages = ["eng", "tgl"] + languages = ["*", "stan1293", "taga1270"] inventories = get_phonological_inventories() for language in languages: sounds = set(inventories[language])