Skip to content

Commit

Permalink
Label inventories using Glottocodes instead of ISO639-3
Browse files Browse the repository at this point in the history
  • Loading branch information
lggruspe committed Apr 21, 2023
1 parent efef468 commit a1a1c45
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 10 deletions.
24 changes: 17 additions & 7 deletions simphones/inventories.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,12 @@ def substitute(phone: Phone) -> Phone:
def get_phonological_inventories() -> InventoryDataset:
"""Get phonological inventories from the PHOIBLE dataset.
The keys of the returned dictionary are ISO 639-3 language codes or "*".
The "*" inventory combines the inventories of every language.
The result is a dictionary with Glottocodes as keys.
In addition, there are three special keys:
- "*" (combination of the inventories of every language)
- "Djindewal" (doesn't have a Glottocode)
- "ModernAramaic" (doesn't have a Glottocode)
"""
phoible = Path(__file__).with_name("phoible.csv")

Expand All @@ -50,7 +54,7 @@ def get_phonological_inventories() -> InventoryDataset:
next(rows) # Drop the header.

for row in rows:
code = row[2]
glottocode = row[1]
raw_phoneme = substitute(row[6])
allophones = parse_allophones(row[7])

Expand All @@ -65,8 +69,12 @@ def get_phonological_inventories() -> InventoryDataset:
update_inventory(combined_inventory, phoneme, allophones)

# Update language inventory.
# NA inventories are not skipped, but they are mushed together into
# one.
# If the language has no Glottocode, use the language name as a key
# instead.
code = glottocode
if code == "NA":
language_name = row[3]
code = language_name.replace(" ", "")
language_inventory = inventories.setdefault(code, {})
update_inventory(language_inventory, phoneme, allophones)
return inventories
Expand Down Expand Up @@ -117,8 +125,10 @@ def update_inventory(
def get_sounds(language: LanguageCode = "*") -> set[Phone]:
"""Return set of sounds in the given language.
If no ISO639-3 language code is given, the return value is the combined
inventories of all languages in the PHOIBLE data.
If no Glottocode is given, the return value is the combined inventories of
all languages in the PHOIBLE data.
Simply returns an empty set if the language code is not in PHOIBLE.
"""
inventories = get_phonological_inventories()
inventory = inventories.get(language, {})
Expand Down
26 changes: 23 additions & 3 deletions test/test_inventories.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# See https://www.gnu.org/licenses/gpl-3.0.en.html
"""Test simphones.inventories."""
from argparse import Namespace
import re

import pytest

Expand Down Expand Up @@ -32,17 +33,36 @@ def test_get_phonological_inventories() -> None:


def test_get_phonological_inventories_na() -> None:
"""`get_phonological_inventories` should include an NA inventory."""
"""`get_phonological_inventories` shouldn't contain an NA inventory.
If the Glottocode of a language isn't available, it should use the name of
the language as key instead.
"""
inventories = get_phonological_inventories()
assert "NA" not in inventories

assert "Djindewal" in inventories
assert "ModernAramaic" in inventories


def test_get_phonological_inventories_glottocode_keys() -> None:
"""The keys of the return value of `get_phonological_inventories` should be
Glottocodes, except for special keys.
"""
special = {"*", "Djindewal", "ModernAramaic"}
inventories = get_phonological_inventories()
assert "NA" in inventories
for glottocode in inventories:
is_glottocode = re.match("[a-z]{4}[0-9]{4}", glottocode)
if not is_glottocode:
assert glottocode in special


def test_get_sounds() -> None:
"""`get_sounds(language)` should be the same as the keys in the sound
inventory of the language.
"""
# Test on a few languages, because it's slow.
languages = ["eng", "tgl"]
languages = ["*", "stan1293", "taga1270"]
inventories = get_phonological_inventories()
for language in languages:
sounds = set(inventories[language])
Expand Down

0 comments on commit a1a1c45

Please sign in to comment.