From 78f95085d47cf747123ecb1de421cd40855a46e7 Mon Sep 17 00:00:00 2001 From: Peter Kraus Date: Thu, 10 Oct 2024 13:51:42 +0200 Subject: [PATCH 1/2] Simplify locale handling. --- .../yadg/dataschema_5_1/filetype.py | 16 +--- .../yadg/dataschema_5_1/stepdefaults.py | 19 ++-- tests/test_dataschema.py | 30 ++++-- tests/test_dataschema/err10_locale_sd.json | 22 +++++ tests/test_dataschema/err9_locale_step.json | 23 +++++ tests/test_dataschema/ts16_locales.json | 93 +++++++++++++++++++ 6 files changed, 175 insertions(+), 28 deletions(-) create mode 100644 tests/test_dataschema/err10_locale_sd.json create mode 100644 tests/test_dataschema/err9_locale_step.json create mode 100644 tests/test_dataschema/ts16_locales.json diff --git a/src/dgbowl_schemas/yadg/dataschema_5_1/filetype.py b/src/dgbowl_schemas/yadg/dataschema_5_1/filetype.py index e91da8d..796d597 100644 --- a/src/dgbowl_schemas/yadg/dataschema_5_1/filetype.py +++ b/src/dgbowl_schemas/yadg/dataschema_5_1/filetype.py @@ -4,8 +4,7 @@ from abc import ABC from typing import Optional, Literal, Mapping, Any, TypeVar import tzlocal -import locale -from babel import Locale, UnknownLocaleError +from babel import Locale import logging from .stepdefaults import StepDefaults @@ -32,16 +31,9 @@ def timezone_resolve_localtime(cls, v): @field_validator("locale") @classmethod - def locale_set_default(cls, v): - for loc in (v, locale.getlocale(locale.LC_NUMERIC)[0], locale.getlocale()[0]): - try: - v = str(Locale.parse(loc)) - break - except (TypeError, UnknownLocaleError, ValueError) as e: - logger.debug("Could not process locale '%s': %s", loc, e) - else: - logger.debug("No valid locale string provided. Defaulting to 'en_GB'.") - v = "en_GB" + def locale_validate_default(cls, v): + if v is not None: + v = str(Locale.parse(v)) return v diff --git a/src/dgbowl_schemas/yadg/dataschema_5_1/stepdefaults.py b/src/dgbowl_schemas/yadg/dataschema_5_1/stepdefaults.py index 3161cee..a71cce8 100644 --- a/src/dgbowl_schemas/yadg/dataschema_5_1/stepdefaults.py +++ b/src/dgbowl_schemas/yadg/dataschema_5_1/stepdefaults.py @@ -38,13 +38,16 @@ def timezone_resolve_localtime(cls, v): @field_validator("locale") @classmethod def locale_set_default(cls, v): - for loc in (v, locale.getlocale(locale.LC_NUMERIC)[0], locale.getlocale()[0]): - try: - v = str(Locale.parse(loc)) - break - except (TypeError, UnknownLocaleError, ValueError) as e: - logger.debug("Could not process locale '%s': %s", loc, e) + if v is not None: + v = str(Locale.parse(v)) else: - logger.debug("No valid locale string provided. Defaulting to 'en_GB'.") - v = "en_GB" + for loc in (locale.getlocale(locale.LC_NUMERIC)[0], locale.getlocale()[0]): + try: + v = str(Locale.parse(loc)) + break + except (TypeError, UnknownLocaleError, ValueError) as e: + logger.debug("Could not process locale '%s': %s", loc, e) + else: + logger.debug("No valid locale string provided. Defaulting to 'en_GB'.") + v = "en_GB" return v diff --git a/tests/test_dataschema.py b/tests/test_dataschema.py index 0c8f832..d1ec74f 100644 --- a/tests/test_dataschema.py +++ b/tests/test_dataschema.py @@ -5,6 +5,8 @@ from dgbowl_schemas.yadg.dataschema import ExtractorFactory import locale from pydantic import BaseModel +from babel import UnknownLocaleError +from pydantic import ValidationError @pytest.mark.parametrize( @@ -53,6 +55,7 @@ def test_dataschema_metadata_json(inpath, success, datadir): ("ts13_fusion_json.json"), # 5.1 ("ts14_basic_csv.json"), # 5.1 ("ts15_example.json"), # 5.1 + ("ts16_locales.json"), # 5.1 ], ) def test_dataschema_steps_json(inpath, datadir): @@ -80,13 +83,15 @@ def test_dataschema_steps_json(inpath, datadir): ("err6_chromtrace.json"), # 4.0 ("err7_typo.json"), # 4.0 ("err8_metadata.json"), # 5.0 + ("err9_locale_step.json"), # 5.1, check crash on wrong locale + ("err10_locale_sd.json"), # 5.1, check crash on wrong locale in stepdefaults ], ) def test_dataschema_err(inpath, datadir): os.chdir(datadir) with open(inpath, "r") as infile: jsdata = json.load(infile) - with pytest.raises(ValueError) as e: + with pytest.raises((ValueError, UnknownLocaleError)) as e: to_dataschema(**jsdata["input"]) assert jsdata["exception"] in str(e.value) @@ -179,15 +184,24 @@ def test_extractor_factory(input, output): ("en_US", "en_US"), ("en_US.UTF-8", "en_US"), # check parsing with .UTF-8 suffix ("de_DE.windows-1252", "de_DE"), # check parsing with .windows-1252 suffix - # Failures defaulting to en_GB below here - ("en-US", "en_GB"), # check that parsing with "-" fails - ("no_NO", "en_GB"), # no_NO is not a valid locale, nb_NO is - ("English_United States", "en_GB"), # English_United States is a language - ("English (United States)", "en_GB"), # English (United States) is a language - ("Norwegian (Bokmål)", "en_GB"), # Norwegian (Bokmål) is a language - (None, "en_GB"), # Full fallback. + (None, "en_GB"), ], ) def test_stepdefaults_locale(input, output): ret = ExtractorFactory(extractor=dict(filetype="example", locale=input)).extractor assert ret.locale == output + + +@pytest.mark.parametrize( + "input", + [ + "en-US", # check that parsing with "-" fails + "no_NO", # no_NO is not a valid locale, nb_NO is + "English_United States", # English_United States is a language + "English (United States)", # English (United States) is a language + "Norwegian (Bokmål)", # Norwegian (Bokmål) is a language + ], +) +def test_stepdefaults_locale_fail(input): + with pytest.raises((ValidationError, UnknownLocaleError)): + ExtractorFactory(extractor=dict(filetype="example", locale=input)) diff --git a/tests/test_dataschema/err10_locale_sd.json b/tests/test_dataschema/err10_locale_sd.json new file mode 100644 index 0000000..f123725 --- /dev/null +++ b/tests/test_dataschema/err10_locale_sd.json @@ -0,0 +1,22 @@ +{ + "input": { + "version": "5.1", + "metadata": { + "provenance": {"type": "manual"} + }, + "step_defaults": { + "timezone": "Europe/Berlin", + "locale": "wrong" + }, + "steps": [ + { + "tag": "locale-wrong", + "input": {"files": ["test"]}, + "extractor": { + "filetype": "example" + } + } + ] + }, + "exception": "unknown locale 'wrong'" +} \ No newline at end of file diff --git a/tests/test_dataschema/err9_locale_step.json b/tests/test_dataschema/err9_locale_step.json new file mode 100644 index 0000000..47583c2 --- /dev/null +++ b/tests/test_dataschema/err9_locale_step.json @@ -0,0 +1,23 @@ +{ + "input": { + "version": "5.1", + "metadata": { + "provenance": {"type": "manual"} + }, + "step_defaults": { + "timezone": "Europe/Berlin", + "locale": "de_DE" + }, + "steps": [ + { + "tag": "locale-wrong", + "input": {"files": ["test"]}, + "extractor": { + "filetype": "example", + "locale": "wrong" + } + } + ] + }, + "exception": "unknown locale 'wrong'" +} \ No newline at end of file diff --git a/tests/test_dataschema/ts16_locales.json b/tests/test_dataschema/ts16_locales.json new file mode 100644 index 0000000..cf47a69 --- /dev/null +++ b/tests/test_dataschema/ts16_locales.json @@ -0,0 +1,93 @@ +{ + "input": { + "version": "5.1", + "metadata": { + "provenance": {"type": "manual"} + }, + "step_defaults": { + "timezone": "Europe/Berlin", + "locale": "de_DE" + }, + "steps": [ + { + "tag": "defaults", + "input": {"files": ["test"]}, + "extractor": { + "filetype": "example" + } + }, + { + "tag": "locale-en_US", + "input": {"files": ["test"]}, + "extractor": { + "filetype": "example", + "locale": "en_US" + } + }, + { + "tag": "timezone-Zurich", + "input": {"files": ["test"]}, + "extractor": { + "filetype": "example", + "timezone": "Europe/Zurich" + } + } + ] + }, + "output": [ + { + "tag": "defaults", + "input": { + "files": ["test"], + "prefix": null, + "suffix": null, + "contains": null, + "exclude": null + }, + "extractor": { + "encoding": null, + "filetype": "example", + "locale": null, + "timezone": null, + "parameters": {} + }, + "externaldate": null + }, + { + "tag": "locale-en_US", + "input": { + "files": ["test"], + "prefix": null, + "suffix": null, + "contains": null, + "exclude": null + }, + "extractor": { + "encoding": null, + "filetype": "example", + "locale": "en_US", + "timezone": null, + "parameters": {} + }, + "externaldate": null + }, + { + "tag": "timezone-Zurich", + "input": { + "files": ["test"], + "prefix": null, + "suffix": null, + "contains": null, + "exclude": null + }, + "extractor": { + "encoding": null, + "filetype": "example", + "locale": null, + "timezone": "Europe/Zurich", + "parameters": {} + }, + "externaldate": null + } + ] +} \ No newline at end of file From 18b09815ea65fe3efcca83da3ac35f55fb42e80d Mon Sep 17 00:00:00 2001 From: Peter Kraus Date: Thu, 10 Oct 2024 14:22:42 +0200 Subject: [PATCH 2/2] Make sure we don't pass junk locales in stepdefaults. --- src/dgbowl_schemas/yadg/dataschema_5_0/__init__.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/dgbowl_schemas/yadg/dataschema_5_0/__init__.py b/src/dgbowl_schemas/yadg/dataschema_5_0/__init__.py index 19a7e82..d34eaa7 100644 --- a/src/dgbowl_schemas/yadg/dataschema_5_0/__init__.py +++ b/src/dgbowl_schemas/yadg/dataschema_5_0/__init__.py @@ -1,5 +1,6 @@ from pydantic import BaseModel, Field from typing import Sequence +from babel import Locale, UnknownLocaleError import logging from .metadata import Metadata from .step import Steps @@ -42,6 +43,13 @@ def update(self): exclude_none=True, exclude_defaults=True ) + # Make sure we only pass locales that are valid + if nsch["step_defaults"].get("locale") is not None: + try: + v = str(Locale.parse(nsch["step_defaults"]["locale"])) + except (TypeError, UnknownLocaleError, ValueError): + del nsch["step_defaults"]["locale"] + for step in self.steps: nstep = { "tag": step.tag,