From eeafdf4f782bbb04080d51f28f8aa8482587e3ab Mon Sep 17 00:00:00 2001 From: Peter Kraus Date: Tue, 8 Oct 2024 19:02:46 +0200 Subject: [PATCH] More robust processing of windows locales. --- .../yadg/dataschema_5_1/__init__.py | 2 +- .../yadg/dataschema_5_1/filetype.py | 13 +++++++----- .../yadg/dataschema_5_1/stepdefaults.py | 8 +++++-- tests/test_dataschema.py | 21 +++++++++++++++++++ 4 files changed, 36 insertions(+), 8 deletions(-) diff --git a/src/dgbowl_schemas/yadg/dataschema_5_1/__init__.py b/src/dgbowl_schemas/yadg/dataschema_5_1/__init__.py index 95073a5..1591d3b 100644 --- a/src/dgbowl_schemas/yadg/dataschema_5_1/__init__.py +++ b/src/dgbowl_schemas/yadg/dataschema_5_1/__init__.py @@ -20,7 +20,7 @@ class DataSchema(BaseModel, extra="forbid"): metadata: Optional[Mapping[str, Any]] """Input metadata for :mod:`yadg`.""" - step_defaults: StepDefaults = Field(StepDefaults()) + step_defaults: StepDefaults = Field(..., default_factory=StepDefaults) """Default values for configuration of each :class:`Step`.""" steps: Sequence[Step] diff --git a/src/dgbowl_schemas/yadg/dataschema_5_1/filetype.py b/src/dgbowl_schemas/yadg/dataschema_5_1/filetype.py index 91cbe38..e91da8d 100644 --- a/src/dgbowl_schemas/yadg/dataschema_5_1/filetype.py +++ b/src/dgbowl_schemas/yadg/dataschema_5_1/filetype.py @@ -33,11 +33,14 @@ def timezone_resolve_localtime(cls, v): @field_validator("locale") @classmethod def locale_set_default(cls, v): - if v is None: - v = locale.getlocale(locale.LC_NUMERIC)[0] - try: - v = str(Locale.parse(v)) - except (TypeError, UnknownLocaleError): + for loc in (v, locale.getlocale(locale.LC_NUMERIC)[0], locale.getlocale()[0]): + try: + v = str(Locale.parse(loc)) + break + except (TypeError, UnknownLocaleError, ValueError) as e: + logger.debug("Could not process locale '%s': %s", loc, e) + else: + logger.debug("No valid locale string provided. Defaulting to 'en_GB'.") v = "en_GB" return v diff --git a/src/dgbowl_schemas/yadg/dataschema_5_1/stepdefaults.py b/src/dgbowl_schemas/yadg/dataschema_5_1/stepdefaults.py index 66b9a79..3161cee 100644 --- a/src/dgbowl_schemas/yadg/dataschema_5_1/stepdefaults.py +++ b/src/dgbowl_schemas/yadg/dataschema_5_1/stepdefaults.py @@ -3,6 +3,9 @@ import locale from babel import Locale, UnknownLocaleError import tzlocal +import logging + +logger = logging.getLogger(__name__) class StepDefaults(BaseModel, extra="forbid"): @@ -39,8 +42,9 @@ def locale_set_default(cls, v): try: v = str(Locale.parse(loc)) break - except (TypeError, UnknownLocaleError): - pass + except (TypeError, UnknownLocaleError, ValueError) as e: + logger.debug("Could not process locale '%s': %s", loc, e) else: + logger.debug("No valid locale string provided. Defaulting to 'en_GB'.") v = "en_GB" return v diff --git a/tests/test_dataschema.py b/tests/test_dataschema.py index 1b76e90..2f22209 100644 --- a/tests/test_dataschema.py +++ b/tests/test_dataschema.py @@ -170,3 +170,24 @@ def test_extractor_factory(input, output): assert ret.locale == output.get("locale") assert ret.encoding == output.get("encoding") assert ret.timezone is not None + + +@pytest.mark.parametrize( + "input, output", + [ + ("en_GB", "en_GB"), + ("en_US", "en_US"), + ("en_US.UTF-8", "en_US"), # check parsing with .UTF-8 suffix + ("de_DE.windows-1252", "de_DE"), # check parsing with .windows-1252 suffix + # Failures defaulting to en_GB below here + ("en-US", "en_GB"), # check that parsing with "-" fails + ("no_NO", "en_GB"), # no_NO is not a valid locale, nb_NO is + ("English_United States", "en_GB"), # English_United States is a language + ("English (United States)", "en_GB"), # English (United States) is a language + ("Norwegian (Bokmål)", "en_GB"), # Norwegian (Bokmål) is a language + (None, "en_GB"), # Full fallback. + ], +) +def test_stepdefaults_locale(input, output): + ret = ExtractorFactory(extractor=dict(filetype="example", locale=input)).extractor + assert ret.locale == output