Skip to content

Commit

Permalink
DataSchema-5.1: More robust processing of Windows locales. (#45)
Browse files Browse the repository at this point in the history
* More robust processing of windows locales.

* ruff.
  • Loading branch information
PeterKraus authored Oct 8, 2024
1 parent 1638558 commit 6e6b5ee
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 8 deletions.
2 changes: 1 addition & 1 deletion src/dgbowl_schemas/yadg/dataschema_5_1/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class DataSchema(BaseModel, extra="forbid"):
metadata: Optional[Mapping[str, Any]]
"""Input metadata for :mod:`yadg`."""

step_defaults: StepDefaults = Field(StepDefaults())
step_defaults: StepDefaults = Field(..., default_factory=StepDefaults)
"""Default values for configuration of each :class:`Step`."""

steps: Sequence[Step]
Expand Down
13 changes: 8 additions & 5 deletions src/dgbowl_schemas/yadg/dataschema_5_1/filetype.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,14 @@ def timezone_resolve_localtime(cls, v):
@field_validator("locale")
@classmethod
def locale_set_default(cls, v):
if v is None:
v = locale.getlocale(locale.LC_NUMERIC)[0]
try:
v = str(Locale.parse(v))
except (TypeError, UnknownLocaleError):
for loc in (v, locale.getlocale(locale.LC_NUMERIC)[0], locale.getlocale()[0]):
try:
v = str(Locale.parse(loc))
break
except (TypeError, UnknownLocaleError, ValueError) as e:
logger.debug("Could not process locale '%s': %s", loc, e)
else:
logger.debug("No valid locale string provided. Defaulting to 'en_GB'.")
v = "en_GB"
return v

Expand Down
8 changes: 6 additions & 2 deletions src/dgbowl_schemas/yadg/dataschema_5_1/stepdefaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
import locale
from babel import Locale, UnknownLocaleError
import tzlocal
import logging

logger = logging.getLogger(__name__)


class StepDefaults(BaseModel, extra="forbid"):
Expand Down Expand Up @@ -39,8 +42,9 @@ def locale_set_default(cls, v):
try:
v = str(Locale.parse(loc))
break
except (TypeError, UnknownLocaleError):
pass
except (TypeError, UnknownLocaleError, ValueError) as e:
logger.debug("Could not process locale '%s': %s", loc, e)
else:
logger.debug("No valid locale string provided. Defaulting to 'en_GB'.")
v = "en_GB"
return v
21 changes: 21 additions & 0 deletions tests/test_dataschema.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,3 +170,24 @@ def test_extractor_factory(input, output):
assert ret.locale == output.get("locale")
assert ret.encoding == output.get("encoding")
assert ret.timezone is not None


@pytest.mark.parametrize(
"input, output",
[
("en_GB", "en_GB"),
("en_US", "en_US"),
("en_US.UTF-8", "en_US"), # check parsing with .UTF-8 suffix
("de_DE.windows-1252", "de_DE"), # check parsing with .windows-1252 suffix
# Failures defaulting to en_GB below here
("en-US", "en_GB"), # check that parsing with "-" fails
("no_NO", "en_GB"), # no_NO is not a valid locale, nb_NO is
("English_United States", "en_GB"), # English_United States is a language
("English (United States)", "en_GB"), # English (United States) is a language
("Norwegian (Bokmål)", "en_GB"), # Norwegian (Bokmål) is a language
(None, "en_GB"), # Full fallback.
],
)
def test_stepdefaults_locale(input, output):
ret = ExtractorFactory(extractor=dict(filetype="example", locale=input)).extractor
assert ret.locale == output

0 comments on commit 6e6b5ee

Please sign in to comment.