diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 34cbabc..4645d13 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,8 +1,17 @@ name: CI on: - workflow_dispatch: pull_request: + push: + workflow_dispatch: + +permissions: + checks: write + contents: write + # deployments permission to deploy GitHub pages website + deployments: write + pull-requests: write + jobs: python-unit: @@ -18,7 +27,7 @@ jobs: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} cache: 'pip' @@ -28,7 +37,90 @@ jobs: run: | python -m pip install --upgrade pip pip install .[test] - + + - name: Check Python linting (Ruff) + run: ruff check --output-format=github + + - name: Check Python formatting (Ruff) + run: ruff format --check + - name: Run unit tests run: | - pytest + pytest --junitxml=junit_pytest_main.xml --cov-report=term-missing:skip-covered + mv .coverage .coverage_main + + - name: Run Django integration tests + working-directory: ./edtf_django_tests + run: | + pytest edtf_integration/tests.py --ds=edtf_django_tests.settings --junitxml=../junit_pytest_django.xml --cov-report=term-missing:skip-covered + mv .coverage ../.coverage_django + + - name: Combine coverage reports + run: | + coverage combine .coverage_main .coverage_django + coverage report --omit="edtf_django_tests/*" + coverage xml -o coverage_combined.xml --omit="edtf_django_tests/*" + + - name: Combine JUnit XML reports + run: | + python combine_junit.py combined_junit_pytest.xml junit_pytest_main.xml junit_pytest_django.xml + + - name: Pytest coverage comment + id: coverageComment + uses: MishaKav/pytest-coverage-comment@main + with: + pytest-xml-coverage-path: ./coverage_combined.xml + junitxml-path: ./combined_junit_pytest.xml + unique-id-for-comment: ${{ matrix.python-version }} + github-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Check the output coverage + run: | + echo "Coverage Percentage - ${{ steps.coverageComment.outputs.coverage }}" + echo "Coverage Color - ${{ steps.coverageComment.outputs.color }}" + echo "Coverage Html - ${{ steps.coverageComment.outputs.coverageHtml }}" + echo "Summary Report -" ${{ steps.coverageComment.outputs.summaryReport }} + echo "Coverage Warnings - ${{ steps.coverageComment.outputs.warnings }}" + echo "Coverage Errors - ${{ steps.coverageComment.outputs.errors }}" + echo "Coverage Failures - ${{ steps.coverageComment.outputs.failures }}" + echo "Coverage Skipped - ${{ steps.coverageComment.outputs.skipped }}" + echo "Coverage Tests - ${{ steps.coverageComment.outputs.tests }}" + echo "Coverage Time - ${{ steps.coverageComment.outputs.time }}" + echo "Not Success Test Info - ${{ steps.coverageComment.outputs.notSuccessTestInfo }}" + + - name: Run benchmarks + run: | + pytest -m benchmark --benchmark-json=./output.json + + - name: Download previous benchmark data + uses: actions/cache@v4 + with: + path: ./cache + key: ${{ runner.os }}-benchmark + + - name: Publish benchmark results + uses: benchmark-action/github-action-benchmark@v1 + if: github.event_name == 'pull_request' && github.repository == 'ixc/python-edtf' + with: + tool: 'pytest' + auto-push: true + comment-always: true + output-file-path: output.json + github-token: ${{ secrets.GITHUB_TOKEN }} + comment-on-alert: true + save-data-file: true + summary-always: true + + - name: Comment on benchmark results without publishing + if: github.event_name != 'pull_request' || github.repository != 'ixc/python-edtf' + uses: benchmark-action/github-action-benchmark@v1 + with: + tool: 'pytest' + auto-push: false + github-token: ${{ secrets.GITHUB_TOKEN }} + comment-always: true + output-file-path: output.json + comment-on-alert: false + save-data-file: true + summary-always: true + external-data-json-path: ./cache/benchmark-data.json diff --git a/.github/workflows/coverage_readme.yml b/.github/workflows/coverage_readme.yml new file mode 100644 index 0000000..86309de --- /dev/null +++ b/.github/workflows/coverage_readme.yml @@ -0,0 +1,68 @@ +name: Update Coverage on Readme +on: + push: + branches: + - main + +# https://docs.github.com/en/actions/using-jobs/assigning-permissions-to-jobs +# `contents` is for permission to the contents of the repository. +# `pull-requests` is for permission to pull request +permissions: + contents: write + checks: write + pull-requests: write + +# see: https://github.com/MishaKav/pytest-coverage-comment +jobs: + update-coverage-on-readme: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + persist-credentials: false + fetch-depth: 0 + + - name: Set up Python 3.12 + uses: actions/setup-python@v5 + with: + python-version: 3.12 + cache: 'pip' + cache-dependency-path: '**/pyproject.toml' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install .[test] + + - name: Run tests and generate coverage + run: | + pytest + mv .coverage .coverage_main + cd edtf_django_tests + coverage run manage.py test edtf_integration + mv .coverage ../.coverage_django + cd .. + coverage combine .coverage_main .coverage_django + coverage report --omit="edtf_django_tests/*" + coverage xml -o coverage_combined.xml --omit="edtf_django_tests/*" + + - name: Pytest coverage comment + if: ${{ github.ref == 'refs/heads/main' }} + id: coverageComment + uses: MishaKav/pytest-coverage-comment@main + with: + pytest-xml-coverage-path: ./coverage_combined.xml + hide-comment: true + + - name: Update Readme with Coverage Html + if: ${{ github.ref == 'refs/heads/main' }} + run: | + sed -i '//,//c\\n\${{ steps.coverageComment.outputs.coverageHtml }}\n' ./README.md + + - name: Commit & Push changes to README + run: | + git config --global user.name 'github-actions[bot]' + git config --global user.email 'github-actions[bot]@users.noreply.github.com' + git add README.md + git commit -m 'Update coverage badge in README' + git push diff --git a/.gitignore b/.gitignore index ba74660..36df893 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ __pycache__/ # Distribution / packaging .Python env/ +venv/ build/ develop-eggs/ dist/ @@ -41,7 +42,14 @@ htmlcov/ .cache nosetests.xml coverage.xml +coverage_combined.xml +.coverage_main +.coverage_django *,cover +combined_junit_pytest.xml +pytest.xml +junit_pytest_main.xml +junit_pytest_django.xml # Translations *.mo @@ -49,6 +57,7 @@ coverage.xml # Django stuff: *.log +db.sqlite3 # Sphinx documentation docs/_build/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..ff6df15 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,22 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: end-of-file-fixer + exclude: "business-facing/layer" + - id: trailing-whitespace + exclude: "business-facing/layer" + - id: check-yaml + exclude: "business-facing/layer" + - id: check-json + exclude: "business-facing/layer" + + - repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.4.4 + hooks: + # Run the linter, and enable lint fixes + - id: ruff + args: [ --fix ] + # Run the formatter. + - id: ruff-format diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index e3377a3..0000000 --- a/.travis.yml +++ /dev/null @@ -1,12 +0,0 @@ -language: python -sudo: false -cache: pip -python: -- '3.6' -- '2.7' -before_install: -- pip install nose coverage 'django<2' -script: -- nosetests --verbose --with-coverage --cover-package=edtf -after_success: -- coverage report diff --git a/LICENSE b/LICENSE index 756b6a4..f697a39 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,7 @@ The MIT License (MIT) Copyright (c) 2015 The Interaction Consortium +Copyright (c) 2023 SAW Leipzig Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 8f5c13c..7542c36 100644 --- a/README.md +++ b/README.md @@ -1,76 +1,99 @@ -python-edtf -=========== +# python-edtf -An implementation of EDTF format in Python, together with utility functions for parsing natural language date texts, and converting EDTF dates to related Python `date` objects. + + -See http://www.loc.gov/standards/datetime/ for the current draft specification. +An implementation of EDTF format in Python, together with utility functions for parsing natural language date texts, and converting EDTF dates to related Python `date` or `struct_time` objects. + +See for the final draft specification. + +This project is based on python-edtf and was developed to include the newest specification ## To install - pip install edtf +```shell +pip install edtf +``` ## To use - >>> from edtf import parse_edtf - # Parse an EDTF string to an EDTFObject - >>> e = parse_edtf("1979-08~") # approx August 1979 - >>> e - UncertainOrApproximate: '1979-08~' - # normalised string representation (some different EDTF strings have identical meanings) - >>> unicode(e) - u'1979-08~' - - # Derive Python date objects - # lower and upper bounds that strictly adhere to the given range - >>> e.lower_strict()[:3], e.upper_strict()[:3] - ((1979, 8, 1), (1979, 8, 31)) - # lower and upper bounds that are padded if there's indicated uncertainty - >>> e.lower_fuzzy()[:3], e.upper_fuzzy()[:3] - ((1979, 7, 1), (1979, 9, 30)) - - # Date intervals - >>> interval = parse_edtf("1979-08~/open") - >>> interval - Level1Interval: '1979-08~/open' - # Intervals have lower and upper EDTF objects. - >>> interval.lower, interval.upper - (UncertainOrApproximate: '1979-08~', UncertainOrApproximate: 'open') - >>> interval.lower.upper_strict()[:3] - (1979, 8, 31) - >>> interval.upper.lower_strict() # 'open' is interpreted to mean 'still happening'. - [Today's date] - - # Date collections - >>> coll = parse_edtf('{1667,1668, 1670..1672}') - >>> coll - MultipleDates: '{1667, 1668, 1670..1672}' - >>> coll.objects - (Date: '1667', Date: '1668', Consecutives: '1670..1672') +```python +>>> from edtf import parse_edtf + +# Parse an EDTF string to an EDTFObject +>>> +>>> e = parse_edtf("1979-08~") # approx August 1979 +>>> e +UncertainOrApproximate: '1979-08~' + +# normalised string representation (some different EDTF strings have identical meanings) +>>> +>>> unicode(e) +u'1979-08~' + +# Derive Python date objects + +# lower and upper bounds that strictly adhere to the given range +>>> +>>> e.lower_strict()[:3], e.upper_strict()[:3] +((1979, 8, 1), (1979, 8, 31)) + +# lower and upper bounds that are padded if there's indicated uncertainty +>>> +>>> e.lower_fuzzy()[:3], e.upper_fuzzy()[:3] +((1979, 7, 1), (1979, 9, 30)) + +# Date intervals +>>> +>>> interval = parse_edtf("1979-08~/..") +>>> interval +Level1Interval: '1979-08~/..' + +# Intervals have lower and upper EDTF objects +>>> +>>> interval.lower, interval.upper +(UncertainOrApproximate: '1979-08~', UnspecifiedIntervalSection: '..') +>>> interval.lower.lower_strict()[:3], interval.lower.upper_strict()[:3] +((1979, 8, 1), (1979, 8, 31)) +>>> interval.upper.upper_strict() # '..' is interpreted to mean open interval and is returning -/+ math.inf +math.inf + +# Date collections +>>> +>>> coll = parse_edtf('{1667,1668, 1670..1672}') +>>> coll +MultipleDates: '{1667, 1668, 1670..1672}' +>>> coll.objects +(Date: '1667', Date: '1668', Consecutives: '1670..1672') +``` The object returned by `parse_edtf()` is an instance of an `edtf.parser.parser_classes.EDTFObject` subclass, depending on the type of date that was parsed. These classes are: - # Level 0 - Date - DateAndTime - Interval - - # Level 1 - UncertainOrApproximate - Unspecified - Level1Interval - LongYear - Season - - # Level 2 - PartialUncertainOrApproximate - PartialUnspecified - OneOfASet - MultipleDates - MaskedPrecision - Level2Interval - ExponentialYear - -All of these implement `upper/lower_strict/fuzzy()` methods to derive Python `date` objects. +```text +# Level 0 +Date +DateAndTime +Interval + +# Level 1 +UncertainOrApproximate +Unspecified +Level1Interval +UnspecifiedIntervalSection +LongYear +Season + +# Level 2 +PartialUncertainOrApproximate +PartialUnspecified +OneOfASet +MultipleDates +Level2Interval +Level2Season +ExponentialYear +``` + +All of these implement `upper/lower_strict/fuzzy()` methods to derive `struct_time` objects, except of UnspecifiedIntervalSection, that can also return math.inf value The `*Interval` instances have `upper` and `lower` properties that are themselves `EDTFObject` instances. @@ -86,163 +109,209 @@ Test coverage includes every example given in the spec table of features. * Date: - >>> parse_edtf('1979-08') # August 1979 - Date: '1979-08' +```python +>>> parse_edtf('1979-08') # August 1979 +Date: '1979-08' +``` * Date and Time: - >>> parse_edtf('2004-01-01T10:10:10+05:00') - DateAndTime: '2004-01-01T10:10:10+05:00' +```python +>>> parse_edtf('2004-01-01T10:10:10+05:00') +DateAndTime: '2004-01-01T10:10:10+05:00' +``` * Interval (start/end): - >>> parse_edtf('1979-08-28/1979-09-25') # From August 28 to September 25 1979 - Interval: '1979-08-28/1979-09-25' +```python +>>> parse_edtf('1979-08-28/1979-09-25') # From August 28 to September 25 1979 +Interval: '1979-08-28/1979-09-25' +``` ### Level 1 Extensions * Uncertain/Approximate dates: - >>> parse_edtf('1979-08-28~') # Approximately August 28th 1979 - UncertainOrApproximate: '1979-08-28~' +```python +>>> parse_edtf('1979-08-28~') # Approximately August 28th 1979 +UncertainOrApproximate: '1979-08-28~' +``` * Unspecified dates: - >>> parse_edtf('1979-08-uu') # An unknown day in August 1979 - Unspecified: '1979-08-uu' - >>> parse_edtf('1979-uu') # Some month in 1979 - Unspecified: '1979-uu' +```python +>>> parse_edtf('1979-08-XX') # An unknown day in August 1979 +Unspecified: '1979-08-XX' +>>> parse_edtf('1979-XX') # Some month in 1979 +Unspecified: '1979-XX' +``` * Extended intervals: - >>> parse_edtf('1984-06-02?/2004-08-08~') - Level1Interval: '1984-06-02?/2004-08-08~' +```python +>>> parse_edtf('1984-06-02?/2004-08-08~') +Level1Interval: '1984-06-02?/2004-08-08~' +``` * Years exceeding four digits: - >>> parse_edtf('y-12000') # 12000 years BCE - LongYear: 'y-12000' +```python +>>> parse_edtf('Y-12000') # 12000 years BCE +LongYear: 'Y-12000' +``` * Season: - >>> parse_edtf('1979-22') # Summer 1979 - Season: '1979-22' +```python +>>> parse_edtf('1979-22') # Summer 1979 +Season: '1979-22' +``` ### Level 2 Extensions * Partial uncertain/approximate: - >>> parse_edtf('(2011)-06-04~') # year certain, month/day approximate. - # Note that the result text is normalized - PartialUncertainOrApproximate: '2011-(06-04)~' +```python +>>> parse_edtf('2004-06~-11') # year certain, month/day approximate. +PartialUncertainOrApproximate: '2004-06~-11' +``` * Partial unspecified: - >>> parse_edtf('1979-uu-28') # The 28th day of an uncertain month in 1979 - PartialUnspecified: '1979-uu-28' +```python +>>> parse_edtf('1979-XX-28') # The 28th day of an uncertain month in 1979 +PartialUnspecified: '1979-XX-28' +``` * One of a set: - >>> parse_edtf("[..1760-12-03,1762]") - OneOfASet: '[..1760-12-03, 1762]' +```python +>>> parse_edtf("[..1760-12-03,1762]") +OneOfASet: '[..1760-12-03, 1762]' +``` * Multiple dates: - >>> parse_edtf('{1667,1668, 1670..1672}') - MultipleDates: '{1667, 1668, 1670..1672}' - -* Masked precision: - - >>> parse_edtf('197x') # A date in the 1970s. - MaskedPrecision: '197x' +```python +>>> parse_edtf('{1667,1668, 1670..1672}') +MultipleDates: '{1667, 1668, 1670..1672}' +``` * Level 2 Extended intervals: - >>> parse_edtf('2004-06-(01)~/2004-06-(20)~') - Level2Interval: '2004-06-(01)~/2004-06-(20)~' +```python +>>> parse_edtf('2004-06-~01/2004-06-~20') +Level2Interval: '2004-06-~01/2004-06-~20' +``` * Year requiring more than 4 digits - exponential form: - >>> parse_edtf('y-17e7') - ExponentialYear: 'y-17e7' +```python +>>> e = parse_edtf('Y-17E7') +ExponentialYear: 'Y-17E7' +>>> e.estimated() +-170000000 +``` + +* Significant digits: + +```python +# '1950S2': some year between 1900 and 1999, estimated to be 1950 +>>> d = parse_edtf('1950S2') +Date: '1950S2' +>>> d.lower_fuzzy()[:3] +(1900, 1, 1) +>>> d.upper_fuzzy()[:3] +(1999, 12, 31) +# 'Y171010000S3': some year between 171000000 and 171999999 estimated to be 171010000, with 3 significant digits. +>>> l = parse_edtf('Y171010000S3') +LongYear: 'Y171010000S3' +>>> l.estimated() +171010000 +>>> l.lower_fuzzy()[:3] +(171000000, 1, 1) +>>> l.upper_fuzzy()[:3] +(171999999, 12, 31) +# 'Y3388E2S3': some year in exponential notation between 338000 and 338999, estimated to be 338800 +>>> e = parse_edtf('Y3388E2S3') +ExponentialYear: 'Y3388E2S3S3' +>>> e.estimated() +338800 +>>> e.lower_fuzzy()[:3] +(338000, 1, 1) +>>> e.upper_fuzzy()[:3] +(338999, 12, 31) +``` ### Natural language representation - The library includes a basic English natural language parser (it's not yet smart enough to work with occasions such as 'Easter', or in other languages): - >>> from edtf import text_to_edtf - >>> text_to_edtf("circa August 1979") - '1979-08~' +```python +>>> from edtf import text_to_edtf +>>> text_to_edtf("circa August 1979") +'1979-08~' +``` Note that the result is a string, not an `ETDFObject`. The parser can parse strings such as: - 'January 12, 1940' => '1940-01-12' - '90' => '1990' #implied century - 'January 2008' => '2008-01' - 'the year 1800' => '1800' - '10/7/2008' => '2008-10-07' # in a full-specced date, assume US ordering - - # uncertain/approximate - '1860?' => '1860?' - '1862 (uncertain)' => '1862?' - 'circa Feb 1812' => '1812-02~' - 'c.1860' => '1860~' #with or without . - 'ca1860' => '1860~' - 'approx 1860' => '1860~' - - # masked precision - '1860s' => '186x' #186x has decade precision, 186u has year precision. - '1800s' => '18xx' # without uncertainty indicators, assume century - - # masked precision + uncertainty - 'ca. 1860s' => '186x~' - 'circa 1840s' => '184x~' - 'ca. 1860s?' => '186x?~' - 'c1800s?' => '180x?~' # with uncertainty indicators, use the decade - - # unspecified parts - 'January 12' => 'uuuu-01-12' - 'January' => 'uuuu-01' - '7/2008' => '2008-07' - - #seasons - 'Autumn 1872' => '1872-23' - 'Fall 1872' => '1872-23' - - # before/after - 'earlier than 1928' => 'unknown/1928' - 'later than 1928' => '1928/unknown' - 'before January 1928' => 'unknown/1928-01' - 'after about the 1920s' => '192x~/unknown' - - # unspecified - 'year in the 1860s' => '186u' #186x has decade precision, 186u has year precision. - ('year in the 1800s', '18xu') - 'month in 1872' => '1872-uu' - 'day in January 1872' => '1872-01-uu' - 'day in 1872' => '1872-uu-uu' - - #centuries - '1st century' => '00xx' - '10c' => '09xx' - '19th century?' => '18xx?' - - # just showing off now... - 'a day in about Spring 1849?' => '1849-21-uu?~' - - # simple ranges, which aren't as accurate as they could be. The parser is - limited to only picking the first year range it finds. - '1851-1852' => '1851/1852' - '1851-1852; printed 1853-1854' => '1851/1852' - '1851-52' => '1851/1852' - '1856-ca. 1865' => '1856/1865~' - '1860s-1870s' => '186x/187x' - '1920s -early 1930s' => '192x/193x' - '1938, printed 1940s-1950s' => '1938' - +```text +'January 12, 1940' => '1940-01-12' +'90' => '1990' #implied century +'January 2008' => '2008-01' +'the year 1800' => '1800' +'10/7/2008' => '2008-10-07' # in a full-specced date, assume US ordering + +# uncertain/approximate +'1860?' => '1860?' +'1862 (uncertain)' => '1862?' +'circa Feb 1812' => '1812-02~' +'c.1860' => '1860~' #with or without . +'ca1860' => '1860~' +'approx 1860' => '1860~' +'ca. 1860s' => '186X~' +'circa 1840s' => '184X~' +'ca. 1860s?' => '186X?~' +'c1800s?' => '180X?~' # with uncertainty indicators, use the decade + +# unspecified parts +'January 12' => 'XXXX-01-12' +'January' => 'XXXX-01' +'7/2008' => '2008-07' +'month in 1872' => '1872-XX' +'day in January 1872' => '1872-01-XX' +'day in 1872' => '1872-XX-XX' + +#seasons +'Autumn 1872' => '1872-23' +'Fall 1872' => '1872-23' + +# before/after +'earlier than 1928' => '/1928' +'later than 1928' => '1928/' +'before January 1928' => '/1928-01' +'after about the 1920s' => '192X~/' + +#centuries +'1st century' => '00XX' +'10c' => '09XX' +'19th century?' => '18XX?' + +# just showing off now... +'a day in about Spring 1849?' => '1849-21-XX?~' + +# simple ranges, which aren't as accurate as they could be. The parser is +limited to only picking the first year range it finds. +'1851-1852' => '1851/1852' +'1851-1852; printed 1853-1854' => '1851/1852' +'1851-52' => '1851/1852' +'1856-ca. 1865' => '1856/1865~' +'1860s-1870s' => '186X/187X' +'1920s - early 1930s' => '192X/193X' +'1938, printed 1940s-1950s' => '1938' +``` Generating natural text from an EDTF representation is a future goal. @@ -256,13 +325,10 @@ Generating natural text from an EDTF representation is a future goal. * If a natural language groups dates with a '/', it's interpreted as "or" rather than "and". The resulting EDTF text is a list bracketed by `[]` ("one of these dates") rather than `{}` (all of these dates). - ## Converting to and from Python dates - Since EDTF dates are often regions, and often imprecise, we need to use a few different Python dates, depending on the circumstance. Generally, Python dates are used for sorting and filtering, and are not displayed directly to users. - ### `struct_time` date representation Because Python's `datetime` module does not support dates out side the range 1 AD to 9999 AD we return dates as `time.struct_time` objects by default instead of the `datetime.date` or `datetime.datetime` objects you might expect. @@ -271,7 +337,8 @@ The `struct_time` representation is more difficult to work with, but can be sort If you are sure you are working with dates within the range supported by Python's `datetime` module, you can get these more convenient objects using the `edtf.struct_time_to_date` and `edtf.struct_time_to_datetime` functions. -NOTE: This library previously did return `date` and `datetime` objects from methods by default before we switched to `struct_time`. See ticket https://github.com/ixc/python-edtf/issues/26. +> [!NOTE] +> This library previously did return `date` and `datetime` objects from methods by default before we switched to `struct_time`. See ticket . ### `lower_strict` and `upper_strict` @@ -279,26 +346,27 @@ These dates indicate the earliest and latest dates that are __strictly__ in the In an ascending sort (most recent last), sort by `lower_strict` to get a natural sort order. In a descending sort (most recent first), sort by `upper_strict`: - >>> e = parse_edtf('1912-04~') +```python +>>> e = parse_edtf('1912-04~') - >>> e.lower_strict() # Returns struct_time - >>> time.struct_time(tm_year=1912, tm_mon=4, tm_mday=1, tm_hour=0, tm_min=0, tm_sec=0, tm_wday=0, tm_yday=0, tm_isdst=-1) +>>> e.lower_strict() # Returns struct_time +>>> time.struct_time(tm_year=1912, tm_mon=4, tm_mday=1, tm_hour=0, tm_min=0, tm_sec=0, tm_wday=0, tm_yday=0, tm_isdst=-1) - >>> e.lower_strict()[:3] # Show only interesting parts of struct_time - (1912, 4, 01) +>>> e.lower_strict()[:3] # Show only interesting parts of struct_time +(1912, 4, 01) - >>> from edtf import struct_time_to_date - >>> struct_time_to_date(e.lower_strict()) # Convert to date - datetime.date(1912, 4, 01) +>>> from edtf import struct_time_to_date +>>> struct_time_to_date(e.lower_strict()) # Convert to date +datetime.date(1912, 4, 01) - >>> e.upper_strict()[:3] - (1912, 4, 30) +>>> e.upper_strict()[:3] +(1912, 4, 30) - >>> struct_time_to_date(e.upper_strict()) - datetime.date(1912, 4, 30) +>>> struct_time_to_date(e.upper_strict()) +datetime.date(1912, 4, 30) +``` ### `lower_fuzzy` and `upper_fuzzy` ------------------------------------ These dates indicate the earliest and latest dates that are __possible__ in the date range, for a fairly arbitrary definition of 'possibly'. @@ -306,29 +374,81 @@ These values are useful for filtering results - i.e. testing which EDTF dates mi The fuzzy dates are derived from the strict dates, plus or minus a level of padding that depends on how precise the date specfication is. For the case of approximate or uncertain dates, we (arbitrarily) pad the ostensible range by 100% of the uncertain timescale, or by a 12 weeks in the case of seasons. That is, if a date is approximate at the month scale, it is padded by a month. If it is approximate at the year scale, it is padded by a year: - >>> e = parse_edtf('1912-04~') - >>> e.lower_fuzzy()[:3] # padding is 100% of a month - (1912, 3, 1) - >>> e.upper_fuzzy()[:3] - (1912, 5, 30) +```python +>>> e = parse_edtf('1912-04~') +>>> e.lower_fuzzy()[:3] # padding is 100% of a month +(1912, 3, 1) +>>> e.upper_fuzzy()[:3] +(1912, 5, 30) - >>> e = parse_edtf('1912~') - >>> e.lower_fuzzy()[:3] # padding is 100% of a year - (1911, 1, 1) - >>> e.upper_fuzzy()[:3] - (1913, 12, 31) +>>> e = parse_edtf('1912~') +>>> e.lower_fuzzy()[:3] # padding is 100% of a year +(1911, 1, 1) +>>> e.upper_fuzzy()[:3] +(1913, 12, 31) +``` One can interpret uncertain or approximate dates as 'plus or minus a [level of precision]'. If a date is both uncertain __and__ approximate, the padding is applied twice, i.e. it gets 100% * 2 padding, or 'plus or minus two [levels of precision]'. +### Qualification properties + +EDTF objects support properties that provide an overview of how the object is qualified: + +* `.is_uncertain (?)` +* `.is_approximate (~)` +* `.is_uncertain_and_approximate (%)` + +These properties represent whether the any part of the date object is uncertain, approximate, or uncertain and approximate. For ranges, the properties are true if any part of the range (lower or upper section) is qualified as such. A date is not necessarily uncertain and approximate if it is separately both uncertain and approximate - it must have the "%" qualifier to be considered uncertain and aproximate. + +```python +>>> parse_edtf("2006-06-11") +Date: '2006-06-11' +>>> parse_edtf("2006-06-11").is_uncertain +False +>>> parse_edtf("2006-06-11").is_approximate +False + +>>> parse_edtf("1984?") +UncertainOrApproximate: '1984?' +>>> parse_edtf("1984?").is_approximate +False +>>> parse_edtf("1984?").is_uncertain +True +>>> parse_edtf("1984?").is_uncertain_and_approximate +False + +>>> parse_edtf("1984%").is_uncertain +False +>>> parse_edtf("1984%").is_uncertain_and_approximate +True + +>>> parse_edtf("1984~/2004-06") +Level1Interval: '1984~/2004-06' +>>> parse_edtf("1984~/2004-06").is_approximate +True +>>> parse_edtf("1984~/2004-06").is_uncertain +False + +>>> parse_edtf("2004?-~06-~04") +PartialUncertainOrApproximate: '2004?-~06-~04' +>>> parse_edtf("2004?-~06-~04").is_approximate +True +>>> parse_edtf("2004?-~06-~04").is_uncertain +True +>>> parse_edtf("2004?-~06-~04").is_uncertain_and_approximate +False +``` + ### Seasons -Seasons are interpreted as Northern Hemisphere by default. To change this, override the month mapping in `appsettings.py`. +> [!IMPORTANT] +> Seasons are interpreted as Northern Hemisphere by default. To change this, override the month mapping in [`appsettings.py`](edtf/appsettings.py). ### Comparisons -Two EDTF dates are considered equal if their unicode() representations are the same. An EDTF date is considered greater than another if its `lower_strict` value is later. +Two EDTF dates are considered equal if their `unicode()` representations are the same. An EDTF date is considered greater than another if its `lower_strict` value is later. ## Django ORM field @@ -338,36 +458,64 @@ To store a natural language value on your model, define another field, and set t When your model is saved, the `natural_text_field` value will be parsed to set the `date_edtf` value, and the underlying EDTF object will set the `_earliest` and `_latest` fields on the model to a float value representing the Julian Date. - -**WARNING**: The conversion to and from Julian Date numerical values can be inaccurate, especially for ancient dates back to thousands of years BC. Ideally Julian Date values should be used for range and ordering operations only where complete accuracy is not required. They should **not** be used for definitive storage or for display after roundtrip conversions. +> [!WARNING] +> The conversion to and from Julian Date numerical values can be inaccurate, especially for ancient dates back to thousands of years BC. Ideally Julian Date values should be used for range and ordering operations only where complete accuracy is not required. They should __not__ be used for definitive storage or for display after roundtrip conversions. Example usage: - from django.db import models - from edtf.fields import EDTFField - - class MyModel(models.Model): - date_display = models.CharField( - "Date of creation (display)", - blank=True, - max_length=255, - ) - date_edtf = EDTFField( - "Date of creation (EDTF)", - natural_text_field='date_display', - lower_fuzzy_field='date_earliest', - upper_fuzzy_field='date_latest', - lower_strict_field='date_sort_ascending', - upper_strict_field='date_sort_descending', - blank=True, - null=True, - ) - # use for filtering - date_earliest = models.FloatField(blank=True, null=True) - date_latest = models.FloatField(blank=True, null=True) - # use for sorting - date_sort_ascending = models.FloatField(blank=True, null=True) - date_sort_descending = models.FloatField(blank=True, null=True) - +```python +from django.db import models +from edtf.fields import EDTFField + +class MyModel(models.Model): + date_display = models.CharField( + "Date of creation (display)", + blank=True, + max_length=255, + ) + date_edtf = EDTFField( + "Date of creation (EDTF)", + natural_text_field='date_display', + lower_fuzzy_field='date_earliest', + upper_fuzzy_field='date_latest', + lower_strict_field='date_sort_ascending', + upper_strict_field='date_sort_descending', + blank=True, + null=True, + ) + # use for filtering + date_earliest = models.FloatField(blank=True, null=True) + date_latest = models.FloatField(blank=True, null=True) + # use for sorting + date_sort_ascending = models.FloatField(blank=True, null=True) + date_sort_descending = models.FloatField(blank=True, null=True) +``` Since the `EDTFField` and the `_earliest` and `_latest` field values are set automatically, you may want to make them readonly, or not visible in your model admin. + +## To develop + +### Setup + +* Clone the repository: `git clone https://github.com/ixc/python-edtf.git` +* Set up a virtual environment: `python3 -m venv venv` +* Install the dependencies: `pip install -r dev-requirements.txt` +* Install precommit hooks: `pre-commit install` + +### Running tests + +* From `python-edtf`, run the unit tests: `pytest` +* From `python-edtf`, run `pytest -m benchmark` to run the benchmarks (published [here]( https://ixc.github.io/python-edtf/dev/bench/)) +* From `python-edtf/edtf_django_tests`, run the integration tests: `python manage.py test edtf_integration` +* To run CI locally, use `act`, e.g. `act pull_request` or `act --pull=false --container-architecture linux/amd64`. Some steps may require a GitHub PAT: `act pull_request --container-architecture linux/amd64 --pull=false -s GITHUB_TOKEN=` + +### Linting and formatting + +* Check linting: `ruff check --output-format=github --config pyproject.toml` +* Check formatting: `ruff format --check --config pyproject.toml` +* Fix formatting: `ruff format --config pyproject.toml` +* Linting and formatting checks and attempted fixes are also run as precommit hooks if you installed them. + +### Coverage and benchmraks + +Coverage reports are generated and added as comments to commits, and also visible in the actions log. Benchmarks are run on pull requests and are published [here]( https://ixc.github.io/python-edtf/dev/bench/) and also visible in the actions log. diff --git a/changelog.rst b/changelog.rst index ea5b6fa..6a302ae 100644 --- a/changelog.rst +++ b/changelog.rst @@ -4,6 +4,33 @@ Changelog In development -------------- +5.0.0.develop0 (2024-05-05) +-------------------------- + +* Breaking Changes: Rename project back to edtf from edtf2, after the merge of work form https://github.com/saw-leipzig/python-edtf/ +* Breaking Changes: Drop support for Python 2 and Python 3 versions below 3.8. `v5` will support Python 3.8 to 3.12 at release. +* Switch from `tox` and `nose` to `pytest` for testing. +* Consolidate config and packaging from `setup.py` and `setup.cfg` to `pyproject.toml`. + +5.0.0 (2023-10-04) +------------------ + +* Breaking Changes: Implementation of the newer specifications from `https://www.loc.gov/standards/datetime/`:: + + Differences + This specification differs from the earlier draft as follows: + + - the unspecified date character (formerly lower case ‘u’) is superseded by the character (upper case) 'X'; + - Masked precision is eliminated; + - the uncertain and approximate qualifiers, '?' and '~', when applied together, are combined into a single qualifier character '%'; + - “qualification from the left” is introduced and replaces the grouping mechanism using parentheses; + - the extended interval syntax keywords 'unknown' and 'open' have been replaced with null and the double-dot notation ['..'] respectively; + - the year prefix 'y' and the exponential indicator 'e', both previously lowercase, are now 'Y' and 'E' (uppercase); and + - the significant digit indicator 'p' is now 'S' (uppercase). + +* Renaming of the project to edtf2: As this project seems to have no longer support from the creator `The Interaction Consortium` we decided to fork it and release it under a new name by our own +* Author: https://github.com/muellersSAW + 4.0 (2018-05-31) ---------------- diff --git a/combine_junit.py b/combine_junit.py new file mode 100644 index 0000000..5e3a05b --- /dev/null +++ b/combine_junit.py @@ -0,0 +1,23 @@ +import sys + +from junitparser import JUnitXml + + +def combine_junit_xml(output_file, *input_files): + combined_xml = JUnitXml() + for input_file in input_files: + xml = JUnitXml.fromfile(input_file) + combined_xml.extend(xml) + combined_xml.write(output_file) + + +if __name__ == "__main__": + if len(sys.argv) < 3: + print( + "Usage: python combine_junit_xml.py ... " + ) + sys.exit(1) + + output_file = sys.argv[1] + input_files = sys.argv[2:] + combine_junit_xml(output_file, *input_files) diff --git a/dev-requirements.txt b/dev-requirements.txt new file mode 100644 index 0000000..19242af --- /dev/null +++ b/dev-requirements.txt @@ -0,0 +1,7 @@ +-r requirements.txt # Include all main requirements +django>=4.2,<5.0 +pytest +pytest-benchmark +pytest-django +ruff +pre-commit diff --git a/edtf/__init__.py b/edtf/__init__.py index a86232f..7bb2885 100644 --- a/edtf/__init__.py +++ b/edtf/__init__.py @@ -1,6 +1,73 @@ -from edtf.parser.grammar import parse_edtf from edtf.natlang import text_to_edtf -from edtf.parser.parser_classes import * -from edtf.convert import dt_to_struct_time, struct_time_to_date, \ - struct_time_to_datetime, trim_struct_time, struct_time_to_jd, \ - jd_to_struct_time +from edtf.parser import ( + UA, + Consecutives, + Date, + DateAndTime, + EarlierConsecutives, + EDTFObject, + EDTFParseException, + ExponentialYear, + Interval, + LaterConsecutives, + Level1Interval, + Level2Interval, + Level2Season, + LongYear, + MultipleDates, + OneOfASet, + PartialUncertainOrApproximate, + PartialUnspecified, + Season, + UncertainOrApproximate, + Unspecified, + UnspecifiedIntervalSection, + parse_edtf, +) + +from .convert import ( + dt_to_struct_time, + jd_to_struct_time, + old_specs_to_new_specs_expression, + struct_time_to_date, + struct_time_to_datetime, + struct_time_to_jd, + trim_struct_time, +) + +# public +__all__ = [ + "dt_to_struct_time", + "jd_to_struct_time", + "old_specs_to_new_specs_expression", + "struct_time_to_date", + "struct_time_to_datetime", + "struct_time_to_jd", + "trim_struct_time", + "text_to_edtf", + "parse_edtf", + # parser_exceptions + "EDTFParseException", + # parser_classes + "EDTFObject", + "Date", + "DateAndTime", + "Interval", + "UA", + "UncertainOrApproximate", + "UnspecifiedIntervalSection", + "Unspecified", + "Level1Interval", + "LongYear", + "Season", + "PartialUncertainOrApproximate", + "PartialUnspecified", + "Consecutives", + "EarlierConsecutives", + "LaterConsecutives", + "OneOfASet", + "MultipleDates", + "Level2Interval", + "Level2Season", + "ExponentialYear", +] diff --git a/edtf/appsettings.py b/edtf/appsettings.py index b23d0aa..8e15846 100644 --- a/edtf/appsettings.py +++ b/edtf/appsettings.py @@ -2,15 +2,34 @@ try: from django.core.exceptions import ImproperlyConfigured + try: from django.conf import settings - EDTF = getattr(settings, 'EDTF', {}) + + EDTF = getattr(settings, "EDTF", {}) except ImproperlyConfigured: EDTF = {} except ImportError: EDTF = {} -SEASON_MONTHS_RANGE = EDTF.get('SEASON_MONTHS_RANGE', { +SEASON_MONTHS_RANGE = EDTF.get( + "SEASON_MONTHS_RANGE", + { + # season id: [earliest_month, last_month] + 21: [3, 5], + 22: [6, 8], + 23: [9, 11], + # winter in the northern hemisphere wraps the end of the year, so + # Winter 2010 could wrap into 2011. + # For simplicity, we assume it falls at the end of the year, esp since the + # spec says that sort order goes spring > summer > autumn > winter + 24: [12, 12], + }, +) + +SEASON_L2_MONTHS_RANGE = EDTF.get( + "SEASON_L2_MONTHS_RANGE", + { # season id: [earliest_month, last_month] 21: [3, 5], 22: [6, 8], @@ -20,28 +39,64 @@ # For simplicity, we assume it falls at the end of the year, esp since the # spec says that sort order goes spring > summer > autumn > winter 24: [12, 12], - } + # spring in the northern hemisphere + 25: [3, 5], + # summer in the northern hemisphere + 26: [6, 8], + # fall/autumn in the northern hemisphere + 27: [9, 11], + # winter in the northern hemisphere wraps the end of the year + 28: [12, 12], + # spring in the southern hemisphere + 29: [9, 11], + # summer in the southern hemisphere + 30: [12, 12], + # fall/autumn in the southern hemisphere + 31: [3, 5], + # winter in the southern hemisphere + 32: [6, 8], + 33: [1, 3], + 34: [4, 6], + 35: [7, 9], + 36: [10, 12], + 37: [1, 4], + 38: [5, 8], + 39: [9, 12], + 40: [1, 6], + 41: [7, 12], + }, ) -DAY_FIRST = EDTF.get('DAY_FIRST', False) # Americans! +DAY_FIRST = EDTF.get("DAY_FIRST", False) # Americans! -SEASONS = EDTF.get('SEASONS', { - 21: "spring", - 22: "summer", - 23: "autumn", - 24: "winter", -}) -INVERSE_SEASONS = EDTF.get('INVERSE_SEASONS', {v: k for k, v in SEASONS.items()}) +SEASONS = EDTF.get( + "SEASONS", + { + 21: "spring", + 22: "summer", + 23: "autumn", + 24: "winter", + }, +) +INVERSE_SEASONS = EDTF.get("INVERSE_SEASONS", {v: k for k, v in SEASONS.items()}) # also need to interpret `fall` -INVERSE_SEASONS['fall'] = 23 +INVERSE_SEASONS["fall"] = 23 # changing these will break tests -PADDING_DAY_PRECISION = EDTF.get('PADDING_DAY_PRECISION', relativedelta(days=1)) -PADDING_MONTH_PRECISION = EDTF.get('PADDING_MONTH_PRECISION', relativedelta(months=1)) -PADDING_YEAR_PRECISION = EDTF.get('PADDING_YEAR_PRECISION', relativedelta(years=1)) -PADDING_SEASON_PRECISION = EDTF.get('PADDING_SEASON_PRECISION', relativedelta(weeks=12)) -MULTIPLIER_IF_UNCERTAIN = EDTF.get('MULTIPLIER_IF_UNCERTAIN', 1.0) -MULTIPLIER_IF_APPROXIMATE = EDTF.get('MULTIPLIER_IF_APPROXIMATE', 1.0) -MULTIPLIER_IF_BOTH = EDTF.get('MULTIPLIER_IF_BOTH', 2.0) - +PADDING_DAY_PRECISION = EDTF.get("PADDING_DAY_PRECISION", relativedelta(days=1)) +PADDING_MONTH_PRECISION = EDTF.get("PADDING_MONTH_PRECISION", relativedelta(months=1)) +PADDING_YEAR_PRECISION = EDTF.get("PADDING_YEAR_PRECISION", relativedelta(years=1)) +PADDING_SEASON_PRECISION = EDTF.get("PADDING_SEASON_PRECISION", relativedelta(weeks=12)) +PADDING_DECADE_PRECISION = EDTF.get("PADDING_DECADE_PRECISION", relativedelta(years=10)) +PADDING_CENTURY_PRECISION = EDTF.get( + "PADDING_CENTURY_PRECISION", relativedelta(years=100) +) +PADDING_MILLENNIUM_PRECISION = EDTF.get( + "PADDING_MILLENNIUM_PRECISION", relativedelta(years=1000) +) +MULTIPLIER_IF_UNCERTAIN = EDTF.get("MULTIPLIER_IF_UNCERTAIN", 1.0) +MULTIPLIER_IF_APPROXIMATE = EDTF.get("MULTIPLIER_IF_APPROXIMATE", 1.0) +MULTIPLIER_IF_BOTH = EDTF.get("MULTIPLIER_IF_BOTH", 2.0) DELTA_IF_UNKNOWN = EDTF.get("DELTA_IF_UNKNOWN", relativedelta(years=10)) + +DEBUG_PYPARSING = False diff --git a/edtf/convert.py b/edtf/convert.py index c1bfd3a..a294462 100644 --- a/edtf/convert.py +++ b/edtf/convert.py @@ -1,13 +1,26 @@ -from time import struct_time from datetime import date, datetime +from time import struct_time from edtf import jdutil - TIME_EMPTY_TIME = [0, 0, 0] # tm_hour, tm_min, tm_sec TIME_EMPTY_EXTRAS = [0, 0, -1] # tm_wday, tm_yday, tm_isdst +def old_specs_to_new_specs_expression(expression): + expression = expression.replace("unknown", "") + expression = expression.replace("open", "..") + expression = expression.replace("u", "X") + expression = expression.replace("x", "X") + expression = expression.replace("?~", "%") + expression = expression.replace("~?", "%") + expression = expression.replace("e", "E") + expression = expression.replace("y", "Y") + expression = expression.replace("p", "S") + + return expression + + def dt_to_struct_time(dt): """ Convert a `datetime.date` or `datetime.datetime` to a `struct_time` @@ -19,16 +32,15 @@ def dt_to_struct_time(dt): """ if isinstance(dt, datetime): return struct_time( - [dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second] + - TIME_EMPTY_EXTRAS + [dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second] + + TIME_EMPTY_EXTRAS ) elif isinstance(dt, date): return struct_time( [dt.year, dt.month, dt.day] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS ) else: - raise NotImplementedError( - "Cannot convert %s to `struct_time`" % type(dt)) + raise NotImplementedError(f"Cannot convert {type(dt)} to `struct_time`") def struct_time_to_date(st): @@ -99,12 +111,11 @@ def jd_to_struct_time(jd): # This conversion can return negative values for items we do not want to be # negative: month, day, hour, minute, second. year, month, day, hour, minute, second = _roll_negative_time_fields( - year, month, day, hour, minute, second) - - return struct_time( - [year, month, day, hour, minute, second] + TIME_EMPTY_EXTRAS + year, month, day, hour, minute, second ) + return struct_time([year, month, day, hour, minute, second] + TIME_EMPTY_EXTRAS) + def _roll_negative_time_fields(year, month, day, hour, minute, second): """ diff --git a/edtf/fields.py b/edtf/fields.py index 52b9171..07a9744 100644 --- a/edtf/fields.py +++ b/edtf/fields.py @@ -1,51 +1,76 @@ -try: - import cPickle as pickle -except: - import pickle +import pickle -from django.db import models +from django.core import checks from django.core.exceptions import FieldDoesNotExist +from django.db import models +from django.db.models import signals +from django.db.models.query_utils import DeferredAttribute +from pyparsing import ParseException -from edtf import parse_edtf, EDTFObject -from edtf.natlang import text_to_edtf +from edtf import EDTFObject, parse_edtf from edtf.convert import struct_time_to_date, struct_time_to_jd +from edtf.natlang import text_to_edtf +from edtf.parser.edtf_exceptions import EDTFParseException DATE_ATTRS = ( - 'lower_strict', - 'upper_strict', - 'lower_fuzzy', - 'upper_fuzzy', + "lower_strict", + "upper_strict", + "lower_fuzzy", + "upper_fuzzy", ) -class EDTFField(models.CharField): +class EDTFFieldDescriptor(DeferredAttribute): + """ + Descriptor for the EDTFField's attribute on the model instance. + This updates the dependent fields each time this value is set. + """ + def __set__(self, instance, value): + # First set the value we are given + instance.__dict__[self.field.attname] = value + # `update_values` may provide us with a new value to set + edtf = self.field.update_values(instance, value) + if edtf != value: + instance.__dict__[self.field.attname] = edtf + + +class EDTFField(models.CharField): def __init__( self, - verbose_name=None, name=None, + verbose_name=None, + name=None, natural_text_field=None, + direct_input_field=None, lower_strict_field=None, upper_strict_field=None, lower_fuzzy_field=None, upper_fuzzy_field=None, - **kwargs + **kwargs, ): - kwargs['max_length'] = 2000 - self.natural_text_field, self.lower_strict_field, \ - self.upper_strict_field, self.lower_fuzzy_field, \ - self.upper_fuzzy_field = natural_text_field, lower_strict_field, \ - upper_strict_field, lower_fuzzy_field, upper_fuzzy_field - super(EDTFField, self).__init__(verbose_name, name, **kwargs) - - description = "An field for storing complex/fuzzy date specifications in EDTF format." + kwargs["max_length"] = 2000 + self.natural_text_field = natural_text_field + self.direct_input_field = direct_input_field + self.lower_strict_field = lower_strict_field + self.upper_strict_field = upper_strict_field + self.lower_fuzzy_field = lower_fuzzy_field + self.upper_fuzzy_field = upper_fuzzy_field + super().__init__(verbose_name, name, **kwargs) + + description = ( + "A field for storing complex/fuzzy date specifications in EDTF format." + ) + descriptor_class = EDTFFieldDescriptor def deconstruct(self): - name, path, args, kwargs = super(EDTFField, self).deconstruct() + name, path, args, kwargs = super().deconstruct() if self.natural_text_field: - kwargs['natural_text_field'] = self.natural_text_field + kwargs["natural_text_field"] = self.natural_text_field + if self.direct_input_field: + kwargs["direct_input_field"] = self.direct_input_field for attr in DATE_ATTRS: - field = "%s_field" % attr + field = f"{attr}_field" f = getattr(self, field, None) if f: kwargs[field] = f @@ -53,15 +78,17 @@ def deconstruct(self): del kwargs["max_length"] return name, path, args, kwargs - def from_db_value(self, value, expression, connection, context=None): - # Converting values to Python objects - if not value: - return None + def from_db_value(self, value, expression, connection): + # Converting values from the database to Python objects + if value is None: + return value + try: - return pickle.loads(str(value)) - except: - pass - return parse_edtf(value, fail_silently=True) + # Try to unpickle if the value was pickled + return pickle.loads(value) # noqa S301 + except (pickle.PickleError, TypeError): + # If it fails because it's not pickled data, try parsing as EDTF + return parse_edtf(value, fail_silently=True) def to_python(self, value): if isinstance(value, EDTFObject): @@ -75,46 +102,63 @@ def to_python(self, value): def get_db_prep_save(self, value, connection): if value: return pickle.dumps(value) - return super(EDTFField, self).get_db_prep_save(value, connection) + return super().get_db_prep_save(value, connection) def get_prep_value(self, value): # convert python objects to query values - value = super(EDTFField, self).get_prep_value(value) + value = super().get_prep_value(value) if isinstance(value, EDTFObject): return pickle.dumps(value) return value - def pre_save(self, instance, add): + def update_values(self, instance, *args, **kwargs): """ - Updates the edtf value from the value of the display_field. - If there's a valid edtf, then set the date values. + Updates the EDTF value from either the natural_text_field, which is parsed + with text_to_edtf() and is used for display, or falling back to the direct_input_field, + which allows directly providing an EDTF string. If one of these provides a valid EDTF object, + then set the date values accordingly. """ - if not self.natural_text_field or self.attname not in instance.__dict__: - return - - edtf = getattr(instance, self.attname) - # Update EDTF field based on latest natural text value, if any - natural_text = getattr(instance, self.natural_text_field) - if natural_text: - edtf = text_to_edtf(natural_text) + # Get existing value to determine if update is needed + existing_value = getattr(instance, self.attname, None) + direct_input = getattr(instance, self.direct_input_field, "") + natural_text = getattr(instance, self.natural_text_field, "") + + # if direct_input is provided and is different from the existing value, update the EDTF field + if direct_input and ( + existing_value is None or str(existing_value) != direct_input + ): + try: + edtf = parse_edtf( + direct_input, fail_silently=True + ) # ParseException if invalid; should this be raised? + except ParseException as err: + raise EDTFParseException(direct_input, err) from None + + # set the natural_text (display) field to the direct_input if it is not provided + if not natural_text: + setattr(instance, self.natural_text_field, direct_input) + + elif natural_text: + edtf_string = text_to_edtf(natural_text) + if edtf_string and ( + existing_value is None or str(existing_value) != edtf_string + ): + edtf = parse_edtf( + edtf_string, fail_silently=True + ) # potential ParseException if invalid; should this be raised? + else: + edtf = existing_value else: - edtf = None + if not existing_value: + # No inputs provided and no existing value; TODO log this? + return + # TODO: if both direct_input and natural_text are cleared, should we throw an error? + edtf = existing_value - # TODO If `natural_text_field` becomes cleared the derived EDTF field - # value should also be cleared, rather than left at original value? - - # TODO Handle case where EDTF field is set to a string directly, not - # via `natural_text_field` (this is a slightly unexpected use-case, but - # is a very efficient way to set EDTF values in situations like for API - # imports so we probably want to continue to support it?) - if edtf and not isinstance(edtf, EDTFObject): - edtf = parse_edtf(edtf, fail_silently=True) - - setattr(instance, self.attname, edtf) - # set or clear related date fields on the instance + # Process and update related date fields based on the EDTF object for attr in DATE_ATTRS: - field_attr = "%s_field" % attr + field_attr = f"{attr}_field" g = getattr(self, field_attr, None) if g: if edtf: @@ -129,10 +173,62 @@ def pre_save(self, instance, add): value = struct_time_to_date(value) else: raise NotImplementedError( - u"EDTFField does not support %s as a derived data" - u" field, only FloatField or DateField" - % type(target_field)) + f"EDTFField does not support {type(target_field)} as a derived data" + " field, only FloatField or DateField" + ) setattr(instance, g, value) else: setattr(instance, g, None) return edtf + + def contribute_to_class(self, cls, name, **kwargs): + super().contribute_to_class(cls, name, **kwargs) + # Attach update_values so that dependent fields declared + # after their corresponding edtf field don't stay cleared by + # Model.__init__, see Django bug #11196. + # Only run post-initialization values update on non-abstract models + if not cls._meta.abstract: + signals.post_init.connect(self.update_values, sender=cls) + + def check(self, **kwargs): + errors = super().check(**kwargs) + + for field_alias in [ + "direct_input_field", + "lower_fuzzy_field", + "lower_strict_field", + "natural_text_field", + "upper_fuzzy_field", + "upper_strict_field", + ]: + errors.extend(self._check_field(field_alias)) + + return errors + + def _check_field(self, field_alias): + field_name = getattr(self, field_alias, None) + + # Check if the alias value has been provided in the field definition + if not field_name: + return [ + checks.Error( + f"You must specify a '{field_alias}' for EDTFField", + hint=None, + obj=self, + id="python-edtf.EDTF01", + ) + ] + + # Check if the field that is referenced actually exists + try: + self.model._meta.get_field(field_name) + except FieldDoesNotExist: + return [ + checks.Error( + f"'{self.name}' refers to a non-existent '{field_alias}' field: '{field_name}'", + hint=None, + obj=self, + id="python-edtf.EDTF02", + ) + ] + return [] diff --git a/edtf/jdutil.py b/edtf/jdutil.py index 9fabdd1..16cd312 100644 --- a/edtf/jdutil.py +++ b/edtf/jdutil.py @@ -9,14 +9,15 @@ """ -import math import datetime as dt +import math # Note: The Python datetime module assumes an infinitely valid Gregorian calendar. # The Gregorian calendar took effect after 10-15-1582 and the dates 10-05 through # 10-14-1582 never occurred. Python datetime objects will produce incorrect # time deltas if one date is from before 10-15-1582. + def mjd_to_jd(mjd): """ Convert Modified Julian Day to Julian Day. @@ -54,11 +55,11 @@ def jd_to_mjd(jd): return jd - 2400000.5 -def date_to_jd(year,month,day): +def date_to_jd(year, month, day): """ Convert a date to Julian Day. - Algorithm from 'Practical Astronomy with your Calculator or Spreadsheet', + Algorithm from 'Practical Astronomy with your Calculator or Spreadsheet', 4th ed., Duffet-Smith and Zwart, 2011. Parameters @@ -95,20 +96,19 @@ def date_to_jd(year,month,day): # this checks where we are in relation to October 15, 1582, the beginning # of the Gregorian calendar. - if ((year < 1582) or - (year == 1582 and month < 10) or - (year == 1582 and month == 10 and day < 15)): + if ( + (year < 1582) + or (year == 1582 and month < 10) + or (year == 1582 and month == 10 and day < 15) + ): # before start of Gregorian calendar B = 0 else: # after start of Gregorian calendar - A = math.trunc(yearp / 100.) - B = 2 - A + math.trunc(A / 4.) + A = math.trunc(yearp / 100.0) + B = 2 - A + math.trunc(A / 4.0) - if yearp < 0: - C = math.trunc((365.25 * yearp) - 0.75) - else: - C = math.trunc(365.25 * yearp) + C = math.trunc(365.25 * yearp - 0.75) if yearp < 0 else math.trunc(365.25 * yearp) D = math.trunc(30.6001 * (monthp + 1)) @@ -121,7 +121,7 @@ def jd_to_date(jd): """ Convert Julian Day to date. - Algorithm from 'Practical Astronomy with your Calculator or Spreadsheet', + Algorithm from 'Practical Astronomy with your Calculator or Spreadsheet', 4th ed., Duffet-Smith and Zwart, 2011. Parameters @@ -151,15 +151,12 @@ def jd_to_date(jd): """ jd = jd + 0.5 - F, I = math.modf(jd) - I = int(I) + F, I = math.modf(jd) # noqa: E741 + I = int(I) # noqa: E741 - A = math.trunc((I - 1867216.25)/36524.25) + A = math.trunc((I - 1867216.25) / 36524.25) - if I > 2299160: - B = I + 1 + A - math.trunc(A / 4.) - else: - B = I + B = I + 1 + A - math.trunc(A / 4.0) if I > 2299160 else I C = B + 1524 @@ -171,20 +168,14 @@ def jd_to_date(jd): day = C - E + F - math.trunc(30.6001 * G) - if G < 13.5: - month = G - 1 - else: - month = G - 13 + month = G - 1 if G < 13.5 else G - 13 - if month > 2.5: - year = D - 4716 - else: - year = D - 4715 + year = D - 4716 if month > 2.5 else D - 4715 return year, month, day -def hmsm_to_days(hour=0,min=0,sec=0,micro=0): +def hmsm_to_days(hour=0, min=0, sec=0, micro=0): """ Convert hours, minutes, seconds, and microseconds to fractional days. @@ -213,13 +204,13 @@ def hmsm_to_days(hour=0,min=0,sec=0,micro=0): 0.25 """ - days = sec + (micro / 1.e6) + days = sec + (micro / 1.0e6) - days = min + (days / 60.) + days = min + (days / 60.0) - days = hour + (days / 60.) + days = hour + (days / 60.0) - return days / 24. + return days / 24.0 def days_to_hmsm(days): @@ -257,16 +248,16 @@ def days_to_hmsm(days): (2, 24, 0, 0) """ - hours = days * 24. + hours = days * 24.0 hours, hour = math.modf(hours) - mins = hours * 60. + mins = hours * 60.0 mins, min = math.modf(mins) - secs = mins * 60. + secs = mins * 60.0 secs, sec = math.modf(secs) - micro = round(secs * 1.e6) + micro = round(secs * 1.0e6) return int(hour), int(min), int(sec), int(micro) @@ -286,16 +277,18 @@ def datetime_to_jd(date): Examples -------- - >>> d = datetime.datetime(1985,2,17,6) + >>> d = datetime.datetime(1985,2,17,6) >>> d datetime.datetime(1985, 2, 17, 6, 0) >>> jdutil.datetime_to_jd(d) 2446113.75 """ - days = date.day + hmsm_to_days(date.hour,date.minute,date.second,date.microsecond) + days = date.day + hmsm_to_days( + date.hour, date.minute, date.second, date.microsecond + ) - return date_to_jd(date.year,date.month,days) + return date_to_jd(date.year, date.month, days) def jd_to_datetime(jd): @@ -320,12 +313,12 @@ def jd_to_datetime(jd): """ year, month, day = jd_to_date(jd) - frac_days,day = math.modf(day) + frac_days, day = math.modf(day) day = int(day) - hour,min,sec,micro = days_to_hmsm(frac_days) + hour, min, sec, micro = days_to_hmsm(frac_days) - return datetime(year,month,day,hour,min,sec,micro) + return datetime(year, month, day, hour, min, sec, micro) def timedelta_to_days(td): @@ -350,9 +343,9 @@ def timedelta_to_days(td): 4.5 """ - seconds_in_day = 24. * 3600. + seconds_in_day = 24.0 * 3600.0 - days = td.days + (td.seconds + (td.microseconds * 10.e6)) / seconds_in_day + days = td.days + (td.seconds + (td.microseconds * 10.0e6)) / seconds_in_day return days @@ -372,8 +365,9 @@ class datetime(dt.datetime): datetime.datetime : Parent class. """ - def __add__(self,other): - if not isinstance(other,dt.timedelta): + + def __add__(self, other): + if not isinstance(other, dt.timedelta): s = "jdutil.datetime supports '+' only with datetime.timedelta" raise TypeError(s) @@ -383,8 +377,8 @@ def __add__(self,other): return jd_to_datetime(combined) - def __radd__(self,other): - if not isinstance(other,dt.timedelta): + def __radd__(self, other): + if not isinstance(other, dt.timedelta): s = "jdutil.datetime supports '+' only with datetime.timedelta" raise TypeError(s) @@ -394,15 +388,15 @@ def __radd__(self,other): return jd_to_datetime(combined) - def __sub__(self,other): - if isinstance(other,dt.timedelta): + def __sub__(self, other): + if isinstance(other, dt.timedelta): days = timedelta_to_days(other) combined = datetime_to_jd(self) - days return jd_to_datetime(combined) - elif isinstance(other, (datetime,dt.datetime)): + elif isinstance(other, (datetime, dt.datetime)): diff = datetime_to_jd(self) - datetime_to_jd(other) return dt.timedelta(diff) @@ -412,8 +406,8 @@ def __sub__(self,other): s += "datetime.timedelta, jdutil.datetime and datetime.datetime" raise TypeError(s) - def __rsub__(self,other): - if not isinstance(other, (datetime,dt.datetime)): + def __rsub__(self, other): + if not isinstance(other, (datetime, dt.datetime)): s = "jdutil.datetime supports '-' with: " s += "jdutil.datetime and datetime.datetime" raise TypeError(s) diff --git a/edtf/natlang/__init__.py b/edtf/natlang/__init__.py index 325672f..463863c 100644 --- a/edtf/natlang/__init__.py +++ b/edtf/natlang/__init__.py @@ -1 +1,3 @@ from .en import text_to_edtf + +__all__ = ["text_to_edtf"] diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index ec7842b..f28e685 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -1,10 +1,11 @@ """Utilities to derive an EDTF string from an (English) natural language string.""" -from datetime import datetime -from dateutil.parser import parse + import re -from edtf import appsettings -from six.moves import xrange +from datetime import datetime +from dateutil.parser import ParserError, parse + +from edtf import appsettings # two dates where every digit of an ISO date representation is different, # and one is in the past and one is in the future. @@ -12,15 +13,15 @@ DEFAULT_DATE_1 = datetime(1234, 1, 1, 0, 0) DEFAULT_DATE_2 = datetime(5678, 10, 10, 0, 0) -SHORT_YEAR_RE = r'(-?)([\du])([\dxu])([\dxu])([\dxu])' -LONG_YEAR_RE = r'y(-?)([1-9]\d\d\d\d+)' -CENTURY_RE = r'(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?' -CE_RE = r'(\d{1,4}) (ad|ce|bc|bce)' +SHORT_YEAR_RE = r"(-?)([\dX])([\dX])([\dX])([\dX])" +LONG_YEAR_RE = r"Y(-?)([1-9]\d\d\d\d+)" +CENTURY_RE = r"(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?" +CE_RE = r"(\d{1,4}) (ad|ce|bc|bce)" # Set of RE rules that will cause us to abort text processing, since we know # the results will be wrong. REJECT_RULES = ( - r'.*dynasty.*', # Don't parse '23rd Dynasty' to 'uuuu-uu-23' + r".*dynasty.*", # Don't parse '23rd Dynasty' to 'uuuu-uu-23' ) @@ -29,7 +30,7 @@ def text_to_edtf(text): Generate EDTF string equivalent of a given natural language date string. """ if not text: - return + return None t = text.lower() @@ -41,7 +42,6 @@ def text_to_edtf(text): # TODO: assemble multiple dates into a {} or [] structure. for split in [",", ";", "or"]: for list_item in t.split(split): - # try parsing as an interval - split by '-' toks = list_item.split("-") if len(toks) == 2: @@ -51,18 +51,23 @@ def text_to_edtf(text): # match looks from the beginning of the string, search # looks anywhere. - if re.match(r'\d\D\b', d2): # 1-digit year partial e.g. 1868-9 - if re.search(r'\b\d\d\d\d$', d1): # TODO: evaluate it and see if it's a year + if re.match(r"\d\D\b", d2): # 1-digit year partial e.g. 1868-9 + if re.search( + r"\b\d\d\d\d$", d1 + ): # TODO: evaluate it and see if it's a year d2 = d1[-4:-1] + d2 - elif re.match(r'\d\d\b', d2): # 2-digit year partial e.g. 1809-10 - if re.search(r'\b\d\d\d\d$', d1): + elif re.match(r"\d\d\b", d2): # 2-digit year partial e.g. 1809-10 + if re.search(r"\b\d\d\d\d$", d1): d2 = d1[-4:-2] + d2 else: - century_range_match = re.search(r'\b(\d\d)(th|st|nd|rd|)-(\d\d)(th|st|nd|rd) [cC]', "%s-%s" % (d1,d2)) + century_range_match = re.search( + r"\b(\d\d)(th|st|nd|rd|)-(\d\d)(th|st|nd|rd) [cC]", + f"{d1}-{d2}", + ) if century_range_match: g = century_range_match.groups() - d1 = "%sC" % g[0] - d2 = "%sC" % g[2] + d1 = f"{g[0]}C" + d2 = f"{g[2]}C" r1 = text_to_edtf_date(d1) r2 = text_to_edtf_date(d2) @@ -79,7 +84,7 @@ def text_to_edtf(text): else: int_match = re.search(r"(\d\d\d\d)\/(\d\d\d\d)", list_item) if int_match: - return "[%s, %s]" % (int_match.group(1), int_match.group(2)) + return f"[{int_match.group(1)}, {int_match.group(2)}]" result = text_to_edtf_date(list_item) if result: @@ -87,18 +92,17 @@ def text_to_edtf(text): if result: break - is_before = re.findall(r'\bbefore\b', t) - is_before = is_before or re.findall(r'\bearlier\b', t) + is_before = re.findall(r"\bbefore\b", t) + is_before = is_before or re.findall(r"\bearlier\b", t) - is_after = re.findall(r'\bafter\b', t) - is_after = is_after or re.findall(r'\bsince\b', t) - is_after = is_after or re.findall(r'\blater\b', t) + is_after = re.findall(r"\bafter\b", t) + is_after = is_after or re.findall(r"\bsince\b", t) + is_after = is_after or re.findall(r"\blater\b", t) if is_before: - result = u"unknown/%s" % result + result = f"/{result}" # unknown is replaced with null for intervals elif is_after: - result = u"%s/unknown" % result - + result = f"{result}/" # unknown is replaced with null for intervals return result @@ -114,36 +118,34 @@ def text_to_edtf_date(text): return t = text.lower() - result = '' + result = "" for reject_re in REJECT_RULES: if re.match(reject_re, t): return # matches on '1800s'. Needs to happen before is_decade. - could_be_century = re.findall(r'(\d{2}00)s', t) + could_be_century = re.findall(r"(\d{2}00)s", t) # matches on '1800s' and '1910s'. Removes the 's'. # Needs to happen before is_uncertain because e.g. "1860s?" - t, is_decade = re.subn(r'(\d{3}0)s', r'\1', t) + t, is_decade = re.subn(r"(\d{3}0)s", r"\1", t) # detect approximation signifiers # a few 'circa' abbreviations just before the year - is_approximate = re.findall(r'\b(ca?\.?) ?\d{4}', t) + is_approximate = re.findall(r"\b(ca?\.?) ?\d{4}", t) # the word 'circa' anywhere - is_approximate = is_approximate or re.findall(r'\bcirca\b', t) + is_approximate = is_approximate or re.findall(r"\bcirca\b", t) # the word 'approx'/'around'/'about' anywhere - is_approximate = is_approximate or \ - re.findall(r'\b(approx|around|about)', t) + is_approximate = is_approximate or re.findall(r"\b(approx|around|about)", t) # a ~ before a year-ish number - is_approximate = is_approximate or re.findall(r'\b~\d{4}', t) + is_approximate = is_approximate or re.findall(r"\b~\d{4}", t) # a ~ at the beginning - is_approximate = is_approximate or re.findall(r'^~', t) + is_approximate = is_approximate or re.findall(r"^~", t) # detect uncertainty signifiers - t, is_uncertain = re.subn(r'(\d{4})\?', r'\1', t) + t, is_uncertain = re.subn(r"(\d{4})\?", r"\1", t) # the words uncertain/maybe/guess anywhere - is_uncertain = is_uncertain or re.findall( - r'\b(uncertain|possibly|maybe|guess)', t) + is_uncertain = is_uncertain or re.findall(r"\b(uncertain|possibly|maybe|guess)", t) # detect century forms is_century = re.findall(CENTURY_RE, t) @@ -151,28 +153,26 @@ def text_to_edtf_date(text): # detect CE/BCE year form is_ce = re.findall(CE_RE, t) if is_century: - result = "%02dxx" % (int(is_century[0][0]) - 1,) - is_approximate = is_approximate or \ - re.findall(r'\b(ca?\.?) ?' + CENTURY_RE, t) - is_uncertain = is_uncertain or re.findall(CENTURY_RE + r'\?', t) + result = "%02dXX" % (int(is_century[0][0]) - 1,) + is_approximate = is_approximate or re.findall(r"\b(ca?\.?) ?" + CENTURY_RE, t) + is_uncertain = is_uncertain or re.findall(CENTURY_RE + r"\?", t) try: is_bc = is_century[0][-1] in ("bc", "bce") if is_bc: - result = "-%s" % result + result = f"-{result}" except IndexError: pass elif is_ce: result = "%04d" % (int(is_ce[0][0])) - is_approximate = is_approximate or \ - re.findall(r'\b(ca?\.?) ?' + CE_RE, t) - is_uncertain = is_uncertain or re.findall(CE_RE + r'\?', t) + is_approximate = is_approximate or re.findall(r"\b(ca?\.?) ?" + CE_RE, t) + is_uncertain = is_uncertain or re.findall(CE_RE + r"\?", t) try: is_bc = is_ce[0][-1] in ("bc", "bce") if is_bc: - result = "-%s" % result + result = f"-{result}" except IndexError: pass @@ -187,7 +187,7 @@ def text_to_edtf_date(text): dayfirst=appsettings.DAY_FIRST, yearfirst=False, fuzzy=True, # force a match, even if it's default date - default=DEFAULT_DATE_1 + default=DEFAULT_DATE_1, ) dt2 = parse( @@ -195,14 +195,15 @@ def text_to_edtf_date(text): dayfirst=appsettings.DAY_FIRST, yearfirst=False, fuzzy=True, # force a match, even if it's default date - default=DEFAULT_DATE_2 + default=DEFAULT_DATE_2, ) - except ValueError: + except ParserError: + return + except Exception: return - if dt1.date() == DEFAULT_DATE_1.date() and \ - dt2.date() == DEFAULT_DATE_2.date(): + if dt1.date() == DEFAULT_DATE_1.date() and dt2.date() == DEFAULT_DATE_2.date(): # couldn't parse anything - defaults are untouched. return @@ -210,35 +211,34 @@ def text_to_edtf_date(text): date2 = dt2.isoformat()[:10] # guess precision of 'unspecified' characters to use - mentions_year = re.findall(r'\byear\b.+(in|during)\b', t) - mentions_month = re.findall(r'\bmonth\b.+(in|during)\b', t) - mentions_day = re.findall(r'\bday\b.+(in|during)\b', t) + mentions_year = re.findall(r"\byear\b.+(in|during)\b", t) + mentions_month = re.findall(r"\bmonth\b.+(in|during)\b", t) + mentions_day = re.findall(r"\bday\b.+(in|during)\b", t) - for i in xrange(len(date1)): + for i in range(len(date1)): # if the given year could be a century (e.g. '1800s') then use # approximate/uncertain markers to decide whether we treat it as # a century or a decade. - if i == 2 and could_be_century and \ - not (is_approximate or is_uncertain): - result += 'x' + if i == 2 and could_be_century and not (is_approximate or is_uncertain): + result += "X" elif i == 3 and is_decade > 0: if mentions_year: - result += 'u' # year precision + result += "X" # previously year precision - now just X else: - result += 'x' # decade precision + result += "X" # previously decade precision - now just X elif date1[i] == date2[i]: # since both attempts at parsing produced the same result # it must be parsed value, not a default result += date1[i] else: # different values were produced, meaning that it's likely - # a default. Use 'unspecified' - result += "u" + # a default. Use 'X' + result += "X" # strip off unknown chars from end of string - except the first 4 - for i in reversed(xrange(len(result))): - if result[i] not in ('u', 'x', '-'): + for i in reversed(range(len(result))): + if result[i] not in ("X", "-"): smallest_length = 4 if mentions_month: @@ -262,14 +262,16 @@ def text_to_edtf_date(text): # end dateutil post-parsing - if is_uncertain: - result += "?" - - if is_approximate: - result += "~" + if is_uncertain and is_approximate: + result += "%" + else: + if is_uncertain: + result += "?" + if is_approximate: + result += "~" # weed out bad parses - if result.startswith("uu-uu"): + if result.startswith("XX-XX"): return None return result diff --git a/edtf/natlang/tests.py b/edtf/natlang/tests.py index ea137d2..d2c43a5 100644 --- a/edtf/natlang/tests.py +++ b/edtf/natlang/tests.py @@ -1,215 +1,211 @@ -import unittest -from edtf.natlang.en import text_to_edtf - -# where examples are tuples, the second item is the normalised output -EXAMPLES = ( - ('active late 17th-19th centuries', '16xx/18xx'), # ignoring 'late' for now - ('active 17-19th Centuries', '16xx/18xx'), # ignoring 'late' for now - - # Unrecognised values - ('', None), - ('this isn\'t a date', None), - - # Explicity rejected values that would otherwise be badly converted - ('23rd Dynasty', None), - - ('90', '1990'), # implied century - ('1860', '1860'), - ('the year 1800', '1800'), - ('the year 1897', '1897'), - ('January 2008', '2008-01'), - ('January 12, 1940', '1940-01-12'), - - # uncertain/approximate - ('1860?', '1860?'), - ('1862 (uncertain)', '1862?'), - ('maybe 1862', '1862?'), - ('1862 maybe', '1862?'), - ('1862 guess', '1862?'), - ('uncertain: 1862', '1862?'), - ('uncertain: Jan 18 1862', '1862-01-18?'), - ('~ Feb 1812', '1812-02~'), - ('circa Feb 1812', '1812-02~'), - ('Feb 1812 approx', '1812-02~'), - ('c1860', '1860~'), # different abbreviations - ('c.1860', '1860~'), # with or without . - ('ca1860', '1860~'), - ('ca.1860', '1860~'), - ('c 1860', '1860~'), # with or without space - ('c. 1860', '1860~'), - ('ca. 1860', '1860~'), - ('approx 1860', '1860~'), - ('1860 approx', '1860~'), - ('1860 approximately', '1860~'), - ('approximately 1860', '1860~'), - ('about 1860', '1860~'), - ('about Spring 1849', '1849-21~'), - ('notcirca 1860', '1860'), # avoid words containing circa - ('attica 1802', '1802'), - # avoid false positive circa at the end of preceding word - ('attic. 1802', '1802'), # avoid false positive circa - - # masked precision - ('1860s', '186x'), # 186x has decade precision, 186u has year precision. - - # masked precision + uncertainty - ('ca. 1860s', '186x~'), - ('c. 1860s', '186x~'), - ('Circa 1840s', '184x~'), - ('circa 1840s', '184x~'), - ('ca. 1860s?', '186x?~'), - ('uncertain: approx 1862', '1862?~'), - - # masked precision with first decade (ambiguous) - ('1800s', '18xx'), # without additional uncertainty, use the century - ('2000s', '20xx'), # without additional uncertainty, use the century - ('c1900s', '190x~'), # if there's additional uncertainty, use the decade - ('c1800s?', '180x?~'), # if there's additional uncertainty, use the decade - - # unspecified - ('January 12', 'uuuu-01-12'), - ('January', 'uuuu-01'), - ('10/7/2008', '2008-10-07'), - ('7/2008', '2008-07'), - - # seasons - ('Spring 1872', '1872-21'), - ('Summer 1872', '1872-22'), - ('Autumn 1872', '1872-23'), - ('Fall 1872', '1872-23'), - ('Winter 1872', '1872-24'), - - # before/after - ('earlier than 1928', 'unknown/1928'), - ('before 1928', 'unknown/1928'), - ('after 1928', '1928/unknown'), - ('later than 1928', '1928/unknown'), - ('before January 1928', 'unknown/1928-01'), - ('before 18 January 1928', 'unknown/1928-01-18'), - - # before/after approx - ('before approx January 18 1928', 'unknown/1928-01-18~'), - ('before approx January 1928', 'unknown/1928-01~'), - ('after approx January 1928', '1928-01~/unknown'), - ('after approx Summer 1928', '1928-22~/unknown'), +# ruff: noqa: S101 # Asserts are ok in tests - # before/after and uncertain/unspecificed - ('after about the 1920s', '192x~/unknown'), - ('before about the 1900s', 'unknown/190x~'), - ('before the 1900s', 'unknown/19xx'), - - # unspecified - # ('decade in 1800s', '18ux'), #too esoteric - # ('decade somewhere during the 1800s', '18ux'), #lengthier. Keywords are 'in' or 'during' - ('year in the 1860s', '186u'), - # 186x has decade precision, 186u has year precision. - ('year in the 1800s', '18xu'), - ('year in about the 1800s', '180u~'), - ('month in 1872', '1872-uu'), - ('day in Spring 1849', '1849-21-uu'), - ('day in January 1872', '1872-01-uu'), - ('day in 1872', '1872-uu-uu'), - ('birthday in 1872', '1872'), - # avoid false positive at end of preceding word - - # centuries - ('1st century', '00xx'), - ('10c', '09xx'), - ('19th century', '18xx'), - ('19th century?', '18xx?'), - ('before 19th century', 'unknown/18xx'), - ('19c', '18xx'), - ('15c.', '14xx'), - ('ca. 19c', '18xx~'), - ('~19c', '18xx~'), - ('about 19c', '18xx~'), - ('19c?', '18xx?'), - ('c.19c?', '18xx?~'), - - # BC/AD - ('1 AD', '0001'), - ('17 CE', '0017'), - ('127 CE', '0127'), - ('1270 CE', '1270'), - ('c1 AD', '0001~'), - ('c17 CE', '0017~'), - ('c127 CE', '0127~'), - ('c1270 CE', '1270~'), - ('c64 BCE', '-0064~'), - ('2nd century bc', '-01xx'), # -200 to -101 - ('2nd century bce', '-01xx'), - ('2nd century ad', '01xx'), - ('2nd century ce', '01xx'), - - # c-c-c-combo - # just showing off now... - ('a day in about Spring 1849?', '1849-21-uu?~'), - - # simple ranges. Not all of these results are correct EDTF, but - # this is as good as the EDTF implementation and simple natural - # language parser we have. - ('1851-1852', '1851/1852'), - ('1851-1852; printed 1853-1854', '1851/1852'), - ('1851-52', '1851/1852'), - ('1852 - 1860', '1852/1860'), - ('1856-ca. 1865', '1856/1865~'), - ('1857-mid 1860s', '1857/186x'), - ('1858/1860', '[1858, 1860]'), - ('1860s-1870s', '186x/187x'), - ('1861, printed 1869', '1861'), - ('1910-30', '1910/1930'), - ('active 1910-30', '1910/1930'), - ('1861-67', '1861/1867'), - ('1861-67 (later print)', '1861/1867'), - ('1863 or 1864', '1863'), - ('1863, printed 1870', '1863'), - ('1863, printed ca. 1866', '1863'), - ('1864 or 1866', '1864'), - ('1864, printed ca. 1864', '1864'), - ('1864-1872, printed 1870s', '1864/1872'), - ('1868-1871?', '1868/1871?'), - ('1869-70', '1869/1870'), - ('1870s, printed ca. 1880s', '187x'), - ('1900-1903, cast before 1929', '1900/1903'), - ('1900; 1973', '1900'), - ('1900; printed 1912', '1900'), - ('1915 late - autumn 1916', '1915/1916-23'), - - ('1915, from Camerawork, October 1916', '1915'), # should be {1915, 1916-10} - ('1920s -early 1930s', '192x/193x'), - ('1930s, printed early 1960s', '193x'), # should be something like {193x, 196x}, - # though those forms aren't explicitly supported in the spec. - ('1932, printed 1976 by Gunther Sander', '1932'), # should be {1932, 1976} - ('1938, printed 1940s-1950s', '1938'), # should be something like {1938, 194x-195x} +import pytest +from edtf.natlang.en import text_to_edtf - # for these to work we need to recast is_uncertain and is_approximate - # such that they work on different parts. Probably worth rolling our own - # dateparser at this point. - # ('July in about 1849', '1849~-07'), - # ('a day in July in about 1849', '1849~-07-uu'), - # ('a day in Spring in about 1849', '1849~-21-uu'), - # ('a day in about July? in about 1849', '1849~-07?~-uu'), - # ('a day in about Spring in about 1849', '1849~-21~-uu'), - # ('maybe January in some year in about the 1830s', '183u~-01?'), - # ('about July? in about 1849', '1849~-07?~'), +# TODO update the tests and code to test and output the new spec +# where examples are tuples, the second item is the normalised output +@pytest.mark.parametrize( + "input_text,expected_output", + [ + # Ignoring 'late' for simplicity in these examples + ("active late 17th-19th centuries", "16XX/18XX"), + ("active 17-19th Centuries", "16XX/18XX"), + # Unrecognised values + ("", None), + ("this isn't a date", None), + # Explicitly rejected values that would otherwise be badly converted + ("23rd Dynasty", None), + # Implied century and specific years + ("90", "1990"), # Implied century + ("1860", "1860"), + ("the year 1800", "1800"), + ("the year 1897", "1897"), + ("January 2008", "2008-01"), + ("January 12, 1940", "1940-01-12"), + # Uncertain or approximate dates + ("1860?", "1860?"), + ("1862 (uncertain)", "1862?"), + ("maybe 1862", "1862?"), + ("1862 maybe", "1862?"), + ("1862 guess", "1862?"), + ("uncertain: 1862", "1862?"), + ("uncertain: Jan 18 1862", "1862-01-18?"), + ("~ Feb 1812", "1812-02~"), + ("circa Feb 1812", "1812-02~"), + ("Feb 1812 approx", "1812-02~"), + ("c1860", "1860~"), # Different abbreviations + ("c.1860", "1860~"), # With or without . + ("ca1860", "1860~"), + ("ca.1860", "1860~"), + ("c 1860", "1860~"), # With or without space + ("c. 1860", "1860~"), + ("ca. 1860", "1860~"), + ("approx 1860", "1860~"), + ("1860 approx", "1860~"), + ("1860 approximately", "1860~"), + ("approximately 1860", "1860~"), + ("about 1860", "1860~"), + ("about Spring 1849", "1849-21~"), + ("notcirca 1860", "1860"), # Avoid words containing 'circa' + ( + "attica 1802", + "1802", + ), # Avoid false positive 'circa' at the end of preceding word + ("attic. 1802", "1802"), # Avoid false positive 'circa' + # Previously tested masked precision, uncertain or ambiguous masked precision + ("1860s", "186X"), + ("ca. 1860s", "186X~"), + ("c. 1860s", "186X~"), + ("Circa 1840s", "184X~"), + ("circa 1840s", "184X~"), + ("ca. 1860s?", "186X%"), + ("uncertain: approx 1862", "1862%"), + ("1800s", "18XX"), + ("2000s", "20XX"), + ("c1900s", "190X~"), + ("c1800s?", "180X%"), + # Unspecified dates + ("January 12", "XXXX-01-12"), + ("January", "XXXX-01"), + ("10/7/2008", "2008-10-07"), + ("7/2008", "2008-07"), + # Seasons mapped to specific codes + ("Spring 1872", "1872-21"), + ("Summer 1872", "1872-22"), + ("Autumn 1872", "1872-23"), + ("Fall 1872", "1872-23"), + ("Winter 1872", "1872-24"), + # Dates relative to known events (before/after) + ("earlier than 1928", "/1928"), + ("before 1928", "/1928"), + ("after 1928", "1928/"), + ("later than 1928", "1928/"), + ("before January 1928", "/1928-01"), + ("before 18 January 1928", "/1928-01-18"), + # Approximations combined with before/after + ("before approx January 18 1928", "/1928-01-18~"), + ("before approx January 1928", "/1928-01~"), + ("after approx January 1928", "1928-01~/"), + ("after approx Summer 1928", "1928-22~/"), + # Before and after with uncertain / unspecified components + ("after about the 1920s", "192X~/"), + ("before about the 1900s", "/190X~"), + ("before the 1900s", "/19XX"), + # previous examples for masked precision, now removed from the EDTF spec + # use `X` for unknown regardless of precision or why the data is unknown + ("decade in 1800s", "18XX"), + ("decade somewhere during the 1800s", "18XX"), + ("year in the 1860s", "186X"), + ("year in the 1800s", "18XX"), + ("year in about the 1800s", "180X~"), + ("month in 1872", "1872-XX"), + ("day in Spring 1849", "1849-21-XX"), + ("day in January 1872", "1872-01-XX"), + ("day in 1872", "1872-XX-XX"), + ("birthday in 1872", "1872"), + # Handling centuries with approximation and uncertainty + ("1st century", "00XX"), + ("10c", "09XX"), + ("19th century", "18XX"), + ("19th century?", "18XX?"), + ("before 19th century", "/18XX"), + ("19c", "18XX"), + ("15c.", "14XX"), + ("ca. 19c", "18XX~"), + ("~19c", "18XX~"), + ("about 19c", "18XX~"), + ("19c?", "18XX?"), + ("c.19c?", "18XX%"), + # BC/AD dating + ("1 AD", "0001"), + ("17 CE", "0017"), + ("127 CE", "0127"), + ("1270 CE", "1270"), + ("c1 AD", "0001~"), + ("c17 CE", "0017~"), + ("c127 CE", "0127~"), + ("c1270 CE", "1270~"), + ("c64 BCE", "-0064~"), + ("2nd century bc", "-01XX"), # -200 to -101 + ("2nd century bce", "-01XX"), + ("2nd century ad", "01XX"), + ("2nd century ce", "01XX"), + # Combining uncertainties and approximations in creative ways + ("a day in about Spring 1849?", "1849-21-XX%"), + # Simple date ranges, showcasing both the limitations and capabilities of the parser + # Not all of these results are correct EDTF, but this is as good as the EDTF implementation + # and simple natural language parser we have. + ("1851-1852", "1851/1852"), + ("1851-1852; printed 1853-1854", "1851/1852"), + ("1851-52", "1851/1852"), + ("1852 - 1860", "1852/1860"), + ("1856-ca. 1865", "1856/1865~"), + ("1857-mid 1860s", "1857/186X"), + ("1858/1860", "[1858, 1860]"), + ("1860s-1870s", "186X/187X"), + ("1910-30", "1910/1930"), + ("active 1910-30", "1910/1930"), + ("1861-67", "1861/1867"), + ("1861-67 (later print)", "1861/1867"), + ("1863 or 1864", "1863"), + ("1863, printed 1870", "1863"), + ("1863, printed ca. 1866", "1863"), + ("1864 or 1866", "1864"), + ("1864, printed ca. 1864", "1864"), + ("1864-1872, printed 1870s", "1864/1872"), + ("1868-1871?", "1868/1871?"), + ("1869-70", "1869/1870"), + ("1870s, printed ca. 1880s", "187X"), + ("1900-1903, cast before 1929", "1900/1903"), + ("1900; 1973", "1900"), + ("1900; printed 1912", "1900"), + ("1915 late - autumn 1916", "1915/1916-23"), + ("1915, from Camerawork, October 1916", "1915"), # should be {1915, 1916-10} + ("1920s -early 1930s", "192X/193X"), + ( + "1930s, printed early 1960s", + "193X", + ), # should be something like {193x, 196x}, + ("1932, printed 1976 by Gunther Sander", "1932"), # should be {1932, 1976} + ( + "1938, printed 1940s-1950s", + "1938", + ), # should be something like {1938, 194x-195x} + ], ) - - -class TestLevel0(unittest.TestCase): - def test_natlang(self): - """ - For each of the examples, establish that: - - the unicode of the parsed object is acceptably equal to the EDTF string - - the parsed object is a subclass of EDTFObject - :return: - """ - for i, o in EXAMPLES: - e = text_to_edtf(i) - print("%s => %s" % (i, e)) - self.assertEqual(e, o) - - -if __name__ == '__main__': - unittest.main() +def test_natlang(input_text, expected_output): + """ + Test natural language conversion to EDTF format: + Verify that the conversion from text to EDTF format matches the expected output. + """ + result = text_to_edtf(input_text) + assert ( + result == expected_output + ), f"Failed for input: {input_text} - expected {expected_output}, got {result}" + + +@pytest.mark.benchmark +@pytest.mark.parametrize( + "input_text,expected_output", + [ + ("23rd Dynasty", None), + ("January 2008", "2008-01"), + ("ca1860", "1860~"), + ("uncertain: approx 1862", "1862%"), + ("January", "XXXX-01"), + ("Winter 1872", "1872-24"), + ("before approx January 18 1928", "/1928-01-18~"), + ("birthday in 1872", "1872"), + ("1270 CE", "1270"), + ("2nd century bce", "-01XX"), + ("1858/1860", "[1858, 1860]"), + ], +) +def test_benchmark_natlang(benchmark, input_text, expected_output): + """ + Benchmark selected natural language conversions + """ + benchmark(text_to_edtf, input_text) diff --git a/edtf/parser/__init__.py b/edtf/parser/__init__.py index e5a0e5f..43197d5 100644 --- a/edtf/parser/__init__.py +++ b/edtf/parser/__init__.py @@ -1,2 +1,51 @@ -from edtf.parser.grammar import parse_edtf -from edtf.parser.parser_classes import * +from .edtf_exceptions import EDTFParseException +from .grammar import parse_edtf +from .parser_classes import ( + UA, + Consecutives, + Date, + DateAndTime, + EarlierConsecutives, + EDTFObject, + ExponentialYear, + Interval, + LaterConsecutives, + Level1Interval, + Level2Interval, + Level2Season, + LongYear, + MultipleDates, + OneOfASet, + PartialUncertainOrApproximate, + PartialUnspecified, + Season, + UncertainOrApproximate, + Unspecified, + UnspecifiedIntervalSection, +) + +__all__ = [ + "parse_edtf", + "EDTFParseException", + "EDTFObject", + "Date", + "DateAndTime", + "Interval", + "UA", + "UncertainOrApproximate", + "Unspecified", + "UnspecifiedIntervalSection", + "Level1Interval", + "LongYear", + "Season", + "PartialUncertainOrApproximate", + "PartialUnspecified", + "Consecutives", + "EarlierConsecutives", + "LaterConsecutives", + "OneOfASet", + "MultipleDates", + "Level2Interval", + "Level2Season", + "ExponentialYear", +] diff --git a/edtf/parser/edtf_exceptions.py b/edtf/parser/edtf_exceptions.py index 9530602..d906d58 100644 --- a/edtf/parser/edtf_exceptions.py +++ b/edtf/parser/edtf_exceptions.py @@ -2,4 +2,28 @@ class EDTFParseException(ParseException): - pass + """Raised when an input cannot be parsed as an EDTF string. + + Attributes: + input_string - the input string that could not be parsed + err -- the original ParseException that caused this one + """ + + def __init__(self, input_string, err=None): + if input_string is None: + input_string = "" + self.input_string = input_string + if err is None: + err = ParseException(input_string, 0, "Invalid input or format.") + self.err = err + super().__init__(str(err), err.loc if err.loc else 0, self.input_string) + + def __str__(self): + if not self.input_string: + return "You must supply some input text" + near_text = ( + self.input_string[max(self.err.loc - 10, 0) : self.err.loc + 10] + if hasattr(self.err, "loc") + else "" + ) + return f"Error at position {self.err.loc}: Invalid input or format near '{near_text}'. Please provide a valid EDTF string." diff --git a/edtf/parser/grammar.py b/edtf/parser/grammar.py index c028c6e..beabf52 100644 --- a/edtf/parser/grammar.py +++ b/edtf/parser/grammar.py @@ -1,27 +1,67 @@ -from pyparsing import Literal as L, ParseException, Optional, OneOrMore, \ - ZeroOrMore, oneOf, Regex, Combine, Word, NotAny, nums +# ruff: noqa: E402 I001 + +# It's recommended to `enablePackrat()` immediately after importing pyparsing +# https://github.com/pyparsing/pyparsing/wiki/Performance-Tips + +import pyparsing +from edtf.appsettings import DEBUG_PYPARSING + +pyparsing.ParserElement.enablePackrat() + +from pyparsing import ( + Combine, + NotAny, + OneOrMore, + Opt, + Optional, + ParseException, + Regex, + Word, + ZeroOrMore, + nums, + oneOf, +) +from pyparsing import Literal as L -# (* ************************** Level 0 *************************** *) -from edtf.parser.parser_classes import Date, DateAndTime, Interval, Unspecified, \ - UncertainOrApproximate, Level1Interval, LongYear, Season, \ - PartialUncertainOrApproximate, UA, PartialUnspecified, OneOfASet, \ - Consecutives, EarlierConsecutives, LaterConsecutives, MultipleDates, \ - MaskedPrecision, Level2Interval, ExponentialYear from edtf.parser.edtf_exceptions import EDTFParseException -oneThru12 = oneOf(['%.2d' % i for i in range(1, 13)]) -oneThru13 = oneOf(['%.2d' % i for i in range(1, 14)]) -oneThru23 = oneOf(['%.2d' % i for i in range(1, 24)]) -zeroThru23 = oneOf(['%.2d' % i for i in range(0, 23)]) -oneThru29 = oneOf(['%.2d' % i for i in range(1, 30)]) -oneThru30 = oneOf(['%.2d' % i for i in range(1, 31)]) -oneThru31 = oneOf(['%.2d' % i for i in range(1, 32)]) -oneThru59 = oneOf(['%.2d' % i for i in range(1, 60)]) -zeroThru59 = oneOf(['%.2d' % i for i in range(0, 60)]) - -positiveDigit = Word(nums, exact=1, excludeChars='0') +# (* ************************** Level 0 *************************** *) +from edtf.parser.parser_classes import ( + UA, + Consecutives, + Date, + DateAndTime, + EarlierConsecutives, + ExponentialYear, + Interval, + LaterConsecutives, + Level1Interval, + Level2Interval, + Level2Season, + LongYear, + MultipleDates, + OneOfASet, + PartialUncertainOrApproximate, + PartialUnspecified, + Season, + UncertainOrApproximate, + Unspecified, +) + +oneThru12 = oneOf(["%.2d" % i for i in range(1, 13)]) +oneThru13 = oneOf(["%.2d" % i for i in range(1, 14)]) +oneThru23 = oneOf(["%.2d" % i for i in range(1, 24)]) +zeroThru23 = oneOf(["%.2d" % i for i in range(0, 24)]) +oneThru29 = oneOf(["%.2d" % i for i in range(1, 30)]) +oneThru30 = oneOf(["%.2d" % i for i in range(1, 31)]) +oneThru31 = oneOf(["%.2d" % i for i in range(1, 32)]) +oneThru59 = oneOf(["%.2d" % i for i in range(1, 60)]) +zeroThru59 = oneOf(["%.2d" % i for i in range(0, 60)]) + digit = Word(nums, exact=1) +positiveDigit = Word(nums, exact=1, excludeChars="0") +positiveInteger = Combine(positiveDigit + ZeroOrMore(digit)) second = zeroThru59 minute = zeroThru59 @@ -30,18 +70,23 @@ month = oneThru12("month") monthDay = ( - (oneOf("01 03 05 07 08 10 12")("month") + "-" + oneThru31("day")) ^ - (oneOf("04 06 09 11")("month") + "-" + oneThru30("day")) ^ - (L("02")("month") + "-" + oneThru29("day")) + (oneOf("01 03 05 07 08 10 12")("month") + "-" + oneThru31("day")) + ^ (oneOf("04 06 09 11")("month") + "-" + oneThru30("day")) + ^ (L("02")("month") + "-" + oneThru29("day")) ) +# Significant digits suffix +significantDigits = "S" + Word(nums)("significant_digits") + # 4 digits, 0 to 9 positiveYear = Word(nums, exact=4) # Negative version of positive year, but "-0000" is illegal negativeYear = NotAny(L("-0000")) + ("-" + positiveYear) -year = Combine(positiveYear ^ negativeYear)("year") +year = Combine(positiveYear ^ negativeYear)("year") + Optional(significantDigits) +# simple version for Consecutives +year_basic = Combine(positiveYear ^ negativeYear)("year") yearMonth = year + "-" + month yearMonthDay = year + "-" + monthDay # o hai iso date @@ -51,11 +96,8 @@ zoneOffsetHour = oneThru13 zoneOffset = L("Z") ^ ( - Regex("[+-]") + ( - zoneOffsetHour + Optional(":" + minute) ^ - L("14:00") ^ - ("00:" + oneThru59) - ) + Regex("[+-]") + + (zoneOffsetHour + Optional(":" + minute) ^ L("14:00") ^ ("00:" + oneThru59)) ) baseTime = Combine(hour + ":" + minute + ":" + second ^ "24:00:00") @@ -74,7 +116,7 @@ # (* ************************** Level 1 *************************** *) # (* ** Auxiliary Assignments for Level 1 ** *) -UASymbol = Combine(oneOf("? ~ ?~")) +UASymbol = Combine(oneOf("? ~ %")) UA.set_parser(UASymbol) seasonNumber = oneOf("21 22 23 24") @@ -87,177 +129,190 @@ # (* *** Long Year - Simple Form *** *) -longYearSimple = "y" + Combine( - Optional("-") + positiveDigit + digit + digit + digit + OneOrMore(digit) -)("year") +longYearSimple = ( + "Y" + + Combine(Optional("-") + positiveDigit + digit + digit + digit + OneOrMore(digit))( + "year" + ) + + Optional(significantDigits) +) LongYear.set_parser(longYearSimple) # (* *** L1Interval *** *) uaDateOrSeason = dateOrSeason + Optional(UASymbol) -l1Start = uaDateOrSeason ^ "unknown" # bit of a kludge here to get the all the relevant tokens into the parse action # cleanly otherwise the parameter names are overlapped. def f(toks): try: - return {'date': toks[0], 'ua': toks[1]} + return {"date": toks[0], "ua": toks[1]} except IndexError: - return {'date': toks[0], 'ua': None} + return {"date": toks[0], "ua": None} +l1Start = ".." ^ uaDateOrSeason l1Start.addParseAction(f) -l1End = uaDateOrSeason ^ "unknown" ^ "open" +l1End = uaDateOrSeason ^ ".." l1End.addParseAction(f) -level1Interval = l1Start("lower") + "/" + l1End("upper") +level1Interval = Optional(l1Start)("lower") + "/" + l1End("upper") ^ l1Start( + "lower" +) + "/" + Optional(l1End("upper")) Level1Interval.set_parser(level1Interval) # (* *** unspecified *** *) -yearWithOneOrTwoUnspecifedDigits = Combine( - digit + digit + (digit ^ 'u') + 'u' +yearWithOneOrTwoOrThreeUnspecifedDigits = Combine( + Optional("-") + digit + (digit ^ "X") + (digit ^ "X") + "X" )("year") -monthUnspecified = year + "-" + L("uu")("month") -dayUnspecified = yearMonth + "-" + L("uu")("day") -dayAndMonthUnspecified = year + "-" + L("uu")("month") + "-" + L("uu")("day") - -unspecified = yearWithOneOrTwoUnspecifedDigits \ - ^ monthUnspecified \ - ^ dayUnspecified \ +monthUnspecified = year + "-" + L("XX")("month") +dayUnspecified = yearMonth + "-" + L("XX")("day") +dayAndMonthUnspecified = year + "-" + L("XX")("month") + "-" + L("XX")("day") + +unspecified = ( + yearWithOneOrTwoOrThreeUnspecifedDigits + ^ monthUnspecified + ^ dayUnspecified ^ dayAndMonthUnspecified +) + Optional(UASymbol)("ua") Unspecified.set_parser(unspecified) # (* *** uncertainOrApproxDate *** *) -uncertainOrApproxDate = date('date') + UASymbol("ua") +uncertainOrApproxDate = date("date") + UASymbol("ua") UncertainOrApproximate.set_parser(uncertainOrApproxDate) -level1Expression = uncertainOrApproxDate \ - ^ unspecified \ - ^ level1Interval \ - ^ longYearSimple \ - ^ season +level1Expression = ( + uncertainOrApproxDate ^ unspecified ^ level1Interval ^ longYearSimple ^ season +) # (* ************************** Level 2 *************************** *) # (* ** Internal Unspecified** *) -digitOrU = Word(nums + 'u', exact=1) - -# 2-digit day with at least one 'u' present -dayWithU = Combine( - ("u" + digitOrU) ^ - (digitOrU + 'u') -)("day") - -# 2-digit month with at least one 'u' present -monthWithU = Combine( - oneOf("0u 1u") ^ - ("u" + digitOrU) -)("month") - -# 4-digit year with at least one 'u' present -yearWithU = Combine( - ('u' + digitOrU + digitOrU + digitOrU) ^ - (digitOrU + 'u' + digitOrU + digitOrU) ^ - (digitOrU + digitOrU + 'u' + digitOrU) ^ - (digitOrU + digitOrU + digitOrU + 'u') +digitOrX = Word(nums + "X", exact=1) + +# 2-digit day with at least one 'X' present +dayWithX = Combine(("X" + digitOrX) ^ (digitOrX + "X"))("day") + +# 2-digit month with at least one 'X' present +monthWithX = Combine(oneOf("0X 1X") ^ ("X" + digitOrX))("month") + +# 4-digit year with at least one 'X' present +yearWithX = Combine( + ("X" + digitOrX + digitOrX + digitOrX) + ^ (digitOrX + "X" + digitOrX + digitOrX) + ^ (digitOrX + digitOrX + "X" + digitOrX) + ^ (digitOrX + digitOrX + digitOrX + "X") )("year") -yearMonthWithU = ( - (Combine(year("") ^ yearWithU(""))("year") + "-" + monthWithU) ^ - (yearWithU + "-" + month) +yearMonthWithX = (Combine(year("") ^ yearWithX(""))("year") + "-" + monthWithX) ^ ( + yearWithX + "-" + month ) -monthDayWithU = ( - (Combine(month("") ^ monthWithU(""))("month") + "-" + dayWithU) ^ - (monthWithU + "-" + day) +monthDayWithX = (Combine(month("") ^ monthWithX(""))("month") + "-" + dayWithX) ^ ( + monthWithX + "-" + day ) -yearMonthDayWithU = ( - (yearWithU + "-" + Combine(month("") ^ monthWithU(""))("month") + "-" + Combine(day("") ^ dayWithU(""))("day")) ^ - (year + "-" + monthWithU + "-" + Combine(day("") ^ dayWithU(""))("day")) ^ - (year + "-" + month + "-" + dayWithU) +yearMonthDayWithX = ( + ( + yearWithX + + "-" + + Combine(month("") ^ monthWithX(""))("month") + + "-" + + Combine(day("") ^ dayWithX(""))("day") + ) + ^ (year + "-" + monthWithX + "-" + Combine(day("") ^ dayWithX(""))("day")) + ^ (year + "-" + month + "-" + dayWithX) ) -partialUnspecified = yearWithU ^ yearMonthWithU ^ yearMonthDayWithU +partialUnspecified = yearWithX ^ yearMonthWithX ^ yearMonthDayWithX PartialUnspecified.set_parser(partialUnspecified) # (* ** Internal Uncertain or Approximate** *) -# this line is out of spec, but the given examples (e.g. '(2004)?-06-04~') -# appear to require it. -year_with_brackets = year ^ ("(" + year + ")") - -# second clause below needed Optional() around the "year_ua" UASymbol, for dates -# like '(2011)-06-04~' to work. +# group qualification +# qualifier right of a component(date, month, day) applies to all components to the left +group_qual = yearMonth + UASymbol("year_month_ua") + "-" + day ^ year + UASymbol( + "year_ua" +) + "-" + month + Opt("-" + day) + +# component qualification +# qualifier immediate left of a component (date, month, day) applies to that component only +qual_year = year ^ UASymbol("year_ua_b") + year ^ year + UASymbol("year_ua") +qual_month = month ^ UASymbol("month_ua") + month +qual_day = day ^ UASymbol("day_ua") + day + +indi_qual = ( + UASymbol("year_ua_b") + year + Opt("-" + qual_month + Opt("-" + qual_day)) + ^ qual_year + "-" + UASymbol("month_ua") + month + Opt("-" + qual_day) + ^ qual_year + "-" + qual_month + "-" + UASymbol("day_ua") + day +) -IUABase = \ - (year_with_brackets + UASymbol("year_ua") + "-" + month + Optional("-(" + day + ")" + UASymbol("day_ua"))) \ - ^ (year_with_brackets + Optional(UASymbol)("year_ua") + "-" + monthDay + Optional(UASymbol)("month_day_ua")) \ - ^ ( - year_with_brackets + Optional(UASymbol)("year_ua") + "-(" + month + ")" + UASymbol("month_ua") + - Optional("-(" + day + ")" + UASymbol("day_ua")) - ) \ - ^ ( - year_with_brackets + Optional(UASymbol)("year_ua") + "-(" + month + ")" + UASymbol("month_ua") + - Optional("-" + day) - ) \ - ^ (yearMonth + UASymbol("year_month_ua") + "-(" + day + ")" + UASymbol("day_ua")) \ - ^ (yearMonth + UASymbol("year_month_ua") + "-" + day) \ - ^ (yearMonth + "-(" + day + ")" + UASymbol("day_ua")) \ - ^ (year + "-(" + monthDay + ")" + UASymbol("month_day_ua")) \ - ^ (season("ssn") + UASymbol("season_ua")) - -partialUncertainOrApproximate = IUABase ^ ("(" + IUABase + ")" + UASymbol("all_ua")) +partialUncertainOrApproximate = group_qual ^ indi_qual PartialUncertainOrApproximate.set_parser(partialUncertainOrApproximate) -dateWithInternalUncertainty = partialUncertainOrApproximate \ - ^ partialUnspecified +dateWithInternalUncertainty = partialUncertainOrApproximate ^ partialUnspecified -qualifyingString = Regex(r'\S') # any nonwhitespace char +qualifyingString = Regex(r"\S") # any nonwhitespace char # (* ** SeasonQualified ** *) seasonQualifier = qualifyingString seasonQualified = season + "^" + seasonQualifier # (* ** Long Year - Scientific Form ** *) -positiveInteger = Combine(positiveDigit + ZeroOrMore(digit)) -longYearScientific = "y" + Combine(Optional("-") + positiveInteger)("base") + "e" + \ - positiveInteger("exponent") + Optional("p" + positiveInteger("precision")) +longYearScientific = ( + "Y" + + Combine(Optional("-") + positiveInteger)("base") + + "E" + + positiveInteger("exponent") + + Optional(significantDigits) +) ExponentialYear.set_parser(longYearScientific) # (* ** level2Interval ** *) -level2Interval = (dateOrSeason("lower") + "/" + dateWithInternalUncertainty("upper")) \ - ^ (dateWithInternalUncertainty("lower") + "/" + dateOrSeason("upper")) \ - ^ (dateWithInternalUncertainty("lower") + "/" + dateWithInternalUncertainty("upper")) +level2Interval = ( + (dateOrSeason("lower") + "/" + dateWithInternalUncertainty("upper")) + ^ (dateWithInternalUncertainty("lower") + "/" + dateOrSeason("upper")) + ^ ( + dateWithInternalUncertainty("lower") + + "/" + + dateWithInternalUncertainty("upper") + ) +) Level2Interval.set_parser(level2Interval) -# (* ** Masked precision ** *) -maskedPrecision = Combine(digit + digit + ((digit + "x") ^ "xx"))("year") -MaskedPrecision.set_parser(maskedPrecision) - # (* ** Inclusive list and choice list** *) -consecutives = (yearMonthDay("lower") + ".." + yearMonthDay("upper")) \ - ^ (yearMonth("lower") + ".." + yearMonth("upper")) \ - ^ (year("lower") + ".." + year("upper")) +consecutives = ( + (yearMonthDay("lower") + ".." + yearMonthDay("upper")) + ^ (yearMonth("lower") + ".." + yearMonth("upper")) + ^ ( + year_basic("lower") + ".." + year_basic("upper") + ) # using year_basic because some tests were throwing `'list' object has no attribute 'expandtabs'` - somewhere, pyparsing.parse_string() was being passed a list +) Consecutives.set_parser(consecutives) -listElement = date \ - ^ dateWithInternalUncertainty \ - ^ uncertainOrApproxDate \ - ^ unspecified \ +listElement = ( + date + ^ dateWithInternalUncertainty + ^ uncertainOrApproxDate + ^ unspecified ^ consecutives +) + +earlier = L("..").addParseAction(f)("lower") + date("upper").addParseAction(f) +later = date("lower").addParseAction(f) + L("..").addParseAction(f)("upper") -earlier = ".." + date("upper") EarlierConsecutives.set_parser(earlier) -later = date("lower") + ".." LaterConsecutives.set_parser(later) -listContent = (earlier + ZeroOrMore("," + listElement)) \ - ^ (Optional(earlier + ",") + ZeroOrMore(listElement + ",") + later) \ - ^ (listElement + OneOrMore("," + listElement)) \ + +listContent = ( + (earlier + ZeroOrMore("," + listElement)) + ^ (Optional(earlier + ",") + ZeroOrMore(listElement + ",") + later) + ^ (listElement + OneOrMore("," + listElement)) ^ consecutives +) choiceList = "[" + listContent + "]" OneOfASet.set_parser(choiceList) @@ -265,27 +320,41 @@ def f(toks): inclusiveList = "{" + listContent + "}" MultipleDates.set_parser(inclusiveList) -level2Expression = partialUncertainOrApproximate \ - ^ partialUnspecified \ - ^ choiceList \ - ^ inclusiveList \ - ^ maskedPrecision \ - ^ level2Interval \ - ^ longYearScientific \ - ^ seasonQualified + +# (* *** L2 Season *** *) +seasonL2Number = oneOf("21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41") +l2season = year + "-" + seasonL2Number("season") +Level2Season.set_parser(l2season) + +level2Expression = ( + partialUncertainOrApproximate + ^ partialUnspecified + ^ choiceList + ^ inclusiveList + ^ level2Interval + ^ longYearScientific + ^ l2season + ^ seasonQualified +) # putting it all together -edtfParser = level0Expression("level0") ^ level1Expression("level1") ^ level2Expression("level2") +edtfParser = ( + level0Expression("level0") ^ level1Expression("level1") ^ level2Expression("level2") +) -def parse_edtf(str, parseAll=True, fail_silently=False): +def parse_edtf(input_string, parseAll=True, fail_silently=False, debug=None): + if debug is None: + debug = DEBUG_PYPARSING + if not input_string: + raise EDTFParseException(input_string) try: - if not str: - raise ParseException("You must supply some input text") - p = edtfParser.parseString(str.strip(), parseAll) + p = edtfParser.parseString(input_string.strip(), parseAll) if p: return p[0] - except ParseException as e: + except ParseException as err: if fail_silently: return None - raise EDTFParseException(e) + if debug: + raise + raise EDTFParseException(input_string, err) from None diff --git a/edtf/parser/parser_classes.py b/edtf/parser/parser_classes.py index b670296..ed03355 100644 --- a/edtf/parser/parser_classes.py +++ b/edtf/parser/parser_classes.py @@ -1,17 +1,22 @@ import calendar +import math import re -from time import struct_time from datetime import date, datetime from operator import add, sub +from time import struct_time from dateutil.relativedelta import relativedelta from edtf import appsettings -from edtf.convert import dt_to_struct_time, trim_struct_time, \ - TIME_EMPTY_TIME, TIME_EMPTY_EXTRAS +from edtf.convert import ( + TIME_EMPTY_EXTRAS, + TIME_EMPTY_TIME, + dt_to_struct_time, + trim_struct_time, +) -EARLIEST = 'earliest' -LATEST = 'latest' +EARLIEST = "earliest" +LATEST = "latest" PRECISION_MILLENIUM = "millenium" PRECISION_CENTURY = "century" @@ -80,14 +85,16 @@ def apply_delta(op, time_struct, delta): # Convert result year back to its original millenium final_year = dt_result.year - millenium_diff return struct_time( - (final_year,) + dt_result.timetuple()[1:6] + tuple(TIME_EMPTY_EXTRAS)) + (final_year,) + dt_result.timetuple()[1:6] + tuple(TIME_EMPTY_EXTRAS) + ) -class EDTFObject(object): +class EDTFObject: """ - Object to attact to a parser to become instantiated when the parser + Object to attach to a parser to become instantiated when the parser completes. """ + parser = None @classmethod @@ -99,9 +106,9 @@ def set_parser(cls, p): def parse_action(cls, toks): kwargs = toks.asDict() try: - return cls(**kwargs) # replace the token list with the class + return cls(**kwargs) # replace the token list with the class except Exception as e: - print("trying to %s.__init__(**%s)" % (cls.__name__, kwargs)) + print(f"trying to {cls.__name__}.__init__(**{kwargs})") raise e @classmethod @@ -109,14 +116,11 @@ def parse(cls, s): return cls.parser.parseString(s)[0] def __repr__(self): - return "%s: '%s'" % (type(self).__name__, str(self)) + return f"{type(self).__name__}: '{str(self)}'" def __init__(self, *args, **kwargs): - str = "%s.__init__(*%s, **%s)" % ( - type(self).__name__, - args, kwargs, - ) - raise NotImplementedError("%s is not implemented." % str) + str = f"{type(self).__name__}.__init__(*{args}, **{kwargs})" + raise NotImplementedError(f"{str} is not implemented.") def __str__(self): raise NotImplementedError @@ -137,19 +141,31 @@ def _get_fuzzy_padding(self, lean): return relativedelta(0) def get_is_approximate(self): - return getattr(self, '_is_approximate', False) + return getattr(self, "_is_approximate", False) def set_is_approximate(self, val): self._is_approximate = val + is_approximate = property(get_is_approximate, set_is_approximate) def get_is_uncertain(self): - return getattr(self, '_is_uncertain', False) + return getattr(self, "_is_uncertain", False) def set_is_uncertain(self, val): self._is_uncertain = val + is_uncertain = property(get_is_uncertain, set_is_uncertain) + def get_is_uncertain_and_approximate(self): + return getattr(self, "_uncertain_and_approximate", False) + + def set_is_uncertain_and_approximate(self, val): + self._uncertain_and_approximate = val + + is_uncertain_and_approximate = property( + get_is_uncertain_and_approximate, set_is_uncertain_and_approximate + ) + def lower_fuzzy(self): strict_val = self.lower_strict() return apply_delta(sub, strict_val, self._get_fuzzy_padding(EARLIEST)) @@ -183,7 +199,9 @@ def __gt__(self, other): return self.lower_strict() > dt_to_struct_time(other) elif isinstance(other, struct_time): return self.lower_strict() > trim_struct_time(other) - raise TypeError("can't compare %s with %s" % (type(self).__name__, type(other).__name__)) + raise TypeError( + f"can't compare {type(self).__name__} with {type(other).__name__}" + ) def __ge__(self, other): if isinstance(other, EDTFObject): @@ -192,7 +210,9 @@ def __ge__(self, other): return self.lower_strict() >= dt_to_struct_time(other) elif isinstance(other, struct_time): return self.lower_strict() >= trim_struct_time(other) - raise TypeError("can't compare %s with %s" % (type(self).__name__, type(other).__name__)) + raise TypeError( + f"can't compare {type(self).__name__} with {type(other).__name__}" + ) def __lt__(self, other): if isinstance(other, EDTFObject): @@ -201,7 +221,9 @@ def __lt__(self, other): return self.lower_strict() < dt_to_struct_time(other) elif isinstance(other, struct_time): return self.lower_strict() < trim_struct_time(other) - raise TypeError("can't compare %s with %s" % (type(self).__name__, type(other).__name__)) + raise TypeError( + f"can't compare {type(self).__name__} with {type(other).__name__}" + ) def __le__(self, other): if isinstance(other, EDTFObject): @@ -210,13 +232,15 @@ def __le__(self, other): return self.lower_strict() <= dt_to_struct_time(other) elif isinstance(other, struct_time): return self.lower_strict() <= trim_struct_time(other) - raise TypeError("can't compare %s with %s" % (type(self).__name__, type(other).__name__)) + raise TypeError( + f"can't compare {type(self).__name__} with {type(other).__name__}" + ) # (* ************************** Level 0 *************************** *) -class Date(EDTFObject): +class Date(EDTFObject): def set_year(self, y): if y is None: raise AttributeError("Year must not be None") @@ -224,33 +248,42 @@ def set_year(self, y): def get_year(self): return self._year + year = property(get_year, set_year) def set_month(self, m): self._month = m - if m == None: + if m is None: self.day = None def get_month(self): return self._month + month = property(get_month, set_month) - def __init__(self, year=None, month=None, day=None, **kwargs): - for param in ('date', 'lower', 'upper'): + def __init__( + self, year=None, month=None, day=None, significant_digits=None, **kwargs + ): + for param in ("date", "lower", "upper"): if param in kwargs: self.__init__(**kwargs[param]) return - self.year = year # Year is required, but sometimes passed in as a 'date' dict. + self.year = year # Year is required, but sometimes passed in as a 'date' dict. self.month = month self.day = day + self.significant_digits = ( + int(significant_digits) if significant_digits else None + ) def __str__(self): r = self.year if self.month: - r += "-%s" % self.month + r += f"-{self.month}" if self.day: - r += "-%s" % self.day + r += f"-{self.day}" + if self.significant_digits: + r += f"S{self.significant_digits}" return r def isoformat(self, default=date.max): @@ -260,24 +293,56 @@ def isoformat(self, default=date.max): int(self.day or default.day), ) + def lower_fuzzy(self): + if not hasattr(self, "significant_digits") or not self.significant_digits: + return apply_delta( + sub, self.lower_strict(), self._get_fuzzy_padding(EARLIEST) + ) + else: + total_digits = len(self.year) + insignificant_digits = total_digits - self.significant_digits + lower_year = ( + int(self.year) + // (10**insignificant_digits) + * (10**insignificant_digits) + ) + return struct_time([lower_year, 1, 1] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) + + def upper_fuzzy(self): + if not hasattr(self, "significant_digits") or not self.significant_digits: + return apply_delta( + add, self.upper_strict(), self._get_fuzzy_padding(LATEST) + ) + else: + total_digits = len(self.year) + insignificant_digits = total_digits - self.significant_digits + upper_year = (int(self.year) // (10**insignificant_digits) + 1) * ( + 10**insignificant_digits + ) - 1 + return struct_time( + [upper_year, 12, 31] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS + ) + def _precise_year(self, lean): # Replace any ambiguous characters in the year string with 0s or 9s if lean == EARLIEST: - return int(re.sub(r'[xu]', r'0', self.year)) + return int(re.sub(r"X", r"0", self.year)) else: - return int(re.sub(r'[xu]', r'9', self.year)) + return int(re.sub(r"X", r"9", self.year)) def _precise_month(self, lean): - if self.month and self.month != "uu": + if self.month and self.month != "XX": try: return int(self.month) - except ValueError as e: - raise ValueError("Couldn't convert %s to int (in %s)" % (self.month, self)) + except ValueError as err: + raise ValueError( + f"Couldn't convert {self.month} to int (in {self})" + ) from err else: return 1 if lean == EARLIEST else 12 def _precise_day(self, lean): - if not self.day or self.day == 'uu': + if not self.day or self.day == "XX": if lean == EARLIEST: return 1 else: @@ -296,7 +361,9 @@ def _strict_date(self, lean): self._precise_year(lean), self._precise_month(lean), self._precise_day(lean), - ) + tuple(TIME_EMPTY_TIME) + tuple(TIME_EMPTY_EXTRAS) + ) + + tuple(TIME_EMPTY_TIME) + + tuple(TIME_EMPTY_EXTRAS) ) @property @@ -307,6 +374,9 @@ def precision(self): return PRECISION_MONTH return PRECISION_YEAR + def estimated(self): + return self._precise_year(EARLIEST) + class DateAndTime(EDTFObject): def __init__(self, date, time): @@ -327,14 +397,14 @@ def __eq__(self, other): return self.isoformat() == other.isoformat() elif isinstance(other, struct_time): return self._strict_date() == trim_struct_time(other) - return super(DateAndTime, self).__eq__(other) + return super().__eq__(other) def __ne__(self, other): if isinstance(other, datetime): return self.isoformat() != other.isoformat() elif isinstance(other, struct_time): return self._strict_date() != trim_struct_time(other) - return super(DateAndTime, self).__ne__(other) + return super().__ne__(other) class Interval(EDTFObject): @@ -343,30 +413,20 @@ def __init__(self, lower, upper): self.upper = upper def __str__(self): - return "%s/%s" % (self.lower, self.upper) + return f"{self.lower}/{self.upper}" def _strict_date(self, lean): if lean == EARLIEST: - try: - r = self.lower._strict_date(lean) - if r is None: - raise AttributeError - return r - except AttributeError: # it's a string, or no date. Result depends on the upper date - upper = self.upper._strict_date(LATEST) - return apply_delta(sub, upper, appsettings.DELTA_IF_UNKNOWN) + r = self.lower._strict_date(lean) else: - try: - r = self.upper._strict_date(lean) - if r is None: - raise AttributeError - return r - except AttributeError: # an 'unknown' or 'open' string - depends on the lower date - if self.upper and (self.upper == "open" or self.upper.date == "open"): - return dt_to_struct_time(date.today()) # it's still happening - else: - lower = self.lower._strict_date(EARLIEST) - return apply_delta(add, lower, appsettings.DELTA_IF_UNKNOWN) + r = self.upper._strict_date(lean) + return r + + @property + def precision(self): + if self.lower.precision == self.upper.precision: + return self.lower.precision + return None # (* ************************** Level 1 *************************** *) @@ -379,11 +439,13 @@ def parse_action(cls, toks): return cls(*args) def __init__(self, *args): - assert len(args)==1 + if len(args) != 1: + raise AssertionError("UA must have exactly one argument") ua = args[0] self.is_uncertain = "?" in ua self.is_approximate = "~" in ua + self.is_uncertain_and_approximate = "%" in ua def __str__(self): d = "" @@ -391,10 +453,12 @@ def __str__(self): d += "?" if self.is_approximate: d += "~" + if self.is_uncertain_and_approximate: + d += "%" return d def _get_multiplier(self): - if self.is_uncertain and self.is_approximate: + if self.is_uncertain_and_approximate: return appsettings.MULTIPLIER_IF_BOTH elif self.is_uncertain: return appsettings.MULTIPLIER_IF_UNCERTAIN @@ -406,41 +470,261 @@ class UncertainOrApproximate(EDTFObject): def __init__(self, date, ua): self.date = date self.ua = ua + self.is_uncertain = ua.is_uncertain if ua else False + self.is_approximate = ua.is_approximate if ua else False + self.is_uncertain_and_approximate = ( + ua.is_uncertain_and_approximate if ua else False + ) def __str__(self): if self.ua: - return "%s%s" % (self.date, self.ua) + return f"{self.date}{self.ua}" else: return str(self.date) def _strict_date(self, lean): - if self.date == "open": - return dt_to_struct_time(date.today()) - if self.date =="unknown": - return None # depends on the other date return self.date._strict_date(lean) def _get_fuzzy_padding(self, lean): if not self.ua: - return relativedelta(0) + return relativedelta() multiplier = self.ua._get_multiplier() + padding = relativedelta() + + # Check the presence of uncertainty on each component + # self.precision not helpful here: + # L1 qualified EDTF dates apply qualification across all parts of the date + if self.date.year: + padding += relativedelta( + years=int(multiplier * appsettings.PADDING_YEAR_PRECISION.years) + ) + if self.date.month: + padding += relativedelta( + months=int(multiplier * appsettings.PADDING_MONTH_PRECISION.months) + ) + if self.date.day: + padding += relativedelta( + days=int(multiplier * appsettings.PADDING_DAY_PRECISION.days) + ) + + return padding + + +class UnspecifiedIntervalSection(EDTFObject): + def __init__(self, sectionOpen=False, other_section_element=None): + if sectionOpen: + self.is_open = True + self.is_unknown = False + else: + self.is_open = False + self.is_unknown = True + self.other = other_section_element - if self.date.precision == PRECISION_DAY: - return multiplier * appsettings.PADDING_DAY_PRECISION - elif self.date.precision == PRECISION_MONTH: - return multiplier * appsettings.PADDING_MONTH_PRECISION - elif self.date.precision == PRECISION_YEAR: - return multiplier * appsettings.PADDING_YEAR_PRECISION + def __str__(self): + if self.is_unknown: + return "" + else: + return ".." + + def _strict_date(self, lean): + if lean == EARLIEST: + if self.is_unknown: + upper = self.other._strict_date(LATEST) + return apply_delta(sub, upper, appsettings.DELTA_IF_UNKNOWN) + else: + return -math.inf + else: + if self.is_unknown: + lower = self.other._strict_date(EARLIEST) + return apply_delta(add, lower, appsettings.DELTA_IF_UNKNOWN) + else: + return math.inf + + @property + def precision(self): + return self.other.date.precision or PRECISION_YEAR class Unspecified(Date): - pass + def __init__( + self, + year=None, + month=None, + day=None, + significant_digits=None, + ua=None, + **kwargs, + ): + super().__init__( + year=year, + month=month, + day=day, + significant_digits=significant_digits, + **kwargs, + ) + self.ua = ua + self.is_uncertain = ua.is_uncertain if ua else False + self.is_approximate = ua.is_approximate if ua else False + self.is_uncertain_and_approximate = ( + ua.is_uncertain_and_approximate if ua else False + ) + self.negative = self.year.startswith("-") + + def __str__(self): + base = super().__str__() + if self.ua: + base += str(self.ua) + return base + + def _get_fuzzy_padding(self, lean): + if not self.ua: + return relativedelta() + multiplier = self.ua._get_multiplier() + padding = relativedelta() + + if self.year: + years_padding = self._years_padding(multiplier) + padding += years_padding + if self.month: + padding += relativedelta( + months=int(multiplier * appsettings.PADDING_MONTH_PRECISION.months) + ) + if self.day: + padding += relativedelta( + days=int(multiplier * appsettings.PADDING_DAY_PRECISION.days) + ) + return padding + + def _years_padding(self, multiplier): + """Calculate year padding based on the precision.""" + precision_settings = { + PRECISION_MILLENIUM: appsettings.PADDING_MILLENNIUM_PRECISION.years, + PRECISION_CENTURY: appsettings.PADDING_CENTURY_PRECISION.years, + PRECISION_DECADE: appsettings.PADDING_DECADE_PRECISION.years, + PRECISION_YEAR: appsettings.PADDING_YEAR_PRECISION.years, + } + years = precision_settings.get(self.precision, 0) + return relativedelta(years=int(multiplier * years)) + + def lower_fuzzy(self): + strict_val = ( + self.lower_strict() + ) # negative handled in the lower_strict() override + adjusted = apply_delta(sub, strict_val, self._get_fuzzy_padding(EARLIEST)) + return adjusted + + def upper_fuzzy(self): + strict_val = ( + self.upper_strict() + ) # negative handled in the upper_strict() override + + adjusted = apply_delta(add, strict_val, self._get_fuzzy_padding(LATEST)) + return adjusted + + def lower_strict(self): + if self.negative: + strict_val = self._strict_date( + lean=LATEST + ) # gets the year right, but need to adjust day and month + if self.precision in ( + PRECISION_YEAR, + PRECISION_DECADE, + PRECISION_CENTURY, + PRECISION_MILLENIUM, + ): + return struct_time( + (strict_val.tm_year, 1, 1) + + tuple(TIME_EMPTY_TIME) + + tuple(TIME_EMPTY_EXTRAS) + ) + elif self.precision == PRECISION_MONTH: + return struct_time( + (strict_val.tm_year, strict_val.tm_mon, 1) + + tuple(TIME_EMPTY_TIME) + + tuple(TIME_EMPTY_EXTRAS) + ) + else: + return strict_val + else: + return self._strict_date(lean=EARLIEST) + + def upper_strict(self): + if self.negative: + strict_val = self._strict_date(lean=EARLIEST) + if self.precision in ( + PRECISION_YEAR, + PRECISION_DECADE, + PRECISION_CENTURY, + PRECISION_MILLENIUM, + ): + return struct_time( + (strict_val.tm_year, 12, 31) + + tuple(TIME_EMPTY_TIME) + + tuple(TIME_EMPTY_EXTRAS) + ) + elif self.precision == PRECISION_MONTH: + days_in_month = calendar.monthrange( + strict_val.tm_year, strict_val.tm_mon + )[1] + return struct_time( + (strict_val.tm_year, strict_val.tm_mon, days_in_month) + + tuple(TIME_EMPTY_TIME) + + tuple(TIME_EMPTY_EXTRAS) + ) + else: + return strict_val + else: + return self._strict_date(lean=LATEST) + + @property + def precision(self): + if self.day: + return PRECISION_DAY + if self.month: + return PRECISION_MONTH + if self.year: + year_no_symbol = self.year.lstrip("-") + if year_no_symbol.isdigit(): + return PRECISION_YEAR + if len(year_no_symbol) == 4 and year_no_symbol.endswith("XXX"): + return PRECISION_MILLENIUM + if len(year_no_symbol) == 4 and year_no_symbol.endswith("XX"): + return PRECISION_CENTURY + if len(year_no_symbol) == 4 and year_no_symbol.endswith("X"): + return PRECISION_DECADE + raise ValueError(f"Unspecified date {self} has no precision") class Level1Interval(Interval): - def __init__(self, lower, upper): - self.lower = UncertainOrApproximate(**lower) - self.upper = UncertainOrApproximate(**upper) + def __init__(self, lower=None, upper=None): + if lower: + if lower["date"] == "..": + self.lower = UnspecifiedIntervalSection( + True, UncertainOrApproximate(**upper) + ) + else: + self.lower = UncertainOrApproximate(**lower) + else: + self.lower = UnspecifiedIntervalSection( + False, UncertainOrApproximate(**upper) + ) + if upper: + if upper["date"] == "..": + self.upper = UnspecifiedIntervalSection( + True, UncertainOrApproximate(**lower) + ) + else: + self.upper = UncertainOrApproximate(**upper) + else: + self.upper = UnspecifiedIntervalSection( + False, UncertainOrApproximate(**lower) + ) + self.is_approximate = self.lower.is_approximate or self.upper.is_approximate + self.is_uncertain = self.lower.is_uncertain or self.upper.is_uncertain + self.is_uncertain_and_approximate = ( + self.lower.is_uncertain_and_approximate + or self.upper.is_uncertain_and_approximate + ) def _get_fuzzy_padding(self, lean): if lean == EARLIEST: @@ -450,11 +734,17 @@ def _get_fuzzy_padding(self, lean): class LongYear(EDTFObject): - def __init__(self, year): + def __init__(self, year, significant_digits=None): self.year = year + self.significant_digits = ( + int(significant_digits) if significant_digits else None + ) def __str__(self): - return "y%s" % self.year + if self.significant_digits: + return f"Y{self.year}S{self.significant_digits}" + else: + return f"Y{self.year}" def _precise_year(self): return int(self.year) @@ -462,26 +752,63 @@ def _precise_year(self): def _strict_date(self, lean): py = self._precise_year() if lean == EARLIEST: - return struct_time( - [py, 1, 1] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) + return struct_time([py, 1, 1] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) else: - return struct_time( - [py, 12, 31] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) + return struct_time([py, 12, 31] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) + + def estimated(self): + return self._precise_year() + + def lower_fuzzy(self): + full_year = self._precise_year() + strict_val = self.lower_strict() + if not self.significant_digits: + return apply_delta(sub, strict_val, self._get_fuzzy_padding(EARLIEST)) + else: + insignificant_digits = len(str(full_year)) - int(self.significant_digits) + if insignificant_digits <= 0: + return apply_delta(sub, strict_val, self._get_fuzzy_padding(EARLIEST)) + padding_value = 10**insignificant_digits + sig_digits = full_year // padding_value + lower_year = sig_digits * padding_value + return apply_delta( + sub, + struct_time([lower_year, 1, 1] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS), + self._get_fuzzy_padding(EARLIEST), + ) + + def upper_fuzzy(self): + full_year = self._precise_year() + strict_val = self.upper_strict() + if not self.significant_digits: + return apply_delta(add, strict_val, self._get_fuzzy_padding(LATEST)) + else: + insignificant_digits = len(str(full_year)) - self.significant_digits + if insignificant_digits <= 0: + return apply_delta(add, strict_val, self._get_fuzzy_padding(LATEST)) + padding_value = 10**insignificant_digits + sig_digits = full_year // padding_value + upper_year = (sig_digits + 1) * padding_value - 1 + return apply_delta( + add, + struct_time([upper_year, 12, 31] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS), + self._get_fuzzy_padding(LATEST), + ) class Season(Date): def __init__(self, year, season, **kwargs): self.year = year - self.season = season # use season to look up month + self.season = season # use season to look up month # day isn't part of the 'season' spec, but it helps the inherited # `Date` methods do their thing. self.day = None def __str__(self): - return "%s-%s" % (self.year, self.season) + return f"{self.year}-{self.season}" def _precise_month(self, lean): - rng = appsettings.SEASON_MONTHS_RANGE[int(self.season)] + rng = appsettings.SEASON_L2_MONTHS_RANGE[int(self.season)] if lean == EARLIEST: return rng[0] else: @@ -492,22 +819,32 @@ def _precise_month(self, lean): class PartialUncertainOrApproximate(Date): - - def set_year(self, y): # Year can be None. + def set_year(self, y): # Year can be None. self._year = y + year = property(Date.get_year, set_year) def __init__( - self, year=None, month=None, day=None, - year_ua=False, month_ua = False, day_ua = False, - year_month_ua = False, month_day_ua = False, - ssn=None, season_ua=False, all_ua=False + self, + year=None, + month=None, + day=None, + year_ua=False, + month_ua=False, + day_ua=False, + year_month_ua=False, + month_day_ua=False, + ssn=None, + season_ua=False, + all_ua=False, + year_ua_b=False, ): self.year = year self.month = month self.day = day self.year_ua = year_ua + self.year_ua_b = year_ua_b self.month_ua = month_ua self.day_ua = day_ua @@ -519,65 +856,73 @@ def __init__( self.all_ua = all_ua - def __str__(self): + uas = [ + year_ua, + month_ua, + day_ua, + year_month_ua, + month_day_ua, + season_ua, + all_ua, + ] + self.is_uncertain = any( + item.is_uncertain for item in uas if hasattr(item, "is_uncertain") + ) + self.is_approximate = any( + item.is_approximate for item in uas if hasattr(item, "is_approximate") + ) + self.is_uncertain_and_approximate = any( + item.is_uncertain_and_approximate + for item in uas + if hasattr(item, "is_uncertain_and_approximate") + ) + def __str__(self): if self.season_ua: - return "%s%s" % (self.season, self.season_ua) + return f"{self.season}{self.season_ua}" if self.year_ua: - y = "%s%s" % (self.year, self.year_ua) + y = f"{self.year}{self.year_ua}" else: - y = str(self.year) + y = f"{self.year_ua_b}{self.year}" if self.year_ua_b else str(self.year) - if self.month_ua: - m = "(%s)%s" % (self.month, self.month_ua) - else: - m = str(self.month) + m = f"{self.month_ua}{self.month}" if self.month_ua else str(self.month) if self.day: - if self.day_ua: - d = "(%s)%s" % (self.day, self.day_ua) - else: - d = str(self.day) + d = f"{self.day_ua}{self.day}" if self.day_ua else str(self.day) else: d = None - if self.year_month_ua: # year/month approximate. No brackets needed. - ym = "%s-%s%s" % (y, m, self.year_month_ua) - if d: - result = "%s-%s" % (ym, d) - else: - result = ym + if self.year_month_ua: # year/month approximate. No brackets needed. + ym = f"{y}-{m}{self.year_month_ua}" + result = f"{ym}-{d}" if d else ym elif self.month_day_ua: - if self.year_ua: # we don't need the brackets round month and day - result = "%s-%s-%s%s" % (y, m, d, self.month_day_ua) + if self.year_ua: # we don't need the brackets round month and day + result = f"{y}-{m}-{d}{self.month_day_ua}" else: - result = "%s-(%s-%s)%s" % (y, m, d, self.month_day_ua) + result = f"{y}-({m}-{d}){self.month_day_ua}" else: - if d: - result = "%s-%s-%s" % (y, m, d) - else: - result = "%s-%s" % (y, m) + result = f"{y}-{m}-{d}" if d else f"{y}-{m}" if self.all_ua: - result = "(%s)%s" % (result, self.all_ua) + result = f"({result}){self.all_ua}" return result def _precise_year(self, lean): if self.season: return self.season._precise_year(lean) - return super(PartialUncertainOrApproximate, self)._precise_year(lean) + return super()._precise_year(lean) def _precise_month(self, lean): if self.season: return self.season._precise_month(lean) - return super(PartialUncertainOrApproximate, self)._precise_month(lean) + return super()._precise_month(lean) def _precise_day(self, lean): if self.season: return self.season._precise_day(lean) - return super(PartialUncertainOrApproximate, self)._precise_day(lean) + return super()._precise_day(lean) def _get_fuzzy_padding(self, lean): """ @@ -588,21 +933,42 @@ def _get_fuzzy_padding(self, lean): result = relativedelta(0) if self.year_ua: - result += appsettings.PADDING_YEAR_PRECISION * self.year_ua._get_multiplier() + result += ( + appsettings.PADDING_YEAR_PRECISION * self.year_ua._get_multiplier() + ) + if self.year_ua_b: + result += ( + appsettings.PADDING_YEAR_PRECISION * self.year_ua_b._get_multiplier() + ) if self.month_ua: - result += appsettings.PADDING_MONTH_PRECISION * self.month_ua._get_multiplier() + result += ( + appsettings.PADDING_MONTH_PRECISION * self.month_ua._get_multiplier() + ) if self.day_ua: result += appsettings.PADDING_DAY_PRECISION * self.day_ua._get_multiplier() if self.year_month_ua: - result += appsettings.PADDING_YEAR_PRECISION * self.year_month_ua._get_multiplier() - result += appsettings.PADDING_MONTH_PRECISION * self.year_month_ua._get_multiplier() + result += ( + appsettings.PADDING_YEAR_PRECISION + * self.year_month_ua._get_multiplier() + ) + result += ( + appsettings.PADDING_MONTH_PRECISION + * self.year_month_ua._get_multiplier() + ) if self.month_day_ua: - result += appsettings.PADDING_DAY_PRECISION * self.month_day_ua._get_multiplier() - result += appsettings.PADDING_MONTH_PRECISION * self.month_day_ua._get_multiplier() + result += ( + appsettings.PADDING_DAY_PRECISION * self.month_day_ua._get_multiplier() + ) + result += ( + appsettings.PADDING_MONTH_PRECISION + * self.month_day_ua._get_multiplier() + ) if self.season_ua: - result += appsettings.PADDING_SEASON_PRECISION * self.season_ua._get_multiplier() + result += ( + appsettings.PADDING_SEASON_PRECISION * self.season_ua._get_multiplier() + ) if self.all_ua: multiplier = self.all_ua._get_multiplier() @@ -638,15 +1004,17 @@ def __init__(self, lower=None, upper=None): self.upper = upper def __str__(self): - return "%s..%s" % (self.lower or '', self.upper or '') + return "{}..{}".format(self.lower or "", self.upper or "") -class EarlierConsecutives(Consecutives): - pass +class EarlierConsecutives(Level1Interval): + def __str__(self): + return f"{self.lower}{self.upper}" -class LaterConsecutives(Consecutives): - pass +class LaterConsecutives(Level1Interval): + def __str__(self): + return f"{self.lower}{self.upper}" class OneOfASet(EDTFObject): @@ -659,13 +1027,27 @@ def __init__(self, *args): self.objects = args def __str__(self): - return "[%s]" % (", ".join([str(o) for o in self.objects])) + return "[{}]".format(", ".join([str(o) for o in self.objects])) def _strict_date(self, lean): + strict_dates = [x._strict_date(lean) for x in self.objects] + # Accounting for possible 'inf' and '-inf' values if lean == LATEST: - return max([x._strict_date(lean) for x in self.objects]) + if any(isinstance(d, float) and d == float("inf") for d in strict_dates): + return float("inf") + else: + return max( + (d for d in strict_dates if not isinstance(d, float)), + default=float("inf"), + ) else: - return min([x._strict_date(lean) for x in self.objects]) + if any(isinstance(d, float) and d == float("-inf") for d in strict_dates): + return float("-inf") + else: + return min( + (d for d in strict_dates if not isinstance(d, float)), + default=float("-inf"), + ) class MultipleDates(EDTFObject): @@ -678,7 +1060,7 @@ def __init__(self, *args): self.objects = args def __str__(self): - return "{%s}" % (", ".join([str(o) for o in self.objects])) + return "{{{}}}".format(", ".join([str(o) for o in self.objects])) def _strict_date(self, lean): if lean == LATEST: @@ -687,10 +1069,6 @@ def _strict_date(self, lean): return min([x._strict_date(lean) for x in self.objects]) -class MaskedPrecision(Date): - pass - - class Level2Interval(Level1Interval): def __init__(self, lower, upper): # Check whether incoming lower/upper values are single-item lists, and @@ -705,20 +1083,36 @@ def __init__(self, lower, upper): self.upper = upper[0] else: self.upper = upper + self.is_approximate = self.lower.is_approximate or self.upper.is_approximate + self.is_uncertain = self.lower.is_uncertain or self.upper.is_uncertain + self.is_uncertain_and_approximate = ( + self.lower.is_uncertain_and_approximate + or self.upper.is_uncertain_and_approximate + ) + + +class Level2Season(Season): + pass class ExponentialYear(LongYear): - def __init__(self, base, exponent, precision=None): + def __init__(self, base, exponent, significant_digits=None): self.base = base self.exponent = exponent - self.precision = precision + self.significant_digits = ( + int(significant_digits) if significant_digits else None + ) def _precise_year(self): return int(self.base) * 10 ** int(self.exponent) def get_year(self): - if self.precision: - return '%se%sp%s' % (self.base, self.exponent, self.precision) + if self.significant_digits: + return f"{self.base}E{self.exponent}S{self.significant_digits}" else: - return '%se%s' % (self.base, self.exponent) + return f"{self.base}E{self.exponent}" + year = property(get_year) + + def estimated(self): + return self._precise_year() diff --git a/edtf/parser/tests.py b/edtf/parser/tests.py index f9dde42..c2dd711 100644 --- a/edtf/parser/tests.py +++ b/edtf/parser/tests.py @@ -1,21 +1,21 @@ -import unittest -import sys +# ruff: noqa: S101 # Asserts are ok in tests + from datetime import date from time import struct_time -from edtf.parser.grammar import parse_edtf as parse -from edtf.parser.parser_classes import EDTFObject, TIME_EMPTY_TIME, \ - TIME_EMPTY_EXTRAS +import pytest + from edtf.parser.edtf_exceptions import EDTFParseException +from edtf.parser.grammar import parse_edtf as parse +from edtf.parser.parser_classes import TIME_EMPTY_EXTRAS, TIME_EMPTY_TIME, EDTFObject -# Example object types and attributes. -# the first item in each tuple is the input EDTF string, and expected parse result. -# where the first value is a tuple, the second item is the normalised parse result. +# Example object types and attributes represented as tuples. +# The first item in each tuple is the input EDTF string, and expected parse result. +# where the first value is a tuple, the second item is a tuple of the normalised parse result. # -# The rest of the values in each tuple indicate the iso versions of the derived -# Python ``date``s. -# - If there's one other value, all the derived dates should be the same. -# - If there're two other values, then all the lower values should be the same +# The values in the second tuple indicate the iso versions of the derived Python `date`s. +# - If there is one other value, all the derived dates should be the same. +# - If there are two other values, then all the lower values should be the same # and all the upper values should be the same. # - If there are three other values, then the upper and lower ``_strict`` values # should be the first value, and the upper and lower ``_fuzzy`` values should be @@ -26,276 +26,389 @@ EXAMPLES = ( # ******************************* LEVEL 0 ********************************* # year, month, day - ('2001-02-03', '2001-02-03'), + ("2001-02-03", ("2001-02-03",)), # year, month - ('2008-12', '2008-12-01', '2008-12-31'), + ("2008-12", ("2008-12-01", "2008-12-31")), # year - ('2008', '2008-01-01', '2008-12-31'), + ("2008", ("2008-01-01", "2008-12-31")), # a negative year - ('-0999', '-0999-01-01', '-0999-12-31'), + ("-0999", ("-0999-01-01", "-0999-12-31")), # year zero - ('0000', '0000-01-01', '0000-12-31'), + ("0000", ("0000-01-01", "0000-12-31")), # DateTimes - ('2001-02-03T09:30:01', '2001-02-03'), - ('2004-01-01T10:10:10Z', '2004-01-01'), - ('2004-01-01T10:10:10+05:00', '2004-01-01'), + ("2001-02-03T09:30:01", ("2001-02-03",)), + ("2004-01-01T10:10:10Z", ("2004-01-01",)), + ("2004-01-01T10:10:10+05:00", ("2004-01-01",)), + ("1985-04-12T23:20:30", ("1985-04-12",)), + # Intervals # An interval beginning sometime in 1964 and ending sometime in 2008. Year precision. - ('1964/2008', '1964-01-01', '2008-12-31'), + ("1964/2008", ("1964-01-01", "2008-12-31")), # An interval beginning sometime in June 2004 and ending sometime in August of 2006. Month precision. - ('2004-06/2006-08', '2004-06-01', '2006-08-31'), + ("2004-06/2006-08", ("2004-06-01", "2006-08-31")), # An interval beginning sometime on February 1, 2004 and ending sometime on February 8, 2005. Day precision. - ('2004-02-01/2005-02-08', '2004-02-01', '2005-02-08'), - # An interval beginning sometime on February 1, 2004 and ending sometime in February 2005. The precision of the interval is not defined; the start endpoint has day precision and the end endpoint has month precision. - ('2004-02-01/2005-02', '2004-02-01', '2005-02-28'), - # An interval beginning sometime on February 1, 2004 and ending sometime in 2005. The start endpoint has day precision and the end endpoint has year precision. - ('2004-02-01/2005', '2004-02-01', '2005-12-31'), + ("2004-02-01/2005-02-08", ("2004-02-01", "2005-02-08")), + # An interval beginning sometime on February 1, 2004 and ending sometime in February 2005. + # The precision of the interval is not defined; the start endpoint has day precision and the end endpoint has month precision. + ("2004-02-01/2005-02", ("2004-02-01", "2005-02-28")), + # An interval beginning sometime on February 1, 2004 and ending sometime in 2005. + # The start endpoint has day precision and the end endpoint has year precision. + ("2004-02-01/2005", ("2004-02-01", "2005-12-31")), # An interval beginning sometime in 2005 and ending sometime in February 2006. - ('2005/2006-02', '2005-01-01', '2006-02-28'), - + ("2005/2006-02", ("2005-01-01", "2006-02-28")), + # An interval beginning sometime in -2005 and ending sometime in February -2004. + ("-2005/-1999-02", ("-2005-01-01", "-1999-02-28")), # ******************************* LEVEL 1 ********************************* # Uncertain/Approximate # uncertain: possibly the year 1984, but not definitely - ('1984?', '1984-01-01', '1984-12-31', '1983-01-01', '1985-12-31'), - ('2004-06-11?', '2004-06-11', '2004-06-11', '2004-06-10', '2004-06-12'), - ('2004-06?', '2004-06-01', '2004-06-30', '2004-05-01', '2004-07-30'), + ("1984?", ("1984-01-01", "1984-12-31", "1983-01-01", "1985-12-31")), + ( + "2004-06-11?", + ("2004-06-11", "2003-05-10", "2005-07-12"), + ), # everything is fuzzy by 100% for "qualification of a date (complete)" (L1) + ("2004-06?", ("2004-06-01", "2004-06-30", "2003-05-01", "2005-07-30")), # "approximately" the year 1984 - ('1984~', '1984-01-01', '1984-12-31', '1983-01-01', '1985-12-31'), + ("1984~", ("1984-01-01", "1984-12-31", "1983-01-01", "1985-12-31")), # the year is approximately 1984 and even that is uncertain - ('1984?~', '1984-01-01', '1984-12-31', '1982-01-01', '1986-12-31'), + ("1984%", ("1984-01-01", "1984-12-31", "1982-01-01", "1986-12-31")), # Unspecified # some unspecified year in the 1990s. - ('199u', '1990-01-01', '1999-12-31'), + ("199X", ("1990-01-01", "1999-12-31")), # some unspecified year in the 1900s. - ('19uu', '1900-01-01', '1999-12-31'), + ("19XX", ("1900-01-01", "1999-12-31")), # some month in 1999 - ('1999-uu', '1999-01-01', '1999-12-31'), + ("1999-XX", ("1999-01-01", "1999-12-31")), # some day in January 1999 - ('1999-01-uu', '1999-01-01', '1999-01-31'), + ("1999-01-XX", ("1999-01-01", "1999-01-31")), # some day in 1999 - ('1999-uu-uu', '1999-01-01', '1999-12-31'), - + ("1999-XX-XX", ("1999-01-01", "1999-12-31")), + # negative unspecified year + ("-01XX", ("-0199-01-01", "-0100-12-31")), # Uncertain/Approximate lower boundary dates (BCE) - ('-0275~', '-0275-01-01', '-0275-12-31', '-0276-01-01', '-0274-12-31'), - ('-0001~', '-0001-01-01', '-0001-12-31', '-0002-01-01', '0000-12-31'), - ('0000~', '0000-01-01', '0000-12-31', '-0001-01-01', '0001-12-31'), - + ("-0275~", ("-0275-01-01", "-0275-12-31", "-0276-01-01", "-0274-12-31")), + ("-0001~", ("-0001-01-01", "-0001-12-31", "-0002-01-01", "0000-12-31")), + ("0000~", ("0000-01-01", "0000-12-31", "-0001-01-01", "0001-12-31")), + # Unspecified and qualified + # "circa 17th century" + ("16XX~", ("1600-01-01", "1699-12-31", "1500-01-01", "1799-12-31")), + ("16XX%", ("1600-01-01", "1699-12-31", "1400-01-01", "1899-12-31")), + ("1XXX", ("1000-01-01", "1999-12-31")), + ("1XXX~", ("1000-01-01", "1999-12-31", "0000-01-01", "2999-12-31")), + ("156X~", ("1560-01-01", "1569-12-31", "1550-01-01", "1579-12-31")), + ("-01XX~", ("-0199-01-01", "-0100-12-31", "-0299-01-01", "0000-12-31")), # L1 Extended Interval # beginning unknown, end 2006 - ('unknown/2006', '1996-12-31', '2006-12-31'), + # for intervals with an unknown beginning or end, the unknown bound is calculated with the constant DELTA_IF_UNKNOWN (10 years) + ("/2006", ("1996-12-31", "2006-12-31")), # beginning June 1, 2004, end unknown - ('2004-06-01/unknown', '2004-06-01', '2014-06-01'), - # beginning January 1 2004 with no end date - ('2004-01-01/open', '2004-01-01', date.today().isoformat()), + ("2004-06-01/", ("2004-06-01", "2014-06-01")), + # beginning open, end 2006 + ("../2006", ("-inf", "2006-12-31")), + # beginning January 1, 2004 with no end date + ("2004-01-01/..", ("2004-01-01", "inf")), # interval beginning approximately 1984 and ending June 2004 - ('1984~/2004-06', '1984-01-01', '2004-06-30', '1983-01-01', '2004-06-30'), + ("1984~/2004-06", ("1984-01-01", "2004-06-30", "1983-01-01", "2004-06-30")), # interval beginning 1984 and ending approximately June 2004 - ('1984/2004-06~', '1984-01-01', '2004-06-30', '1984-01-01', '2004-07-30'), - ('1984?/2004?~', '1984-01-01', '2004-12-31', '1983-01-01', '2006-12-31'), - ('1984~/2004~', '1984-01-01', '2004-12-31', '1983-01-01', '2005-12-31'), + ("1984/2004-06~", ("1984-01-01", "2004-06-30", "1984-01-01", "2005-07-30")), + ("1984?/2004%", ("1984-01-01", "2004-12-31", "1983-01-01", "2006-12-31")), + ("1984~/2004~", ("1984-01-01", "2004-12-31", "1983-01-01", "2005-12-31")), # interval whose beginning is uncertain but thought to be 1984, and whose end is uncertain and approximate but thought to be 2004 - ('1984-06?/2004-08?', '1984-06-01', '2004-08-31', '1984-05-01', '2004-09-30'), - ('1984-06-02?/2004-08-08~', '1984-06-02', '2004-08-08', '1984-06-01', '2004-08-09'), - ('1984-06-02?/unknown', '1984-06-02', '1994-06-02', '1984-06-01', '1994-06-02'), + ("1984-06?/2004-08?", ("1984-06-01", "2004-08-31", "1983-05-01", "2005-09-30")), + ( + "1984-06-02?/2004-08-08~", + ("1984-06-02", "2004-08-08", "1983-05-01", "2005-09-09"), + ), + ("1984-06-02?/", ("1984-06-02", "1994-06-02", "1983-05-01", "1994-06-02")), # Year exceeding 4 digits - # the year 170000002 - ('y170000002', '170000002-01-01', '170000002-12-31'), - # the year -170000002 - ('y-170000002', '-170000002-01-01', '-170000002-12-31'), + ("Y170000002", ("170000002-01-01", "170000002-12-31")), + ("Y-170000002", ("-170000002-01-01", "-170000002-12-31")), # Seasons - # Spring, 2001 - ('2001-21', '2001-03-01', '2001-05-31'), - # Summer, 2003 - ('2003-22', '2003-06-01', '2003-08-31'), - # Autumn, 2000 - ('2000-23', '2000-09-01', '2000-11-30'), - # Winter, 2010 - ('2010-24', '2010-12-01', '2010-12-31'), - + ("2001-21", ("2001-03-01", "2001-05-31")), + ("2003-22", ("2003-06-01", "2003-08-31")), + ("2000-23", ("2000-09-01", "2000-11-30")), + ("2010-24", ("2010-12-01", "2010-12-31")), # ******************************* LEVEL 2 ********************************* - - # Partial Uncertain/ Approximate + # Qualification + # Group qualification: a qualification character to the immediate right of a component applies + # to that component as well as to all components to the left. + # year, month, and day are uncertain and approximate + # this example appears under "group qualification" but actually parses as L1 UncertainOrApproximate + ( + "2004-06-11%", + ("2004-06-11", "2002-04-09", "2006-08-13"), + ), # all parts to the left are fuzzy by 200% # uncertain year; month, day known - ('2004?-06-11', '2004-06-11', '2003-06-11', '2005-06-11'), + ("2004?-06-11", ("2004-06-11", "2003-06-11", "2005-06-11")), # year and month are approximate; day known - ('2004-06~-11', '2004-06-11', '2003-05-11', '2005-07-11'), - # uncertain month, year and day known - ('2004-(06)?-11', '2004-06-11', '2004-05-11', '2004-07-11'), + ("2004-06~-11", ("2004-06-11", "2003-05-11", "2005-07-11")), + # Qualification of individual component: a qualification character to the immediate left + # of the component applies to that component only # day is approximate; year, month known - ('2004-06-(11)~', '2004-06-11', '2004-06-10', '2004-06-12'), + ("2004-06-~11", ("2004-06-11", "2004-06-10", "2004-06-12")), # Year known, month within year is approximate and uncertain - ('2004-(06)?~', '2004-06-01', '2004-06-30', '2004-04-01', '2004-08-30'), + ("2004-%06", ("2004-06-01", "2004-06-30", "2004-04-01", "2004-08-30")), # Year known, month and day uncertain - ('2004-(06-11)?', '2004-06-11', '2004-05-10', '2004-07-12'), + ("2004-?06-?11", ("2004-06-11", "2004-05-10", "2004-07-12")), # Year uncertain, month known, day approximate - ('2004?-06-(11)~', '2004-06-11', '2003-06-10', '2005-06-12'), + ("2004?-06-~11", ("2004-06-11", "2003-06-10", "2005-06-12")), # Year uncertain and month is both uncertain and approximate - ('(2004-(06)~)?', '2004-06-01', '2004-06-30', '2003-04-01', '2005-08-30'), - # This has the same meaning as the previous example. - ('2004?-(06)?~', '2004-06-01', '2004-06-30', '2003-04-01', '2005-08-30'), - # Year uncertain, month and day approximate. - (('(2004)?-06-04~', '2004?-06-04~'), '2004-06-04', '2003-05-03', '2005-07-05'), - # Year known, month and day approximate. Note that this has the same meaning as the following. - (('(2011)-06-04~', '2011-(06-04)~'), '2011-06-04', '2011-05-03', '2011-07-05'), - # Year known, month and day approximate. - ('2011-(06-04)~', '2011-06-04', '2011-05-03', '2011-07-05'), - # Approximate season (around Autumn 2011) - ('2011-23~', '2011-09-01', '2011-11-30', '2011-06-09', '2012-02-22'), - # Years wrapping - ('2011-24~', '2011-12-01', '2011-12-31', '2011-09-08', '2012-03-24'), + ("?2004-%06", ("2004-06-01", "2004-06-30", "2003-04-01", "2005-08-30")), + # This has the same meaning as the previous example.- NEW SPEC + ("2004?-%06", ("2004-06-01", "2004-06-30", "2003-04-01", "2005-08-30")), + # Year uncertain, month and day approximate + ("2004?-~06-~04", ("2004-06-04", "2003-05-03", "2005-07-05")), + # Year known, month and day approximate + ("2011-~06-~04", ("2011-06-04", "2011-05-03", "2011-07-05")), # Partial unspecified # December 25 sometime during the 1560s - ('156u-12-25', '1560-12-25', '1569-12-25'), + ("156X-12-25", ("1560-12-25", "1569-12-25")), # December 25 sometime during the 1500s - ('15uu-12-25', '1500-12-25', '1599-12-25'), + ("15XX-12-25", ("1500-12-25", "1599-12-25")), # Year and day of month specified, month unspecified - ('1560-uu-25', '1560-01-25', '1560-12-25'), - ('15uu-12-uu', '1500-12-01', '1599-12-31'), + ("1560-XX-25", ("1560-01-25", "1560-12-25")), + ("15XX-12-XX", ("1500-12-01", "1599-12-31")), # Day specified, year and month unspecified - ('uuuu-uu-23', '0000-01-23', '9999-12-23'), + ("XXXX-XX-23", ("0000-01-23", "9999-12-23")), # One of a Set # One of the years 1667, 1668, 1670, 1671, 1672 - (('[1667,1668, 1670..1672]', '[1667, 1668, 1670..1672]'), '1667-01-01', '1672-12-31'), + ("[1667, 1668, 1670..1672]", ("1667-01-01", "1672-12-31")), # December 3, 1760 or some earlier date - ('[..1760-12-03]', '1750-12-03', '1760-12-03'), + ("[..1760-12-03]", ("-inf", "1760-12-03")), # December 1760 or some later month - ('[1760-12..]', '1760-12-01', '1770-12-01'), + ("[1760-12..]", ("1760-12-01", "inf")), # January or February of 1760 or December 1760 or some later month - ('[1760-01, 1760-02, 1760-12..]', '1760-01-01', '1770-12-01'), + ("[1760-01, 1760-02, 1760-12..]", ("1760-01-01", "inf")), # Either the year 1667 or the month December of 1760. - ('[1667, 1760-12]', '1667-01-01', '1760-12-31'), + ("[1667, 1760-12]", ("1667-01-01", "1760-12-31")), # Multiple Dates # All of the years 1667, 1668, 1670, 1671, 1672 - (('{1667,1668, 1670..1672}', '{1667, 1668, 1670..1672}'), '1667-01-01', '1672-12-31'), + ("{1667,1668, 1670..1672}", ("1667-01-01", "1672-12-31")), # The year 1960 and the month December of 1961. - ('{1960, 1961-12}', '1960-01-01', '1961-12-31'), - # Masked Precision + ("{1960, 1961-12}", ("1960-01-01", "1961-12-31")), + # Previously tested masked precision, now eliminated from the spec # A date during the 1960s - ('196x', '1960-01-01', '1969-12-31'), + ("196X", ("1960-01-01", "1969-12-31")), # A date during the 1900s - ('19xx', '1900-01-01', '1999-12-31'), + ("19XX", ("1900-01-01", "1999-12-31")), # L2 Extended Interval - # An interval in June 2004 beginning approximately the first and ending approximately the 20th. - ('2004-06-(01)~/2004-06-(20)~', '2004-06-01', '2004-06-20', '2004-05-31', '2004-06-21'), + # Interval with fuzzy day endpoints in June 2004 + ( + "2004-06-~01/2004-06-~20", + ("2004-06-01", "2004-06-20", "2004-05-31", "2004-06-21"), + ), # The interval began on an unspecified day in June 2004. - ('2004-06-uu/2004-07-03', '2004-06-01', '2004-07-03'), + ("2004-06-XX/2004-07-03", ("2004-06-01", "2004-07-03")), # Year Requiring More than Four Digits - Exponential Form # the year 170000000 - ('y17e7', '170000000-01-01', '170000000-12-31'), + ("Y17E7", ("170000000-01-01", "170000000-12-31")), # the year -170000000 - ('y-17e7', '-170000000-01-01', '-170000000-12-31'), - # Some year between 171010000 and 171999999, estimated to be 171010000 ('p3' indicates a precision of 3 significant digits.) - # TODO Not yet implemented, see https://github.com/ixc/python-edtf/issues/12 - # ('y17101e4p3', '171010000-01-01', '171999999-12-31'), + ("Y-17E7", ("-170000000-01-01", "-170000000-12-31")), + # L2 significant digits + # Some year between 1900 and 1999, estimated to be 1950 + ("1950S2", ("1950-01-01", "1950-12-31", "1900-01-01", "1999-12-31")), + ("1953S2", ("1953-01-01", "1953-12-31", "1900-01-01", "1999-12-31")), + ("1953S3", ("1953-01-01", "1953-12-31", "1950-01-01", "1959-12-31")), + # Some year between 171010000 and 171999999, estimated to be 171010000 ('S3' indicates a precision of 3 significant digits.) + ( + "Y17101E4S3", + ("171010000-01-01", "171010000-12-31", "171000000-01-01", "171999999-12-31"), + ), + # Some year between 338000 and 338999, estimated to be 338800 + ("Y3388E2S3", ("338800-01-01", "338800-12-31", "338000-01-01", "338999-12-31")), + # some year between 171000000 and 171999999 estimated to be 171010000 + ( + "Y171010000S3", + ("171010000-01-01", "171010000-12-31", "171000000-01-01", "171999999-12-31"), + ), + # L2 Seasons + # Spring southern hemisphere, 2001 + ("2001-29", ("2001-09-01", "2001-11-30")), + # second quarter of 2001 + ("2001-34", ("2001-04-01", "2001-06-30")), +) + +BENCHMARK_EXAMPLES = ( + "2001-02-03", + "2008-12", + "2008", + "-0999", + "2004-01-01T10:10:10+05:00", + "-2005/-1999-02", + "/2006", + "?2004-%06", + "[1667, 1760-12]", + "Y3388E2S3", + "2001-29", +) + +APPROXIMATE_UNCERTAIN_EXAMPLES = ( + # first part of tuple is the input EDTF string, second part is a tuple of booleans: + # uncertain ?, approximate ~, both uncertain and approximate % + ("2004", (False, False, False)), + ("2006-06-11", (False, False, False)), + ("-0999", (False, False, False)), + ("1984?", (True, False, False)), + ("2004-06-11?", (True, False, False)), + ("1984~", (False, True, False)), + ("1984%", (False, False, True)), + ("1984~/2004-06", (False, True, False)), + ("2004-%06", (False, False, True)), + ("2004?-~06-~04", (True, True, False)), + ("2004?-06-04", (True, False, False)), + ("2011-~06-~04", (False, True, False)), + ("2004-06-~01/2004-06-~20", (False, True, False)), + ("156X~", (False, True, False)), ) BAD_EXAMPLES = ( + # parentheses are not used for group qualification in the 2018 spec None, - '', - 'not a edtf string', - 'y17e7-12-26', # not implemented - '2016-13-08', # wrong day order - '2016-02-39', # out of range - '-0000-01-01', # negative zero year + "", + "not a edtf string", + "Y17E7-12-26", # Y indicates that the date is year only + "2016-13-08", # wrong day order + "2016-02-39", # out of range + "-0000-01-01", # negative zero year + "2004-(06)?-11", # uncertain month, year and day known - OLD SPEC + "2004-06-(11)~", # day is approximate; year, month known - OLD SPEC + "2004-(06)%", # Year known, month within year is approximate and uncertain - OLD SPEC + "2004-(06-11)?", # Year known, month and day uncertain - OLD SPEC + "2004?-06-(11)~", # Year uncertain, month known, day approximate - OLD SPEC + "(2004-(06)~)?", # Year uncertain and month is both uncertain and approximate - OLD SPEC + "(2004)?-06-04~", # Year uncertain, month and day approximate.- OLD SPEC + "(2011)-06-04~", # Year known, month and day approximate. Note that this has the same meaning as the following.- OLD SPEC + "2011-(06-04)~", # Year known, month and day approximate.- OLD SPEC + "2004-06-(01)~/2004-06-(20)~", # An interval in June 2004 beginning approximately the first and ending approximately the 20th - OLD SPEC ) -class TestParsing(unittest.TestCase): - def test_non_parsing(self): - for i in BAD_EXAMPLES: - self.assertRaises(EDTFParseException, parse, i) +def iso_to_struct_time(iso_date): + """Convert YYYY-mm-dd date strings or infinities to time structs or float infinities.""" + if iso_date == "inf": + return float("inf") + elif iso_date == "-inf": + return float("-inf") + + if iso_date[0] == "-": + is_negative = True + iso_date = iso_date[1:] + else: + is_negative = False + y, mo, d = (int(i) for i in iso_date.split("-")) + if is_negative: + y *= -1 + return struct_time([y, mo, d] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) + + +@pytest.mark.parametrize("test_input,expected_tuple", EXAMPLES) +def test_edtf_examples(test_input, expected_tuple): + """Test parsing of EDTF strings with expected outputs.""" + result = parse(test_input) + assert isinstance(result, EDTFObject), "Result should be an instance of EDTFObject" + + # Extract only the date part if the result includes a time. + result_date = str(result) + if "T" in result_date: + result_date = result_date.split("T")[0] - def test_date_values(self): - """ - Test that every EDTFObject can tell you its lower and upper - fuzzy and strict dates, and that they're what we think they should be. - """ + # Unpack expected results based on their count + if len(expected_tuple) == 1: + assert ( + result_date == expected_tuple[0] + ), f"Expected {expected_tuple[0]}, got {result_date}" + elif len(expected_tuple) == 2: + lower_strict = iso_to_struct_time(expected_tuple[0]) + upper_strict = iso_to_struct_time(expected_tuple[1]) + assert ( + result.lower_strict() == lower_strict + ), f"Lower strict date does not match. Expected {lower_strict}, got {result.lower_strict()}" + assert ( + result.upper_strict() == upper_strict + ), f"Upper strict date does not match. Expected {upper_strict}, got {result.upper_strict()}" + elif len(expected_tuple) == 3: + strict_date = iso_to_struct_time(expected_tuple[0]) + lower_fuzzy = iso_to_struct_time(expected_tuple[1]) + upper_fuzzy = iso_to_struct_time(expected_tuple[2]) + assert ( + result.lower_strict() == strict_date + ), f"Lower strict date does not match. Expected {strict_date}, got {result.lower_strict()}" + assert ( + result.upper_strict() == strict_date + ), f"Upper strict date does not match. Expected {strict_date}, got {result.upper_strict()}" + assert ( + result.lower_fuzzy() == lower_fuzzy + ), f"Lower fuzzy date does not match. Expected {lower_fuzzy}, got {result.lower_fuzzy()}" + assert ( + result.upper_fuzzy() == upper_fuzzy + ), f"Upper fuzzy date does not match. Expected {upper_fuzzy}, got {result.upper_fuzzy()}" + elif len(expected_tuple) == 4: + lower_strict = iso_to_struct_time(expected_tuple[0]) + upper_strict = iso_to_struct_time(expected_tuple[1]) + lower_fuzzy = iso_to_struct_time(expected_tuple[2]) + upper_fuzzy = iso_to_struct_time(expected_tuple[3]) + assert ( + result.lower_strict() == lower_strict + ), f"Lower strict date does not match. Expected {lower_strict}, got {result.lower_strict()}" + assert ( + result.upper_strict() == upper_strict + ), f"Upper strict date does not match. Expected {upper_strict}, got {result.upper_strict()}" + assert ( + result.lower_fuzzy() == lower_fuzzy + ), f"Lower fuzzy date does not match. Expected {lower_fuzzy}, got {result.lower_fuzzy()}" + assert ( + result.upper_fuzzy() == upper_fuzzy + ), f"Upper fuzzy date does not match. Expected {upper_fuzzy}, got {result.upper_fuzzy()}" - for e in EXAMPLES: - i = e[0] - if isinstance(i, tuple): - i, o = i - else: - o = i - sys.stdout.write("parsing '%s'" % i) - f = parse(i) - sys.stdout.write(" => %s()\n" % type(f).__name__) - self.assertIsInstance(f, EDTFObject) - self.assertEqual(str(f), o) +@pytest.mark.parametrize("bad_input", BAD_EXAMPLES) +def test_non_parsing(bad_input): + """Test that non-parsing inputs correctly raise an exception.""" + with pytest.raises(EDTFParseException): + parse(bad_input) - if len(e) == 5: - expected_lower_strict = e[1] - expected_upper_strict = e[2] - expected_lower_fuzzy = e[3] - expected_upper_fuzzy = e[4] - elif len(e) == 4: - expected_lower_strict = e[1] - expected_upper_strict = e[1] - expected_lower_fuzzy = e[2] - expected_upper_fuzzy = e[3] - elif len(e) == 3: - expected_lower_strict = e[1] - expected_upper_strict = e[2] - expected_lower_fuzzy = e[1] - expected_upper_fuzzy = e[2] - elif len(e) == 2: - expected_lower_strict = e[1] - expected_upper_strict = e[1] - expected_lower_fuzzy = e[1] - expected_upper_fuzzy = e[1] - if len(e) == 1: - continue - def iso_to_struct_time(iso_date): - """ Convert YYYY-mm-dd date strings to time structs """ - if iso_date[0] == '-': - is_negative = True - iso_date = iso_date[1:] - else: - is_negative = False - y, mo, d = [int(i) for i in iso_date.split('-')] - if is_negative: - y *= -1 - return struct_time( - [y, mo, d] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) +@pytest.mark.parametrize("bad_input", [None, ""]) +def test_empty_input(bad_input): + """Test that empty input raises a specific exception.""" + with pytest.raises(EDTFParseException) as exc_info: + parse(bad_input) + assert "You must supply some input text" in str(exc_info.value) - # Convert string date representations into `struct_time`s - expected_lower_strict = iso_to_struct_time(expected_lower_strict) - expected_upper_strict = iso_to_struct_time(expected_upper_strict) - expected_lower_fuzzy = iso_to_struct_time(expected_lower_fuzzy) - expected_upper_fuzzy = iso_to_struct_time(expected_upper_fuzzy) - try: - self.assertEqual(f.lower_strict(), expected_lower_strict) - self.assertEqual(f.upper_strict(), expected_upper_strict) - self.assertEqual(f.lower_fuzzy(), expected_lower_fuzzy) - self.assertEqual(f.upper_fuzzy(), expected_upper_fuzzy) - except Exception as x: - # Write to stdout for manual debugging, I guess - sys.stdout.write(str(x)) - # Re-raise exception so unit tests work for non-manual usage - raise +def test_comparisons(): + """Test comparisons between parsed EDTF objects and standard dates.""" + d1 = parse("1979-08~") + d2 = parse("1979-08~") + d3 = parse("1979-09-16") + d4 = parse("1979-08-16") + d5 = date(1979, 8, 16) + d6 = date(1970, 9, 16) - def test_comparisons(self): - d1 = parse("1979-08~") - d2 = parse("1979-08~") - d3 = parse("1979-09-16") - d4 = parse("1979-08-16") - d5 = date(1979, 8, 16) - d6 = date(1970, 9, 16) + assert d1 == d2 + assert d1 != d3 + assert d1 >= d2 + assert d3 > d1 + assert d1 < d4 + assert d4 == d5 + assert d1 < d5 + assert d1 > d6 - self.assertEqual(d1, d2) - self.assertNotEqual(d1, d3) - self.assertTrue(d1 >= d2) - self.assertTrue(d2 >= d1) - self.assertTrue(d3 > d1) - self.assertTrue(d1 < d4) - # with python dates (EDTFFormat must be first operand) - self.assertEqual(d4, d5) - self.assertTrue(d1 < d5) - self.assertTrue(d1 > d6) +@pytest.mark.benchmark +@pytest.mark.parametrize("test_input", BENCHMARK_EXAMPLES) +def test_benchmark_parser(benchmark, test_input): + """Benchmark parsing of selected EDTF strings.""" + benchmark(parse, test_input) -if __name__ == '__main__': - unittest.main() +@pytest.mark.parametrize("test_input,expected_tuple", APPROXIMATE_UNCERTAIN_EXAMPLES) +def test_approximate_uncertain(test_input, expected_tuple): + """Test parsing of EDTF strings and check .is_uncertain, .is_approximate, + and .is_uncertain_and_approximate properties. The expected_tuple should have three + values, the first should be a boolean indicating if the date is uncertain, + the second should be a boolean indicating if the date is approximate, and the + third should be a boolean indicating if the date is both uncertain and approximate.""" + result = parse(test_input) + assert isinstance(result, EDTFObject), "Result should be an instance of EDTFObject" + assert result.is_uncertain == expected_tuple[0] + assert result.is_approximate == expected_tuple[1] + assert result.is_uncertain_and_approximate == expected_tuple[2] diff --git a/edtf/tests.py b/edtf/tests.py index 0e49e67..9812b65 100644 --- a/edtf/tests.py +++ b/edtf/tests.py @@ -1,134 +1,109 @@ -import unittest +# ruff: noqa: S101 # Asserts are ok in tests +from datetime import date, datetime from time import struct_time -from datetime import datetime, date from edtf import convert -class TestConversions(unittest.TestCase): - - def test_dt_to_struct_time_for_datetime(self): - now = datetime.now() - st = convert.dt_to_struct_time(now) - # Check equal year, month, day, hours, minutes, seconds - self.assertEqual(st[:6], now.timetuple()[:6]) - # Confirm 'extra' fields are set to defaults - self.assertEqual(st[6:], (0, 0, -1)) - - def test_dt_to_struct_time_for_date(self): - today = date.today() - st = convert.dt_to_struct_time(today) - # Check equal year, month, day - self.assertEqual(st[:3], today.timetuple()[:3]) - # Confirm time fields are zeroed - self.assertEqual(st[3:6], (0, 0, 0)) - # Confirm 'extra' fields are set to defaults - self.assertEqual(st[6:], (0, 0, -1)) - - def test_struct_time_to_date(self): - st = struct_time( - [2018, 4, 19] + convert.TIME_EMPTY_TIME + convert.TIME_EMPTY_EXTRAS) - d = date(*st[:3]) - self.assertEqual(d, convert.struct_time_to_date(st)) - - def test_struct_time_to_datetime(self): - st = struct_time( - [2018, 4, 19] + [10, 13, 54] + convert.TIME_EMPTY_EXTRAS) - dt = datetime(*st[:6]) - converted_dt = convert.struct_time_to_datetime(st) - self.assertEqual(dt, converted_dt) - # Note that 'extra' fields are auto-populated by `datetime` module - self.assertEqual(converted_dt.timetuple()[6:], (3, 109, -1)) - - def test_trim_struct_time(self): - now = datetime.now() - st = now.timetuple() - trimmed_st = convert.trim_struct_time(st) - # Confirm trimmed `struct_time` has expected date/time values - self.assertEqual( - trimmed_st[:6], - (now.year, now.month, now.day, now.hour, now.minute, now.second) - ) - # Confirm 'extra' fields are set to defaults - self.assertEqual(trimmed_st[6:], (0, 0, -1)) - # Confirm 'extra' fields in untrimmed `struct_time` has real values - self.assertNotEqual(st[6:], (0, 0, -1)) - - def test_struct_time_to_jd(self): - # Check conversion of AD date & time to Julian Date number - st_ad = struct_time( - [2018, 4, 19] + [10, 13, 54] + convert.TIME_EMPTY_EXTRAS) - jd_ad = 2458227.9263194446 - self.assertEqual(jd_ad, convert.struct_time_to_jd(st_ad)) - # Check conversion of BC date & time to Julian Date number - st_bc = struct_time( - [-2018, 4, 19] + [10, 13, 54] + convert.TIME_EMPTY_EXTRAS) - jd_bc = 984091.9263194444 - self.assertEqual(jd_bc, convert.struct_time_to_jd(st_bc)) - - def test_jd_to_struct_time(self): - # Check conversion of Julian Date number to AD date & time - jd_ad = 2458227.9263194446 # As in `test_struct_time_to_jd` - st_ad = struct_time( - [2018, 4, 19] + [10, 13, 54] + convert.TIME_EMPTY_EXTRAS) - self.assertEqual(st_ad, convert.jd_to_struct_time(jd_ad)) - # Check conversion of Julian Date number to BC date & time - # WARNING: Converted time is off by 1 second, 53 not 54 - jd_bc = 984091.9263194444 # As in `test_struct_time_to_jd` - st_bc = struct_time( - [-2018, 4, 19] + [10, 13, 54 - 1] + convert.TIME_EMPTY_EXTRAS) - self.assertEqual(st_bc, convert.jd_to_struct_time(jd_bc)) - - def test_jd_round_trip_for_extreme_future(self): - original_st = struct_time( - [999999, 8, 4] + [21, 15, 3] + convert.TIME_EMPTY_EXTRAS) - jd = convert.struct_time_to_jd(original_st) - converted_st = convert.jd_to_struct_time(jd) - # Confirm that year, month, day, hour, minute are correct (not second) - self.assertEqual(original_st[:5], converted_st[:5]) - # WARNING: Seconds are off by 1, should be 3 but is 2 - self.assertEqual(3 - 1, converted_st[5]) - - def test_jd_round_trip_for_extreme_past(self): - original_st = struct_time( - [-999999, 8, 4] + [21, 15, 3] + convert.TIME_EMPTY_EXTRAS) - converted_st = convert.jd_to_struct_time( - convert.struct_time_to_jd(original_st)) - # WARNING: We have lost a year of accuracy - self.assertEqual( - (-999999 + 1, # Year off by 1 - 8, 4, 21, 15, 3, 0, 0, -1), - tuple(converted_st)) - - def test_jd_round_trip_for_zero_year_aka_1_bc(self): - original_st = struct_time( - [0, 9, 5] + [4, 58, 59] + convert.TIME_EMPTY_EXTRAS) - converted_st = convert.jd_to_struct_time( - convert.struct_time_to_jd(original_st)) - self.assertEqual( - (0, 9, 5, 4, 58, 59, 0, 0, -1), - tuple(converted_st)) - - def test_jd_round_trip_for_2_bc(self): - original_st = struct_time( - [-1, 12, 5] + [4, 58, 59] + convert.TIME_EMPTY_EXTRAS) - converted_st = convert.jd_to_struct_time( - convert.struct_time_to_jd(original_st)) - self.assertEqual( - (-1, 12, 5, 4, 58, 59, 0, 0, -1), - tuple(converted_st)) - - def test_roll_negative_time_fields(self): - # Confirm time value is adjusted as expected - year = -100 - month = -17 # More than 1 year - day = -34 # More than 1 month - hour = -25 # More than 1 day - minute = -74 # More than 1 hour - second = -253 # More than 1 minute - self.assertEqual( - (-102, 5, 24, 21, 41, 47), - convert._roll_negative_time_fields( - year, month, day, hour, minute, second) - ) +def test_dt_to_struct_time_for_datetime(): + now = datetime.now() + st = convert.dt_to_struct_time(now) + assert st[:6] == now.timetuple()[:6] + assert st[6:] == (0, 0, -1) + + +def test_dt_to_struct_time_for_date(): + today = date.today() + st = convert.dt_to_struct_time(today) + assert st[:3] == today.timetuple()[:3] + assert st[3:6] == (0, 0, 0) + assert st[6:] == (0, 0, -1) + + +def test_struct_time_to_date(): + st = struct_time( + [2018, 4, 19] + convert.TIME_EMPTY_TIME + convert.TIME_EMPTY_EXTRAS + ) + d = date(*st[:3]) + assert d == convert.struct_time_to_date(st) + + +def test_struct_time_to_datetime(): + st = struct_time([2018, 4, 19] + [10, 13, 54] + convert.TIME_EMPTY_EXTRAS) + dt = datetime(*st[:6]) + converted_dt = convert.struct_time_to_datetime(st) + assert dt == converted_dt + assert converted_dt.timetuple()[6:] == (3, 109, -1) + + +def test_trim_struct_time(): + now = datetime.now() + st = now.timetuple() + trimmed_st = convert.trim_struct_time(st) + assert trimmed_st[:6] == ( + now.year, + now.month, + now.day, + now.hour, + now.minute, + now.second, + ) + assert trimmed_st[6:] == (0, 0, -1) + assert st[6:] != (0, 0, -1) + + +def test_struct_time_to_jd(): + st_ad = struct_time([2018, 4, 19] + [10, 13, 54] + convert.TIME_EMPTY_EXTRAS) + jd_ad = 2458227.9263194446 + assert jd_ad == convert.struct_time_to_jd(st_ad) + st_bc = struct_time([-2018, 4, 19] + [10, 13, 54] + convert.TIME_EMPTY_EXTRAS) + jd_bc = 984091.9263194444 + assert jd_bc == convert.struct_time_to_jd(st_bc) + + +def test_jd_to_struct_time(): + jd_ad = 2458227.9263194446 + st_ad = struct_time([2018, 4, 19] + [10, 13, 54] + convert.TIME_EMPTY_EXTRAS) + assert st_ad == convert.jd_to_struct_time(jd_ad) + jd_bc = 984091.9263194444 + st_bc = struct_time([-2018, 4, 19] + [10, 13, 54 - 1] + convert.TIME_EMPTY_EXTRAS) + assert st_bc == convert.jd_to_struct_time(jd_bc) + + +def test_jd_round_trip_for_extreme_future(): + original_st = struct_time([999999, 8, 4] + [21, 15, 3] + convert.TIME_EMPTY_EXTRAS) + jd = convert.struct_time_to_jd(original_st) + converted_st = convert.jd_to_struct_time(jd) + assert original_st[:5] == converted_st[:5] + assert converted_st[5] == 3 - 1 + + +def test_jd_round_trip_for_extreme_past(): + original_st = struct_time([-999999, 8, 4] + [21, 15, 3] + convert.TIME_EMPTY_EXTRAS) + converted_st = convert.jd_to_struct_time(convert.struct_time_to_jd(original_st)) + assert tuple(converted_st) == (-999999 + 1, 8, 4, 21, 15, 3, 0, 0, -1) + + +def test_jd_round_trip_for_zero_year_aka_1_bc(): + original_st = struct_time([0, 9, 5] + [4, 58, 59] + convert.TIME_EMPTY_EXTRAS) + converted_st = convert.jd_to_struct_time(convert.struct_time_to_jd(original_st)) + assert tuple(converted_st) == (0, 9, 5, 4, 58, 59, 0, 0, -1) + + +def test_jd_round_trip_for_2_bc(): + original_st = struct_time([-1, 12, 5] + [4, 58, 59] + convert.TIME_EMPTY_EXTRAS) + converted_st = convert.jd_to_struct_time(convert.struct_time_to_jd(original_st)) + assert tuple(converted_st) == (-1, 12, 5, 4, 58, 59, 0, 0, -1) + + +def test_roll_negative_time_fields(): + year = -100 + month = -17 + day = -34 + hour = -25 + minute = -74 + second = -253 + assert convert._roll_negative_time_fields( + year, month, day, hour, minute, second + ) == (-102, 5, 24, 21, 41, 47) diff --git a/edtf_django_tests/edtf_django_tests/__init__.py b/edtf_django_tests/edtf_django_tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/edtf_django_tests/edtf_django_tests/asgi.py b/edtf_django_tests/edtf_django_tests/asgi.py new file mode 100644 index 0000000..b62c5f5 --- /dev/null +++ b/edtf_django_tests/edtf_django_tests/asgi.py @@ -0,0 +1,16 @@ +""" +ASGI config for edtf_django_tests project. + +It exposes the ASGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/4.2/howto/deployment/asgi/ +""" + +import os + +from django.core.asgi import get_asgi_application + +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "edtf_django_tests.settings") + +application = get_asgi_application() diff --git a/edtf_django_tests/edtf_django_tests/settings.py b/edtf_django_tests/edtf_django_tests/settings.py new file mode 100644 index 0000000..bad4f60 --- /dev/null +++ b/edtf_django_tests/edtf_django_tests/settings.py @@ -0,0 +1,124 @@ +""" +Django settings for edtf_django_tests project. + +Generated by 'django-admin startproject' using Django 4.2.7. + +For more information on this file, see +https://docs.djangoproject.com/en/4.2/topics/settings/ + +For the full list of settings and their values, see +https://docs.djangoproject.com/en/4.2/ref/settings/ +""" + +from pathlib import Path + +# Build paths inside the project like this: BASE_DIR / 'subdir'. +BASE_DIR = Path(__file__).resolve().parent.parent + + +# Quick-start development settings - unsuitable for production +# See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/ + +# SECURITY WARNING: keep the secret key used in production secret! +SECRET_KEY = "django-insecure-zkd&%e=di9d(p@wq7vnstn+4dx7cxbxkve�*+57sks0q$=0a" # noqa: S105 (only for testing) + +# SECURITY WARNING: don't run with debug turned on in production! +DEBUG = True + +ALLOWED_HOSTS = [] + + +# Application definition + +INSTALLED_APPS = [ + "django.contrib.admin", + "django.contrib.auth", + "django.contrib.contenttypes", + "django.contrib.sessions", + "django.contrib.messages", + "django.contrib.staticfiles", + "edtf_integration", +] + +MIDDLEWARE = [ + "django.middleware.security.SecurityMiddleware", + "django.contrib.sessions.middleware.SessionMiddleware", + "django.middleware.common.CommonMiddleware", + "django.middleware.csrf.CsrfViewMiddleware", + "django.contrib.auth.middleware.AuthenticationMiddleware", + "django.contrib.messages.middleware.MessageMiddleware", + "django.middleware.clickjacking.XFrameOptionsMiddleware", +] + +ROOT_URLCONF = "edtf_django_tests.urls" + +TEMPLATES = [ + { + "BACKEND": "django.template.backends.django.DjangoTemplates", + "DIRS": [], + "APP_DIRS": True, + "OPTIONS": { + "context_processors": [ + "django.template.context_processors.debug", + "django.template.context_processors.request", + "django.contrib.auth.context_processors.auth", + "django.contrib.messages.context_processors.messages", + ], + }, + }, +] + +WSGI_APPLICATION = "edtf_django_tests.wsgi.application" + + +# Database +# https://docs.djangoproject.com/en/4.2/ref/settings/#databases + +DATABASES = { + "default": { + "ENGINE": "django.db.backends.sqlite3", + "NAME": BASE_DIR / "db.sqlite3", + } +} + + +# Password validation +# https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators + +AUTH_PASSWORD_VALIDATORS = [ + { + "NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator", + }, + { + "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator", + }, + { + "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator", + }, + { + "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator", + }, +] + + +# Internationalization +# https://docs.djangoproject.com/en/4.2/topics/i18n/ + +LANGUAGE_CODE = "en-us" + +TIME_ZONE = "UTC" + +USE_I18N = True + +USE_TZ = True + + +# Static files (CSS, JavaScript, Images) +# https://docs.djangoproject.com/en/4.2/howto/static-files/ + +STATIC_URL = "static/" + +# Default primary key field type +# https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field + +DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField" diff --git a/edtf_django_tests/edtf_django_tests/urls.py b/edtf_django_tests/edtf_django_tests/urls.py new file mode 100644 index 0000000..0b30a1b --- /dev/null +++ b/edtf_django_tests/edtf_django_tests/urls.py @@ -0,0 +1,23 @@ +""" +URL configuration for edtf_django_tests project. + +The `urlpatterns` list routes URLs to views. For more information please see: + https://docs.djangoproject.com/en/4.2/topics/http/urls/ +Examples: +Function views + 1. Add an import: from my_app import views + 2. Add a URL to urlpatterns: path('', views.home, name='home') +Class-based views + 1. Add an import: from other_app.views import Home + 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home') +Including another URLconf + 1. Import the include() function: from django.urls import include, path + 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) +""" + +from django.contrib import admin +from django.urls import path + +urlpatterns = [ + path("admin/", admin.site.urls), +] diff --git a/edtf_django_tests/edtf_django_tests/wsgi.py b/edtf_django_tests/edtf_django_tests/wsgi.py new file mode 100644 index 0000000..20450c1 --- /dev/null +++ b/edtf_django_tests/edtf_django_tests/wsgi.py @@ -0,0 +1,16 @@ +""" +WSGI config for edtf_django_tests project. + +It exposes the WSGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/4.2/howto/deployment/wsgi/ +""" + +import os + +from django.core.wsgi import get_wsgi_application + +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "edtf_django_tests.settings") + +application = get_wsgi_application() diff --git a/edtf_django_tests/edtf_integration/__init__.py b/edtf_django_tests/edtf_integration/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/edtf_django_tests/edtf_integration/admin.py b/edtf_django_tests/edtf_integration/admin.py new file mode 100644 index 0000000..3051891 --- /dev/null +++ b/edtf_django_tests/edtf_integration/admin.py @@ -0,0 +1,43 @@ +from django.contrib import admin + +from .models import TestEvent + + +class TestEventAdmin(admin.ModelAdmin): + list_display = ( + "date_display", + "date_edtf_direct", + "date_earliest", + "date_latest", + "date_sort_ascending", + "date_sort_descending", + "date_edtf", + ) + search_fields = ("date_display", "date_edtf_direct") + list_filter = ("date_earliest", "date_latest") + readonly_fields = ( + "date_earliest", + "date_latest", + "date_sort_ascending", + "date_sort_descending", + "date_edtf", + ) + + fieldsets = ( + (None, {"fields": ("date_display", "date_edtf_direct", "date_edtf")}), + ( + "Computed Dates", + { + "classes": ("collapse",), + "fields": ( + "date_earliest", + "date_latest", + "date_sort_ascending", + "date_sort_descending", + ), + }, + ), + ) + + +admin.site.register(TestEvent, TestEventAdmin) diff --git a/edtf_django_tests/edtf_integration/apps.py b/edtf_django_tests/edtf_integration/apps.py new file mode 100644 index 0000000..23bc09d --- /dev/null +++ b/edtf_django_tests/edtf_integration/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class EdtfIntegrationConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "edtf_integration" diff --git a/edtf_django_tests/edtf_integration/migrations/0001_initial.py b/edtf_django_tests/edtf_integration/migrations/0001_initial.py new file mode 100644 index 0000000..0311290 --- /dev/null +++ b/edtf_django_tests/edtf_integration/migrations/0001_initial.py @@ -0,0 +1,64 @@ +# Generated by Django 4.2.13 on 2024-05-09 18:13 + +from django.db import migrations, models +import edtf.fields + + +class Migration(migrations.Migration): + initial = True + + dependencies = [] + + operations = [ + migrations.CreateModel( + name="TestEvent", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "date_display", + models.CharField( + blank=True, + help_text="Enter the date in natural language format (e.g., 'Approximately June 2004').", + max_length=255, + null=False, + verbose_name="Date of creation (display)", + ), + ), + ( + "date_edtf_direct", + models.CharField( + blank=True, + help_text="Enter the date in EDTF format (e.g., '2004-06~').", + max_length=255, + null=False, + verbose_name="Date of creation (EDTF format)", + ), + ), + ( + "date_edtf", + edtf.fields.EDTFField( + blank=True, + lower_fuzzy_field="date_earliest", + lower_strict_field="date_sort_ascending", + natural_text_field="date_display", + null=True, + upper_fuzzy_field="date_latest", + upper_strict_field="date_sort_descending", + verbose_name="Date of creation (EDTF)", + ), + ), + ("date_earliest", models.FloatField(blank=True, null=True)), + ("date_latest", models.FloatField(blank=True, null=True)), + ("date_sort_ascending", models.FloatField(blank=True, null=True)), + ("date_sort_descending", models.FloatField(blank=True, null=True)), + ], + ), + ] diff --git a/edtf_django_tests/edtf_integration/migrations/__init__.py b/edtf_django_tests/edtf_integration/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/edtf_django_tests/edtf_integration/models.py b/edtf_django_tests/edtf_integration/models.py new file mode 100644 index 0000000..5e66592 --- /dev/null +++ b/edtf_django_tests/edtf_integration/models.py @@ -0,0 +1,53 @@ +from django.db import models + +from edtf.fields import EDTFField + + +class TestEvent(models.Model): + date_display = models.CharField( + "Date of creation (display)", + blank=True, + null=False, + max_length=255, + help_text="Enter the date in natural language format (e.g., 'Approximately June 2004').", + ) + + date_edtf_direct = models.CharField( + "Date of creation (EDTF format)", + max_length=255, + blank=True, + null=False, + help_text="Enter the date in EDTF format (e.g., '2004-06~').", + ) + + # EDTF field that parses the input from either natural language or direct EDTF string + # natural_text_field is the field that stores the natural language input and is used for display + # direct_input_field stores an EDTF string + # TODO is there a need for both a natural text input and a label? + # TODO could consolidate the direct_input_field and natural_text_field into a single field, but would need + # a flag to indicate whether the input is natural language or EDTF as the natural language parser sometimes + # misparses an EDTF string as a natural language string (e.g. `2020-03-15/2020-04-15` -> `2020-03-15`) + date_edtf = EDTFField( + "Date of creation (EDTF)", + natural_text_field="date_display", + direct_input_field="date_edtf_direct", + lower_fuzzy_field="date_earliest", + upper_fuzzy_field="date_latest", + lower_strict_field="date_sort_ascending", + upper_strict_field="date_sort_descending", + blank=True, + null=True, + ) + # Computed fields for filtering + date_earliest = models.FloatField(blank=True, null=True) + date_latest = models.FloatField(blank=True, null=True) + # Computed fields for sorting + date_sort_ascending = models.FloatField(blank=True, null=True) + date_sort_descending = models.FloatField(blank=True, null=True) + + def __str__(self) -> str: + return ( + f"Test Event: {self.date_display=}, " + f"{self.date_edtf_direct=}, " + f"{self.date_edtf=}" + ) diff --git a/edtf_django_tests/edtf_integration/tests.py b/edtf_django_tests/edtf_integration/tests.py new file mode 100644 index 0000000..aa1bf34 --- /dev/null +++ b/edtf_django_tests/edtf_integration/tests.py @@ -0,0 +1,155 @@ +from django.test import TestCase + +from edtf import EDTFObject, struct_time_to_jd +from edtf import parse_edtf as parse + +from .models import TestEvent + + +class TestEventModelTests(TestCase): + def setUp(self): + # Create instances and assign them to instance variables + # date_edtf_direct is a valid EDTF string, date_display is a date + # to be parsed from natural language + self.event1 = TestEvent.objects.create(date_edtf_direct="2020-03-15/2020-04-15") + self.event2 = TestEvent.objects.create(date_edtf_direct="2021-05-06") + self.event3 = TestEvent.objects.create(date_edtf_direct="2019-11") + self.event4 = TestEvent.objects.create(date_display="Approximately August 2018") + self.event5 = TestEvent.objects.create(date_edtf_direct="2021-05-06") + + def test_edtf_object_returned(self): + for event in TestEvent.objects.all(): + self.assertIsInstance(event.date_edtf, EDTFObject) + + def test_sorting(self): + events = list(TestEvent.objects.order_by("date_sort_ascending")) + self.assertEqual(events[0].date_display, "Approximately August 2018") + self.assertEqual(events[1].date_edtf_direct, "2019-11") + self.assertEqual(events[2].date_edtf_direct, "2020-03-15/2020-04-15") + self.assertEqual(events[3].date_edtf_direct, "2021-05-06") + self.assertEqual(events[4].date_edtf_direct, "2021-05-06") + + events_desc = list(TestEvent.objects.order_by("-date_sort_descending")) + self.assertEqual(events_desc[0].date_edtf_direct, "2021-05-06") + self.assertEqual(events_desc[1].date_edtf_direct, "2021-05-06") + self.assertEqual(events_desc[2].date_edtf_direct, "2020-03-15/2020-04-15") + self.assertEqual(events_desc[3].date_edtf_direct, "2019-11") + self.assertEqual(events_desc[4].date_display, "Approximately August 2018") + + def test_date_boundaries(self): + event = TestEvent.objects.get(date_edtf_direct="2020-03-15/2020-04-15") + expected_earliest_jd = struct_time_to_jd(parse("2020-03-15").lower_strict()) + expected_latest_jd = struct_time_to_jd(parse("2020-04-15").upper_strict()) + self.assertAlmostEqual(event.date_earliest, expected_earliest_jd, places=1) + self.assertAlmostEqual(event.date_latest, expected_latest_jd, places=1) + + expected_earliest_jd = struct_time_to_jd(parse("2021-05-06").lower_strict()) + expected_latest_jd = struct_time_to_jd(parse("2021-05-06").upper_strict()) + self.assertAlmostEqual( + self.event2.date_earliest, expected_earliest_jd, places=1 + ) + self.assertAlmostEqual(self.event2.date_latest, expected_latest_jd, places=1) + + event3 = TestEvent.objects.get(date_edtf_direct="2019-11") + expected_earliest_jd = struct_time_to_jd(parse("2019-11").lower_strict()) + expected_latest_jd = struct_time_to_jd(parse("2019-11").upper_strict()) + self.assertAlmostEqual(event3.date_earliest, expected_earliest_jd, places=1) + self.assertAlmostEqual(event3.date_latest, expected_latest_jd, places=1) + + event4 = TestEvent.objects.get(date_display="Approximately August 2018") + expected_earliest_jd = struct_time_to_jd(parse("2018-08~").lower_fuzzy()) + expected_latest_jd = struct_time_to_jd(parse("2018-08~").upper_fuzzy()) + self.assertAlmostEqual(event4.date_earliest, expected_earliest_jd, places=1) + self.assertAlmostEqual(event4.date_latest, expected_latest_jd, places=1) + + def test_date_display(self): + """ + Test that the date_display field is correctly populated based on the EDTF input. + In the future, a more sophisticated natural language parser could be used to generate + a human readable date from the EDTF input. + """ + event = TestEvent.objects.get(date_edtf_direct="2020-03-15/2020-04-15") + self.assertEqual(event.date_display, "2020-03-15/2020-04-15") + self.assertEqual(self.event2.date_display, "2021-05-06") + self.assertEqual(self.event3.date_display, "2019-11") + self.assertEqual(self.event4.date_display, "Approximately August 2018") + + def test_date_display_with_none_or_empty_string(self): + """ + Test that the date_display field is correctly populated when the + `natural_date` field is set to empty string (for example, if it + were used with `null=False` in the model definition) or set to + None (if it were used with `null=True`). + """ + event = TestEvent(date_display="") + event.date_edtf_direct = "2020-03-15/2020-04-15" + # Trigger the descriptor to update the date_display field + event.date_edtf = "" + self.assertEqual(event.date_display, "2020-03-15/2020-04-15") + + event = TestEvent(date_display=None) + # Verify date_display is set to None even though the field is `null=False` + self.assertIsNone(event.date_display) + event.date_edtf_direct = "2020-03-15/2020-04-15" + event.date_edtf = "" + self.assertEqual(event.date_display, "2020-03-15/2020-04-15") + + def test_comparison(self): + # test equality of the same dates + self.assertEqual( + self.event2.date_edtf, + self.event5.date_edtf, + "Events with the same date should be equal", + ) + + # test inequality of different dates + self.assertNotEqual( + self.event1.date_edtf, + self.event2.date_edtf, + "Events with different dates should not be equal", + ) + + # greater than + self.assertGreater( + self.event2.date_edtf, + self.event3.date_edtf, + "2021-05-06 is greater than 2019-11", + ) + + # less than + self.assertLess( + self.event3.date_edtf, + self.event2.date_edtf, + "2019-11 is less than 2021-05-06", + ) + + def test_field_related_field_specification(self): + edtf_field_on_model = TestEvent._meta.get_field("date_edtf") + required_fields = ( + "direct_input_field", + "lower_fuzzy_field", + "lower_strict_field", + "natural_text_field", + "upper_fuzzy_field", + "upper_strict_field", + ) + for field_alias in required_fields: + # Remove the alias from the edtf_field + orig_value = getattr(edtf_field_on_model, field_alias) + setattr(edtf_field_on_model, field_alias, None) + errors = edtf_field_on_model.check() + self.assertEqual(len(errors), 1) + self.assertTrue(field_alias in errors[0].msg) + # Should be an 'alias not specified' error + self.assertEqual(errors[0].id, "python-edtf.EDTF01") + + # Point the alias to a non-existent field + setattr(edtf_field_on_model, field_alias, "fake") + errors = edtf_field_on_model.check() + self.assertEqual(len(errors), 1) + self.assertTrue(field_alias in errors[0].msg) + # Should be a 'non-eixstent field' error + self.assertEqual(errors[0].id, "python-edtf.EDTF02") + + # Repair the field so later tests can still work + setattr(edtf_field_on_model, field_alias, orig_value) diff --git a/edtf_django_tests/edtf_integration/views.py b/edtf_django_tests/edtf_integration/views.py new file mode 100644 index 0000000..60f00ef --- /dev/null +++ b/edtf_django_tests/edtf_integration/views.py @@ -0,0 +1 @@ +# Create your views here. diff --git a/edtf_django_tests/manage.py b/edtf_django_tests/manage.py new file mode 100755 index 0000000..ffd375b --- /dev/null +++ b/edtf_django_tests/manage.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python +"""Django's command-line utility for administrative tasks.""" + +import os +import sys + + +def main(): + """Run administrative tasks.""" + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "edtf_django_tests.settings") + try: + from django.core.management import execute_from_command_line + except ImportError as exc: + raise ImportError( + "Couldn't import Django. Are you sure it's installed and " + "available on your PYTHONPATH environment variable? Did you " + "forget to activate a virtual environment?" + ) from exc + execute_from_command_line(sys.argv) + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..860741e --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,147 @@ +[project] +name = "edtf" +version = "5.0.0" +dependencies = [ + "python-dateutil", + "pyparsing", +] +description = "Python implementation of Library of Congress EDTF (Extended Date Time Format) specification" +requires-python = ">=3.8" +readme = {file = "README.txt", content-type = "text/markdown"} +authors = [ + { name = "The Interaction Consortium", email = "studio@interaction.net.au"}, + { name = "Alastair Weakley"}, + { name = "Greg Turner"}, + { name = "James Murty"}, + { name = "Mark Finger" }, + { name = "Sabine Müller" }, + { name = "Cole Crawford" }, + { name = "Klaus Rettinghaus" } +] +maintainers = [ + { name = "The Interaction Consortium", email = "studio@interaction.net.au" } +] +classifiers = [ + "Intended Audience :: Developers", + "Intended Audience :: End Users/Desktop", + "License :: OSI Approved :: MIT License", + "Operating System :: MacOS :: MacOS X", + "Operating System :: POSIX :: Linux", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] + +[project.optional-dependencies] +test = [ + "django>=4.2,<5.0", + "pytest", + "pytest-django", + "pytest-benchmark", + "ruff", + "pre-commit", + "coverage", + "pytest-cov", + "junitparser", +] + +[project.urls] +homepage = "https://github.com/ixc/python-edtf" +issues = "https://github.com/ixc/python-edtf/issues" +repository = "https://github.com/ixc/python-edtf.git" +changelog = "https://github.com/ixc/python-edtf/blob/main/changelog.rst" + +[build-system] +requires = ["setuptools", "wheel"] +build-backend = "setuptools.build_meta" + +[tool.setuptools] +packages.find = { where = ["."], exclude = ["edtf_django_tests", "edtf_django_tests.*"] } + +[tool.wheel] +universal = false + +[tool.tox] +legacy_tox_ini = """ + [tox] + min_version = 4.0 + env_list = py{38,39,310,311,312} + isolated_build = true + skip_missing_interpreters = True + + [testenv] + deps = + pytest + django + commands = pytest +""" + +[tool.pytest.ini_options] +python_files = ["tests.py", "test_*.py", "*_test.py", "*_tests.py"] +python_classes = ["Test*", "*Tests"] +python_functions = ["test_*"] +markers = [ + "benchmark: mark a test as a benchmark", +] +addopts = "--ignore=edtf_django_tests/ --cov=edtf -m 'not benchmark'" +plugins = ["pytest_cov", "pytest_benchmark"] + +[tool.coverage.run] +# we run the edtf_integration tests but only care about them testing fields.py in the main package +omit = [ + "edtf_django_tests/*" +] + +[tool.coverage.report] +exclude_lines = [ + # Don't complain about missing debug-only code: + "if __name__ == .__main__.:", + # Don't complain if tests don't hit defensive assertion code: + "raise AssertionError", + "raise NotImplementedError", + "raise NotImplemented", + "raise NotImplemented" +] + +[tool.ruff] +# Python 3.8 +target-version = "py38" + +extend-exclude = [ + '**/migrations/*', +] + +[tool.ruff.lint] +select = [ + # pycodestyle + "E", + # pycodestyle warnings + "W", + # Pyflakes + "F", + # pyupgrade + ## Flake8 plugins + "UP", + # flake8-bugbear + "B", + # flake8-comprehensions + "C", + # flake8-django + "DJ", + # flake8-bandit + "S", + # flake8-simplify + "SIM", + # isort + "I", +] + +ignore = [ + # Ignore Pycodestyle line-length warnings, (mainly long comments). + "E501", + # Ignore McCabe complexity (for now). + "C901", +] diff --git a/requirements.txt b/requirements.txt index 0ab3a7d..1656e27 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,2 @@ python-dateutil pyparsing -six diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 082465a..0000000 --- a/setup.cfg +++ /dev/null @@ -1,5 +0,0 @@ -[devpi:upload] -formats = bdist_wheel - -[wheel] -universal = 1 diff --git a/setup.py b/setup.py deleted file mode 100644 index f0f1849..0000000 --- a/setup.py +++ /dev/null @@ -1,52 +0,0 @@ -from __future__ import print_function - -import setuptools -import sys - -def readme(): - with open('README.md') as f: - return f.read() - -setuptools.setup( - name='edtf', - use_scm_version={'version_scheme': 'post-release'}, - url='https://github.com/ixc/python-edtf', - author='Greg Turner', - author_email='greg@interaction.net.au', - description='Python implementation of Library of Congress EDTF (Extended ' - 'Date Time Format) specification', - long_description=readme(), - long_description_content_type="text/markdown", - license='MIT', - packages=setuptools.find_packages(), - include_package_data=True, - install_requires=[ - 'python-dateutil', - 'pyparsing', - 'six' - ], - extras_require={ - 'test': [ - 'django', - 'nose', - 'tox', - ], - }, - setup_requires=[ - 'setuptools_scm', - ], - keywords=[ - 'edtf', - ], - classifiers=[ - 'Development Status :: 4 - Beta', - 'Intended Audience :: Developers', - 'Intended Audience :: End Users/Desktop', - 'License :: OSI Approved :: MIT License', - 'Operating System :: MacOS :: MacOS X', - 'Operating System :: POSIX :: Linux', - 'Programming Language :: Python', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3.6', - ], -) diff --git a/tox.ini b/tox.ini deleted file mode 100644 index f70761f..0000000 --- a/tox.ini +++ /dev/null @@ -1,8 +0,0 @@ -[tox] -envlist = py27,py36 - -[testenv] -deps= - nose - django -commands=nosetests