From 6b35f745e7fc62c4860c38c9fca176717e61df38 Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Thu, 27 Jun 2024 09:30:25 -0400 Subject: [PATCH] update for new dedupe data model pattern --- datetimetype/__init__.py | 21 ++++++++++----------- pyproject.toml | 3 --- tests/test_datetime_comparator.py | 26 +++++++++++++------------- 3 files changed, 23 insertions(+), 27 deletions(-) diff --git a/datetimetype/__init__.py b/datetimetype/__init__.py index e712a9e..514308d 100644 --- a/datetimetype/__init__.py +++ b/datetimetype/__init__.py @@ -1,13 +1,13 @@ import numpy as np from datetime_distance import DateTimeComparator +from dedupe import predicates from dedupe.variables.base import DerivedType, FieldType from dedupe.variables.string import affineGap import datetimetype.datetime_predicates as dtp -from dedupe import predicates -class DateTimeType(FieldType): +class DateTime(FieldType): type = "DateTime" _predicate_functions = [ @@ -26,7 +26,7 @@ def __len__(self): return self.expanded_size - def __init__(self, definition): + def __init__(self, field, fuzzy=True, dayfirst=False, yearfirst=False, **kwargs): """ Initialize a field for comparing datetime types, including timestamps, dates, months, and years. @@ -41,30 +41,29 @@ def __init__(self, definition): for more information about python-dateutil's parser settings. """ - super(DateTimeType, self).__init__(definition) + super().__init__(field, **kwargs) # Parser settings - self.fuzzy = definition.get("fuzzy", True) - self.dayfirst = definition.get("dayfirst", False) - self.yearfirst = definition.get("yearfirst", False) + self.fuzzy = fuzzy + self.dayfirst = dayfirst + self.yearfirst = yearfirst # Define the expected fields in the output vector self.variables = ("seconds", "days", "months", "years", "full string") - fields = self._get_fields(definition["field"]) + fields = self._get_fields(field) # Format for output vector: Not Missing + Dummies + Fields self.expanded_size = 1 + (len(self.variables) - 1) + len(self.variables) self.higher_vars = [ - DerivedType({"name": variable, "type": field_type}) - for variable, field_type in fields + DerivedType(variable, field_type) for variable, field_type in fields ] def _get_fields(self, field): """ Returns the format for the output vector. """ - fields = [("{}: Not Missing".format(field), "Dummy")] + fields = [(f"{field}: Not Missing", "Dummy")] fields += [(var, "Dummy") for var in self.variables[:-1]] diff --git a/pyproject.toml b/pyproject.toml index f1857c4..c358d3a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,9 +12,6 @@ dependencies = ["dedupe>=3.0", "datetime-distance"] [project.urls] Homepage = "https://github.com/datamade/dedupe-variable-datetime" -[project.entry-points] -dedupevariables = {datetimetype = "datetimetype:DateTimeType"} - [tool.setuptools] packages = ["datetimetype"] include-package-data = false diff --git a/tests/test_datetime_comparator.py b/tests/test_datetime_comparator.py index 794d4f9..1084d67 100644 --- a/tests/test_datetime_comparator.py +++ b/tests/test_datetime_comparator.py @@ -4,11 +4,11 @@ import numpy as np import datetimetype.datetime_predicates as dtp -from datetimetype import DateTimeType +from datetimetype import DateTime def test_datetime_to_datetime_comparison(): - dt = DateTimeType({"field": "foo"}) + dt = DateTime("foo") np.testing.assert_almost_equal( dt.comparator("2017-05-25", "2017-01-01"), np.array([1, 0, 1, 0, 0, 0, math.sqrt(144), 0, 0, 0]), @@ -16,7 +16,7 @@ def test_datetime_to_datetime_comparison(): def test_datetime_to_timestamp_comparison(): - dt = DateTimeType({"field": "foo"}) + dt = DateTime("foo") np.testing.assert_almost_equal( dt.comparator("2017-05-25", "2017-01-01 12:30:05"), np.array([1, 0, 1, 0, 0, 0, math.sqrt(143), 0, 0, 0]), @@ -24,7 +24,7 @@ def test_datetime_to_timestamp_comparison(): def test_timestamp_to_timestamp_comparison(): - dt = DateTimeType({"field": "foo"}) + dt = DateTime("foo") np.testing.assert_almost_equal( dt.comparator("2017-05-25 21:08:09", "2017-01-01 12:30:05"), np.array([1, 1, 0, 0, 0, math.sqrt(12472684), 0, 0, 0, 0]), @@ -32,7 +32,7 @@ def test_timestamp_to_timestamp_comparison(): def test_years(): - dt = DateTimeType({"field": "foo"}) + dt = DateTime("foo") np.testing.assert_almost_equal( dt.comparator("2012", "2010"), np.array([1, 0, 0, 0, 1, 0, 0, 0, math.sqrt(2), 0]), @@ -40,7 +40,7 @@ def test_years(): def test_months(): - dt = DateTimeType({"field": "foo"}) + dt = DateTime("foo") np.testing.assert_almost_equal( dt.comparator("May 2012", "June 2013"), np.array([1, 0, 0, 1, 0, 0, 0, math.sqrt(13), 0, 0]), @@ -48,7 +48,7 @@ def test_months(): def test_days(): - dt = DateTimeType({"field": "foo"}) + dt = DateTime("foo") np.testing.assert_almost_equal( dt.comparator("5 May 2013", "9 June 2013"), np.array([1, 0, 1, 0, 0, 0, math.sqrt(35), 0, 0, 0]), @@ -56,7 +56,7 @@ def test_days(): def test_month_and_day(): - dt = DateTimeType({"field": "foo"}) + dt = DateTime("foo") np.testing.assert_almost_equal( dt.comparator("7/7", "July 9th"), np.array([1, 0, 1, 0, 0, 0, math.sqrt(2), 0, 0, 0]), @@ -64,7 +64,7 @@ def test_month_and_day(): def test_alternate_formats(): - dt = DateTimeType({"field": "foo"}) + dt = DateTime("foo") comp = dt.comparator("May 5th, 2013", "2013-06-09") np.testing.assert_almost_equal( comp, np.array([1, 0, 1, 0, 0, 0, math.sqrt(35), 0, 0, 0]) @@ -82,7 +82,7 @@ def test_alternate_formats(): def test_bad_parse(): - dt = DateTimeType({"field": "foo"}) + dt = DateTime("foo") np.testing.assert_almost_equal( dt.comparator("foo", "bar"), np.array([1, 0, 0, 0, 0, 0, 0, 0, 0, 5.5]) ) @@ -91,7 +91,7 @@ def test_bad_parse(): def test_fuzzy_parse(): - dt = DateTimeType({"field": "foo"}) + dt = DateTime("foo") time1 = "June 6th 2013" time2 = "It happened on June 7th, 2013" np.testing.assert_almost_equal( @@ -100,13 +100,13 @@ def test_fuzzy_parse(): def test_missing(): - dt = DateTimeType({"field": "foo"}) + dt = DateTime("foo") np.testing.assert_almost_equal(dt.comparator("", "non-empty"), np.zeros(len(dt))) np.testing.assert_almost_equal(dt.comparator(None, "non-empty"), np.zeros(len(dt))) def test_datetime_object(): - dt = DateTimeType({"field": "foo"}) + dt = DateTime("foo") a = datetime.datetime(2016, 5, 6, 0, 0) b = datetime.datetime(2016, 5, 7, 0, 0) np.testing.assert_almost_equal(