From e18df4c50c712510f41fa54baf8e383db1f45661 Mon Sep 17 00:00:00 2001 From: sstock <92024317+SStock1@users.noreply.github.com> Date: Wed, 8 May 2024 15:02:56 +0100 Subject: [PATCH] 28 enforce consistency on missing value handling in featurespy (#51) * leaving blank list for empty date of birth features * fixing empty string input for gen_misc_geatures * restoring some examples * restoring some examples --------- Co-authored-by: Samuel Stock Co-authored-by: Samuel Stock --- docs/tutorials/run-through.qmd | 11 +++++++---- src/pprl/embedder/features.py | 3 ++- test/embedder/test_features.py | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/docs/tutorials/run-through.qmd b/docs/tutorials/run-through.qmd index 1835cd5..da5bb46 100644 --- a/docs/tutorials/run-through.qmd +++ b/docs/tutorials/run-through.qmd @@ -16,7 +16,7 @@ First, we'll import a few modules, including: ```{python} import os - +import numpy as np import pandas as pd from pprl import EmbeddedDataFrame, Embedder, config @@ -34,8 +34,9 @@ df1 = pd.DataFrame( id=[1,2,3], forename=["Henry", "Sally", "Ina"], surname = ["Tull", "Brown", "Lawrey"], - dob=["1/1/2001", "2/1/2001", "4/10/1995"], + dob=["", "2/1/2001", "4/10/1995"], gender=["male", "Male", "Female"], + county=["", np.NaN, "County Durham"] ) ) @@ -45,6 +46,7 @@ df2 = pd.DataFrame( full_name=["Harry Tull", "Sali Brown", "Ina Laurie"], date_of_birth=["2/1/2001", "2/1/2001", "4/11/1995"], sex=["M", "M", "F"], + county=["Rutland", "Powys", "Durham"] ) ) ``` @@ -64,6 +66,7 @@ feature_factory = dict( name=feat.gen_name_features, dob=feat.gen_dateofbirth_features, sex=feat.gen_sex_features, + misc=feat.gen_misc_features ) ff_args = dict(name={}, sex={}, dob={}) @@ -91,10 +94,10 @@ contribute to the embedding. ```{python} edf1 = embedder.embed( - df1, colspec=dict(forename="name", surname="name", dob="dob", gender="sex") + df1, colspec=dict(forename="name", surname="name", dob="dob", gender="sex", county="misc") ) edf2 = embedder.embed( - df2, colspec=dict(full_name="name", date_of_birth="dob", sex="sex") + df2, colspec=dict(full_name="name", date_of_birth="dob", sex="sex", county="misc") ) print(edf1) diff --git a/src/pprl/embedder/features.py b/src/pprl/embedder/features.py index 1192ff8..f7747e6 100644 --- a/src/pprl/embedder/features.py +++ b/src/pprl/embedder/features.py @@ -251,7 +251,7 @@ def gen_dateofbirth_features( dob: pd.Series, dayfirst: bool = True, yearfirst: bool = False, - default: list[str] = ["day<01>", "month<01>", "year<2050>"], + default: list[str] = [], ) -> pd.Series: """Generate labelled date features from a series of dates of birth. @@ -314,6 +314,7 @@ def gen_misc_features(field: pd.Series, label: None | str | Hashable = None) -> _field = ( field.copy() + .replace("", "no_data", regex=False) .fillna("no_data") .astype("str") .str.casefold() # make everything lowercase diff --git a/test/embedder/test_features.py b/test/embedder/test_features.py index ba7b430..3f99d0d 100644 --- a/test/embedder/test_features.py +++ b/test/embedder/test_features.py @@ -434,7 +434,7 @@ def test_gen_misc_features(fields, label): assert features.dtype == list for feature, field in zip(features, fields): - if field is None or (isinstance(field, float) and pd.isna(field)): + if field is None or field == "" or (isinstance(field, float) and pd.isna(field)): assert feature == "" else: assert feature == [f"{label}<{str(field).casefold()}>"]