Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

28 enforce consistency on missing value handling in featurespy #51

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions docs/tutorials/run-through.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ First, we'll import a few modules, including:

```{python}
import os

import numpy as np
import pandas as pd

from pprl import EmbeddedDataFrame, Embedder, config
Expand All @@ -34,8 +34,9 @@ df1 = pd.DataFrame(
id=[1,2,3],
forename=["Henry", "Sally", "Ina"],
surname = ["Tull", "Brown", "Lawrey"],
dob=["1/1/2001", "2/1/2001", "4/10/1995"],
dob=["", "2/1/2001", "4/10/1995"],
gender=["male", "Male", "Female"],
county=["", np.NaN, "County Durham"]
)
)

Expand All @@ -45,6 +46,7 @@ df2 = pd.DataFrame(
full_name=["Harry Tull", "Sali Brown", "Ina Laurie"],
date_of_birth=["2/1/2001", "2/1/2001", "4/11/1995"],
sex=["M", "M", "F"],
county=["Rutland", "Powys", "Durham"]
)
)
```
Expand All @@ -64,6 +66,7 @@ feature_factory = dict(
name=feat.gen_name_features,
dob=feat.gen_dateofbirth_features,
sex=feat.gen_sex_features,
misc=feat.gen_misc_features
)

ff_args = dict(name={}, sex={}, dob={})
Expand Down Expand Up @@ -91,10 +94,10 @@ contribute to the embedding.

```{python}
edf1 = embedder.embed(
df1, colspec=dict(forename="name", surname="name", dob="dob", gender="sex")
df1, colspec=dict(forename="name", surname="name", dob="dob", gender="sex", county="misc")
)
edf2 = embedder.embed(
df2, colspec=dict(full_name="name", date_of_birth="dob", sex="sex")
df2, colspec=dict(full_name="name", date_of_birth="dob", sex="sex", county="misc")
)

print(edf1)
Expand Down
3 changes: 2 additions & 1 deletion src/pprl/embedder/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ def gen_dateofbirth_features(
dob: pd.Series,
dayfirst: bool = True,
yearfirst: bool = False,
default: list[str] = ["day<01>", "month<01>", "year<2050>"],
default: list[str] = [],
) -> pd.Series:
"""Generate labelled date features from a series of dates of birth.

Expand Down Expand Up @@ -314,6 +314,7 @@ def gen_misc_features(field: pd.Series, label: None | str | Hashable = None) ->

_field = (
field.copy()
.replace("", "no_data", regex=False)
.fillna("no_data")
.astype("str")
.str.casefold() # make everything lowercase
Expand Down
2 changes: 1 addition & 1 deletion test/embedder/test_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,7 @@ def test_gen_misc_features(fields, label):
assert features.dtype == list

for feature, field in zip(features, fields):
if field is None or (isinstance(field, float) and pd.isna(field)):
if field is None or field == "" or (isinstance(field, float) and pd.isna(field)):
assert feature == ""
else:
assert feature == [f"{label}<{str(field).casefold()}>"]
Expand Down
Loading