Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tests and minor fixes for "anonymize_database.anonymize_data" function #22

Merged
merged 18 commits into from
Aug 28, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
2907b5b
Added df_least_nan, df_duplicated_columns methods to create dataframe…
lorenz-gorini Aug 18, 2020
a300809
Changed from sklearn.preprocessing.OneHotEncoder to EncodingFunctions…
lorenz-gorini Aug 21, 2020
0d1d191
Fixed issue 12 by changing from EncodingFunctions.ONEHOT/ORDINAL clas…
lorenz-gorini Aug 21, 2020
a0bf2f0
Fixed repeated code after rebase
lorenz-gorini Aug 25, 2020
ce6be4d
Added df_generic Mock, that is used for mocking a generic Pandas Data…
lorenz-gorini Aug 20, 2020
4f0d72f
Changed from sklearn.preprocessing.OneHotEncoder to EncodingFunctions…
lorenz-gorini Aug 21, 2020
dec0ea1
Fixed issue 12 by changing from EncodingFunctions.ONEHOT/ORDINAL clas…
lorenz-gorini Aug 21, 2020
8f15a1e
In import_df_with_info_from_file function:
lorenz-gorini Aug 25, 2020
6c33926
Completed tests for import/export_df_with_info functions for DataFram…
lorenz-gorini Aug 25, 2020
80f0af0
Fixes issue #19 because now "show_columns_type" considers every value…
lorenz-gorini Aug 25, 2020
96cbaeb
Refactored according to flake8
lorenz-gorini Aug 25, 2020
93cff6d
Moved temporary_data_dir fixture to conftest.py since it is a generic…
lorenz-gorini Aug 26, 2020
82a5997
Fixes issue #21 by adding a "random_seed" argument to "anonymize_data…
lorenz-gorini Aug 26, 2020
591b0fc
Added test for "anonymize_database.anonymize_data" function.
lorenz-gorini Aug 26, 2020
1df8074
Fixed tests after rebase
lorenz-gorini Aug 27, 2020
f56c41a
Fixed according to PR comments (minor typos in docstrings)
lorenz-gorini Aug 27, 2020
37aa71c
Formatted according to flake8
lorenz-gorini Aug 27, 2020
1fd2ef8
Fixed minor typos docstrings
lorenz-gorini Aug 28, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions src/tests/dataframewithinfo_util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import itertools
import random
from datetime import date
from typing import Tuple

import pandas as pd

Expand Down Expand Up @@ -381,6 +382,45 @@ def df_duplicated_columns(duplicated_cols_count: int) -> pd.DataFrame:

return pd.DataFrame(df_duplicated)

@staticmethod
def df_with_private_info(private_cols: Tuple[str]):
"""
Create DataFrame with private info columns along with data columns

The returned DataFrame mock contains (len(private_cols) + 2) columns
and 5 rows. Particularly it contains the columns listed in ``private_cols``
with string values,and 2 data columns containing
alessiamarcolini marked this conversation as resolved.
Show resolved Hide resolved
integer values.
Two of these rows have same values in ``private_cols`` columns, but different
values in the other 2 data columns (this could be simulating a DataFrame
with multiple rows related to the same customer/patient).

Parameters
----------
private_cols: Tuple[str]
List of columns that will be created as private columns

Returns
-------
pd.DataFrame
DataFrame mock containing (len(private_cols) + 2) columns
and 5 rows. Particularly it contains the columns listed in ``private_cols``
with generic string values,and 2 data columns containing
alessiamarcolini marked this conversation as resolved.
Show resolved Hide resolved
integer values.

"""
df_private_info_dict = {}
sample_size = 5
for i, col in enumerate(private_cols):
df_private_info_dict[col] = [
f"col_{i}_value_{k}" for k in range(sample_size - 1)
]
# Add a duplicated row (it may be associated to the same customer)
df_private_info_dict[col].append(f"col_{i}_value_{sample_size-2}")
df_private_info_dict["data_col_0"] = list(range(sample_size))
df_private_info_dict["data_col_1"] = list(range(sample_size))
return pd.DataFrame(df_private_info_dict)


class SeriesMock:
@staticmethod
Expand Down
87 changes: 87 additions & 0 deletions src/tests/integration/test_anonymize_database.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import pandas as pd
import pytest

from tests.dataframewithinfo_util import DataFrameMock

from ...pd_extras.anonymize_database import anonymize_data


@pytest.mark.parametrize(
"private_cols_to_remove, private_cols_to_map, "
+ "expected_anonym_df, expected_private_df",
alessiamarcolini marked this conversation as resolved.
Show resolved Hide resolved
[
(
["private_col_a", "private_col_b"],
["private_col_a", "private_col_b", "private_col_c"],
pd.DataFrame(
{
"private_col_c": {
0: "col_2_value_0",
1: "col_2_value_1",
2: "col_2_value_2",
3: "col_2_value_3",
4: "col_2_value_3",
},
"data_col_0": {0: 0, 1: 1, 2: 2, 3: 3, 4: 4},
"data_col_1": {0: 0, 1: 1, 2: 2, 3: 3, 4: 4},
"ID_OWNER": {
0: "467ef2006da06554f248d74bf537a2e5a5270321c35963eace344feb32dd7b31",
1: "42d7ba97aaf0368c3b2e66ac7bb88787480d22ff3e0694a805647cdce1ecac73",
2: "e605c6ffcbfcb25f252e269b04b77df4a9514effe10d9885b366dfceae82aa24",
3: "be7c8a1fc7ff3c143455fb8d2774369ff6e756d804cb1e1765aca079b1a0778a",
4: "be7c8a1fc7ff3c143455fb8d2774369ff6e756d804cb1e1765aca079b1a0778a",
},
}
),
pd.DataFrame(
{
"private_col_a": {
0: "col_0_value_0",
1: "col_0_value_1",
2: "col_0_value_2",
3: "col_0_value_3",
},
"private_col_b": {
0: "col_1_value_0",
1: "col_1_value_1",
2: "col_1_value_2",
3: "col_1_value_3",
},
"private_col_c": {
0: "col_2_value_0",
1: "col_2_value_1",
2: "col_2_value_2",
3: "col_2_value_3",
},
"ID_OWNER": {
0: "467ef2006da06554f248d74bf537a2e5a5270321c35963eace344feb32dd7b31",
1: "42d7ba97aaf0368c3b2e66ac7bb88787480d22ff3e0694a805647cdce1ecac73",
2: "e605c6ffcbfcb25f252e269b04b77df4a9514effe10d9885b366dfceae82aa24",
3: "be7c8a1fc7ff3c143455fb8d2774369ff6e756d804cb1e1765aca079b1a0778a",
},
}
),
)
],
)
def test_anonymize_data(
temporary_data_dir,
private_cols_to_remove,
private_cols_to_map,
expected_anonym_df,
expected_private_df,
):

alessiamarcolini marked this conversation as resolved.
Show resolved Hide resolved
original_df = DataFrameMock.df_with_private_info(private_cols=private_cols_to_map)

anonym_df, private_df = anonymize_data(
df=original_df,
file_name="test_original_db_anonymize",
private_cols_to_remove=private_cols_to_remove,
private_cols_to_map=private_cols_to_map,
dest_path=str(temporary_data_dir),
random_seed=42,
)

pd.testing.assert_frame_equal(anonym_df, expected_anonym_df)
pd.testing.assert_frame_equal(private_df, expected_private_df)