Skip to content

Commit

Permalink
Added test for "anonymize_database.anonymize_data" function.
Browse files Browse the repository at this point in the history
Added DataFrame mock "df_with_private_info" for creating a df with some private columns and other data columns
  • Loading branch information
lorenz-gorini committed Aug 26, 2020
1 parent 447e4ce commit a7ba5f8
Show file tree
Hide file tree
Showing 2 changed files with 127 additions and 0 deletions.
40 changes: 40 additions & 0 deletions src/tests/dataframewithinfo_util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import itertools
import random
from datetime import date
from typing import Tuple

import pandas as pd

Expand Down Expand Up @@ -381,6 +382,45 @@ def df_duplicated_columns(duplicated_cols_count: int) -> pd.DataFrame:

return pd.DataFrame(df_duplicated)

@staticmethod
def df_with_private_info(private_cols: Tuple[str]):
"""
Create DataFrame with private info columns along with data columns
The returned DataFrame mock contains (len(private_cols) + 2) columns
and 5 rows. Particularly it contains the columns listed in ``private_cols``
with string values,and 2 data columns containing
integer values.
Two of these rows have same values in ``private_cols`` columns, but different
values in the other 2 data columns (this could be simulating a DataFrame
with multiple rows related to the same customer/patient).
Parameters
----------
private_cols: Tuple[str]
List of columns that will be created as private columns
Returns
-------
pd.DataFrame
DataFrame mock containing (len(private_cols) + 2) columns
and 5 rows. Particularly it contains the columns listed in ``private_cols``
with generic string values,and 2 data columns containing
integer values.
"""
df_private_info_dict = {}
sample_size = 5
for i, col in enumerate(private_cols):
df_private_info_dict[col] = [
f"col_{i}_value_{k}" for k in range(sample_size - 1)
]
# Add a duplicated row (it may be associated to the same customer)
df_private_info_dict[col].append(f"col_{i}_value_{sample_size-2}")
df_private_info_dict["data_col_0"] = list(range(sample_size))
df_private_info_dict["data_col_1"] = list(range(sample_size))
return pd.DataFrame(df_private_info_dict)


class SeriesMock:
@staticmethod
Expand Down
87 changes: 87 additions & 0 deletions src/tests/integration/test_anonymize_database.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import pandas as pd
import pytest

from tests.dataframewithinfo_util import DataFrameMock

from ...pd_extras.anonymize_database import anonymize_data


@pytest.mark.parametrize(
"private_cols_to_remove, private_cols_to_map, "
+ "expected_anonym_df, expected_private_df",
[
(
["private_col_a", "private_col_b"],
["private_col_a", "private_col_b", "private_col_c"],
pd.DataFrame(
{
"private_col_c": {
0: "col_2_value_0",
1: "col_2_value_1",
2: "col_2_value_2",
3: "col_2_value_3",
4: "col_2_value_3",
},
"data_col_0": {0: 0, 1: 1, 2: 2, 3: 3, 4: 4},
"data_col_1": {0: 0, 1: 1, 2: 2, 3: 3, 4: 4},
"ID_OWNER": {
0: "467ef2006da06554f248d74bf537a2e5a5270321c35963eace344feb32dd7b31",
1: "42d7ba97aaf0368c3b2e66ac7bb88787480d22ff3e0694a805647cdce1ecac73",
2: "e605c6ffcbfcb25f252e269b04b77df4a9514effe10d9885b366dfceae82aa24",
3: "be7c8a1fc7ff3c143455fb8d2774369ff6e756d804cb1e1765aca079b1a0778a",
4: "be7c8a1fc7ff3c143455fb8d2774369ff6e756d804cb1e1765aca079b1a0778a",
},
}
),
pd.DataFrame(
{
"private_col_a": {
0: "col_0_value_0",
1: "col_0_value_1",
2: "col_0_value_2",
3: "col_0_value_3",
},
"private_col_b": {
0: "col_1_value_0",
1: "col_1_value_1",
2: "col_1_value_2",
3: "col_1_value_3",
},
"private_col_c": {
0: "col_2_value_0",
1: "col_2_value_1",
2: "col_2_value_2",
3: "col_2_value_3",
},
"ID_OWNER": {
0: "467ef2006da06554f248d74bf537a2e5a5270321c35963eace344feb32dd7b31",
1: "42d7ba97aaf0368c3b2e66ac7bb88787480d22ff3e0694a805647cdce1ecac73",
2: "e605c6ffcbfcb25f252e269b04b77df4a9514effe10d9885b366dfceae82aa24",
3: "be7c8a1fc7ff3c143455fb8d2774369ff6e756d804cb1e1765aca079b1a0778a",
},
}
),
)
],
)
def test_anonymize_data(
temporary_data_dir,
private_cols_to_remove,
private_cols_to_map,
expected_anonym_df,
expected_private_df,
):

original_df = DataFrameMock.df_with_private_info(private_cols=private_cols_to_map)

anonym_df, private_df = anonymize_data(
df=original_df,
file_name="test_original_db_anonymize",
private_cols_to_remove=private_cols_to_remove,
private_cols_to_map=private_cols_to_map,
dest_path=str(temporary_data_dir),
random_seed=42,
)

pd.testing.assert_frame_equal(anonym_df, expected_anonym_df)
pd.testing.assert_frame_equal(private_df, expected_private_df)

0 comments on commit a7ba5f8

Please sign in to comment.