diff --git a/src/tests/dataframewithinfo_util.py b/src/tests/dataframewithinfo_util.py index 95e7381..543025c 100644 --- a/src/tests/dataframewithinfo_util.py +++ b/src/tests/dataframewithinfo_util.py @@ -1,6 +1,7 @@ import itertools import random from datetime import date +from typing import Tuple import pandas as pd @@ -381,6 +382,45 @@ def df_duplicated_columns(duplicated_cols_count: int) -> pd.DataFrame: return pd.DataFrame(df_duplicated) + @staticmethod + def df_with_private_info(private_cols: Tuple[str]): + """ + Create DataFrame with private info columns along with data columns + + The returned DataFrame mock contains (len(private_cols) + 2) columns + and 5 rows. Particularly it contains the columns listed in ``private_cols`` + with string values,and 2 data columns containing + integer values. + Two of these rows have same values in ``private_cols`` columns, but different + values in the other 2 data columns (this could be simulating a DataFrame + with multiple rows related to the same customer/patient). + + Parameters + ---------- + private_cols: Tuple[str] + List of columns that will be created as private columns + + Returns + ------- + pd.DataFrame + DataFrame mock containing (len(private_cols) + 2) columns + and 5 rows. Particularly it contains the columns listed in ``private_cols`` + with generic string values,and 2 data columns containing + integer values. + + """ + df_private_info_dict = {} + sample_size = 5 + for i, col in enumerate(private_cols): + df_private_info_dict[col] = [ + f"col_{i}_value_{k}" for k in range(sample_size - 1) + ] + # Add a duplicated row (it may be associated to the same customer) + df_private_info_dict[col].append(f"col_{i}_value_{sample_size-2}") + df_private_info_dict["data_col_0"] = list(range(sample_size)) + df_private_info_dict["data_col_1"] = list(range(sample_size)) + return pd.DataFrame(df_private_info_dict) + class SeriesMock: @staticmethod diff --git a/src/tests/integration/test_anonymize_database.py b/src/tests/integration/test_anonymize_database.py new file mode 100644 index 0000000..b8e9191 --- /dev/null +++ b/src/tests/integration/test_anonymize_database.py @@ -0,0 +1,87 @@ +import pandas as pd +import pytest + +from tests.dataframewithinfo_util import DataFrameMock + +from ...pd_extras.anonymize_database import anonymize_data + + +@pytest.mark.parametrize( + "private_cols_to_remove, private_cols_to_map, " + + "expected_anonym_df, expected_private_df", + [ + ( + ["private_col_a", "private_col_b"], + ["private_col_a", "private_col_b", "private_col_c"], + pd.DataFrame( + { + "private_col_c": { + 0: "col_2_value_0", + 1: "col_2_value_1", + 2: "col_2_value_2", + 3: "col_2_value_3", + 4: "col_2_value_3", + }, + "data_col_0": {0: 0, 1: 1, 2: 2, 3: 3, 4: 4}, + "data_col_1": {0: 0, 1: 1, 2: 2, 3: 3, 4: 4}, + "ID_OWNER": { + 0: "467ef2006da06554f248d74bf537a2e5a5270321c35963eace344feb32dd7b31", + 1: "42d7ba97aaf0368c3b2e66ac7bb88787480d22ff3e0694a805647cdce1ecac73", + 2: "e605c6ffcbfcb25f252e269b04b77df4a9514effe10d9885b366dfceae82aa24", + 3: "be7c8a1fc7ff3c143455fb8d2774369ff6e756d804cb1e1765aca079b1a0778a", + 4: "be7c8a1fc7ff3c143455fb8d2774369ff6e756d804cb1e1765aca079b1a0778a", + }, + } + ), + pd.DataFrame( + { + "private_col_a": { + 0: "col_0_value_0", + 1: "col_0_value_1", + 2: "col_0_value_2", + 3: "col_0_value_3", + }, + "private_col_b": { + 0: "col_1_value_0", + 1: "col_1_value_1", + 2: "col_1_value_2", + 3: "col_1_value_3", + }, + "private_col_c": { + 0: "col_2_value_0", + 1: "col_2_value_1", + 2: "col_2_value_2", + 3: "col_2_value_3", + }, + "ID_OWNER": { + 0: "467ef2006da06554f248d74bf537a2e5a5270321c35963eace344feb32dd7b31", + 1: "42d7ba97aaf0368c3b2e66ac7bb88787480d22ff3e0694a805647cdce1ecac73", + 2: "e605c6ffcbfcb25f252e269b04b77df4a9514effe10d9885b366dfceae82aa24", + 3: "be7c8a1fc7ff3c143455fb8d2774369ff6e756d804cb1e1765aca079b1a0778a", + }, + } + ), + ) + ], +) +def test_anonymize_data( + temporary_data_dir, + private_cols_to_remove, + private_cols_to_map, + expected_anonym_df, + expected_private_df, +): + + original_df = DataFrameMock.df_with_private_info(private_cols=private_cols_to_map) + + anonym_df, private_df = anonymize_data( + df=original_df, + file_name="test_original_db_anonymize", + private_cols_to_remove=private_cols_to_remove, + private_cols_to_map=private_cols_to_map, + dest_path=str(temporary_data_dir), + random_seed=42, + ) + + pd.testing.assert_frame_equal(anonym_df, expected_anonym_df) + pd.testing.assert_frame_equal(private_df, expected_private_df)