Added test for "anonymize_database.anonymize_data" function.

Added DataFrame mock "df_with_private_info" for creating a df with some private columns and other data columns
HK3-Lab-Team · Aug 26, 2020 · a7ba5f8 · a7ba5f8
1 parent 447e4ce
commit a7ba5f8
Show file tree

Hide file tree

Showing 2 changed files with 127 additions and 0 deletions.
diff --git a/src/tests/dataframewithinfo_util.py b/src/tests/dataframewithinfo_util.py
@@ -1,6 +1,7 @@
 import itertools
 import random
 from datetime import date
+from typing import Tuple
 
 import pandas as pd
 
@@ -381,6 +382,45 @@ def df_duplicated_columns(duplicated_cols_count: int) -> pd.DataFrame:
 
         return pd.DataFrame(df_duplicated)
 
+    @staticmethod
+    def df_with_private_info(private_cols: Tuple[str]):
+        """
+        Create DataFrame with private info columns along with data columns
+
+        The returned DataFrame mock contains (len(private_cols) + 2) columns
+        and 5 rows. Particularly it contains the columns listed in ``private_cols``
+        with string values,and 2 data columns containing
+        integer values.
+        Two of these rows have same values in ``private_cols`` columns, but different
+        values in the other 2 data columns (this could be simulating a DataFrame
+        with multiple rows related to the same customer/patient).
+
+        Parameters
+        ----------
+        private_cols: Tuple[str]
+            List of columns that will be created as private columns
+
+        Returns
+        -------
+        pd.DataFrame
+            DataFrame mock containing (len(private_cols) + 2) columns
+            and 5 rows. Particularly it contains the columns listed in ``private_cols``
+            with generic string values,and 2 data columns containing
+            integer values.
+
+        """
+        df_private_info_dict = {}
+        sample_size = 5
+        for i, col in enumerate(private_cols):
+            df_private_info_dict[col] = [
+                f"col_{i}_value_{k}" for k in range(sample_size - 1)
+            ]
+            # Add a duplicated row (it may be associated to the same customer)
+            df_private_info_dict[col].append(f"col_{i}_value_{sample_size-2}")
+        df_private_info_dict["data_col_0"] = list(range(sample_size))
+        df_private_info_dict["data_col_1"] = list(range(sample_size))
+        return pd.DataFrame(df_private_info_dict)
+
 
 class SeriesMock:
     @staticmethod

diff --git a/src/tests/integration/test_anonymize_database.py b/src/tests/integration/test_anonymize_database.py
@@ -0,0 +1,87 @@
+import pandas as pd
+import pytest
+
+from tests.dataframewithinfo_util import DataFrameMock
+
+from ...pd_extras.anonymize_database import anonymize_data
+
+
+@pytest.mark.parametrize(
+    "private_cols_to_remove, private_cols_to_map, "
+    + "expected_anonym_df, expected_private_df",
+    [
+        (
+            ["private_col_a", "private_col_b"],
+            ["private_col_a", "private_col_b", "private_col_c"],
+            pd.DataFrame(
+                {
+                    "private_col_c": {
+                        0: "col_2_value_0",
+                        1: "col_2_value_1",
+                        2: "col_2_value_2",
+                        3: "col_2_value_3",
+                        4: "col_2_value_3",
+                    },
+                    "data_col_0": {0: 0, 1: 1, 2: 2, 3: 3, 4: 4},
+                    "data_col_1": {0: 0, 1: 1, 2: 2, 3: 3, 4: 4},
+                    "ID_OWNER": {
+                        0: "467ef2006da06554f248d74bf537a2e5a5270321c35963eace344feb32dd7b31",
+                        1: "42d7ba97aaf0368c3b2e66ac7bb88787480d22ff3e0694a805647cdce1ecac73",
+                        2: "e605c6ffcbfcb25f252e269b04b77df4a9514effe10d9885b366dfceae82aa24",
+                        3: "be7c8a1fc7ff3c143455fb8d2774369ff6e756d804cb1e1765aca079b1a0778a",
+                        4: "be7c8a1fc7ff3c143455fb8d2774369ff6e756d804cb1e1765aca079b1a0778a",
+                    },
+                }
+            ),
+            pd.DataFrame(
+                {
+                    "private_col_a": {
+                        0: "col_0_value_0",
+                        1: "col_0_value_1",
+                        2: "col_0_value_2",
+                        3: "col_0_value_3",
+                    },
+                    "private_col_b": {
+                        0: "col_1_value_0",
+                        1: "col_1_value_1",
+                        2: "col_1_value_2",
+                        3: "col_1_value_3",
+                    },
+                    "private_col_c": {
+                        0: "col_2_value_0",
+                        1: "col_2_value_1",
+                        2: "col_2_value_2",
+                        3: "col_2_value_3",
+                    },
+                    "ID_OWNER": {
+                        0: "467ef2006da06554f248d74bf537a2e5a5270321c35963eace344feb32dd7b31",
+                        1: "42d7ba97aaf0368c3b2e66ac7bb88787480d22ff3e0694a805647cdce1ecac73",
+                        2: "e605c6ffcbfcb25f252e269b04b77df4a9514effe10d9885b366dfceae82aa24",
+                        3: "be7c8a1fc7ff3c143455fb8d2774369ff6e756d804cb1e1765aca079b1a0778a",
+                    },
+                }
+            ),
+        )
+    ],
+)
+def test_anonymize_data(
+    temporary_data_dir,
+    private_cols_to_remove,
+    private_cols_to_map,
+    expected_anonym_df,
+    expected_private_df,
+):
+
+    original_df = DataFrameMock.df_with_private_info(private_cols=private_cols_to_map)
+
+    anonym_df, private_df = anonymize_data(
+        df=original_df,
+        file_name="test_original_db_anonymize",
+        private_cols_to_remove=private_cols_to_remove,
+        private_cols_to_map=private_cols_to_map,
+        dest_path=str(temporary_data_dir),
+        random_seed=42,
+    )
+
+    pd.testing.assert_frame_equal(anonym_df, expected_anonym_df)
+    pd.testing.assert_frame_equal(private_df, expected_private_df)