From 2907b5b1a4ffd32456f6b601a5af63d4c261771c Mon Sep 17 00:00:00 2001 From: Lorenzo Gorini Date: Tue, 18 Aug 2020 17:52:50 +0200 Subject: [PATCH 01/18] Added df_least_nan, df_duplicated_columns methods to create dataframe mocks Added test_med_exam_col_list, test_least_nan_cols, test_contains_duplicated_features, test_show_columns_type for the related dataframewithinfo methods --- src/tests/integration/test_dataframe_with_info.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/tests/integration/test_dataframe_with_info.py b/src/tests/integration/test_dataframe_with_info.py index e920144..04b0584 100644 --- a/src/tests/integration/test_dataframe_with_info.py +++ b/src/tests/integration/test_dataframe_with_info.py @@ -18,6 +18,7 @@ ) from ...pd_extras.exceptions import MultipleOperationsFoundError, NotShelveFileError from ...pd_extras.feature_enum import EncodingFunctions, OperationTypeEnum + from ..dataframewithinfo_util import DataFrameMock, SeriesMock from ..featureoperation_util import eq_featureoperation_combs From a300809b5e29ac6a849c06f8de8d812284cff465 Mon Sep 17 00:00:00 2001 From: Lorenzo Gorini Date: Fri, 21 Aug 2020 13:00:43 +0200 Subject: [PATCH 02/18] Changed from sklearn.preprocessing.OneHotEncoder to EncodingFunctions.ONEHOT (to exploit the Enum that gathers the supported functions for encoding. Added tests for "get_enc_column_from_original" and "get_original_from_enc_column" methods Added a FeatureOperation to df_info_with_operations fixture --- .../integration/test_dataframe_with_info.py | 97 ++++++++++++++++++- 1 file changed, 96 insertions(+), 1 deletion(-) diff --git a/src/tests/integration/test_dataframe_with_info.py b/src/tests/integration/test_dataframe_with_info.py index 04b0584..a9581af 100644 --- a/src/tests/integration/test_dataframe_with_info.py +++ b/src/tests/integration/test_dataframe_with_info.py @@ -18,7 +18,6 @@ ) from ...pd_extras.exceptions import MultipleOperationsFoundError, NotShelveFileError from ...pd_extras.feature_enum import EncodingFunctions, OperationTypeEnum - from ..dataframewithinfo_util import DataFrameMock, SeriesMock from ..featureoperation_util import eq_featureoperation_combs @@ -698,6 +697,102 @@ def test_get_original_from_enc_column_raise_multicolfound_error( in str(err.value) ) + @pytest.mark.parametrize( + "original_column, encoder, expected_encoded_columns", + [ + ( # Case 1: Everything specified and found + "fop_original_col_0", + EncodingFunctions.ONEHOT, + ("fop_derived_col_0",), + ), + ("fop_derived_col_1", None, None), # Case 2: column_name in derived_columns + ("fop_original_col_10", None, None), # Case 3: No operation associated + # Case 4: No encoder specified + ("fop_original_col_2", None, ("fop_derived_col_2", "fop_derived_col_3")), + ], + ) + def test_get_enc_column_from_original( + self, + request, + df_info_with_operations, + original_column, + encoder, + expected_encoded_columns, + ): + encoded_columns = df_info_with_operations.get_enc_column_from_original( + column_name=original_column, encoder=encoder + ) + + if expected_encoded_columns is None: + assert encoded_columns is None + else: + assert isinstance(encoded_columns, tuple) + assert len(encoded_columns) == len(expected_encoded_columns) + assert set(encoded_columns) == set(expected_encoded_columns) + + def test_get_enc_column_from_original_raise_error( + self, request, df_info_with_operations + ): + with pytest.raises(MultipleOperationsFoundError) as err: + _ = df_info_with_operations.get_enc_column_from_original( + column_name="fop_original_col_0" + ) + + assert isinstance(err.value, MultipleOperationsFoundError) + assert ( + "Multiple operations were found. Please provide additional information" + in str(err.value) + ) + + @pytest.mark.parametrize( + "encoded_column, encoder, expected_original_columns", + [ + ( # Case 1: Everything specified and found + "fop_derived_col_0", + EncodingFunctions.ONEHOT, + ("fop_original_col_0",), + ), + # Case 2: No encoder specified + ("fop_derived_col_1", None, ("fop_original_col_0", "fop_original_col_1")), + ("fop_derived_col_10", None, None), # Case 3: No operation associated + # Case 4: Column_name in original_columns + ("fop_original_col_2", None, None), + ], + ) + def test_get_original_from_enc_column( + self, + request, + df_info_with_operations, + encoded_column, + encoder, + expected_original_columns, + ): + original_columns = df_info_with_operations.get_original_from_enc_column( + column_name=encoded_column, encoder=encoder + ) + + if expected_original_columns is None: + assert original_columns is None + else: + assert isinstance(original_columns, tuple) + assert len(original_columns) == len(expected_original_columns) + assert set(original_columns) == set(expected_original_columns) + + def test_get_original_from_enc_column_raise_error( + self, request, df_info_with_operations + ): + with pytest.raises(MultipleOperationsFoundError) as err: + + _ = df_info_with_operations.get_original_from_enc_column( + column_name="fop_derived_col_0" + ) + + assert isinstance(err.value, MultipleOperationsFoundError) + assert ( + "Multiple operations were found. Please provide additional information" + in str(err.value) + ) + class Describe_FeatureOperation: @pytest.mark.parametrize( From 0d1d19112f8caf41261908c52ba5aed0e466c22f Mon Sep 17 00:00:00 2001 From: Lorenzo Gorini Date: Fri, 21 Aug 2020 21:54:17 +0200 Subject: [PATCH 03/18] Fixed issue 12 by changing from EncodingFunctions.ONEHOT/ORDINAL class to EncodingFunctions.ONEHOT.value() , which is actually an instance since it should be the actual encoder instance used to encode the feature --- src/pd_extras/dataframe_with_info.py | 6 +++++- src/tests/integration/test_dataframe_with_info.py | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/pd_extras/dataframe_with_info.py b/src/pd_extras/dataframe_with_info.py index fc7670c..f46c666 100644 --- a/src/pd_extras/dataframe_with_info.py +++ b/src/pd_extras/dataframe_with_info.py @@ -12,7 +12,11 @@ import sklearn from joblib import Parallel, delayed -from .exceptions import MultipleObjectsInFileError, MultipleOperationsFoundError, NotShelveFileError +from .exceptions import ( + MultipleObjectsInFileError, + MultipleOperationsFoundError, + NotShelveFileError, +) from .feature_enum import EncodingFunctions, OperationTypeEnum from .settings import CATEG_COL_THRESHOLD diff --git a/src/tests/integration/test_dataframe_with_info.py b/src/tests/integration/test_dataframe_with_info.py index a9581af..7b5e9ba 100644 --- a/src/tests/integration/test_dataframe_with_info.py +++ b/src/tests/integration/test_dataframe_with_info.py @@ -702,7 +702,7 @@ def test_get_original_from_enc_column_raise_multicolfound_error( [ ( # Case 1: Everything specified and found "fop_original_col_0", - EncodingFunctions.ONEHOT, + EncodingFunctions.ONEHOT.value(), ("fop_derived_col_0",), ), ("fop_derived_col_1", None, None), # Case 2: column_name in derived_columns @@ -749,7 +749,7 @@ def test_get_enc_column_from_original_raise_error( [ ( # Case 1: Everything specified and found "fop_derived_col_0", - EncodingFunctions.ONEHOT, + EncodingFunctions.ONEHOT.value(), ("fop_original_col_0",), ), # Case 2: No encoder specified From a0bf2f02681e62d1a858004e2d81e9b40456b259 Mon Sep 17 00:00:00 2001 From: Lorenzo Gorini Date: Tue, 25 Aug 2020 11:22:36 +0200 Subject: [PATCH 04/18] Fixed repeated code after rebase --- .../integration/test_dataframe_with_info.py | 96 ------------------- 1 file changed, 96 deletions(-) diff --git a/src/tests/integration/test_dataframe_with_info.py b/src/tests/integration/test_dataframe_with_info.py index 7b5e9ba..e920144 100644 --- a/src/tests/integration/test_dataframe_with_info.py +++ b/src/tests/integration/test_dataframe_with_info.py @@ -697,102 +697,6 @@ def test_get_original_from_enc_column_raise_multicolfound_error( in str(err.value) ) - @pytest.mark.parametrize( - "original_column, encoder, expected_encoded_columns", - [ - ( # Case 1: Everything specified and found - "fop_original_col_0", - EncodingFunctions.ONEHOT.value(), - ("fop_derived_col_0",), - ), - ("fop_derived_col_1", None, None), # Case 2: column_name in derived_columns - ("fop_original_col_10", None, None), # Case 3: No operation associated - # Case 4: No encoder specified - ("fop_original_col_2", None, ("fop_derived_col_2", "fop_derived_col_3")), - ], - ) - def test_get_enc_column_from_original( - self, - request, - df_info_with_operations, - original_column, - encoder, - expected_encoded_columns, - ): - encoded_columns = df_info_with_operations.get_enc_column_from_original( - column_name=original_column, encoder=encoder - ) - - if expected_encoded_columns is None: - assert encoded_columns is None - else: - assert isinstance(encoded_columns, tuple) - assert len(encoded_columns) == len(expected_encoded_columns) - assert set(encoded_columns) == set(expected_encoded_columns) - - def test_get_enc_column_from_original_raise_error( - self, request, df_info_with_operations - ): - with pytest.raises(MultipleOperationsFoundError) as err: - _ = df_info_with_operations.get_enc_column_from_original( - column_name="fop_original_col_0" - ) - - assert isinstance(err.value, MultipleOperationsFoundError) - assert ( - "Multiple operations were found. Please provide additional information" - in str(err.value) - ) - - @pytest.mark.parametrize( - "encoded_column, encoder, expected_original_columns", - [ - ( # Case 1: Everything specified and found - "fop_derived_col_0", - EncodingFunctions.ONEHOT.value(), - ("fop_original_col_0",), - ), - # Case 2: No encoder specified - ("fop_derived_col_1", None, ("fop_original_col_0", "fop_original_col_1")), - ("fop_derived_col_10", None, None), # Case 3: No operation associated - # Case 4: Column_name in original_columns - ("fop_original_col_2", None, None), - ], - ) - def test_get_original_from_enc_column( - self, - request, - df_info_with_operations, - encoded_column, - encoder, - expected_original_columns, - ): - original_columns = df_info_with_operations.get_original_from_enc_column( - column_name=encoded_column, encoder=encoder - ) - - if expected_original_columns is None: - assert original_columns is None - else: - assert isinstance(original_columns, tuple) - assert len(original_columns) == len(expected_original_columns) - assert set(original_columns) == set(expected_original_columns) - - def test_get_original_from_enc_column_raise_error( - self, request, df_info_with_operations - ): - with pytest.raises(MultipleOperationsFoundError) as err: - - _ = df_info_with_operations.get_original_from_enc_column( - column_name="fop_derived_col_0" - ) - - assert isinstance(err.value, MultipleOperationsFoundError) - assert ( - "Multiple operations were found. Please provide additional information" - in str(err.value) - ) - class Describe_FeatureOperation: @pytest.mark.parametrize( From ce6be4da2364447d3e48b8be745e2d419a4f573c Mon Sep 17 00:00:00 2001 From: Lorenzo Gorini Date: Thu, 20 Aug 2020 18:04:28 +0200 Subject: [PATCH 05/18] Added df_generic Mock, that is used for mocking a generic Pandas DataFrame Moved some tests inside Describe_DataFraneWithInfo class Added tests for add_operation and find_operation_in_column methods Added fixture for creating a DataFrameWithInfo instance with previous FeatureOperation instances (already added to be found) --- src/tests/dataframewithinfo_util.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/tests/dataframewithinfo_util.py b/src/tests/dataframewithinfo_util.py index 95e7381..cc35b17 100644 --- a/src/tests/dataframewithinfo_util.py +++ b/src/tests/dataframewithinfo_util.py @@ -3,6 +3,11 @@ from datetime import date import pandas as pd +import pytest +import sklearn + +from ..pd_extras.dataframe_with_info import DataFrameWithInfo, FeatureOperation +from ..pd_extras.feature_enum import OperationTypeEnum class DataFrameMock: From 4f0d72f519bca3cdcff2cb691d70ca8b86dfdf1b Mon Sep 17 00:00:00 2001 From: Lorenzo Gorini Date: Fri, 21 Aug 2020 13:00:43 +0200 Subject: [PATCH 06/18] Changed from sklearn.preprocessing.OneHotEncoder to EncodingFunctions.ONEHOT (to exploit the Enum that gathers the supported functions for encoding. Added tests for "get_enc_column_from_original" and "get_original_from_enc_column" methods Added a FeatureOperation to df_info_with_operations fixture --- .../integration/test_dataframe_with_info.py | 96 +++++++++++++++++++ 1 file changed, 96 insertions(+) diff --git a/src/tests/integration/test_dataframe_with_info.py b/src/tests/integration/test_dataframe_with_info.py index e920144..a9581af 100644 --- a/src/tests/integration/test_dataframe_with_info.py +++ b/src/tests/integration/test_dataframe_with_info.py @@ -697,6 +697,102 @@ def test_get_original_from_enc_column_raise_multicolfound_error( in str(err.value) ) + @pytest.mark.parametrize( + "original_column, encoder, expected_encoded_columns", + [ + ( # Case 1: Everything specified and found + "fop_original_col_0", + EncodingFunctions.ONEHOT, + ("fop_derived_col_0",), + ), + ("fop_derived_col_1", None, None), # Case 2: column_name in derived_columns + ("fop_original_col_10", None, None), # Case 3: No operation associated + # Case 4: No encoder specified + ("fop_original_col_2", None, ("fop_derived_col_2", "fop_derived_col_3")), + ], + ) + def test_get_enc_column_from_original( + self, + request, + df_info_with_operations, + original_column, + encoder, + expected_encoded_columns, + ): + encoded_columns = df_info_with_operations.get_enc_column_from_original( + column_name=original_column, encoder=encoder + ) + + if expected_encoded_columns is None: + assert encoded_columns is None + else: + assert isinstance(encoded_columns, tuple) + assert len(encoded_columns) == len(expected_encoded_columns) + assert set(encoded_columns) == set(expected_encoded_columns) + + def test_get_enc_column_from_original_raise_error( + self, request, df_info_with_operations + ): + with pytest.raises(MultipleOperationsFoundError) as err: + _ = df_info_with_operations.get_enc_column_from_original( + column_name="fop_original_col_0" + ) + + assert isinstance(err.value, MultipleOperationsFoundError) + assert ( + "Multiple operations were found. Please provide additional information" + in str(err.value) + ) + + @pytest.mark.parametrize( + "encoded_column, encoder, expected_original_columns", + [ + ( # Case 1: Everything specified and found + "fop_derived_col_0", + EncodingFunctions.ONEHOT, + ("fop_original_col_0",), + ), + # Case 2: No encoder specified + ("fop_derived_col_1", None, ("fop_original_col_0", "fop_original_col_1")), + ("fop_derived_col_10", None, None), # Case 3: No operation associated + # Case 4: Column_name in original_columns + ("fop_original_col_2", None, None), + ], + ) + def test_get_original_from_enc_column( + self, + request, + df_info_with_operations, + encoded_column, + encoder, + expected_original_columns, + ): + original_columns = df_info_with_operations.get_original_from_enc_column( + column_name=encoded_column, encoder=encoder + ) + + if expected_original_columns is None: + assert original_columns is None + else: + assert isinstance(original_columns, tuple) + assert len(original_columns) == len(expected_original_columns) + assert set(original_columns) == set(expected_original_columns) + + def test_get_original_from_enc_column_raise_error( + self, request, df_info_with_operations + ): + with pytest.raises(MultipleOperationsFoundError) as err: + + _ = df_info_with_operations.get_original_from_enc_column( + column_name="fop_derived_col_0" + ) + + assert isinstance(err.value, MultipleOperationsFoundError) + assert ( + "Multiple operations were found. Please provide additional information" + in str(err.value) + ) + class Describe_FeatureOperation: @pytest.mark.parametrize( From dec0ea17a14c63366349622f605ca09c2e56764d Mon Sep 17 00:00:00 2001 From: Lorenzo Gorini Date: Fri, 21 Aug 2020 21:54:17 +0200 Subject: [PATCH 07/18] Fixed issue 12 by changing from EncodingFunctions.ONEHOT/ORDINAL class to EncodingFunctions.ONEHOT.value() , which is actually an instance since it should be the actual encoder instance used to encode the feature --- src/tests/integration/test_dataframe_with_info.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tests/integration/test_dataframe_with_info.py b/src/tests/integration/test_dataframe_with_info.py index a9581af..7b5e9ba 100644 --- a/src/tests/integration/test_dataframe_with_info.py +++ b/src/tests/integration/test_dataframe_with_info.py @@ -702,7 +702,7 @@ def test_get_original_from_enc_column_raise_multicolfound_error( [ ( # Case 1: Everything specified and found "fop_original_col_0", - EncodingFunctions.ONEHOT, + EncodingFunctions.ONEHOT.value(), ("fop_derived_col_0",), ), ("fop_derived_col_1", None, None), # Case 2: column_name in derived_columns @@ -749,7 +749,7 @@ def test_get_enc_column_from_original_raise_error( [ ( # Case 1: Everything specified and found "fop_derived_col_0", - EncodingFunctions.ONEHOT, + EncodingFunctions.ONEHOT.value(), ("fop_original_col_0",), ), # Case 2: No encoder specified From 8f15a1e00a4cafd40427f9d6f64b80420803eaf1 Mon Sep 17 00:00:00 2001 From: Lorenzo Gorini Date: Tue, 25 Aug 2020 16:28:36 +0200 Subject: [PATCH 08/18] In import_df_with_info_from_file function: Added a new error exception "NotShelveFileError" to handle the case where the filename is not a file created by using shelve module. Fixed isinstance typo --- src/pd_extras/exceptions.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/pd_extras/exceptions.py b/src/pd_extras/exceptions.py index d852b32..15032f0 100644 --- a/src/pd_extras/exceptions.py +++ b/src/pd_extras/exceptions.py @@ -30,3 +30,7 @@ class NotShelveFileError(Exception): """ pass + + +class NotShelveFileError(Exception): + pass From 6c339260ec80a110cf17061e9a85e39ba23fb346 Mon Sep 17 00:00:00 2001 From: Lorenzo Gorini Date: Tue, 25 Aug 2020 16:32:14 +0200 Subject: [PATCH 09/18] Completed tests for import/export_df_with_info functions for DataFrameWithInfo instances. Added two fixtures for creating generic files with 'shelve' module and the builtin functions. Fixed tests after rebase. --- src/tests/dataframewithinfo_util.py | 5 - .../integration/test_dataframe_with_info.py | 96 ------------------- 2 files changed, 101 deletions(-) diff --git a/src/tests/dataframewithinfo_util.py b/src/tests/dataframewithinfo_util.py index cc35b17..95e7381 100644 --- a/src/tests/dataframewithinfo_util.py +++ b/src/tests/dataframewithinfo_util.py @@ -3,11 +3,6 @@ from datetime import date import pandas as pd -import pytest -import sklearn - -from ..pd_extras.dataframe_with_info import DataFrameWithInfo, FeatureOperation -from ..pd_extras.feature_enum import OperationTypeEnum class DataFrameMock: diff --git a/src/tests/integration/test_dataframe_with_info.py b/src/tests/integration/test_dataframe_with_info.py index 7b5e9ba..e920144 100644 --- a/src/tests/integration/test_dataframe_with_info.py +++ b/src/tests/integration/test_dataframe_with_info.py @@ -697,102 +697,6 @@ def test_get_original_from_enc_column_raise_multicolfound_error( in str(err.value) ) - @pytest.mark.parametrize( - "original_column, encoder, expected_encoded_columns", - [ - ( # Case 1: Everything specified and found - "fop_original_col_0", - EncodingFunctions.ONEHOT.value(), - ("fop_derived_col_0",), - ), - ("fop_derived_col_1", None, None), # Case 2: column_name in derived_columns - ("fop_original_col_10", None, None), # Case 3: No operation associated - # Case 4: No encoder specified - ("fop_original_col_2", None, ("fop_derived_col_2", "fop_derived_col_3")), - ], - ) - def test_get_enc_column_from_original( - self, - request, - df_info_with_operations, - original_column, - encoder, - expected_encoded_columns, - ): - encoded_columns = df_info_with_operations.get_enc_column_from_original( - column_name=original_column, encoder=encoder - ) - - if expected_encoded_columns is None: - assert encoded_columns is None - else: - assert isinstance(encoded_columns, tuple) - assert len(encoded_columns) == len(expected_encoded_columns) - assert set(encoded_columns) == set(expected_encoded_columns) - - def test_get_enc_column_from_original_raise_error( - self, request, df_info_with_operations - ): - with pytest.raises(MultipleOperationsFoundError) as err: - _ = df_info_with_operations.get_enc_column_from_original( - column_name="fop_original_col_0" - ) - - assert isinstance(err.value, MultipleOperationsFoundError) - assert ( - "Multiple operations were found. Please provide additional information" - in str(err.value) - ) - - @pytest.mark.parametrize( - "encoded_column, encoder, expected_original_columns", - [ - ( # Case 1: Everything specified and found - "fop_derived_col_0", - EncodingFunctions.ONEHOT.value(), - ("fop_original_col_0",), - ), - # Case 2: No encoder specified - ("fop_derived_col_1", None, ("fop_original_col_0", "fop_original_col_1")), - ("fop_derived_col_10", None, None), # Case 3: No operation associated - # Case 4: Column_name in original_columns - ("fop_original_col_2", None, None), - ], - ) - def test_get_original_from_enc_column( - self, - request, - df_info_with_operations, - encoded_column, - encoder, - expected_original_columns, - ): - original_columns = df_info_with_operations.get_original_from_enc_column( - column_name=encoded_column, encoder=encoder - ) - - if expected_original_columns is None: - assert original_columns is None - else: - assert isinstance(original_columns, tuple) - assert len(original_columns) == len(expected_original_columns) - assert set(original_columns) == set(expected_original_columns) - - def test_get_original_from_enc_column_raise_error( - self, request, df_info_with_operations - ): - with pytest.raises(MultipleOperationsFoundError) as err: - - _ = df_info_with_operations.get_original_from_enc_column( - column_name="fop_derived_col_0" - ) - - assert isinstance(err.value, MultipleOperationsFoundError) - assert ( - "Multiple operations were found. Please provide additional information" - in str(err.value) - ) - class Describe_FeatureOperation: @pytest.mark.parametrize( From 80f0af081656fd4de9356f24103f3636bb57764d Mon Sep 17 00:00:00 2001 From: Lorenzo Gorini Date: Tue, 25 Aug 2020 17:52:04 +0200 Subject: [PATCH 10/18] Fixes issue #19 because now "show_columns_type" considers every value in the column instead of the first one only. --- src/pd_extras/dataframe_with_info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pd_extras/dataframe_with_info.py b/src/pd_extras/dataframe_with_info.py index f46c666..c67700e 100644 --- a/src/pd_extras/dataframe_with_info.py +++ b/src/pd_extras/dataframe_with_info.py @@ -784,7 +784,7 @@ def show_columns_type(self, col_list: Tuple[str] = None) -> None: - bool -> "bool_col" - str -> "string_col" - other types -> "other_col" - + Parameters ---------- col_list: Tuple[str], optional From 96cbaeb642832143c1e2bcbb692e7b89a5b8821a Mon Sep 17 00:00:00 2001 From: Lorenzo Gorini Date: Tue, 25 Aug 2020 17:53:14 +0200 Subject: [PATCH 11/18] Refactored according to flake8 --- src/pd_extras/dataframe_with_info.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/pd_extras/dataframe_with_info.py b/src/pd_extras/dataframe_with_info.py index c67700e..fc7670c 100644 --- a/src/pd_extras/dataframe_with_info.py +++ b/src/pd_extras/dataframe_with_info.py @@ -12,11 +12,7 @@ import sklearn from joblib import Parallel, delayed -from .exceptions import ( - MultipleObjectsInFileError, - MultipleOperationsFoundError, - NotShelveFileError, -) +from .exceptions import MultipleObjectsInFileError, MultipleOperationsFoundError, NotShelveFileError from .feature_enum import EncodingFunctions, OperationTypeEnum from .settings import CATEG_COL_THRESHOLD @@ -784,7 +780,7 @@ def show_columns_type(self, col_list: Tuple[str] = None) -> None: - bool -> "bool_col" - str -> "string_col" - other types -> "other_col" - + Parameters ---------- col_list: Tuple[str], optional From 93cff6d4186d06c5f3e80d02a4d3e1fec8b1b7a6 Mon Sep 17 00:00:00 2001 From: Lorenzo Gorini Date: Wed, 26 Aug 2020 18:36:29 +0200 Subject: [PATCH 12/18] Moved temporary_data_dir fixture to conftest.py since it is a generic fixture that may be useful to multiple test scripts --- src/tests/conftest.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 src/tests/conftest.py diff --git a/src/tests/conftest.py b/src/tests/conftest.py new file mode 100644 index 0000000..94d86fa --- /dev/null +++ b/src/tests/conftest.py @@ -0,0 +1,38 @@ +import os +import shutil +from pathlib import Path + +import pytest + + +@pytest.fixture(scope="module") +def temporary_data_dir(request) -> Path: + """ + Create a temporary directory for test data and delete it after test end. + + The temporary directory is created in the working directory and it is + named "temp_test_data_folder". + The fixture uses a finalizer that deletes the temporary directory where + every test data was saved. Therefore every time the user calls tests that + use this fixture (and save data inside the returned directory), at the end + of the test the finalizer deletes this directory. + + Parameters + ---------- + + Returns + ------- + Path + Path where every temporary file used by tests is saved. + """ + temp_data_dir = Path(os.getcwd()) / "temp_test_data_folder" + try: + os.mkdir(temp_data_dir) + except FileExistsError: + pass + + def remove_temp_dir_created(): + shutil.rmtree(temp_data_dir) + + request.addfinalizer(remove_temp_dir_created) + return temp_data_dir From 82a5997523c630c28f22d7acfee40395a519f1b7 Mon Sep 17 00:00:00 2001 From: Lorenzo Gorini Date: Wed, 26 Aug 2020 18:49:17 +0200 Subject: [PATCH 13/18] Fixes issue #21 by adding a "random_seed" argument to "anonymize_data" function. Added and reformatted all the comments. Modified code to drop duplicates in DataFrame (function "create_private_info_db"). Fixes undefined "df_sani" variable with the correct variable. --- src/pd_extras/anonymize_database.py | 150 +++++++++++++++++++--------- 1 file changed, 105 insertions(+), 45 deletions(-) diff --git a/src/pd_extras/anonymize_database.py b/src/pd_extras/anonymize_database.py index 75eb3ef..c8fc2b9 100644 --- a/src/pd_extras/anonymize_database.py +++ b/src/pd_extras/anonymize_database.py @@ -2,6 +2,8 @@ import os import random import string +from pathlib import Path +from typing import Tuple, Union import numpy as np import pandas as pd @@ -26,13 +28,22 @@ def add_id_owner_col(private_df, cols_to_hash): """ This function uses the columns of the "private_df" database to generate an hash value and it creates an "ID_OWNER" column with those values. - To generate hash values, we add nonces (random prefix and suffix) to the column values and then we use "sha256". - See https://medium.com/luckspark/hashing-pandas-dataframe-column-with-nonce-763a8c23a833 for more info. - - :param private_df: Pandas.DataFrame with the owner's private data - :param cols_to_hash: This is a list of column names with the infos we want to hash - - :return: Pandas.DataFrame similar to "private_df" with a new "ID_OWNER" column + To generate hash values, the function adds nonces (random prefix and suffix) + to the column values and then we use "sha256". + See https://medium.com/luckspark/hashing-pandas-dataframe-column-with-nonce-763a8c23a833 + for more info. + + Parameters + ---------- + private_df: pd.DataFrame + Pandas.DataFrame with the owner's private data + cols_to_hash: Tuple[str] + This is a list of column names with the infos we want to hash + + Returns + ------- + pd.DataFrame + Pandas.DataFrame similar to ``private_df`` with a new "ID_OWNER" column """ # Turn rows into strings to be used rows_into_strings = np.sum( @@ -53,29 +64,41 @@ def hash_lambda(owner_name): return private_df -def create_private_info_db(df, private_cols_to_map): +def create_private_info_db( + df: pd.DataFrame, private_cols_to_map: Tuple[str] +) -> pd.DataFrame: """ - This function creates a Pandas.DataFrame where you will store all the owner's - private data needed to identify them. - These informations are listed in "private_cols_to_map" argument. - - :param df: Pandas.DataFrame that we will anonymize - :param private_cols_to_map: This is a list of the columns that will be stored in the - private_db that will be returned, along with the new "ID_OWNER" - :return: Pandas.DataFrame with the values of the "private_cols_to_map" and their hashed value in the column "ID_OWNER" + Create a DataFrame with private data and a unique ID. + + This function will store in a DataFrame all the owner's private data + contained in the columns ``private_cols_to_map`` needed to identify them. + The function will also add a unique owner ID (in the column "OWNER_ID") that + is hashed based on ``private_cols_to_map``. + In case there are multiple rows with the same private infos + (e.g.: multiple data from the same customer), only one of those rows + is included in the returned DataFrame. + + Parameters + ---------- + df: pd.DataFrame + Pandas.DataFrame that we will anonymize + private_cols_to_map: Tuple[str] + List of the columns that will be stored in the private_db + that will be returned, along with the new "ID_OWNER" + + Returns + ------- + pd.DataFrame + Pandas.DataFrame with the values of the ``private_cols_to_map`` and + their hashed value in the column "ID_OWNER" """ # Create the private_db with the columns with private infos only private_df = df[private_cols_to_map] - # Get unique combinations of the columns you chose - private_df = ( - private_df.groupby(private_cols_to_map, as_index=False, group_keys=False) - .size() - .reset_index() - ) - - # Eliminate size column - private_df = private_df.drop(columns=[0]) + # In case there are multiple rows with the same private infos + # (e.g.: multiple data from the same customer), only one of these rows + # should be included in ``private_df`` + private_df.drop_duplicates(inplace=True) # Add the ID_OWNER column with the hash value of the row private_df = add_id_owner_col(private_df, private_cols_to_map) @@ -84,40 +107,77 @@ def create_private_info_db(df, private_cols_to_map): def anonymize_data( - df, file_name, private_cols_to_remove, private_cols_to_map, dest_path -): + df: pd.DataFrame, + file_name: str, + private_cols_to_remove: Tuple[str], + private_cols_to_map: Tuple[str], + dest_path: Union[Path, str], + random_seed: int = 42, +) -> Tuple[pd.DataFrame]: """ - This function will take the Pandas DataFrame "df" and it will return two files written inside the "dest_path": - 1. One file (called "[file_name]_anonym") will contain the database "df" where - we replaced the columns "private_cols_to_remove" with the column "ID_OWNER" - 2. Another file (called "[file_name]_private_info") will contain only the - owner infos "private_cols_to_map", which we associated an ID_OWNER to. - The ID_OWNER will be hashed using SHA256. + Separate generic from private data leaving a unique ID as map between them. - :param df: Pandas.DataFrame that we will anonymize - :param file_name: Name of the database we are working on (no ".csv" suffix). Used as prefix when saving csv output files. - :param private_cols_to_remove: Columns that will be removed from "_anonym" file - :param private_cols_to_map: Columns of the "_private_info" files - :param dest_path: The directory where we will save the two files - - :return: [file_name]_anonym : pd.DataFrame - [file_name]_private_info : pd.DataFrame + This function will take the Pandas DataFrame ``df`` and it will return two + files written inside the ``dest_path`` directory: + 1. One file (called "[file_name]_anonym") will contain the database ``df`` where + we replaced the columns ``private_cols_to_remove`` with the column "ID_OWNER" + 2. Another file (called "[file_name]_private_info") will contain only the + owner infos ``private_cols_to_map``, which we associated an ID_OWNER to. + To generate hash values for the "ID_OWNER" column values, the algorithm + adds nonces (random prefix and suffix) to the column values and then + it uses "SHA256" algorithm. + + Parameters + ---------- + df: pd.DataFrame + Pandas.DataFrame that we will anonymize + file_name: str + Name of the database we are working on (no ".csv" suffix). Used as + prefix when saving csv output files. + private_cols_to_remove: Tuple[str] + Columns that will be removed from "_anonym" file + private_cols_to_map: Tuple[str] + Columns of the "_private_info" files + dest_path: Union[Path, str] + The directory where we will save the two files + random_seed: int + Integer value used as "seed" for the generation of random prefixes and + suffixes in "nonces". + + Returns + ------- + pd.DataFrame + Pandas DataFrame containing only the private infos ``private_cols_to_map``, + along with another column "ID_OWNER" that allows to map these private + informations to the data in the other DataFrame. This file is + also saved to "[``dest_path``] / [``file_name``]_private_info.csv" file. + pd.DataFrame + Pandas DataFrame containing the same infos as the DataFrame ``df``, but + the columns "private_cols_to_remove" have been replaced by "ID_OWNER" + column. + This file is also saved to "[``dest_path``] / [``file_name``]_anonym.csv" + file. """ + # Fix the random seed for the generation of random prefixes and + # suffixes in "nonces", used for creating "ID_OWNER" column. + random.seed(random_seed) + # Create the "_anonym" DataFrame which will contain the anonymized database + anonym_df = df.copy() # Fill NaN values in the columns we will map, to make DataFrame merge easier df[private_cols_to_map] = df[private_cols_to_map].fillna("----") # Create the "_private_info" db which will contain the map to owner's private data private_df = create_private_info_db(df, private_cols_to_map) - # Create the "_anonym" DataFrame which will contain the anonymized database - anonym_df = pd.DataFrame(df_sani) - - # Merge to insert the new ID_OWNER column + # Merge to insert the new ID_OWNER column corresponding to the + # private column value combinations anonym_df = anonym_df.merge(private_df) # Delete the columns with private owner's data anonym_df = anonym_df.drop(private_cols_to_remove, axis=1) # Write the two DataFrames to CSV files + dest_path = str(dest_path) + file_name = str(file_name) try: private_df.to_csv( os.path.join(dest_path, f"{file_name}_private_info.csv"), From 591b0fcce0976c4dd677f511df1928d027692954 Mon Sep 17 00:00:00 2001 From: Lorenzo Gorini Date: Wed, 26 Aug 2020 18:50:52 +0200 Subject: [PATCH 14/18] Added test for "anonymize_database.anonymize_data" function. Added DataFrame mock "df_with_private_info" for creating a df with some private columns and other data columns --- src/tests/dataframewithinfo_util.py | 40 +++++++++ .../integration/test_anonymize_database.py | 87 +++++++++++++++++++ 2 files changed, 127 insertions(+) create mode 100644 src/tests/integration/test_anonymize_database.py diff --git a/src/tests/dataframewithinfo_util.py b/src/tests/dataframewithinfo_util.py index 95e7381..543025c 100644 --- a/src/tests/dataframewithinfo_util.py +++ b/src/tests/dataframewithinfo_util.py @@ -1,6 +1,7 @@ import itertools import random from datetime import date +from typing import Tuple import pandas as pd @@ -381,6 +382,45 @@ def df_duplicated_columns(duplicated_cols_count: int) -> pd.DataFrame: return pd.DataFrame(df_duplicated) + @staticmethod + def df_with_private_info(private_cols: Tuple[str]): + """ + Create DataFrame with private info columns along with data columns + + The returned DataFrame mock contains (len(private_cols) + 2) columns + and 5 rows. Particularly it contains the columns listed in ``private_cols`` + with string values,and 2 data columns containing + integer values. + Two of these rows have same values in ``private_cols`` columns, but different + values in the other 2 data columns (this could be simulating a DataFrame + with multiple rows related to the same customer/patient). + + Parameters + ---------- + private_cols: Tuple[str] + List of columns that will be created as private columns + + Returns + ------- + pd.DataFrame + DataFrame mock containing (len(private_cols) + 2) columns + and 5 rows. Particularly it contains the columns listed in ``private_cols`` + with generic string values,and 2 data columns containing + integer values. + + """ + df_private_info_dict = {} + sample_size = 5 + for i, col in enumerate(private_cols): + df_private_info_dict[col] = [ + f"col_{i}_value_{k}" for k in range(sample_size - 1) + ] + # Add a duplicated row (it may be associated to the same customer) + df_private_info_dict[col].append(f"col_{i}_value_{sample_size-2}") + df_private_info_dict["data_col_0"] = list(range(sample_size)) + df_private_info_dict["data_col_1"] = list(range(sample_size)) + return pd.DataFrame(df_private_info_dict) + class SeriesMock: @staticmethod diff --git a/src/tests/integration/test_anonymize_database.py b/src/tests/integration/test_anonymize_database.py new file mode 100644 index 0000000..b8e9191 --- /dev/null +++ b/src/tests/integration/test_anonymize_database.py @@ -0,0 +1,87 @@ +import pandas as pd +import pytest + +from tests.dataframewithinfo_util import DataFrameMock + +from ...pd_extras.anonymize_database import anonymize_data + + +@pytest.mark.parametrize( + "private_cols_to_remove, private_cols_to_map, " + + "expected_anonym_df, expected_private_df", + [ + ( + ["private_col_a", "private_col_b"], + ["private_col_a", "private_col_b", "private_col_c"], + pd.DataFrame( + { + "private_col_c": { + 0: "col_2_value_0", + 1: "col_2_value_1", + 2: "col_2_value_2", + 3: "col_2_value_3", + 4: "col_2_value_3", + }, + "data_col_0": {0: 0, 1: 1, 2: 2, 3: 3, 4: 4}, + "data_col_1": {0: 0, 1: 1, 2: 2, 3: 3, 4: 4}, + "ID_OWNER": { + 0: "467ef2006da06554f248d74bf537a2e5a5270321c35963eace344feb32dd7b31", + 1: "42d7ba97aaf0368c3b2e66ac7bb88787480d22ff3e0694a805647cdce1ecac73", + 2: "e605c6ffcbfcb25f252e269b04b77df4a9514effe10d9885b366dfceae82aa24", + 3: "be7c8a1fc7ff3c143455fb8d2774369ff6e756d804cb1e1765aca079b1a0778a", + 4: "be7c8a1fc7ff3c143455fb8d2774369ff6e756d804cb1e1765aca079b1a0778a", + }, + } + ), + pd.DataFrame( + { + "private_col_a": { + 0: "col_0_value_0", + 1: "col_0_value_1", + 2: "col_0_value_2", + 3: "col_0_value_3", + }, + "private_col_b": { + 0: "col_1_value_0", + 1: "col_1_value_1", + 2: "col_1_value_2", + 3: "col_1_value_3", + }, + "private_col_c": { + 0: "col_2_value_0", + 1: "col_2_value_1", + 2: "col_2_value_2", + 3: "col_2_value_3", + }, + "ID_OWNER": { + 0: "467ef2006da06554f248d74bf537a2e5a5270321c35963eace344feb32dd7b31", + 1: "42d7ba97aaf0368c3b2e66ac7bb88787480d22ff3e0694a805647cdce1ecac73", + 2: "e605c6ffcbfcb25f252e269b04b77df4a9514effe10d9885b366dfceae82aa24", + 3: "be7c8a1fc7ff3c143455fb8d2774369ff6e756d804cb1e1765aca079b1a0778a", + }, + } + ), + ) + ], +) +def test_anonymize_data( + temporary_data_dir, + private_cols_to_remove, + private_cols_to_map, + expected_anonym_df, + expected_private_df, +): + + original_df = DataFrameMock.df_with_private_info(private_cols=private_cols_to_map) + + anonym_df, private_df = anonymize_data( + df=original_df, + file_name="test_original_db_anonymize", + private_cols_to_remove=private_cols_to_remove, + private_cols_to_map=private_cols_to_map, + dest_path=str(temporary_data_dir), + random_seed=42, + ) + + pd.testing.assert_frame_equal(anonym_df, expected_anonym_df) + pd.testing.assert_frame_equal(private_df, expected_private_df) From 1df8074dfe6a521d03240a330834b1331e63d51e Mon Sep 17 00:00:00 2001 From: Lorenzo Gorini Date: Thu, 27 Aug 2020 19:47:13 +0200 Subject: [PATCH 15/18] Fixed tests after rebase --- src/pd_extras/exceptions.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/pd_extras/exceptions.py b/src/pd_extras/exceptions.py index 15032f0..d852b32 100644 --- a/src/pd_extras/exceptions.py +++ b/src/pd_extras/exceptions.py @@ -30,7 +30,3 @@ class NotShelveFileError(Exception): """ pass - - -class NotShelveFileError(Exception): - pass From f56c41a8d825389f977815980690338e2bd2a26e Mon Sep 17 00:00:00 2001 From: Lorenzo Gorini Date: Thu, 27 Aug 2020 21:33:16 +0200 Subject: [PATCH 16/18] Fixed according to PR comments (minor typos in docstrings) --- src/pd_extras/anonymize_database.py | 48 ++++++++++++------- src/tests/dataframewithinfo_util.py | 4 +- .../integration/test_anonymize_database.py | 3 +- 3 files changed, 35 insertions(+), 20 deletions(-) diff --git a/src/pd_extras/anonymize_database.py b/src/pd_extras/anonymize_database.py index c8fc2b9..35e76b4 100644 --- a/src/pd_extras/anonymize_database.py +++ b/src/pd_extras/anonymize_database.py @@ -9,13 +9,27 @@ import pandas as pd -def add_nonce_func(string_array): +def add_nonce_func( + string_array: Union[str, int, float, np.array] +) -> Union[str, int, float, np.array]: """ - This function takes an array of strings passed as "string_array" and - attaches them nonces (random prefix and suffix), using Vectorization. + Add random prefix and suffix to an array of strings ``string_array`` + + This function takes an array of strings passed as ``string_array`` and + attaches nonces (random prefix and suffix) to each string. + It can also be used in a vectorized way + Prefix and suffix will contain 12 random characters each. - :param cols_values: This is a list of numpy arrays, i.e. the columns we add nonce to - :return: np.array of strings with nonces + Parameters + ---------- + string_array: Union[str, int, float, np.array] + This can be a number, a string or a numpy array of values + (e.g. a DataFrame column) + + Returns + ------- + np.array: + Array of strings with nonces """ return ( "".join(random.choice(string.hexdigits) for i in range(12)) @@ -24,7 +38,9 @@ def add_nonce_func(string_array): ) -def add_id_owner_col(private_df, cols_to_hash): +def add_id_owner_col( + private_df: pd.DataFrame, cols_to_hash: Tuple[str] +) -> pd.DataFrame: """ This function uses the columns of the "private_df" database to generate an hash value and it creates an "ID_OWNER" column with those values. @@ -36,14 +52,14 @@ def add_id_owner_col(private_df, cols_to_hash): Parameters ---------- private_df: pd.DataFrame - Pandas.DataFrame with the owner's private data + DataFrame with the owner's private data cols_to_hash: Tuple[str] This is a list of column names with the infos we want to hash Returns ------- pd.DataFrame - Pandas.DataFrame similar to ``private_df`` with a new "ID_OWNER" column + DataFrame similar to ``private_df`` with a new "ID_OWNER" column """ # Turn rows into strings to be used rows_into_strings = np.sum( @@ -74,14 +90,14 @@ def create_private_info_db( contained in the columns ``private_cols_to_map`` needed to identify them. The function will also add a unique owner ID (in the column "OWNER_ID") that is hashed based on ``private_cols_to_map``. - In case there are multiple rows with the same private infos + In case there are multiple rows with the same private info (e.g.: multiple data from the same customer), only one of those rows is included in the returned DataFrame. Parameters ---------- df: pd.DataFrame - Pandas.DataFrame that we will anonymize + DataFrame that we will anonymize private_cols_to_map: Tuple[str] List of the columns that will be stored in the private_db that will be returned, along with the new "ID_OWNER" @@ -89,13 +105,13 @@ def create_private_info_db( Returns ------- pd.DataFrame - Pandas.DataFrame with the values of the ``private_cols_to_map`` and + DataFrame with the values of the ``private_cols_to_map`` and their hashed value in the column "ID_OWNER" """ - # Create the private_db with the columns with private infos only + # Create the private_db with the columns with private info only private_df = df[private_cols_to_map] - # In case there are multiple rows with the same private infos + # In case there are multiple rows with the same private info # (e.g.: multiple data from the same customer), only one of these rows # should be included in ``private_df`` private_df.drop_duplicates(inplace=True) @@ -113,7 +129,7 @@ def anonymize_data( private_cols_to_map: Tuple[str], dest_path: Union[Path, str], random_seed: int = 42, -) -> Tuple[pd.DataFrame]: +) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Separate generic from private data leaving a unique ID as map between them. @@ -130,7 +146,7 @@ def anonymize_data( Parameters ---------- df: pd.DataFrame - Pandas.DataFrame that we will anonymize + DataFrame that we will anonymize file_name: str Name of the database we are working on (no ".csv" suffix). Used as prefix when saving csv output files. @@ -147,7 +163,7 @@ def anonymize_data( Returns ------- pd.DataFrame - Pandas DataFrame containing only the private infos ``private_cols_to_map``, + Pandas DataFrame containing only the private info ``private_cols_to_map``, along with another column "ID_OWNER" that allows to map these private informations to the data in the other DataFrame. This file is also saved to "[``dest_path``] / [``file_name``]_private_info.csv" file. diff --git a/src/tests/dataframewithinfo_util.py b/src/tests/dataframewithinfo_util.py index 543025c..428f1a1 100644 --- a/src/tests/dataframewithinfo_util.py +++ b/src/tests/dataframewithinfo_util.py @@ -389,7 +389,7 @@ def df_with_private_info(private_cols: Tuple[str]): The returned DataFrame mock contains (len(private_cols) + 2) columns and 5 rows. Particularly it contains the columns listed in ``private_cols`` - with string values,and 2 data columns containing + with string values, and 2 data columns containing integer values. Two of these rows have same values in ``private_cols`` columns, but different values in the other 2 data columns (this could be simulating a DataFrame @@ -405,7 +405,7 @@ def df_with_private_info(private_cols: Tuple[str]): pd.DataFrame DataFrame mock containing (len(private_cols) + 2) columns and 5 rows. Particularly it contains the columns listed in ``private_cols`` - with generic string values,and 2 data columns containing + with generic string values, and 2 data columns containing integer values. """ diff --git a/src/tests/integration/test_anonymize_database.py b/src/tests/integration/test_anonymize_database.py index b8e9191..e257f6d 100644 --- a/src/tests/integration/test_anonymize_database.py +++ b/src/tests/integration/test_anonymize_database.py @@ -8,7 +8,7 @@ @pytest.mark.parametrize( "private_cols_to_remove, private_cols_to_map, " - + "expected_anonym_df, expected_private_df", + "expected_anonym_df, expected_private_df", [ ( ["private_col_a", "private_col_b"], @@ -71,7 +71,6 @@ def test_anonymize_data( expected_anonym_df, expected_private_df, ): - original_df = DataFrameMock.df_with_private_info(private_cols=private_cols_to_map) anonym_df, private_df = anonymize_data( From 37aa71c6060f889b3b9b24f77582f629926e82de Mon Sep 17 00:00:00 2001 From: Lorenzo Gorini Date: Thu, 27 Aug 2020 21:34:27 +0200 Subject: [PATCH 17/18] Formatted according to flake8 --- src/pd_extras/anonymize_database.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pd_extras/anonymize_database.py b/src/pd_extras/anonymize_database.py index 35e76b4..7012a94 100644 --- a/src/pd_extras/anonymize_database.py +++ b/src/pd_extras/anonymize_database.py @@ -14,7 +14,7 @@ def add_nonce_func( ) -> Union[str, int, float, np.array]: """ Add random prefix and suffix to an array of strings ``string_array`` - + This function takes an array of strings passed as ``string_array`` and attaches nonces (random prefix and suffix) to each string. It can also be used in a vectorized way @@ -23,9 +23,9 @@ def add_nonce_func( Parameters ---------- string_array: Union[str, int, float, np.array] - This can be a number, a string or a numpy array of values + This can be a number, a string or a numpy array of values (e.g. a DataFrame column) - + Returns ------- np.array: From 1fd2ef827fc40ac5f7e37e03f16f855246c172c7 Mon Sep 17 00:00:00 2001 From: Lorenzo Gorini Date: Fri, 28 Aug 2020 11:49:56 +0200 Subject: [PATCH 18/18] Fixed minor typos docstrings --- src/pd_extras/anonymize_database.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pd_extras/anonymize_database.py b/src/pd_extras/anonymize_database.py index 7012a94..814addc 100644 --- a/src/pd_extras/anonymize_database.py +++ b/src/pd_extras/anonymize_database.py @@ -28,7 +28,7 @@ def add_nonce_func( Returns ------- - np.array: + np.array Array of strings with nonces """ return ( @@ -163,12 +163,12 @@ def anonymize_data( Returns ------- pd.DataFrame - Pandas DataFrame containing only the private info ``private_cols_to_map``, + DataFrame containing only the private info ``private_cols_to_map``, along with another column "ID_OWNER" that allows to map these private informations to the data in the other DataFrame. This file is also saved to "[``dest_path``] / [``file_name``]_private_info.csv" file. pd.DataFrame - Pandas DataFrame containing the same infos as the DataFrame ``df``, but + DataFrame containing the same infos as the DataFrame ``df``, but the columns "private_cols_to_remove" have been replaced by "ID_OWNER" column. This file is also saved to "[``dest_path``] / [``file_name``]_anonym.csv"