From 2907b5b1a4ffd32456f6b601a5af63d4c261771c Mon Sep 17 00:00:00 2001
From: Lorenzo Gorini <lore.gorini@gmail.com>
Date: Tue, 18 Aug 2020 17:52:50 +0200
Subject: [PATCH 01/18] Added df_least_nan, df_duplicated_columns methods to
 create dataframe mocks Added test_med_exam_col_list, test_least_nan_cols,
 test_contains_duplicated_features, test_show_columns_type for the related
 dataframewithinfo methods

---
 src/tests/integration/test_dataframe_with_info.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/tests/integration/test_dataframe_with_info.py b/src/tests/integration/test_dataframe_with_info.py
index e920144..04b0584 100644
--- a/src/tests/integration/test_dataframe_with_info.py
+++ b/src/tests/integration/test_dataframe_with_info.py
@@ -18,6 +18,7 @@
 )
 from ...pd_extras.exceptions import MultipleOperationsFoundError, NotShelveFileError
 from ...pd_extras.feature_enum import EncodingFunctions, OperationTypeEnum
+
 from ..dataframewithinfo_util import DataFrameMock, SeriesMock
 from ..featureoperation_util import eq_featureoperation_combs
 

From a300809b5e29ac6a849c06f8de8d812284cff465 Mon Sep 17 00:00:00 2001
From: Lorenzo Gorini <lore.gorini@gmail.com>
Date: Fri, 21 Aug 2020 13:00:43 +0200
Subject: [PATCH 02/18] Changed from sklearn.preprocessing.OneHotEncoder to
 EncodingFunctions.ONEHOT (to exploit the Enum that gathers the supported
 functions for encoding. Added tests for "get_enc_column_from_original" and
 "get_original_from_enc_column" methods Added a FeatureOperation to
 df_info_with_operations fixture

---
 .../integration/test_dataframe_with_info.py   | 97 ++++++++++++++++++-
 1 file changed, 96 insertions(+), 1 deletion(-)

diff --git a/src/tests/integration/test_dataframe_with_info.py b/src/tests/integration/test_dataframe_with_info.py
index 04b0584..a9581af 100644
--- a/src/tests/integration/test_dataframe_with_info.py
+++ b/src/tests/integration/test_dataframe_with_info.py
@@ -18,7 +18,6 @@
 )
 from ...pd_extras.exceptions import MultipleOperationsFoundError, NotShelveFileError
 from ...pd_extras.feature_enum import EncodingFunctions, OperationTypeEnum
-
 from ..dataframewithinfo_util import DataFrameMock, SeriesMock
 from ..featureoperation_util import eq_featureoperation_combs
 
@@ -698,6 +697,102 @@ def test_get_original_from_enc_column_raise_multicolfound_error(
             in str(err.value)
         )
 
+    @pytest.mark.parametrize(
+        "original_column, encoder, expected_encoded_columns",
+        [
+            (  # Case 1: Everything specified and found
+                "fop_original_col_0",
+                EncodingFunctions.ONEHOT,
+                ("fop_derived_col_0",),
+            ),
+            ("fop_derived_col_1", None, None),  # Case 2: column_name in derived_columns
+            ("fop_original_col_10", None, None),  # Case 3: No operation associated
+            # Case 4: No encoder specified
+            ("fop_original_col_2", None, ("fop_derived_col_2", "fop_derived_col_3")),
+        ],
+    )
+    def test_get_enc_column_from_original(
+        self,
+        request,
+        df_info_with_operations,
+        original_column,
+        encoder,
+        expected_encoded_columns,
+    ):
+        encoded_columns = df_info_with_operations.get_enc_column_from_original(
+            column_name=original_column, encoder=encoder
+        )
+
+        if expected_encoded_columns is None:
+            assert encoded_columns is None
+        else:
+            assert isinstance(encoded_columns, tuple)
+            assert len(encoded_columns) == len(expected_encoded_columns)
+            assert set(encoded_columns) == set(expected_encoded_columns)
+
+    def test_get_enc_column_from_original_raise_error(
+        self, request, df_info_with_operations
+    ):
+        with pytest.raises(MultipleOperationsFoundError) as err:
+            _ = df_info_with_operations.get_enc_column_from_original(
+                column_name="fop_original_col_0"
+            )
+
+        assert isinstance(err.value, MultipleOperationsFoundError)
+        assert (
+            "Multiple operations were found. Please provide additional information"
+            in str(err.value)
+        )
+
+    @pytest.mark.parametrize(
+        "encoded_column, encoder, expected_original_columns",
+        [
+            (  # Case 1: Everything specified and found
+                "fop_derived_col_0",
+                EncodingFunctions.ONEHOT,
+                ("fop_original_col_0",),
+            ),
+            # Case 2: No encoder specified
+            ("fop_derived_col_1", None, ("fop_original_col_0", "fop_original_col_1")),
+            ("fop_derived_col_10", None, None),  # Case 3: No operation associated
+            # Case 4: Column_name in original_columns
+            ("fop_original_col_2", None, None),
+        ],
+    )
+    def test_get_original_from_enc_column(
+        self,
+        request,
+        df_info_with_operations,
+        encoded_column,
+        encoder,
+        expected_original_columns,
+    ):
+        original_columns = df_info_with_operations.get_original_from_enc_column(
+            column_name=encoded_column, encoder=encoder
+        )
+
+        if expected_original_columns is None:
+            assert original_columns is None
+        else:
+            assert isinstance(original_columns, tuple)
+            assert len(original_columns) == len(expected_original_columns)
+            assert set(original_columns) == set(expected_original_columns)
+
+    def test_get_original_from_enc_column_raise_error(
+        self, request, df_info_with_operations
+    ):
+        with pytest.raises(MultipleOperationsFoundError) as err:
+
+            _ = df_info_with_operations.get_original_from_enc_column(
+                column_name="fop_derived_col_0"
+            )
+
+        assert isinstance(err.value, MultipleOperationsFoundError)
+        assert (
+            "Multiple operations were found. Please provide additional information"
+            in str(err.value)
+        )
+
 
 class Describe_FeatureOperation:
     @pytest.mark.parametrize(

From 0d1d19112f8caf41261908c52ba5aed0e466c22f Mon Sep 17 00:00:00 2001
From: Lorenzo Gorini <lore.gorini@gmail.com>
Date: Fri, 21 Aug 2020 21:54:17 +0200
Subject: [PATCH 03/18] Fixed issue 12 by changing from
 EncodingFunctions.ONEHOT/ORDINAL class to EncodingFunctions.ONEHOT.value() ,
 which is actually an instance since it should be the actual encoder instance
 used to encode the feature

---
 src/pd_extras/dataframe_with_info.py              | 6 +++++-
 src/tests/integration/test_dataframe_with_info.py | 4 ++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/pd_extras/dataframe_with_info.py b/src/pd_extras/dataframe_with_info.py
index fc7670c..f46c666 100644
--- a/src/pd_extras/dataframe_with_info.py
+++ b/src/pd_extras/dataframe_with_info.py
@@ -12,7 +12,11 @@
 import sklearn
 from joblib import Parallel, delayed
 
-from .exceptions import MultipleObjectsInFileError, MultipleOperationsFoundError, NotShelveFileError
+from .exceptions import (
+    MultipleObjectsInFileError,
+    MultipleOperationsFoundError,
+    NotShelveFileError,
+)
 from .feature_enum import EncodingFunctions, OperationTypeEnum
 from .settings import CATEG_COL_THRESHOLD
 
diff --git a/src/tests/integration/test_dataframe_with_info.py b/src/tests/integration/test_dataframe_with_info.py
index a9581af..7b5e9ba 100644
--- a/src/tests/integration/test_dataframe_with_info.py
+++ b/src/tests/integration/test_dataframe_with_info.py
@@ -702,7 +702,7 @@ def test_get_original_from_enc_column_raise_multicolfound_error(
         [
             (  # Case 1: Everything specified and found
                 "fop_original_col_0",
-                EncodingFunctions.ONEHOT,
+                EncodingFunctions.ONEHOT.value(),
                 ("fop_derived_col_0",),
             ),
             ("fop_derived_col_1", None, None),  # Case 2: column_name in derived_columns
@@ -749,7 +749,7 @@ def test_get_enc_column_from_original_raise_error(
         [
             (  # Case 1: Everything specified and found
                 "fop_derived_col_0",
-                EncodingFunctions.ONEHOT,
+                EncodingFunctions.ONEHOT.value(),
                 ("fop_original_col_0",),
             ),
             # Case 2: No encoder specified

From a0bf2f02681e62d1a858004e2d81e9b40456b259 Mon Sep 17 00:00:00 2001
From: Lorenzo Gorini <lore.gorini@gmail.com>
Date: Tue, 25 Aug 2020 11:22:36 +0200
Subject: [PATCH 04/18] Fixed repeated code after rebase

---
 .../integration/test_dataframe_with_info.py   | 96 -------------------
 1 file changed, 96 deletions(-)

diff --git a/src/tests/integration/test_dataframe_with_info.py b/src/tests/integration/test_dataframe_with_info.py
index 7b5e9ba..e920144 100644
--- a/src/tests/integration/test_dataframe_with_info.py
+++ b/src/tests/integration/test_dataframe_with_info.py
@@ -697,102 +697,6 @@ def test_get_original_from_enc_column_raise_multicolfound_error(
             in str(err.value)
         )
 
-    @pytest.mark.parametrize(
-        "original_column, encoder, expected_encoded_columns",
-        [
-            (  # Case 1: Everything specified and found
-                "fop_original_col_0",
-                EncodingFunctions.ONEHOT.value(),
-                ("fop_derived_col_0",),
-            ),
-            ("fop_derived_col_1", None, None),  # Case 2: column_name in derived_columns
-            ("fop_original_col_10", None, None),  # Case 3: No operation associated
-            # Case 4: No encoder specified
-            ("fop_original_col_2", None, ("fop_derived_col_2", "fop_derived_col_3")),
-        ],
-    )
-    def test_get_enc_column_from_original(
-        self,
-        request,
-        df_info_with_operations,
-        original_column,
-        encoder,
-        expected_encoded_columns,
-    ):
-        encoded_columns = df_info_with_operations.get_enc_column_from_original(
-            column_name=original_column, encoder=encoder
-        )
-
-        if expected_encoded_columns is None:
-            assert encoded_columns is None
-        else:
-            assert isinstance(encoded_columns, tuple)
-            assert len(encoded_columns) == len(expected_encoded_columns)
-            assert set(encoded_columns) == set(expected_encoded_columns)
-
-    def test_get_enc_column_from_original_raise_error(
-        self, request, df_info_with_operations
-    ):
-        with pytest.raises(MultipleOperationsFoundError) as err:
-            _ = df_info_with_operations.get_enc_column_from_original(
-                column_name="fop_original_col_0"
-            )
-
-        assert isinstance(err.value, MultipleOperationsFoundError)
-        assert (
-            "Multiple operations were found. Please provide additional information"
-            in str(err.value)
-        )
-
-    @pytest.mark.parametrize(
-        "encoded_column, encoder, expected_original_columns",
-        [
-            (  # Case 1: Everything specified and found
-                "fop_derived_col_0",
-                EncodingFunctions.ONEHOT.value(),
-                ("fop_original_col_0",),
-            ),
-            # Case 2: No encoder specified
-            ("fop_derived_col_1", None, ("fop_original_col_0", "fop_original_col_1")),
-            ("fop_derived_col_10", None, None),  # Case 3: No operation associated
-            # Case 4: Column_name in original_columns
-            ("fop_original_col_2", None, None),
-        ],
-    )
-    def test_get_original_from_enc_column(
-        self,
-        request,
-        df_info_with_operations,
-        encoded_column,
-        encoder,
-        expected_original_columns,
-    ):
-        original_columns = df_info_with_operations.get_original_from_enc_column(
-            column_name=encoded_column, encoder=encoder
-        )
-
-        if expected_original_columns is None:
-            assert original_columns is None
-        else:
-            assert isinstance(original_columns, tuple)
-            assert len(original_columns) == len(expected_original_columns)
-            assert set(original_columns) == set(expected_original_columns)
-
-    def test_get_original_from_enc_column_raise_error(
-        self, request, df_info_with_operations
-    ):
-        with pytest.raises(MultipleOperationsFoundError) as err:
-
-            _ = df_info_with_operations.get_original_from_enc_column(
-                column_name="fop_derived_col_0"
-            )
-
-        assert isinstance(err.value, MultipleOperationsFoundError)
-        assert (
-            "Multiple operations were found. Please provide additional information"
-            in str(err.value)
-        )
-
 
 class Describe_FeatureOperation:
     @pytest.mark.parametrize(

From ce6be4da2364447d3e48b8be745e2d419a4f573c Mon Sep 17 00:00:00 2001
From: Lorenzo Gorini <lore.gorini@gmail.com>
Date: Thu, 20 Aug 2020 18:04:28 +0200
Subject: [PATCH 05/18] Added df_generic Mock, that is used for mocking a
 generic Pandas DataFrame Moved some tests inside Describe_DataFraneWithInfo
 class Added tests for add_operation and find_operation_in_column methods
 Added fixture for creating a DataFrameWithInfo instance with previous
 FeatureOperation instances (already added to be found)

---
 src/tests/dataframewithinfo_util.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/tests/dataframewithinfo_util.py b/src/tests/dataframewithinfo_util.py
index 95e7381..cc35b17 100644
--- a/src/tests/dataframewithinfo_util.py
+++ b/src/tests/dataframewithinfo_util.py
@@ -3,6 +3,11 @@
 from datetime import date
 
 import pandas as pd
+import pytest
+import sklearn
+
+from ..pd_extras.dataframe_with_info import DataFrameWithInfo, FeatureOperation
+from ..pd_extras.feature_enum import OperationTypeEnum
 
 
 class DataFrameMock:

From 4f0d72f519bca3cdcff2cb691d70ca8b86dfdf1b Mon Sep 17 00:00:00 2001
From: Lorenzo Gorini <lore.gorini@gmail.com>
Date: Fri, 21 Aug 2020 13:00:43 +0200
Subject: [PATCH 06/18] Changed from sklearn.preprocessing.OneHotEncoder to
 EncodingFunctions.ONEHOT (to exploit the Enum that gathers the supported
 functions for encoding. Added tests for "get_enc_column_from_original" and
 "get_original_from_enc_column" methods Added a FeatureOperation to
 df_info_with_operations fixture

---
 .../integration/test_dataframe_with_info.py   | 96 +++++++++++++++++++
 1 file changed, 96 insertions(+)

diff --git a/src/tests/integration/test_dataframe_with_info.py b/src/tests/integration/test_dataframe_with_info.py
index e920144..a9581af 100644
--- a/src/tests/integration/test_dataframe_with_info.py
+++ b/src/tests/integration/test_dataframe_with_info.py
@@ -697,6 +697,102 @@ def test_get_original_from_enc_column_raise_multicolfound_error(
             in str(err.value)
         )
 
+    @pytest.mark.parametrize(
+        "original_column, encoder, expected_encoded_columns",
+        [
+            (  # Case 1: Everything specified and found
+                "fop_original_col_0",
+                EncodingFunctions.ONEHOT,
+                ("fop_derived_col_0",),
+            ),
+            ("fop_derived_col_1", None, None),  # Case 2: column_name in derived_columns
+            ("fop_original_col_10", None, None),  # Case 3: No operation associated
+            # Case 4: No encoder specified
+            ("fop_original_col_2", None, ("fop_derived_col_2", "fop_derived_col_3")),
+        ],
+    )
+    def test_get_enc_column_from_original(
+        self,
+        request,
+        df_info_with_operations,
+        original_column,
+        encoder,
+        expected_encoded_columns,
+    ):
+        encoded_columns = df_info_with_operations.get_enc_column_from_original(
+            column_name=original_column, encoder=encoder
+        )
+
+        if expected_encoded_columns is None:
+            assert encoded_columns is None
+        else:
+            assert isinstance(encoded_columns, tuple)
+            assert len(encoded_columns) == len(expected_encoded_columns)
+            assert set(encoded_columns) == set(expected_encoded_columns)
+
+    def test_get_enc_column_from_original_raise_error(
+        self, request, df_info_with_operations
+    ):
+        with pytest.raises(MultipleOperationsFoundError) as err:
+            _ = df_info_with_operations.get_enc_column_from_original(
+                column_name="fop_original_col_0"
+            )
+
+        assert isinstance(err.value, MultipleOperationsFoundError)
+        assert (
+            "Multiple operations were found. Please provide additional information"
+            in str(err.value)
+        )
+
+    @pytest.mark.parametrize(
+        "encoded_column, encoder, expected_original_columns",
+        [
+            (  # Case 1: Everything specified and found
+                "fop_derived_col_0",
+                EncodingFunctions.ONEHOT,
+                ("fop_original_col_0",),
+            ),
+            # Case 2: No encoder specified
+            ("fop_derived_col_1", None, ("fop_original_col_0", "fop_original_col_1")),
+            ("fop_derived_col_10", None, None),  # Case 3: No operation associated
+            # Case 4: Column_name in original_columns
+            ("fop_original_col_2", None, None),
+        ],
+    )
+    def test_get_original_from_enc_column(
+        self,
+        request,
+        df_info_with_operations,
+        encoded_column,
+        encoder,
+        expected_original_columns,
+    ):
+        original_columns = df_info_with_operations.get_original_from_enc_column(
+            column_name=encoded_column, encoder=encoder
+        )
+
+        if expected_original_columns is None:
+            assert original_columns is None
+        else:
+            assert isinstance(original_columns, tuple)
+            assert len(original_columns) == len(expected_original_columns)
+            assert set(original_columns) == set(expected_original_columns)
+
+    def test_get_original_from_enc_column_raise_error(
+        self, request, df_info_with_operations
+    ):
+        with pytest.raises(MultipleOperationsFoundError) as err:
+
+            _ = df_info_with_operations.get_original_from_enc_column(
+                column_name="fop_derived_col_0"
+            )
+
+        assert isinstance(err.value, MultipleOperationsFoundError)
+        assert (
+            "Multiple operations were found. Please provide additional information"
+            in str(err.value)
+        )
+
 
 class Describe_FeatureOperation:
     @pytest.mark.parametrize(

From dec0ea17a14c63366349622f605ca09c2e56764d Mon Sep 17 00:00:00 2001
From: Lorenzo Gorini <lore.gorini@gmail.com>
Date: Fri, 21 Aug 2020 21:54:17 +0200
Subject: [PATCH 07/18] Fixed issue 12 by changing from
 EncodingFunctions.ONEHOT/ORDINAL class to EncodingFunctions.ONEHOT.value() ,
 which is actually an instance since it should be the actual encoder instance
 used to encode the feature

---
 src/tests/integration/test_dataframe_with_info.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/tests/integration/test_dataframe_with_info.py b/src/tests/integration/test_dataframe_with_info.py
index a9581af..7b5e9ba 100644
--- a/src/tests/integration/test_dataframe_with_info.py
+++ b/src/tests/integration/test_dataframe_with_info.py
@@ -702,7 +702,7 @@ def test_get_original_from_enc_column_raise_multicolfound_error(
         [
             (  # Case 1: Everything specified and found
                 "fop_original_col_0",
-                EncodingFunctions.ONEHOT,
+                EncodingFunctions.ONEHOT.value(),
                 ("fop_derived_col_0",),
             ),
             ("fop_derived_col_1", None, None),  # Case 2: column_name in derived_columns
@@ -749,7 +749,7 @@ def test_get_enc_column_from_original_raise_error(
         [
             (  # Case 1: Everything specified and found
                 "fop_derived_col_0",
-                EncodingFunctions.ONEHOT,
+                EncodingFunctions.ONEHOT.value(),
                 ("fop_original_col_0",),
             ),
             # Case 2: No encoder specified

From 8f15a1e00a4cafd40427f9d6f64b80420803eaf1 Mon Sep 17 00:00:00 2001
From: Lorenzo Gorini <lore.gorini@gmail.com>
Date: Tue, 25 Aug 2020 16:28:36 +0200
Subject: [PATCH 08/18] In import_df_with_info_from_file function: Added a new
 error exception "NotShelveFileError" to handle the case where the filename is
 not a file created by using shelve module. Fixed isinstance typo

---
 src/pd_extras/exceptions.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/pd_extras/exceptions.py b/src/pd_extras/exceptions.py
index d852b32..15032f0 100644
--- a/src/pd_extras/exceptions.py
+++ b/src/pd_extras/exceptions.py
@@ -30,3 +30,7 @@ class NotShelveFileError(Exception):
     """
 
     pass
+
+
+class NotShelveFileError(Exception):
+    pass

From 6c339260ec80a110cf17061e9a85e39ba23fb346 Mon Sep 17 00:00:00 2001
From: Lorenzo Gorini <lore.gorini@gmail.com>
Date: Tue, 25 Aug 2020 16:32:14 +0200
Subject: [PATCH 09/18] Completed tests for import/export_df_with_info
 functions for DataFrameWithInfo instances. Added two fixtures for creating
 generic files with 'shelve' module and the builtin functions. Fixed tests
 after rebase.

---
 src/tests/dataframewithinfo_util.py           |  5 -
 .../integration/test_dataframe_with_info.py   | 96 -------------------
 2 files changed, 101 deletions(-)

diff --git a/src/tests/dataframewithinfo_util.py b/src/tests/dataframewithinfo_util.py
index cc35b17..95e7381 100644
--- a/src/tests/dataframewithinfo_util.py
+++ b/src/tests/dataframewithinfo_util.py
@@ -3,11 +3,6 @@
 from datetime import date
 
 import pandas as pd
-import pytest
-import sklearn
-
-from ..pd_extras.dataframe_with_info import DataFrameWithInfo, FeatureOperation
-from ..pd_extras.feature_enum import OperationTypeEnum
 
 
 class DataFrameMock:
diff --git a/src/tests/integration/test_dataframe_with_info.py b/src/tests/integration/test_dataframe_with_info.py
index 7b5e9ba..e920144 100644
--- a/src/tests/integration/test_dataframe_with_info.py
+++ b/src/tests/integration/test_dataframe_with_info.py
@@ -697,102 +697,6 @@ def test_get_original_from_enc_column_raise_multicolfound_error(
             in str(err.value)
         )
 
-    @pytest.mark.parametrize(
-        "original_column, encoder, expected_encoded_columns",
-        [
-            (  # Case 1: Everything specified and found
-                "fop_original_col_0",
-                EncodingFunctions.ONEHOT.value(),
-                ("fop_derived_col_0",),
-            ),
-            ("fop_derived_col_1", None, None),  # Case 2: column_name in derived_columns
-            ("fop_original_col_10", None, None),  # Case 3: No operation associated
-            # Case 4: No encoder specified
-            ("fop_original_col_2", None, ("fop_derived_col_2", "fop_derived_col_3")),
-        ],
-    )
-    def test_get_enc_column_from_original(
-        self,
-        request,
-        df_info_with_operations,
-        original_column,
-        encoder,
-        expected_encoded_columns,
-    ):
-        encoded_columns = df_info_with_operations.get_enc_column_from_original(
-            column_name=original_column, encoder=encoder
-        )
-
-        if expected_encoded_columns is None:
-            assert encoded_columns is None
-        else:
-            assert isinstance(encoded_columns, tuple)
-            assert len(encoded_columns) == len(expected_encoded_columns)
-            assert set(encoded_columns) == set(expected_encoded_columns)
-
-    def test_get_enc_column_from_original_raise_error(
-        self, request, df_info_with_operations
-    ):
-        with pytest.raises(MultipleOperationsFoundError) as err:
-            _ = df_info_with_operations.get_enc_column_from_original(
-                column_name="fop_original_col_0"
-            )
-
-        assert isinstance(err.value, MultipleOperationsFoundError)
-        assert (
-            "Multiple operations were found. Please provide additional information"
-            in str(err.value)
-        )
-
-    @pytest.mark.parametrize(
-        "encoded_column, encoder, expected_original_columns",
-        [
-            (  # Case 1: Everything specified and found
-                "fop_derived_col_0",
-                EncodingFunctions.ONEHOT.value(),
-                ("fop_original_col_0",),
-            ),
-            # Case 2: No encoder specified
-            ("fop_derived_col_1", None, ("fop_original_col_0", "fop_original_col_1")),
-            ("fop_derived_col_10", None, None),  # Case 3: No operation associated
-            # Case 4: Column_name in original_columns
-            ("fop_original_col_2", None, None),
-        ],
-    )
-    def test_get_original_from_enc_column(
-        self,
-        request,
-        df_info_with_operations,
-        encoded_column,
-        encoder,
-        expected_original_columns,
-    ):
-        original_columns = df_info_with_operations.get_original_from_enc_column(
-            column_name=encoded_column, encoder=encoder
-        )
-
-        if expected_original_columns is None:
-            assert original_columns is None
-        else:
-            assert isinstance(original_columns, tuple)
-            assert len(original_columns) == len(expected_original_columns)
-            assert set(original_columns) == set(expected_original_columns)
-
-    def test_get_original_from_enc_column_raise_error(
-        self, request, df_info_with_operations
-    ):
-        with pytest.raises(MultipleOperationsFoundError) as err:
-
-            _ = df_info_with_operations.get_original_from_enc_column(
-                column_name="fop_derived_col_0"
-            )
-
-        assert isinstance(err.value, MultipleOperationsFoundError)
-        assert (
-            "Multiple operations were found. Please provide additional information"
-            in str(err.value)
-        )
-
 
 class Describe_FeatureOperation:
     @pytest.mark.parametrize(

From 80f0af081656fd4de9356f24103f3636bb57764d Mon Sep 17 00:00:00 2001
From: Lorenzo Gorini <lore.gorini@gmail.com>
Date: Tue, 25 Aug 2020 17:52:04 +0200
Subject: [PATCH 10/18] Fixes issue #19 because now "show_columns_type"
 considers every value in the column instead of the first one only.

---
 src/pd_extras/dataframe_with_info.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pd_extras/dataframe_with_info.py b/src/pd_extras/dataframe_with_info.py
index f46c666..c67700e 100644
--- a/src/pd_extras/dataframe_with_info.py
+++ b/src/pd_extras/dataframe_with_info.py
@@ -784,7 +784,7 @@ def show_columns_type(self, col_list: Tuple[str] = None) -> None:
         - bool -> "bool_col"
         - str -> "string_col"
         - other types -> "other_col"
-
+        
         Parameters
         ----------
         col_list: Tuple[str], optional

From 96cbaeb642832143c1e2bcbb692e7b89a5b8821a Mon Sep 17 00:00:00 2001
From: Lorenzo Gorini <lore.gorini@gmail.com>
Date: Tue, 25 Aug 2020 17:53:14 +0200
Subject: [PATCH 11/18] Refactored according to flake8

---
 src/pd_extras/dataframe_with_info.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/pd_extras/dataframe_with_info.py b/src/pd_extras/dataframe_with_info.py
index c67700e..fc7670c 100644
--- a/src/pd_extras/dataframe_with_info.py
+++ b/src/pd_extras/dataframe_with_info.py
@@ -12,11 +12,7 @@
 import sklearn
 from joblib import Parallel, delayed
 
-from .exceptions import (
-    MultipleObjectsInFileError,
-    MultipleOperationsFoundError,
-    NotShelveFileError,
-)
+from .exceptions import MultipleObjectsInFileError, MultipleOperationsFoundError, NotShelveFileError
 from .feature_enum import EncodingFunctions, OperationTypeEnum
 from .settings import CATEG_COL_THRESHOLD
 
@@ -784,7 +780,7 @@ def show_columns_type(self, col_list: Tuple[str] = None) -> None:
         - bool -> "bool_col"
         - str -> "string_col"
         - other types -> "other_col"
-        
+
         Parameters
         ----------
         col_list: Tuple[str], optional

From 93cff6d4186d06c5f3e80d02a4d3e1fec8b1b7a6 Mon Sep 17 00:00:00 2001
From: Lorenzo Gorini <lore.gorini@gmail.com>
Date: Wed, 26 Aug 2020 18:36:29 +0200
Subject: [PATCH 12/18] Moved temporary_data_dir fixture to conftest.py since
 it is a generic fixture that may be useful to multiple test scripts

---
 src/tests/conftest.py | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 src/tests/conftest.py

diff --git a/src/tests/conftest.py b/src/tests/conftest.py
new file mode 100644
index 0000000..94d86fa
--- /dev/null
+++ b/src/tests/conftest.py
@@ -0,0 +1,38 @@
+import os
+import shutil
+from pathlib import Path
+
+import pytest
+
+
+@pytest.fixture(scope="module")
+def temporary_data_dir(request) -> Path:
+    """
+    Create a temporary directory for test data and delete it after test end.
+
+    The temporary directory is created in the working directory and it is
+    named "temp_test_data_folder".
+    The fixture uses a finalizer that deletes the temporary directory where
+    every test data was saved. Therefore every time the user calls tests that
+    use this fixture (and save data inside the returned directory), at the end
+    of the test the finalizer deletes this directory.
+
+    Parameters
+    ----------
+
+    Returns
+    -------
+    Path
+        Path where every temporary file used by tests is saved.
+    """
+    temp_data_dir = Path(os.getcwd()) / "temp_test_data_folder"
+    try:
+        os.mkdir(temp_data_dir)
+    except FileExistsError:
+        pass
+
+    def remove_temp_dir_created():
+        shutil.rmtree(temp_data_dir)
+
+    request.addfinalizer(remove_temp_dir_created)
+    return temp_data_dir

From 82a5997523c630c28f22d7acfee40395a519f1b7 Mon Sep 17 00:00:00 2001
From: Lorenzo Gorini <lore.gorini@gmail.com>
Date: Wed, 26 Aug 2020 18:49:17 +0200
Subject: [PATCH 13/18] Fixes issue #21 by adding a "random_seed" argument to
 "anonymize_data" function. Added and reformatted all the comments. Modified
 code to drop duplicates in DataFrame (function "create_private_info_db").
 Fixes undefined "df_sani" variable with the correct variable.

---
 src/pd_extras/anonymize_database.py | 150 +++++++++++++++++++---------
 1 file changed, 105 insertions(+), 45 deletions(-)

diff --git a/src/pd_extras/anonymize_database.py b/src/pd_extras/anonymize_database.py
index 75eb3ef..c8fc2b9 100644
--- a/src/pd_extras/anonymize_database.py
+++ b/src/pd_extras/anonymize_database.py
@@ -2,6 +2,8 @@
 import os
 import random
 import string
+from pathlib import Path
+from typing import Tuple, Union
 
 import numpy as np
 import pandas as pd
@@ -26,13 +28,22 @@ def add_id_owner_col(private_df, cols_to_hash):
     """
     This function uses the columns of the "private_df" database to generate an hash value
     and it creates an "ID_OWNER" column with those values.
-    To generate hash values, we add nonces (random prefix and suffix) to the column values and then we use "sha256".
-    See https://medium.com/luckspark/hashing-pandas-dataframe-column-with-nonce-763a8c23a833 for more info.
-
-    :param private_df: Pandas.DataFrame with the owner's private data
-    :param cols_to_hash: This is a list of column names with the infos we want to hash
-
-    :return: Pandas.DataFrame similar to "private_df" with a new "ID_OWNER" column
+    To generate hash values, the function adds nonces (random prefix and suffix)
+    to the column values and then we use "sha256".
+    See https://medium.com/luckspark/hashing-pandas-dataframe-column-with-nonce-763a8c23a833
+    for more info.
+
+    Parameters
+    ----------
+    private_df: pd.DataFrame
+        Pandas.DataFrame with the owner's private data
+    cols_to_hash: Tuple[str]
+        This is a list of column names with the infos we want to hash
+
+    Returns
+    -------
+    pd.DataFrame
+        Pandas.DataFrame similar to ``private_df`` with a new "ID_OWNER" column
     """
     # Turn rows into strings to be used
     rows_into_strings = np.sum(
@@ -53,29 +64,41 @@ def hash_lambda(owner_name):
     return private_df
 
 
-def create_private_info_db(df, private_cols_to_map):
+def create_private_info_db(
+    df: pd.DataFrame, private_cols_to_map: Tuple[str]
+) -> pd.DataFrame:
     """
-    This function creates a Pandas.DataFrame where you will store all the owner's
-    private data needed to identify them.
-    These informations are listed in "private_cols_to_map" argument.
-
-    :param df: Pandas.DataFrame that we will anonymize
-    :param private_cols_to_map: This is a list of the columns that will be stored in the
-    private_db that will be returned, along with the new "ID_OWNER"
-    :return: Pandas.DataFrame with the values of the "private_cols_to_map" and their hashed value in the column "ID_OWNER"
+    Create a DataFrame with private data and a unique ID.
+
+    This function will store in a DataFrame all the owner's private data
+    contained in the columns ``private_cols_to_map`` needed to identify them.
+    The function will also add a unique owner ID (in the column "OWNER_ID") that
+    is hashed based on ``private_cols_to_map``.
+    In case there are multiple rows with the same private infos
+    (e.g.: multiple data from the same customer), only one of those rows
+    is included in the returned DataFrame.
+
+    Parameters
+    ----------
+    df: pd.DataFrame
+        Pandas.DataFrame that we will anonymize
+    private_cols_to_map: Tuple[str]
+        List of the columns that will be stored in the private_db
+        that will be returned, along with the new "ID_OWNER"
+
+    Returns
+    -------
+    pd.DataFrame
+        Pandas.DataFrame with the values of the ``private_cols_to_map`` and
+        their hashed value in the column "ID_OWNER"
     """
     # Create the private_db with the columns with private infos only
     private_df = df[private_cols_to_map]
 
-    # Get unique combinations of the columns you chose
-    private_df = (
-        private_df.groupby(private_cols_to_map, as_index=False, group_keys=False)
-        .size()
-        .reset_index()
-    )
-
-    # Eliminate size column
-    private_df = private_df.drop(columns=[0])
+    # In case there are multiple rows with the same private infos
+    # (e.g.: multiple data from the same customer), only one of these rows
+    # should be included in ``private_df``
+    private_df.drop_duplicates(inplace=True)
 
     # Add the ID_OWNER column with the hash value of the row
     private_df = add_id_owner_col(private_df, private_cols_to_map)
@@ -84,40 +107,77 @@ def create_private_info_db(df, private_cols_to_map):
 
 
 def anonymize_data(
-    df, file_name, private_cols_to_remove, private_cols_to_map, dest_path
-):
+    df: pd.DataFrame,
+    file_name: str,
+    private_cols_to_remove: Tuple[str],
+    private_cols_to_map: Tuple[str],
+    dest_path: Union[Path, str],
+    random_seed: int = 42,
+) -> Tuple[pd.DataFrame]:
     """
-    This function will take the Pandas DataFrame "df" and it will return two files written inside the "dest_path":
-    1. One file (called "[file_name]_anonym") will contain the database "df" where
-    we replaced the columns "private_cols_to_remove" with the column "ID_OWNER"
-    2. Another file (called "[file_name]_private_info") will contain only the
-    owner infos "private_cols_to_map", which we associated an ID_OWNER to.
-    The ID_OWNER will be hashed using SHA256.
+    Separate generic from private data leaving a unique ID as map between them.
 
-    :param df: Pandas.DataFrame that we will anonymize
-    :param file_name: Name of the database we are working on (no ".csv" suffix). Used as prefix when saving csv output files.
-    :param private_cols_to_remove: Columns that will be removed from "_anonym" file
-    :param private_cols_to_map: Columns of the "_private_info" files
-    :param dest_path: The directory where we will save the two files
-
-    :return: [file_name]_anonym : pd.DataFrame
-             [file_name]_private_info : pd.DataFrame
+    This function will take the Pandas DataFrame ``df`` and it will return two
+    files written inside the ``dest_path`` directory:
+    1. One file (called "[file_name]_anonym") will contain the database ``df`` where
+    we replaced the columns ``private_cols_to_remove`` with the column "ID_OWNER"
+    2. Another file (called "[file_name]_private_info") will contain only the
+    owner infos ``private_cols_to_map``, which we associated an ID_OWNER to.
+    To generate hash values for the "ID_OWNER" column values, the algorithm
+    adds nonces (random prefix and suffix) to the column values and then
+    it uses "SHA256" algorithm.
+
+    Parameters
+    ----------
+    df: pd.DataFrame
+        Pandas.DataFrame that we will anonymize
+    file_name: str
+        Name of the database we are working on (no ".csv" suffix). Used as
+        prefix when saving csv output files.
+    private_cols_to_remove: Tuple[str]
+        Columns that will be removed from "_anonym" file
+    private_cols_to_map: Tuple[str]
+        Columns of the "_private_info" files
+    dest_path: Union[Path, str]
+        The directory where we will save the two files
+    random_seed: int
+        Integer value used as "seed" for the generation of random prefixes and
+        suffixes in "nonces".
+
+    Returns
+    -------
+    pd.DataFrame
+        Pandas DataFrame containing only the private infos ``private_cols_to_map``,
+        along with another column "ID_OWNER" that allows to map these private
+        informations to the data in the other DataFrame. This file is
+        also saved to "[``dest_path``] / [``file_name``]_private_info.csv" file.
+    pd.DataFrame
+        Pandas DataFrame containing the same infos as the DataFrame ``df``, but
+        the columns "private_cols_to_remove" have been replaced by "ID_OWNER"
+        column.
+        This file is also saved to "[``dest_path``] / [``file_name``]_anonym.csv"
+        file.
     """
+    # Fix the random seed for the generation of random prefixes and
+    # suffixes in "nonces", used for creating "ID_OWNER" column.
+    random.seed(random_seed)
+    # Create the "_anonym" DataFrame which will contain the anonymized database
+    anonym_df = df.copy()
     # Fill NaN values in the columns we will map, to make DataFrame merge easier
     df[private_cols_to_map] = df[private_cols_to_map].fillna("----")
     # Create the "_private_info" db which will contain the map to owner's private data
     private_df = create_private_info_db(df, private_cols_to_map)
 
-    # Create the "_anonym" DataFrame which will contain the anonymized database
-    anonym_df = pd.DataFrame(df_sani)
-
-    # Merge to insert the new ID_OWNER column
+    # Merge to insert the new ID_OWNER column corresponding to the
+    # private column value combinations
     anonym_df = anonym_df.merge(private_df)
 
     # Delete the columns with private owner's data
     anonym_df = anonym_df.drop(private_cols_to_remove, axis=1)
 
     # Write the two DataFrames to CSV files
+    dest_path = str(dest_path)
+    file_name = str(file_name)
     try:
         private_df.to_csv(
             os.path.join(dest_path, f"{file_name}_private_info.csv"),

From 591b0fcce0976c4dd677f511df1928d027692954 Mon Sep 17 00:00:00 2001
From: Lorenzo Gorini <lore.gorini@gmail.com>
Date: Wed, 26 Aug 2020 18:50:52 +0200
Subject: [PATCH 14/18] Added test for "anonymize_database.anonymize_data"
 function. Added DataFrame mock "df_with_private_info" for creating a df with
 some private columns and other data columns

---
 src/tests/dataframewithinfo_util.py           | 40 +++++++++
 .../integration/test_anonymize_database.py    | 87 +++++++++++++++++++
 2 files changed, 127 insertions(+)
 create mode 100644 src/tests/integration/test_anonymize_database.py

diff --git a/src/tests/dataframewithinfo_util.py b/src/tests/dataframewithinfo_util.py
index 95e7381..543025c 100644
--- a/src/tests/dataframewithinfo_util.py
+++ b/src/tests/dataframewithinfo_util.py
@@ -1,6 +1,7 @@
 import itertools
 import random
 from datetime import date
+from typing import Tuple
 
 import pandas as pd
 
@@ -381,6 +382,45 @@ def df_duplicated_columns(duplicated_cols_count: int) -> pd.DataFrame:
 
         return pd.DataFrame(df_duplicated)
 
+    @staticmethod
+    def df_with_private_info(private_cols: Tuple[str]):
+        """
+        Create DataFrame with private info columns along with data columns
+
+        The returned DataFrame mock contains (len(private_cols) + 2) columns
+        and 5 rows. Particularly it contains the columns listed in ``private_cols``
+        with string values,and 2 data columns containing
+        integer values.
+        Two of these rows have same values in ``private_cols`` columns, but different
+        values in the other 2 data columns (this could be simulating a DataFrame
+        with multiple rows related to the same customer/patient).
+
+        Parameters
+        ----------
+        private_cols: Tuple[str]
+            List of columns that will be created as private columns
+
+        Returns
+        -------
+        pd.DataFrame
+            DataFrame mock containing (len(private_cols) + 2) columns
+            and 5 rows. Particularly it contains the columns listed in ``private_cols``
+            with generic string values,and 2 data columns containing
+            integer values.
+
+        """
+        df_private_info_dict = {}
+        sample_size = 5
+        for i, col in enumerate(private_cols):
+            df_private_info_dict[col] = [
+                f"col_{i}_value_{k}" for k in range(sample_size - 1)
+            ]
+            # Add a duplicated row (it may be associated to the same customer)
+            df_private_info_dict[col].append(f"col_{i}_value_{sample_size-2}")
+        df_private_info_dict["data_col_0"] = list(range(sample_size))
+        df_private_info_dict["data_col_1"] = list(range(sample_size))
+        return pd.DataFrame(df_private_info_dict)
+
 
 class SeriesMock:
     @staticmethod
diff --git a/src/tests/integration/test_anonymize_database.py b/src/tests/integration/test_anonymize_database.py
new file mode 100644
index 0000000..b8e9191
--- /dev/null
+++ b/src/tests/integration/test_anonymize_database.py
@@ -0,0 +1,87 @@
+import pandas as pd
+import pytest
+
+from tests.dataframewithinfo_util import DataFrameMock
+
+from ...pd_extras.anonymize_database import anonymize_data
+
+
+@pytest.mark.parametrize(
+    "private_cols_to_remove, private_cols_to_map, "
+    + "expected_anonym_df, expected_private_df",
+    [
+        (
+            ["private_col_a", "private_col_b"],
+            ["private_col_a", "private_col_b", "private_col_c"],
+            pd.DataFrame(
+                {
+                    "private_col_c": {
+                        0: "col_2_value_0",
+                        1: "col_2_value_1",
+                        2: "col_2_value_2",
+                        3: "col_2_value_3",
+                        4: "col_2_value_3",
+                    },
+                    "data_col_0": {0: 0, 1: 1, 2: 2, 3: 3, 4: 4},
+                    "data_col_1": {0: 0, 1: 1, 2: 2, 3: 3, 4: 4},
+                    "ID_OWNER": {
+                        0: "467ef2006da06554f248d74bf537a2e5a5270321c35963eace344feb32dd7b31",
+                        1: "42d7ba97aaf0368c3b2e66ac7bb88787480d22ff3e0694a805647cdce1ecac73",
+                        2: "e605c6ffcbfcb25f252e269b04b77df4a9514effe10d9885b366dfceae82aa24",
+                        3: "be7c8a1fc7ff3c143455fb8d2774369ff6e756d804cb1e1765aca079b1a0778a",
+                        4: "be7c8a1fc7ff3c143455fb8d2774369ff6e756d804cb1e1765aca079b1a0778a",
+                    },
+                }
+            ),
+            pd.DataFrame(
+                {
+                    "private_col_a": {
+                        0: "col_0_value_0",
+                        1: "col_0_value_1",
+                        2: "col_0_value_2",
+                        3: "col_0_value_3",
+                    },
+                    "private_col_b": {
+                        0: "col_1_value_0",
+                        1: "col_1_value_1",
+                        2: "col_1_value_2",
+                        3: "col_1_value_3",
+                    },
+                    "private_col_c": {
+                        0: "col_2_value_0",
+                        1: "col_2_value_1",
+                        2: "col_2_value_2",
+                        3: "col_2_value_3",
+                    },
+                    "ID_OWNER": {
+                        0: "467ef2006da06554f248d74bf537a2e5a5270321c35963eace344feb32dd7b31",
+                        1: "42d7ba97aaf0368c3b2e66ac7bb88787480d22ff3e0694a805647cdce1ecac73",
+                        2: "e605c6ffcbfcb25f252e269b04b77df4a9514effe10d9885b366dfceae82aa24",
+                        3: "be7c8a1fc7ff3c143455fb8d2774369ff6e756d804cb1e1765aca079b1a0778a",
+                    },
+                }
+            ),
+        )
+    ],
+)
+def test_anonymize_data(
+    temporary_data_dir,
+    private_cols_to_remove,
+    private_cols_to_map,
+    expected_anonym_df,
+    expected_private_df,
+):
+
+    original_df = DataFrameMock.df_with_private_info(private_cols=private_cols_to_map)
+
+    anonym_df, private_df = anonymize_data(
+        df=original_df,
+        file_name="test_original_db_anonymize",
+        private_cols_to_remove=private_cols_to_remove,
+        private_cols_to_map=private_cols_to_map,
+        dest_path=str(temporary_data_dir),
+        random_seed=42,
+    )
+
+    pd.testing.assert_frame_equal(anonym_df, expected_anonym_df)
+    pd.testing.assert_frame_equal(private_df, expected_private_df)

From 1df8074dfe6a521d03240a330834b1331e63d51e Mon Sep 17 00:00:00 2001
From: Lorenzo Gorini <lore.gorini@gmail.com>
Date: Thu, 27 Aug 2020 19:47:13 +0200
Subject: [PATCH 15/18] Fixed tests after rebase

---
 src/pd_extras/exceptions.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/pd_extras/exceptions.py b/src/pd_extras/exceptions.py
index 15032f0..d852b32 100644
--- a/src/pd_extras/exceptions.py
+++ b/src/pd_extras/exceptions.py
@@ -30,7 +30,3 @@ class NotShelveFileError(Exception):
     """
 
     pass
-
-
-class NotShelveFileError(Exception):
-    pass

From f56c41a8d825389f977815980690338e2bd2a26e Mon Sep 17 00:00:00 2001
From: Lorenzo Gorini <lore.gorini@gmail.com>
Date: Thu, 27 Aug 2020 21:33:16 +0200
Subject: [PATCH 16/18] Fixed according to PR comments (minor typos in
 docstrings)

---
 src/pd_extras/anonymize_database.py           | 48 ++++++++++++-------
 src/tests/dataframewithinfo_util.py           |  4 +-
 .../integration/test_anonymize_database.py    |  3 +-
 3 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/src/pd_extras/anonymize_database.py b/src/pd_extras/anonymize_database.py
index c8fc2b9..35e76b4 100644
--- a/src/pd_extras/anonymize_database.py
+++ b/src/pd_extras/anonymize_database.py
@@ -9,13 +9,27 @@
 import pandas as pd
 
 
-def add_nonce_func(string_array):
+def add_nonce_func(
+    string_array: Union[str, int, float, np.array]
+) -> Union[str, int, float, np.array]:
     """
-    This function takes an array of strings passed as "string_array" and
-    attaches them nonces (random prefix and suffix), using Vectorization.
+    Add random prefix and suffix to an array of strings ``string_array``
+    
+    This function takes an array of strings passed as ``string_array`` and
+    attaches nonces (random prefix and suffix) to each string.
+    It can also be used in a vectorized way
+    Prefix and suffix will contain 12 random characters each.
 
-    :param cols_values: This is a list of numpy arrays, i.e. the columns we add nonce to
-    :return: np.array of strings with nonces
+    Parameters
+    ----------
+    string_array: Union[str, int, float, np.array]
+        This can be a number, a string or a numpy array of values 
+        (e.g. a DataFrame column)
+    
+    Returns
+    -------
+    np.array:
+        Array of strings with nonces
     """
     return (
         "".join(random.choice(string.hexdigits) for i in range(12))
@@ -24,7 +38,9 @@ def add_nonce_func(string_array):
     )
 
 
-def add_id_owner_col(private_df, cols_to_hash):
+def add_id_owner_col(
+    private_df: pd.DataFrame, cols_to_hash: Tuple[str]
+) -> pd.DataFrame:
     """
     This function uses the columns of the "private_df" database to generate an hash value
     and it creates an "ID_OWNER" column with those values.
@@ -36,14 +52,14 @@ def add_id_owner_col(private_df, cols_to_hash):
     Parameters
     ----------
     private_df: pd.DataFrame
-        Pandas.DataFrame with the owner's private data
+        DataFrame with the owner's private data
     cols_to_hash: Tuple[str]
         This is a list of column names with the infos we want to hash
 
     Returns
     -------
     pd.DataFrame
-        Pandas.DataFrame similar to ``private_df`` with a new "ID_OWNER" column
+        DataFrame similar to ``private_df`` with a new "ID_OWNER" column
     """
     # Turn rows into strings to be used
     rows_into_strings = np.sum(
@@ -74,14 +90,14 @@ def create_private_info_db(
     contained in the columns ``private_cols_to_map`` needed to identify them.
     The function will also add a unique owner ID (in the column "OWNER_ID") that
     is hashed based on ``private_cols_to_map``.
-    In case there are multiple rows with the same private infos
+    In case there are multiple rows with the same private info
     (e.g.: multiple data from the same customer), only one of those rows
     is included in the returned DataFrame.
 
     Parameters
     ----------
     df: pd.DataFrame
-        Pandas.DataFrame that we will anonymize
+        DataFrame that we will anonymize
     private_cols_to_map: Tuple[str]
         List of the columns that will be stored in the private_db
         that will be returned, along with the new "ID_OWNER"
@@ -89,13 +105,13 @@ def create_private_info_db(
     Returns
     -------
     pd.DataFrame
-        Pandas.DataFrame with the values of the ``private_cols_to_map`` and
+        DataFrame with the values of the ``private_cols_to_map`` and
         their hashed value in the column "ID_OWNER"
     """
-    # Create the private_db with the columns with private infos only
+    # Create the private_db with the columns with private info only
     private_df = df[private_cols_to_map]
 
-    # In case there are multiple rows with the same private infos
+    # In case there are multiple rows with the same private info
     # (e.g.: multiple data from the same customer), only one of these rows
     # should be included in ``private_df``
     private_df.drop_duplicates(inplace=True)
@@ -113,7 +129,7 @@ def anonymize_data(
     private_cols_to_map: Tuple[str],
     dest_path: Union[Path, str],
     random_seed: int = 42,
-) -> Tuple[pd.DataFrame]:
+) -> Tuple[pd.DataFrame, pd.DataFrame]:
     """
     Separate generic from private data leaving a unique ID as map between them.
 
@@ -130,7 +146,7 @@ def anonymize_data(
     Parameters
     ----------
     df: pd.DataFrame
-        Pandas.DataFrame that we will anonymize
+        DataFrame that we will anonymize
     file_name: str
         Name of the database we are working on (no ".csv" suffix). Used as
         prefix when saving csv output files.
@@ -147,7 +163,7 @@ def anonymize_data(
     Returns
     -------
     pd.DataFrame
-        Pandas DataFrame containing only the private infos ``private_cols_to_map``,
+        Pandas DataFrame containing only the private info ``private_cols_to_map``,
         along with another column "ID_OWNER" that allows to map these private
         informations to the data in the other DataFrame. This file is
         also saved to "[``dest_path``] / [``file_name``]_private_info.csv" file.
diff --git a/src/tests/dataframewithinfo_util.py b/src/tests/dataframewithinfo_util.py
index 543025c..428f1a1 100644
--- a/src/tests/dataframewithinfo_util.py
+++ b/src/tests/dataframewithinfo_util.py
@@ -389,7 +389,7 @@ def df_with_private_info(private_cols: Tuple[str]):
 
         The returned DataFrame mock contains (len(private_cols) + 2) columns
         and 5 rows. Particularly it contains the columns listed in ``private_cols``
-        with string values,and 2 data columns containing
+        with string values, and 2 data columns containing
         integer values.
         Two of these rows have same values in ``private_cols`` columns, but different
         values in the other 2 data columns (this could be simulating a DataFrame
@@ -405,7 +405,7 @@ def df_with_private_info(private_cols: Tuple[str]):
         pd.DataFrame
             DataFrame mock containing (len(private_cols) + 2) columns
             and 5 rows. Particularly it contains the columns listed in ``private_cols``
-            with generic string values,and 2 data columns containing
+            with generic string values, and 2 data columns containing
             integer values.
 
         """
diff --git a/src/tests/integration/test_anonymize_database.py b/src/tests/integration/test_anonymize_database.py
index b8e9191..e257f6d 100644
--- a/src/tests/integration/test_anonymize_database.py
+++ b/src/tests/integration/test_anonymize_database.py
@@ -8,7 +8,7 @@
 
 @pytest.mark.parametrize(
     "private_cols_to_remove, private_cols_to_map, "
-    + "expected_anonym_df, expected_private_df",
+    "expected_anonym_df, expected_private_df",
     [
         (
             ["private_col_a", "private_col_b"],
@@ -71,7 +71,6 @@ def test_anonymize_data(
     expected_anonym_df,
     expected_private_df,
 ):
-
     original_df = DataFrameMock.df_with_private_info(private_cols=private_cols_to_map)
 
     anonym_df, private_df = anonymize_data(

From 37aa71c6060f889b3b9b24f77582f629926e82de Mon Sep 17 00:00:00 2001
From: Lorenzo Gorini <lore.gorini@gmail.com>
Date: Thu, 27 Aug 2020 21:34:27 +0200
Subject: [PATCH 17/18] Formatted according to flake8

---
 src/pd_extras/anonymize_database.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/pd_extras/anonymize_database.py b/src/pd_extras/anonymize_database.py
index 35e76b4..7012a94 100644
--- a/src/pd_extras/anonymize_database.py
+++ b/src/pd_extras/anonymize_database.py
@@ -14,7 +14,7 @@ def add_nonce_func(
 ) -> Union[str, int, float, np.array]:
     """
     Add random prefix and suffix to an array of strings ``string_array``
-    
+
     This function takes an array of strings passed as ``string_array`` and
     attaches nonces (random prefix and suffix) to each string.
     It can also be used in a vectorized way
@@ -23,9 +23,9 @@ def add_nonce_func(
     Parameters
     ----------
     string_array: Union[str, int, float, np.array]
-        This can be a number, a string or a numpy array of values 
+        This can be a number, a string or a numpy array of values
         (e.g. a DataFrame column)
-    
+
     Returns
     -------
     np.array:

From 1fd2ef827fc40ac5f7e37e03f16f855246c172c7 Mon Sep 17 00:00:00 2001
From: Lorenzo Gorini <lore.gorini@gmail.com>
Date: Fri, 28 Aug 2020 11:49:56 +0200
Subject: [PATCH 18/18] Fixed minor typos docstrings

---
 src/pd_extras/anonymize_database.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/pd_extras/anonymize_database.py b/src/pd_extras/anonymize_database.py
index 7012a94..814addc 100644
--- a/src/pd_extras/anonymize_database.py
+++ b/src/pd_extras/anonymize_database.py
@@ -28,7 +28,7 @@ def add_nonce_func(
 
     Returns
     -------
-    np.array:
+    np.array
         Array of strings with nonces
     """
     return (
@@ -163,12 +163,12 @@ def anonymize_data(
     Returns
     -------
     pd.DataFrame
-        Pandas DataFrame containing only the private info ``private_cols_to_map``,
+        DataFrame containing only the private info ``private_cols_to_map``,
         along with another column "ID_OWNER" that allows to map these private
         informations to the data in the other DataFrame. This file is
         also saved to "[``dest_path``] / [``file_name``]_private_info.csv" file.
     pd.DataFrame
-        Pandas DataFrame containing the same infos as the DataFrame ``df``, but
+        DataFrame containing the same infos as the DataFrame ``df``, but
         the columns "private_cols_to_remove" have been replaced by "ID_OWNER"
         column.
         This file is also saved to "[``dest_path``] / [``file_name``]_anonym.csv"