adding annonymise fucntion

datasciencecampus · May 10, 2024 · e9bc44d · e9bc44d
1 parent e18df4c
commit e9bc44d
Show file tree

Hide file tree

Showing 2 changed files with 25 additions and 5 deletions.
diff --git a/src/pprl/app/utils.py b/src/pprl/app/utils.py
@@ -125,7 +125,7 @@ def convert_dataframe_to_bf(
         feature types to be processed as appropriate.
     other_columns: list[str]
         Columns to be returned as they appear in the data in addition to
-        `bf_indices` and `bf_norms`.
+        `bf_indices`, `bf_norms` and `thresholds`.
     salt: str
         Cryptographic salt to add to tokens before hashing.
 
@@ -134,10 +134,7 @@ def convert_dataframe_to_bf(
     output: pandas.DataFrame
         Data frame of bloom-filtered data.
     """
-    if other_columns is None:
-        other_columns = []
 
-    output_columns = other_columns + ["bf_indices", "bf_norms", "thresholds"]
     NGRAMS = [1, 2, 3, 4]
     FFARGS = {"name": {"ngram_length": NGRAMS, "use_gen_skip_grams": True}}
 
@@ -156,6 +153,6 @@ def convert_dataframe_to_bf(
     )
 
     df_bloom_filter = embedder.embed(df, colspec, update_norms=True, update_thresholds=True)
-    output = df_bloom_filter[output_columns]
+    output = df_bloom_filter.anonymise(other_columns)
 
     return output
diff --git a/src/pprl/embedder/embedder.py b/src/pprl/embedder/embedder.py
@@ -157,6 +157,29 @@ def update_norms(self) -> "EmbeddedDataFrame":
 
         return self
 
+    def anonymise(self, keep: None | list = None) -> "EmbeddedDataFrame":
+        """Remove raw data from embedded dataframe.
+
+        Remove all columns from the embedded dataframe expect columns listed
+        in keep and `bf_indices`, `bf_norms` and `thresholds`.
+
+        Returns
+        -------
+        keep: list[str]
+            Columns to be returned as they appear in the data in addition to
+            `bf_indices`, `bf_norms` and `thresholds` if they are present in
+            the data.
+        """
+
+        if keep is None:
+            keep = []
+
+        output_columns = keep + ["bf_indices", "bf_norms", "thresholds"]
+        output_columns = [column for column in output_columns if column in self.columns]
+        # remove duplicate column names
+        output_columns = list(dict.fromkeys(output_columns))
+        return self[output_columns]
+
 
 class SimilarityArray(np.ndarray):
     """Augmented NumPy array of similarity scores with extra attributes.