Skip to content

Commit

Permalink
adding annonymise fucntion
Browse files Browse the repository at this point in the history
  • Loading branch information
Samuel Stock authored and Samuel Stock committed May 10, 2024
1 parent e18df4c commit e9bc44d
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 5 deletions.
7 changes: 2 additions & 5 deletions src/pprl/app/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def convert_dataframe_to_bf(
feature types to be processed as appropriate.
other_columns: list[str]
Columns to be returned as they appear in the data in addition to
`bf_indices` and `bf_norms`.
`bf_indices`, `bf_norms` and `thresholds`.
salt: str
Cryptographic salt to add to tokens before hashing.
Expand All @@ -134,10 +134,7 @@ def convert_dataframe_to_bf(
output: pandas.DataFrame
Data frame of bloom-filtered data.
"""
if other_columns is None:
other_columns = []

output_columns = other_columns + ["bf_indices", "bf_norms", "thresholds"]
NGRAMS = [1, 2, 3, 4]
FFARGS = {"name": {"ngram_length": NGRAMS, "use_gen_skip_grams": True}}

Expand All @@ -156,6 +153,6 @@ def convert_dataframe_to_bf(
)

df_bloom_filter = embedder.embed(df, colspec, update_norms=True, update_thresholds=True)
output = df_bloom_filter[output_columns]
output = df_bloom_filter.anonymise(other_columns)

return output
23 changes: 23 additions & 0 deletions src/pprl/embedder/embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,29 @@ def update_norms(self) -> "EmbeddedDataFrame":

return self

def anonymise(self, keep: None | list = None) -> "EmbeddedDataFrame":
"""Remove raw data from embedded dataframe.
Remove all columns from the embedded dataframe expect columns listed
in keep and `bf_indices`, `bf_norms` and `thresholds`.
Returns
-------
keep: list[str]
Columns to be returned as they appear in the data in addition to
`bf_indices`, `bf_norms` and `thresholds` if they are present in
the data.
"""

if keep is None:
keep = []

output_columns = keep + ["bf_indices", "bf_norms", "thresholds"]
output_columns = [column for column in output_columns if column in self.columns]
# remove duplicate column names
output_columns = list(dict.fromkeys(output_columns))
return self[output_columns]


class SimilarityArray(np.ndarray):
"""Augmented NumPy array of similarity scores with extra attributes.
Expand Down

0 comments on commit e9bc44d

Please sign in to comment.