From e9bc44d199e36ed5ae7004fbf84585b730957149 Mon Sep 17 00:00:00 2001 From: Samuel Stock Date: Fri, 10 May 2024 13:30:47 +0100 Subject: [PATCH 1/3] adding annonymise fucntion --- src/pprl/app/utils.py | 7 ++----- src/pprl/embedder/embedder.py | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/src/pprl/app/utils.py b/src/pprl/app/utils.py index 83e5e43..ca79f42 100644 --- a/src/pprl/app/utils.py +++ b/src/pprl/app/utils.py @@ -125,7 +125,7 @@ def convert_dataframe_to_bf( feature types to be processed as appropriate. other_columns: list[str] Columns to be returned as they appear in the data in addition to - `bf_indices` and `bf_norms`. + `bf_indices`, `bf_norms` and `thresholds`. salt: str Cryptographic salt to add to tokens before hashing. @@ -134,10 +134,7 @@ def convert_dataframe_to_bf( output: pandas.DataFrame Data frame of bloom-filtered data. """ - if other_columns is None: - other_columns = [] - output_columns = other_columns + ["bf_indices", "bf_norms", "thresholds"] NGRAMS = [1, 2, 3, 4] FFARGS = {"name": {"ngram_length": NGRAMS, "use_gen_skip_grams": True}} @@ -156,6 +153,6 @@ def convert_dataframe_to_bf( ) df_bloom_filter = embedder.embed(df, colspec, update_norms=True, update_thresholds=True) - output = df_bloom_filter[output_columns] + output = df_bloom_filter.anonymise(other_columns) return output diff --git a/src/pprl/embedder/embedder.py b/src/pprl/embedder/embedder.py index 8f1e2fc..5ad4524 100644 --- a/src/pprl/embedder/embedder.py +++ b/src/pprl/embedder/embedder.py @@ -157,6 +157,29 @@ def update_norms(self) -> "EmbeddedDataFrame": return self + def anonymise(self, keep: None | list = None) -> "EmbeddedDataFrame": + """Remove raw data from embedded dataframe. + + Remove all columns from the embedded dataframe expect columns listed + in keep and `bf_indices`, `bf_norms` and `thresholds`. + + Returns + ------- + keep: list[str] + Columns to be returned as they appear in the data in addition to + `bf_indices`, `bf_norms` and `thresholds` if they are present in + the data. + """ + + if keep is None: + keep = [] + + output_columns = keep + ["bf_indices", "bf_norms", "thresholds"] + output_columns = [column for column in output_columns if column in self.columns] + # remove duplicate column names + output_columns = list(dict.fromkeys(output_columns)) + return self[output_columns] + class SimilarityArray(np.ndarray): """Augmented NumPy array of similarity scores with extra attributes. From 02cd28e8df2cfc10898847cb2738f19d9ed60f75 Mon Sep 17 00:00:00 2001 From: Samuel Stock Date: Mon, 13 May 2024 16:31:49 +0100 Subject: [PATCH 2/3] adding a test for the anonymise feature of the embedder --- src/pprl/embedder/embedder.py | 2 +- test/embedder/test_embedder.py | 27 +++++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/src/pprl/embedder/embedder.py b/src/pprl/embedder/embedder.py index 5ad4524..24771b2 100644 --- a/src/pprl/embedder/embedder.py +++ b/src/pprl/embedder/embedder.py @@ -175,7 +175,7 @@ def anonymise(self, keep: None | list = None) -> "EmbeddedDataFrame": keep = [] output_columns = keep + ["bf_indices", "bf_norms", "thresholds"] - output_columns = [column for column in output_columns if column in self.columns] + output_columns = [column for column in self.columns if column in output_columns] # remove duplicate column names output_columns = list(dict.fromkeys(output_columns)) return self[output_columns] diff --git a/test/embedder/test_embedder.py b/test/embedder/test_embedder.py index 5684829..35e9105 100644 --- a/test/embedder/test_embedder.py +++ b/test/embedder/test_embedder.py @@ -75,6 +75,33 @@ def test_update_norms(posdef_matrix): assert bf_norms1 == bf_norms2 +@given(st_posdef_matrices(bf_size=10)) +def test_anonymise(posdef_matrix): + """Tests EmbeddedDataFrame.anonymise. + + Test that the columns in the keep list are returned in their + original order in addition to the bf_indices column. + """ + + nrows = len(posdef_matrix) + df = pd.DataFrame( + dict( + idx=[1] * nrows, + firstname=["Fred"] * nrows, + age=[43] * nrows, + lastname=["Hogan O'Malley"] * nrows, + bf_indices=[45] * nrows, + ) + ) + embedder_mock = mock.Mock(Embedder) + embedder_mock.scm_matrix = posdef_matrix + embedder_mock.checksum = "1234" + edf = EmbeddedDataFrame(df, embedder_mock, update_norms=False, update_thresholds=False) + + edf_anonymised = edf.anonymise(keep=["age", "lastname", "idx", "age"]) + assert list(edf_anonymised.columns) == ["idx", "age", "lastname", "bf_indices"] + + def test_embed_colspec(): """Check that only the name column in the colspec is processed.""" From e401ced00842f034086cbf047b3739664e79a296 Mon Sep 17 00:00:00 2001 From: Samuel Stock Date: Mon, 13 May 2024 16:44:34 +0100 Subject: [PATCH 3/3] replacing positive definite matrix with identity matrix --- test/embedder/test_embedder.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/test/embedder/test_embedder.py b/test/embedder/test_embedder.py index 35e9105..f00b4f3 100644 --- a/test/embedder/test_embedder.py +++ b/test/embedder/test_embedder.py @@ -75,26 +75,25 @@ def test_update_norms(posdef_matrix): assert bf_norms1 == bf_norms2 -@given(st_posdef_matrices(bf_size=10)) -def test_anonymise(posdef_matrix): +def test_anonymise(): """Tests EmbeddedDataFrame.anonymise. Test that the columns in the keep list are returned in their original order in addition to the bf_indices column. """ - nrows = len(posdef_matrix) + matrix = np.eye(5) df = pd.DataFrame( dict( - idx=[1] * nrows, - firstname=["Fred"] * nrows, - age=[43] * nrows, - lastname=["Hogan O'Malley"] * nrows, - bf_indices=[45] * nrows, + idx=[1], + firstname=["Fred"], + age=[43], + lastname=["Hogan O'Malley"], + bf_indices=[45], ) ) embedder_mock = mock.Mock(Embedder) - embedder_mock.scm_matrix = posdef_matrix + embedder_mock.scm_matrix = matrix embedder_mock.checksum = "1234" edf = EmbeddedDataFrame(df, embedder_mock, update_norms=False, update_thresholds=False)