From 7dd1f7a27319b25f5b51d4f6b916e0ce01977e4c Mon Sep 17 00:00:00 2001 From: Mat Weldon <35453679+matweldon@users.noreply.github.com> Date: Thu, 4 Apr 2024 12:05:23 +0100 Subject: [PATCH] 46: Final docs check through (#47) * Docs readthrough * Changed pprl to PPRL Toolkit in verknupfung tutorial --- .github/ISSUE_TEMPLATE/bug_report.md | 7 ++- .github/ISSUE_TEMPLATE/feature-idea.md | 3 +- README.md | 14 +++--- _quarto.yml | 14 ++++-- docs/tutorials/example-verknupfung.qmd | 43 +++++++++++++------ docs/tutorials/in-the-cloud.qmd | 47 ++++++++------------ docs/tutorials/index.qmd | 4 +- index.qmd | 8 ++-- pyproject.toml | 4 +- src/pprl/config.py | 4 +- src/pprl/embedder/embedder.py | 59 ++++++++++++++++++-------- src/pprl/embedder/features.py | 13 +++--- src/pprl/encryption.py | 2 +- 13 files changed, 127 insertions(+), 95 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 7fb4e98..8fc5388 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -7,7 +7,7 @@ assignees: '' --- -Please be aware that, as the `pprl_toolkit` is an experimental package, ONS cannot promise to resolve bugs. +Please be aware that, as pprl is an experimental package, ONS cannot promise to resolve bugs. ### Describe the bug A clear and concise description of what the bug is. @@ -22,7 +22,7 @@ Steps to reproduce the behaviour: ### Expected behaviour A clear and concise description of what you expected to happen. -### Evidence (tracebacks and screenshots +### Evidence (tracebacks and screenshots) If applicable, please add any tracebacks or screenshots to help explain your problem. ### System information @@ -30,8 +30,7 @@ Please provide the following information about your environment: - OS: [e.g. macOS] - Browser (when using the client-side app or GCP): [e.g. Chrome, Safari] - - `pprl_toolkit` version: [e.g. 0.0.1] + - pprl version: [e.g. 0.0.1] ### Additional context Add any other context about the problem here. - diff --git a/.github/ISSUE_TEMPLATE/feature-idea.md b/.github/ISSUE_TEMPLATE/feature-idea.md index 5f9bf77..5278030 100644 --- a/.github/ISSUE_TEMPLATE/feature-idea.md +++ b/.github/ISSUE_TEMPLATE/feature-idea.md @@ -7,7 +7,7 @@ assignees: '' --- -Please be aware that, as the `pprl_toolkit` is an experimental package, ONS cannot promise to implement feature ideas. +Please be aware that, as pprl is an experimental package, ONS cannot promise to implement feature ideas. ### Does your feature idea solve a problem? If this applies to your idea, please provide a clear and concise description of what the problem is. @@ -20,4 +20,3 @@ A clear and concise description of any alternative solutions or features you've ### Additional context Add any other context or screenshots about the feature request here. - diff --git a/README.md b/README.md index 34ab35b..254b4ec 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ ![ONS and DSC logos](https://github.com/datasciencecampus/awesome-campus/blob/master/ons_dsc_logo.png) -# `pprl_toolkit`: a toolkit for privacy-preserving record linkage +# PPRL Toolkit: A toolkit for Privacy-Preserving Record Linkage > "We find ourselves living in a society which is rich with data and the opportunities that comes with this. Yet, when disconnected, this data is limited in its usefulness. ... Being able to link data will be vital for enhancing our understanding of society, driving policy change for greater public good." Sir Ian Diamond, the National Statistician -The Privacy Preserving Record Linkage (PPRL) toolkit demonstrates the feasibility of record linkage in difficult 'eyes off' settings. It has been designed for a situation where two organisations (perhaps in different jurisdictions) want to link their datasets at record level, to enrich the information they contain, but neither party is able to send sensitive personal identifiers -- such as names, addresses or dates of birth -- to the other. Building on [previous ONS research](https://www.gov.uk/government/publications/joined-up-data-in-government-the-future-of-data-linking-methods/privacy-preserving-record-linkage-in-the-context-of-a-national-statistics-institute), the toolkit implements a well-known privacy-preserving linkage method in a new way to improve performance, and wraps it in a secure cloud architecture to demonstrate the potential of a layered approach. +The Privacy Preserving Record Linkage (PPRL) toolkit demonstrates the feasibility of record linkage in difficult 'eyes off' settings. It has been designed for a situation where two organisations (perhaps in different jurisdictions) want to link their datasets at record level, to enrich the information they contain, but neither party is able to send sensitive personal identifiers - such as names, addresses or dates of birth - to the other. Building on [previous ONS research](https://www.gov.uk/government/publications/joined-up-data-in-government-the-future-of-data-linking-methods/privacy-preserving-record-linkage-in-the-context-of-a-national-statistics-institute), the toolkit implements a well-known privacy-preserving linkage method in a new way to improve performance, and wraps it in a secure cloud architecture to demonstrate the potential of a layered approach. The toolkit has been developed by data scientists at the [Data Science Campus](https://datasciencecampus.ons.gov.uk/) of the UK Office for National Statistics. This project has benefitted from early collaborations with colleagues at NHS England. @@ -13,7 +13,9 @@ The two parts of the toolkit are: * a Python package for privacy-preserving record linkage with Bloom filters and hash embeddings, that can be used locally with no cloud set-up * instructions, scripts and resources to run record linkage in a cloud-based secure enclave. This part of the toolkit requires you to set up Google Cloud accounts with billing -We're publishing the repo as a prototype and teaching tool. Please feel free to download, adapt and experiment with it in compliance with the open-source license. You can submit issues [here](https://github.com/datasciencecampus/pprl_toolkit/issues). However, as this is an experimental repo, the development team cannot commit to maintaining the repo or responding to issues. If you'd like to collaborate with us, to put these ideas into practice for the public good, please [get in touch](https://datasciencecampus.ons.gov.uk/contact/). +We're publishing the repo as a prototype and teaching tool. Please feel free to download, adapt and experiment with it in compliance with the open-source license. The reference documentation and tutorials are published [here](https://datasciencecampus.github.io/pprl_toolkit). You can submit issues [here](https://github.com/datasciencecampus/pprl_toolkit/issues). However, as this is a prototype, the development team cannot commit to maintaining the repo indefinitely or responding to all issues. + +This toolkit is not assured for use in production settings, but we believe the tools and methods demonstrated here have great potential for positive impact with further development and adaptation. If you'd like to collaborate with us, to put these ideas into practice for the public good, please [get in touch](https://datasciencecampus.ons.gov.uk/contact/). ## Installation @@ -84,7 +86,7 @@ matching. We will use the toolkit to identify these matches. > These datasets don't have the same column names or follow the same encodings, > and there are several spelling mistakes in the names of the band members. > -> Thankfully, the `pprl_toolkit` is flexible enough to handle this! +> Thankfully, the PPRL Toolkit is flexible enough to handle this! ### Creating and assigning a feature factory @@ -148,7 +150,7 @@ Lastly, we compute the matching using an adapted Hungarian algorithm with local ``` -So, all three of the records in each dataset were matched correctly. Excellent! +So, all three of the records in each dataset were matched correctly. Excellent! You can find a longer version of this tutorial [here](https://datasciencecampus.github.io/pprl_toolkit/docs/tutorials/example-verknupfung.html). ## Working in the cloud @@ -169,7 +171,7 @@ parties, a workload author, and a workload operator. These roles can be summaris - The workload **operator** sets up and runs the Confidential Space virtual machine, which uses the Docker image to perform the record linkage. -We have set up `pprl_toolkit` to allow any configuration of these roles among +We have set up the PPRL Toolkit to allow any configuration of these roles among users. You could do it all yourself, split the workload roles between two data owning-parties, or ask a trusted third party to maintain the workload. diff --git a/_quarto.yml b/_quarto.yml index 14530c7..f54111e 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -2,7 +2,7 @@ project: type: website website: - title: "`pprl`" + title: "**pprl**" navbar: left: - href: index.qmd @@ -15,9 +15,9 @@ website: - icon: github menu: - text: Source code - url: https://github.com/datasciencecampus/pprl + url: https://github.com/datasciencecampus/pprl_toolkit - text: Open an issue - url: https://github.com/datasciencecampus/pprl/issues + url: https://github.com/datasciencecampus/pprl_toolkit/issues sidebar: style: docked search: true @@ -75,3 +75,11 @@ quartodoc: package: pprl.app contents: - utils + - title: Server functions + desc: > + Functions for the matching workload server. Used in `scripts/server.py` + package: pprl.matching + contents: + - cloud + - local + - perform diff --git a/docs/tutorials/example-verknupfung.qmd b/docs/tutorials/example-verknupfung.qmd index 025efe9..16a3ec3 100644 --- a/docs/tutorials/example-verknupfung.qmd +++ b/docs/tutorials/example-verknupfung.qmd @@ -22,7 +22,7 @@ df1 = pd.DataFrame( { "first_name": ["Laura", "Kaspar", "Grete"], "last_name": ["Daten", "Gorman", "Knopf"], - "gender": ["f", "m", "f"], + "gender": ["F", "M", "F"], "date_of_birth": ["01/03/1977", "31/12/1975", "12/7/1981"], "instrument": ["bass", "guitar", "drums"], } @@ -37,11 +37,12 @@ df2 = pd.DataFrame( ) ``` -> [!NOTE] -> These datasets don't have the same column names or follow the same encodings, -> and there are several spelling mistakes in the names of the band members, as well as a typo in the dates. -> -> Thankfully, the `pprl_toolkit` is flexible enough to handle this! +::: {.callout-note} +These datasets don't have the same column names or follow the same encodings, +and there are several spelling mistakes in the names of the band members, as well as a typo in the dates. + +Thankfully, the PPRL Toolkit is flexible enough to handle this! +::: ### Creating and assigning a feature factory @@ -72,24 +73,27 @@ spec1 = dict( spec2 = dict(name="name", sex="sex", main_instrument="instrument", birth_date="dob") ``` -> [!TIP] -> The feature generation functions, `features.gen_XXX_features` have sensible default parameters, but sometimes have to be passed in to the feature factory with different parameters, such as to set a feature label in the example above. -> There are two ways to achieve this. Either use `functools.partial` to set parameters (as above), or pass keyword arguments as a dictionary of dictionaries to the `Embedder` as `ff_args`. +::: {.callout-tip} +The feature generation functions, `features.gen_XXX_features` have sensible default parameters, but sometimes have to be passed in to the feature factory with different parameters, such as to set a feature label in the example above. +There are two ways to achieve this. Either use `functools.partial` to set parameters (as above), or pass keyword arguments as a dictionary of dictionaries to the `Embedder` as `ff_args`. +::: ### Embedding the data With our specifications sorted out, we can get to creating our Bloom filter embedding. Before doing so, we need to decide on two parameters: the size of -the filter and the number of hashes. By default, these are `1024` and `2`, +the filter and the number of hashes. By default, these are 1024 and 2, respectively. Once we've decided, we can create our `Embedder` instance and use it to embed our data with their column specifications. ```{python} +#| warning: false from pprl.embedder.embedder import Embedder embedder = Embedder(factory, bf_size=1024, num_hashes=2) + edf1 = embedder.embed(df1, colspec=spec1, update_thresholds=True) edf2 = embedder.embed(df2, colspec=spec2, update_thresholds=True) ``` @@ -103,15 +107,26 @@ three additional columns: `bf_indices`, `bf_norms` and `thresholds`. edf1.columns ``` -The `bf_indices` column contains the Bloom filters, represented compactly as a list of non-zero indices for each record. The `bf_norms` column contains the norm of each Bloom filter with respect to the Soft Cosine Measure (SCM) matrix. In this case since we are using an untrained model, the SCM matrix is an identity matrix, and the norm is just the Euclidean norm of the Bloom filter represented as a binary vector, which is equal to `np.sqrt(len(bf_indices[i]))` for record `i`. The norm is used to scale the similarity measures so that they take values between -1 and 1. +The `bf_indices` column contains the Bloom filters, represented compactly as a list of non-zero indices for each record. + +```{python} +print(edf1.bf_indices[0]) +``` -The `thresholds` column is calculated to provide, for each record, a threshold similarity score below which it will not be matched. It's like a reserve price in an auction -- it stops a record being matched to another record when the similarity isn't high enough. In this feature, the method implemented here differs from other linkage methods, which typically only have one global threshold score for the entire dataset. +The `bf_norms` column contains the norm of each Bloom filter with respect to the Soft Cosine Measure (SCM) matrix. In this case since we are using an untrained model, the SCM matrix is an identity matrix, and the norm is just the Euclidean norm of the Bloom filter represented as a binary vector, which is equal to `np.sqrt(len(bf_indices[i]))` for record `i`. The norm is used to scale the similarity measures so that they take values between -1 and 1. + +The `thresholds` column is calculated to provide, for each record, a threshold similarity score below which it will not be matched. It's like a reserve price in an auction -- it stops a record being matched to another record when the similarity isn't high enough. This is an innovative feature of our method; other linkage methods typically only have one global threshold score for the entire dataset. + +```{python} +print(edf1.loc[:,["bf_norms","thresholds"]]) +print(edf2.loc[:,["bf_norms","thresholds"]]) +``` ### The processed features -Let's take a look at how the features are processed into small text strings (shingles) before being hashed into the Bloom filter. The first record in the first dataset is the same person as the first record in the second dataset, although the data is not identical, so we can compare the processed features for these records to see how `pprl_toolkit` puts them into a format where they can be compared. +Let's take a look at how the features are processed into small text strings (shingles) before being hashed into the Bloom filter. The first record in the first dataset is the same person as the first record in the second dataset, although the data is not identical, so we can compare the processed features for these records to see how pprl puts them into a format where they can be compared. First, we'll look at date of birth: @@ -129,7 +144,7 @@ print(edf1.first_name_features[0] + edf1.last_name_features[0]) print(edf2.name_features[0]) ``` -The two datasets store the names differently, but this doesn't matter for the Bloom filter method because it treats each record like a bag of features. By default, the name processor produces 2-grams, 3-grams and 4-grams. +The two datasets store the names differently, but this doesn't matter for the Bloom filter method because it treats each record like a bag of features. By default, the name processor produces 2-grams and 3-grams. The sex processing function just converts different formats to lowercase and takes the first letter. This will often be enough: diff --git a/docs/tutorials/in-the-cloud.qmd b/docs/tutorials/in-the-cloud.qmd index e5dd538..70876f9 100644 --- a/docs/tutorials/in-the-cloud.qmd +++ b/docs/tutorials/in-the-cloud.qmd @@ -4,7 +4,7 @@ description: > Get you and your collaborators performing linkage in the cloud --- -This tutorial provides an overview of how to use `pprl_toolkit` on +This tutorial provides an overview of how to use the PPRL Toolkit on Google Cloud Platform (GCP). We go over how to assemble and assign roles in a linkage team, how to set up everybody's projects, and end with executing the linkage itself. @@ -34,14 +34,14 @@ model that allows one of the data-owning parties to author the workload while the other is the operator. ::: {.callout-tip} -In fact, `pprl_toolkit` is set up to allow any configuration of these roles +In fact, the PPRL Toolkit is set up to allow any configuration of these roles among up to four people. ::: In any case, you must decide who will be doing what from the outset. Each role comes with different responsibilities, but all roles require a GCP account and access to the `gcloud` command-line tool. Additionally, everyone in the linkage -project will need to install `pprl_toolkit`. +project will need to install the PPRL Toolkit. ### Data-owning party @@ -89,34 +89,23 @@ unique. This will ensure that bucket names are also globally unique. Our aim is to create a globally unique name (and thus ID) for each project. ::: -For example, say the US Census Bureau and UK Office for National Statistics -(ONS) are looking to link some data on ex-patriated residents with PPRL. Then -they might use `us-cb` and `uk-ons` as their party names, which are succinct +For example, say a UK bank and a US bank are looking to link some data on international +transactions to fit a machine learning model to predict fraud. Then +they might use `us-eaglebank` and `uk-royalbank` as their party names, which are succinct and descriptive. However, they are generic and rule out future PPRL projects with the same names. -As a remedy, they could make a hash of their project description to create an +As a remedy, they could make a short hash of their project description to create an identifier: ```bash -$ echo -n "pprl us-cb uk-ons ex-pats-analysis" | sha256sum -d59a50241dc78c3f926b565937b99614b7bb7c84e44fb780440718cb2b0ddc1b - +$ echo -n "pprl us-eaglebank uk-royalbank fraud apr 2024" | sha256sum | cut -c 1-7 +4fb6720 ``` -This is very long. You might only want to use the first few characters of this -hash. Note that Google Cloud bucket names also can't be more than 63 characters -long without dots. - -You can trim it down like so: - -```bash -$ echo -n "pprl us-cb uk-ons ex-pats-analysis" | sha256sum | cut -c 1-7 -d59a502 -``` - -So, our names would be: `uk-ons-d59a502`, `us-cb-d59a502`. If they had a +So, our project names would be: `uk-royalbank-4fb6720`, `us-eaglebank-4fb6720`. If they had a third-party linkage administrator (authoring and operating the workload), they -would have a project called something like `admin-d59a502`. +would have a project called something like `admin-4fb6720`. ## Setting up your projects @@ -169,15 +158,15 @@ The workload operator requires three IAM roles: | Storage Admin | `roles/storage.admin` | Managing a shared bucket | -## Configuring `pprl_toolkit` +## Configuring the PPRL Toolkit Now your linkage team has its projects made up, you need to configure -`pprl_toolkit`. This configuration tells the package where to look and what to +the PPRL Toolkit. This configuration tells the package where to look and what to call things; we do this with a single environment file containing a short collection of key-value pairs. We have provided an example environment file in `.env.example`. Copy or rename -that file to `.env` in the root of the `pprl_toolkit` installation. Then, fill +that file to `.env` in the root of the PPRL Toolkit installation. Then, fill in your project details as necessary. For our example above, let's say the ONS will be the workload author and the US @@ -185,16 +174,16 @@ Census Bureau will be the workload operator. The environment file would look something like this: ```bash -PARTY_1_PROJECT=us-cb-d59a502 +PARTY_1_PROJECT=uk-royalbank-4fb6720 PARTY_1_KEY_VERSION=1 -PARTY_2_PROJECT=uk-ons-d59a502 +PARTY_2_PROJECT=us-eaglebank-4fb6720 PARTY_2_KEY_VERSION=1 -WORKLOAD_AUTHOR_PROJECT=uk-ons-d59a502 +WORKLOAD_AUTHOR_PROJECT=uk-royalbank-4fb6720 WORKLOAD_AUTHOR_PROJECT_REGION=europe-west2 -WORKLOAD_OPERATOR_PROJECT=us-cb-d59a502 +WORKLOAD_OPERATOR_PROJECT=us-eaglebank-4fb6720 WORKLOAD_OPERATOR_PROJECT_ZONE=us-east4-a ``` diff --git a/docs/tutorials/index.qmd b/docs/tutorials/index.qmd index 0fa58c3..ccd5c55 100644 --- a/docs/tutorials/index.qmd +++ b/docs/tutorials/index.qmd @@ -9,8 +9,8 @@ listing: filter-ui: false --- -These tutorials walk you through some of the essential workflows for `pprl`. -The purpose of these documents is for you to learn how to use the `pprl` +These tutorials walk you through some of the essential workflows for pprl. +The purpose of these documents is for you to learn how to use the pprl package for your own linkage projects.
diff --git a/index.qmd b/index.qmd index 6be93bf..22e8e58 100644 --- a/index.qmd +++ b/index.qmd @@ -1,5 +1,5 @@ --- -title: Welcome to the `pprl` documentation! +title: Welcome to the **pprl** documentation! toc: false sidebar: false about: @@ -12,13 +12,13 @@ about: ## What is this and why does it exist? -This package, `pprl`, implements a method for performing -`p`rivacy-`p`reserving `r`ecord `l`inkage. This linkage can be done +This package, **pprl**, implements a method for performing +Privacy Preserving Record Linkage. This linkage can be done locally or through Google Cloud Platform. ## Where do I go now? -If you're looking to get stuck in with `pprl`, head over to our +If you're looking to get stuck in with pprl, head over to our [tutorials](docs/tutorials/index.qmd). For more focused, technical details of how this all works, see our diff --git a/pyproject.toml b/pyproject.toml index 931c3df..6adb035 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ authors = [ {name = "Henry Wilde"}, {name = "Data Science Campus", email = "datacampus@ons.gov.uk"}, ] -description = "Privacy-preserving record linkage via bloom filter embedding" +description = "Privacy-preserving record linkage via Bloom filter embedding" readme = "README.md" requires-python = ">=3.10" license = {text = "MIT License"} @@ -33,7 +33,7 @@ dependencies = [ ] [project.urls] -homepage = "https://github.com/datasciencecampus/pprl" +homepage = "https://github.com/datasciencecampus/pprl_toolkit" [project.optional-dependencies] lint = ["ruff==0.3.0", "mypy"] diff --git a/src/pprl/config.py b/src/pprl/config.py index e26323f..df66775 100644 --- a/src/pprl/config.py +++ b/src/pprl/config.py @@ -11,7 +11,7 @@ def _find_directory(kind: str, what: str | None = None) -> Path: """ - Find a directory in the root of the `pprl` installation. + Find a directory in the root of the pprl installation. Parameters ---------- @@ -43,7 +43,7 @@ def load_environment(path: None | str = None) -> dict[str, None | str]: ---------- path : str, optional Location of the configuration file to load. If not specified, - try to load the configuration file from the root of the `pprl` + try to load the configuration file from the root of the pprl installation called `.env`. Returns diff --git a/src/pprl/embedder/embedder.py b/src/pprl/embedder/embedder.py index 22fd68a..8f1e2fc 100644 --- a/src/pprl/embedder/embedder.py +++ b/src/pprl/embedder/embedder.py @@ -93,12 +93,14 @@ def to_bloom_matrix(self) -> np.ndarray: The matrix has a row for each row in the EDF. The number of columns is equal to `self.embedder.bf_size + self.embedder.offset`. + Each row in the matrix is a Bloom filter expressed as a binary vector, with + the ones corresponding to hashed features. This representation is used in the `Embedder.compare()` method. Returns ------- X: np.ndarray - Binary array of size `(len(self), self.bf_size + 1)`. + Binary array of size `(len(self), self.embedder.bf_size + self.embedder.offset)`. """ assert self.embedder_checksum == self.embedder.checksum, "Checksum mismatch" @@ -137,10 +139,12 @@ def _calculate_norm(self, bf_indices: list[int]) -> float: return np.sqrt(np.sum(self.embedder.scm_matrix[np.ix_(bf_indices, bf_indices)])) def update_norms(self) -> "EmbeddedDataFrame": - """Generate vector norms (wrt. `self.embedder`) for each row. + """Generate vector norms for each row. - The vector norm is used to scale the (Soft) Cosine similarity - scores. + Create or update the `bf_norms` column in the EDF. This method calculates, + for each Bloom filter, its Euclidean norm when the filter is expressed as a + binary vector, and saves it to the EDF. The norm is used to scale the + (Soft) Cosine similarity scores. Attributes ---------- @@ -163,8 +167,7 @@ class SimilarityArray(np.ndarray): Original array of similarity score data. thresholds: tuple, optional 2-tuple of similarity score thresholds for each axis. These - thresholds can be used as an outside option when generating a - matching. + thresholds are used when generating a matching. embedder_checksum: str, optional Hexadecimal string digest of a `pprl.embedder.Embedder` object. @@ -203,9 +206,9 @@ def match( Given an array of similarity scores, compute a matching of its elements, using the Hungarian algorithm by default. If the - `SimilarityArray` has thresholds, masking is used to ensure they - are respected. An `abs_cutoff` (global minimum similarity score) - can also be supplied. + `SimilarityArray` has thresholds, masking is used to ensure that prospective + matches whose similarity score is below the thresholds are not returned. + An `abs_cutoff` (global minimum similarity score) can also be supplied. Parameters ---------- @@ -350,9 +353,14 @@ def __init__( self.checksum = self._compute_checksum() def _initmatrix(self) -> np.ndarray: + """Initialise matrices as identity matrices of dimension `bf_size` + `offset`.""" return np.eye((self.bf_size + self.offset), dtype=np.float32) def _compute_checksum(self) -> str: + """Compute a checksum on important attributes of the Embedder instance. + + To check for functional equality of two instances + """ res = hashlib.md5() # bytes from feature_factory @@ -378,6 +386,10 @@ def embed( ) -> EmbeddedDataFrame: """Encode data columns into features from Bloom embedding. + Given a pandas DataFrame and a column specification, convert columns into + string features, and then embed the features into Bloom filters. The method + returns an instance of `EmbeddedDataFrame`, which is an augmented pandas DataFrame. + Parameters ---------- df : pd.DataFrame @@ -494,18 +506,29 @@ def compare( def _joint_freq_matrix( self, - x: list[list] | pd.Series, - y: list[list] | pd.Series, + bf_indices1: list[list] | pd.Series, + bf_indices2: list[list] | pd.Series, prob: bool = False, ) -> np.ndarray: - assert len(x) == len(y), "x and y lengths must match" - N = len(x) + """Calculate the symmetrised joint frequency matrix on the Bloom filters. + + Given two EDFs' bf_indices, returns a square matrix of size `self.bf_size` where + each entry (i,j) is the frequency of observing a feature hashed into slot `i` in + one dataset, and a feature hashed into slot `j` in the other dataset, at the same + row number. The frequency matrix is then symmetrised because the order of the two + datasets doesn't matter. `prob`, if True, converts frequencies to probabilities by dividing + by N, not usually needed because we're using a logged ratio of two matrices so the + division by N cancels out. + """ + assert len(bf_indices1) == len(bf_indices2), "x and y lengths must match" + N = len(bf_indices1) bfsize = self.bf_size + self.offset coordinates = ([], []) - # Loop through the cross-product of every index in x[n] and every index in y[n] - # for n in 1:len(x) - for i, j in it.chain.from_iterable(map(it.product, x, y)): + # Loop through the cross-product of every index in bf_indices1[n] + # and every index in bf_indices2[n] + # for n in 1:len(bf_indices1) + for i, j in it.chain.from_iterable(map(it.product, bf_indices1, bf_indices2)): coordinates[0].append(i) coordinates[1].append(j) @@ -534,8 +557,8 @@ def train( with its constituent matrices, `freq_matr_matched` and `freq_matr_unmatched`. - Provide two datasets of pre-matched data. If `update=True`, the - training is cumulative, so that `train()` can be called more + Provide two datasets of pre-matched data, with matching records aligned. + If `update=True`, the training is cumulative, so that `train()` can be called more than once, updating the same matrices each time by adding new frequency tables. Otherwise, all three matrices are reinitialised prior to training. diff --git a/src/pprl/embedder/features.py b/src/pprl/embedder/features.py index d123df2..1192ff8 100644 --- a/src/pprl/embedder/features.py +++ b/src/pprl/embedder/features.py @@ -11,18 +11,16 @@ def split_string_underscore(string: str) -> list[str]: """Split and underwrap a string at typical punctuation marks. Currently, we split at any combination of spaces, dashes, dots, - commas, or underscores. For example: + commas, or underscores. - ``` + Examples + -------- >>> strings = ("dave william johnson", "Francesca__Hogan-O'Malley") >>> for string in strings: ... print(split_string_underscore(string)) - ... ["_dave_", "_william_", "_johnson_"] ["_Francesca_", "_Hogan_", "_O'Malley_"] - ``` - Parameters ---------- string: str @@ -71,16 +69,15 @@ def gen_skip_grams(split_tokens: list) -> Generator[str, None, None]: """Generate skip 2-grams from a set of tokens. This function is a generator that contains a series of skip 2-grams. - For example: - ``` + Examples + -------- >>> string = "dave james" >>> tokens = split_string_underscore(string) >>> skips = list(gen_skip_grams(tokens)) >>> print(skips) ["_a", "dv", "ae", "v_", "_a", "jm", "ae", "ms", "e_"] - ``` Parameters ---------- diff --git a/src/pprl/encryption.py b/src/pprl/encryption.py index d45c3f9..aaa6eb5 100644 --- a/src/pprl/encryption.py +++ b/src/pprl/encryption.py @@ -171,7 +171,7 @@ def decrypt_dek( """ Decrypt a data encryption key using an asymmetric key held on KMS. - Owing to the nature of the encryption key set-up of `pprl` this + Owing to the nature of the encryption key set-up of pprl this function is only really to be used in the GCP Confidential Space set up by the linkage administrator.