From de06c8932d2018c75bb47ceb7a77ce66c758f20f Mon Sep 17 00:00:00 2001 From: Quarto GHA Workflow Runner Date: Mon, 13 May 2024 15:51:34 +0000 Subject: [PATCH] Built site for gh-pages --- .nojekyll | 2 +- docs/reference/embedder.html | 63 ++++++++++++++++++------- docs/reference/utils.html | 2 +- docs/tutorials/example-febrl.html | 24 +++++----- docs/tutorials/example-verknupfung.html | 26 +++++----- docs/tutorials/index.html | 8 ++-- docs/tutorials/run-through.html | 50 ++++++++++---------- search.json | 16 +++---- sitemap.xml | 32 ++++++------- 9 files changed, 127 insertions(+), 96 deletions(-) diff --git a/.nojekyll b/.nojekyll index cb3b51d..e51f4c4 100644 --- a/.nojekyll +++ b/.nojekyll @@ -1 +1 @@ -7064a115 \ No newline at end of file +ce820a19 \ No newline at end of file diff --git a/docs/reference/embedder.html b/docs/reference/embedder.html index f0af37b..4d6658c 100644 --- a/docs/reference/embedder.html +++ b/docs/reference/embedder.html @@ -417,26 +417,57 @@

Methods

+anonymise +Remove raw data from embedded dataframe. + + to_bloom_matrix Convert Bloom filter indices into a binary matrix. - + update_norms Generate vector norms for each row. - + update_thresholds Generate matching thresholds for each row of the data. +
+
anonymise
+

embedder.embedder.EmbeddedDataFrame.anonymise(keep=None)

+

Remove raw data from embedded dataframe.

+

Remove all columns from the embedded dataframe expect columns listed in keep and bf_indices, bf_norms and thresholds.

+
+
Returns
+ ++++ + + + + + + + + + + + + +
TypeDescription
list[str]Columns to be returned as they appear in the data in addition to bf_indices, bf_norms and thresholds if they are present in the data.
+
+
to_bloom_matrix

embedder.embedder.EmbeddedDataFrame.to_bloom_matrix()

Convert Bloom filter indices into a binary matrix.

The matrix has a row for each row in the EDF. The number of columns is equal to self.embedder.bf_size + self.embedder.offset. Each row in the matrix is a Bloom filter expressed as a binary vector, with the ones corresponding to hashed features. This representation is used in the Embedder.compare() method.

-
-
Returns
+
+
Returns
@@ -706,8 +737,8 @@
Parameters
-
-
Returns
+
+
Returns
@@ -799,8 +830,8 @@
Parameters
-
-
Returns
+
+
Returns
@@ -879,8 +910,8 @@
Raises
-
-
Returns
+
+
Returns
@@ -932,8 +963,8 @@
Parameters
-
-
Returns
+
+
Returns
@@ -1157,8 +1188,8 @@
Parameters
-
-
Returns
+
+
Returns
@@ -1241,8 +1272,8 @@

Parameters

-
-

Returns

+
+

Returns

diff --git a/docs/reference/utils.html b/docs/reference/utils.html index 07a7dff..7671f11 100644 --- a/docs/reference/utils.html +++ b/docs/reference/utils.html @@ -471,7 +471,7 @@

Parameters

- + diff --git a/docs/tutorials/example-febrl.html b/docs/tutorials/example-febrl.html index c30a2d5..2c8ba8c 100644 --- a/docs/tutorials/example-febrl.html +++ b/docs/tutorials/example-febrl.html @@ -343,7 +343,7 @@

Linking the FEBRL datasets

This tutorial shows how the package can be used locally to match the FEBRL datasets, included as example datasets in the recordlinkage package.

-
+
import os
 import time
 from functools import partial
@@ -359,7 +359,7 @@ 

Linking the FEBRL datasets

Load the data

The datasets we are using are 5000 records across two datasets with no duplicates, and each of the records has a valid match in the other dataset.

After loading the data, we can parse the true matched ID number from the indices.

-
+
feb4a, feb4b = load_febrl4()
 
 feb4a["true_id"] = (
@@ -382,7 +382,7 @@ 

Create a feature
  • Pass a dictionary of dictionaries of keyword arguments as an optional ff_args parameter (e.g. ff_args = {"dob": {"dayfirst": False, "yearfirst": True}}))
  • Use functools.partial(), as we have below.
  • -
    +
    feature_factory = dict(
         name=feat.gen_name_features,
         dob=partial(feat.gen_dateofbirth_features, dayfirst=False, yearfirst=True),
    @@ -396,7 +396,7 @@ 

    Create a feature

    Initialise the embedder instance

    This instance embeds each feature twice into a Bloom filter of length 1024.

    -
    +
    embedder = Embedder(feature_factory, bf_size=1024, num_hashes=2)
    @@ -418,7 +418,7 @@

    Embed the datasets

    For example, to ensure suburb doesn’t collide with state (if they happened to be the same), gen_misc_features() would encode each of their tokens as suburb<token> and state<token>, respectively. If you want to map different columns into the same feature, such as address below, you can set the label explicitly when passing the function to the embedder.

    -
    +
    colspec = dict(
         given_name="name",
         surname="name",
    @@ -436,7 +436,7 @@ 

    Embed the datasets

    edf2 = embedder.embed(feb4b, colspec=colspec)

    Store the embedded datasets and their embedder to file.

    -
    +
    edf1.to_json("party1_data.json")
     edf2.to_json("party2_data.json")
     embedder.to_pickle("embedder.pkl")
    @@ -445,7 +445,7 @@

    Embed the datasets

    Calculate similarity

    Compute the row thresholds to provide a lower bound on matching similarity scores for each row. This operation is the most computationally intensive part of the whole process.

    -
    +
    start = time.time()
     edf1.update_thresholds()
     edf2.update_thresholds()
    @@ -453,22 +453,22 @@ 

    Calculate similarity< print(f"Updating thresholds took {end - start:.2f} seconds")

    -
    Updating thresholds took 8.40 seconds
    +
    Updating thresholds took 8.35 seconds

    Compute the matrix of similarity scores.

    -
    +
    similarity_scores = embedder.compare(edf1,edf2)

    Compute a match

    Use the similarity scores to compute a match, using the Hungarian algorithm. First, we compute the match with the row thresholds.

    -
    +
    matching = similarity_scores.match(require_thresholds=True)

    Using the true IDs, evaluate the precision and recall of the match.

    -
    +
    def get_results(edf1, edf2, matching):
         """Get the results for a given matching."""
     
    @@ -492,7 +492,7 @@ 

    Compute a match

    Then, we compute the match without using the row thresholds, calculating the same performance metrics:

    -
    +
    matching = similarity_scores.match(require_thresholds=False)
     _ = get_results(edf1, edf2, matching)
    diff --git a/docs/tutorials/example-verknupfung.html b/docs/tutorials/example-verknupfung.html index 4e571c0..0606dd3 100644 --- a/docs/tutorials/example-verknupfung.html +++ b/docs/tutorials/example-verknupfung.html @@ -341,7 +341,7 @@

    Exploring a simple linkage example

    Loading the data

    First, we load our data into pandas.DataFrame objects. Here, the first records align, but the other two records should be swapped to have an aligned matching. We will use the toolkit to identify these matches.

    -
    +
    import pandas as pd
     
     df1 = pd.DataFrame(
    @@ -381,7 +381,7 @@ 

    Loading the data

    Creating and assigning a feature factory

    The next step is to decide how to process each of the columns in our datasets.

    To do this, we define a feature factory that maps column types to feature generation functions, and a column specification for each dataset mapping our columns to column types in the factory.

    -
    +
    from pprl.embedder import features
     from functools import partial
     
    @@ -419,7 +419,7 @@ 

    C

    Embedding the data

    With our specifications sorted out, we can get to creating our Bloom filter embedding. Before doing so, we need to decide on two parameters: the size of the filter and the number of hashes. By default, these are 1024 and 2, respectively.

    Once we’ve decided, we can create our Embedder instance and use it to embed our data with their column specifications.

    -
    +
    from pprl.embedder.embedder import Embedder
     
     embedder = Embedder(factory, bf_size=1024, num_hashes=2)
    @@ -428,7 +428,7 @@ 

    Embedding the data

    edf2 = embedder.embed(df2, colspec=spec2, update_thresholds=True)

    If we take a look at one of these embedded datasets, we can see that it has a whole bunch of new columns. There is a _features column for each of the original columns containing their pre-embedding string features, and there’s an all_features column that combines the features. Then there are three additional columns: bf_indices, bf_norms and thresholds.

    -
    +
    edf1.columns
    Index(['first_name', 'last_name', 'gender', 'date_of_birth', 'instrument',
    @@ -439,15 +439,15 @@ 

    Embedding the data

    The bf_indices column contains the Bloom filters, represented compactly as a list of non-zero indices for each record.

    -
    +
    print(edf1.bf_indices[0])
    -
    [2, 262, 646, 903, 9, 526, 15, 272, 654, 146, 531, 532, 17, 282, 667, 413, 670, 544, 288, 931, 292, 808, 937, 172, 942, 559, 816, 691, 820, 567, 823, 440, 56, 60, 61, 318, 319, 320, 444, 577, 836, 583, 332, 77, 972, 590, 465, 593, 211, 468, 82, 851, 338, 600, 84, 218, 861, 613, 871, 744, 238, 367, 881, 758, 890, 379, 1021, 763]
    +
    [2, 262, 903, 646, 9, 526, 654, 272, 15, 146, 17, 532, 531, 282, 667, 413, 670, 544, 288, 931, 292, 808, 937, 172, 942, 559, 816, 691, 820, 567, 56, 823, 440, 60, 61, 318, 319, 320, 444, 577, 836, 583, 332, 77, 590, 972, 465, 82, 211, 468, 84, 338, 851, 600, 593, 218, 861, 613, 871, 744, 238, 367, 881, 758, 890, 379, 1021, 763]

    The bf_norms column contains the norm of each Bloom filter with respect to the Soft Cosine Measure (SCM) matrix. In this case since we are using an untrained model, the SCM matrix is an identity matrix, and the norm is just the Euclidean norm of the Bloom filter represented as a binary vector, which is equal to np.sqrt(len(bf_indices[i])) for record i. The norm is used to scale the similarity measures so that they take values between -1 and 1.

    The thresholds column is calculated to provide, for each record, a threshold similarity score below which it will not be matched. It’s like a reserve price in an auction – it stops a record being matched to another record when the similarity isn’t high enough. This is an innovative feature of our method; other linkage methods typically only have one global threshold score for the entire dataset.

    -
    +
    print(edf1.loc[:,["bf_norms","thresholds"]])
     print(edf2.loc[:,["bf_norms","thresholds"]])
    @@ -467,7 +467,7 @@

    Embedding the data

    The processed features

    Let’s take a look at how the features are processed into small text strings (shingles) before being hashed into the Bloom filter. The first record in the first dataset is the same person as the first record in the second dataset, although the data is not identical, so we can compare the processed features for these records to see how pprl puts them into a format where they can be compared.

    First, we’ll look at date of birth:

    -
    +
    print(edf1.date_of_birth_features[0])
     print(edf2.birth_date_features[0])
    @@ -477,7 +477,7 @@

    The processed featu

    Python can parse the different formats easily. Although the dates are slightly different in the dataset, the year and month will still match, even though the day will not.

    Then we’ll look at name:

    -
    +
    print(edf1.first_name_features[0] + edf1.last_name_features[0])
     print(edf2.name_features[0])
    @@ -487,7 +487,7 @@

    The processed featu

    The two datasets store the names differently, but this doesn’t matter for the Bloom filter method because it treats each record like a bag of features. By default, the name processor produces 2-grams and 3-grams.

    The sex processing function just converts different formats to lowercase and takes the first letter. This will often be enough:

    -
    +
    print(edf1.gender_features[0])
     print(edf2.sex_features[0])
    @@ -496,7 +496,7 @@

    The processed featu

    Finally, we’ll see how our instrument feature function (partial(features.gen_misc_shingled_features, label="instrument")) processed the data:

    -
    +
    print(edf1.instrument_features[0])
     print(edf2.main_instrument_features[0])
    @@ -509,7 +509,7 @@

    The processed featu

    Performing the linkage

    We can now perform the linkage by comparing these Bloom filter embeddings. We use the Soft Cosine Measure (which in this untrained model, is equivalent to a normal cosine similarity metric) to calculate record-wise similarity and an adapted Hungarian algorithm to match the records based on those similarities.

    -
    +
    similarities = embedder.compare(edf1, edf2)
     similarities
    @@ -519,7 +519,7 @@

    Performing the link

    This SimilarityArray object is an augmented numpy.ndarray that can perform our matching. The matching itself can optionally be called with an absolute threshold score, but it doesn’t need one.

    -
    +
    matching = similarities.match()
     matching
    diff --git a/docs/tutorials/index.html b/docs/tutorials/index.html index d14e966..e533e03 100644 --- a/docs/tutorials/index.html +++ b/docs/tutorials/index.html @@ -384,7 +384,7 @@

    Tutorials

    - + @@ -395,7 +395,7 @@

    Tutorials

    5 min
    - + @@ -406,7 +406,7 @@

    Tutorials

    6 min
    - + @@ -417,7 +417,7 @@

    Tutorials

    4 min
    - + diff --git a/docs/tutorials/run-through.html b/docs/tutorials/run-through.html index 756526b..73db0b8 100644 --- a/docs/tutorials/run-through.html +++ b/docs/tutorials/run-through.html @@ -346,7 +346,7 @@

    Embedder API run-through

  • the config module, which includes our package configuration (such as the location of data directories)
  • some classes from the main embedder module
  • -
    +
    import os
     import numpy as np
     import pandas as pd
    @@ -357,7 +357,7 @@ 

    Embedder API run-through

    Data set-up

    For this demo we’ll create a really minimal pair of datasets. Notice that they don’t have to have the same structure or field names.

    -
    +
    df1 = pd.DataFrame(
         dict(
             id=[1,2,3],
    @@ -381,7 +381,7 @@ 

    Data set-up

    Features are extracted as different kinds of string objects from each field, ready to be hash embedded into the Bloom filters. We need to specify the feature extraction functions we’ll need.

    In this case we’ll need one extractor for names, one for dates of birth, and one for sex/gender records. We create a dict with the functions we need. We create another dict to store any keyword arguments we want to pass in to each function (in this case we use all the default arguments so the keyword argument dictionaries are empty):

    -
    +
    feature_factory = dict(
         name=feat.gen_name_features,
         dob=feat.gen_dateofbirth_features,
    @@ -395,7 +395,7 @@ 

    Data set-up

    Embedding

    Now we can create an Embedder object. We want our Bloom filter vectors to have a length of 1024 elements, and we choose to hash each feature two times. These choices seem to work ok, but we haven’t explored them systematically.

    -
    +
    embedder = Embedder(feature_factory,
                         ff_args,
                         bf_size = 2**10,
    @@ -403,7 +403,7 @@ 

    Embedding

    )

    Now we can hash embed the dataset into an EmbeddedDataFrame (EDF). For this we need to pass a column specification colspec that maps each column of the data into the feature_factory functions. Any columns not mapped will not contribute to the embedding.

    -
    +
    edf1 = embedder.embed(
         df1, colspec=dict(forename="name", surname="name", dob="dob", gender="sex", county="misc")
     )
    @@ -435,14 +435,14 @@ 

    Embedding

    2 [day<04>, month<10>, year<1995>] [sex<f>] [county<county durham>] all_features \ -0 [ll, nr, ll_, _t, ull, _tu, _he, he, tu, hen, ... -1 [all, ll, ro, n_, ow, sa, ly_, bro, month<01>,... -2 [ina, ey, _in, re, wr, aw, law, la, na_, ey_, ... +0 [_he, he, _t, ll, tul, ry_, l_, tu, ll_, y_, e... +1 [_br, wn_, ro, ll, al, ly, row, _b, y_, _sa, o... +2 [sex<f>, county<county durham>, na_, re, y_, a... bf_indices bf_norms 0 [644, 773, 135, 776, 265, 778, 271, 402, 404, ... 6.244998 1 [129, 258, 130, 776, 523, 525, 398, 271, 671, ... 7.141428 -2 [647, 394, 269, 13, 15, 532, 667, 155, 413, 28... 7.000000 +2 [647, 394, 269, 13, 15, 532, 667, 28, 413, 155... 7.000000 personid full_name date_of_birth sex county \ 0 4 Harry Tull 2/1/2001 M Rutland 1 5 Sali Brown 2/1/2001 M Powys @@ -459,12 +459,12 @@

    Embedding

    2 [day<04>, month<11>, year<1995>] [sex<f>] [county<durham>] all_features \ -0 [ll, ll_, rr, rry, ar, _ha, _t, ha, ull, count... -1 [county<powys>, ro, li_, n_, ow, sa, bro, ali,... -2 [ina, ie, aur, e_, _in, uri, la, na_, county<d... +0 [_t, ll, tul, ry_, l_, county<rutland>, ar, tu... +1 [_br, wn_, i_, ro, li_, al, ali, row, _b, wn, ... +2 [uri, sex<f>, month<11>, na_, ur, ie, a_, au, ... bf_indices bf_norms -0 [640, 130, 644, 135, 776, 10, 778, 271, 402, 5... 6.855655 +0 [640, 130, 644, 135, 776, 778, 10, 271, 402, 5... 6.855655 1 [130, 523, 525, 398, 271, 152, 671, 803, 806, ... 7.000000 2 [646, 647, 394, 269, 15, 272, 531, 532, 665, 6... 6.928203
    @@ -478,7 +478,7 @@

    Training

    Computing the similarity scores and the matching

    Now we have two embedded datasets, we can compare them and compute all the pairwise Cosine similarity scores.

    First, we have to compute the vector norms of each Bloom vector (for scaling the Cosine similarity) and the thresholds (thresholds are explained here [link]). Computing the thresholds can be time-consuming for a larger dataset, because it essentially computes all pairwise comparisons of the data to itself.

    -
    +
    @@ -515,8 +515,8 @@

    +
    similarities = embedder.compare(edf1,edf2)
     
     print(similarities)
    @@ -572,7 +572,7 @@

    +
    matching = similarities.match(abs_cutoff=0.5)
     
     print(matching)
    @@ -585,13 +585,13 @@

    Serialisation and file I/O

    That’s how to do the workflow in one session. However, this demo follows a multi-stage workflow, so we need to be able to pass objects around. There are a couple of methods that enable file I/O and serialisation.

    First, the Embedder object itself needs to be written to file and loaded. The idea is to train it, share it to the data owning parties, and also to the matching server. For this purpose, it’s possible to pickle the entire Embedder object.

    -
    +
    embedder.to_pickle("embedder.pkl")
     
     embedder_copy = Embedder.from_pickle("embedder.pkl")

    The copy has the same functionality as the original:

    -
    +
    similarities = embedder_copy.compare(edf1,edf2)
     
     print(similarities)
    @@ -602,7 +602,7 @@

    Serialisation an

    NB: This won’t work if two datasets were embedded with different Embedder instances, even if they’re identical. The compare() method checks for the same embedder object memory reference so it won’t work if one was embedded with the original and the other with the copy. The way to fix this is to re-initialise the EmbeddedDataFrame with the new Embedder object.

    -
    +
    edf2_copy = EmbeddedDataFrame(edf2, embedder_copy)

    In this case, be careful that the Embedder is compatible with the Bloom filter vectors in the EDF (i.e. uses the same parameters and feature factories), because while you can refresh the norms and thresholds, you can’t refresh the ‘bf_indices’ without reembedding the data frame.

    @@ -610,7 +610,7 @@

    Serialisation an

    Serialising the data

    The EDF objects are just a thin wrapper around pandas.DataFrame instances, so you can serialise to JSON using the normal methods.

    -
    +
    edf1.to_json("edf1.json")
     
     edf1_copy = pd.read_json("edf1.json")
    @@ -624,7 +624,7 @@ 

    Serialising the data<

    The bf_indices, bf_norms and thresholds columns will be preserved. However, this demotes the data frames back to normal pandas.DataFrame instances and loses the link to an Embedder instance.

    To fix this, just re-initialise them:

    -
    +
    edf1_copy = EmbeddedDataFrame(edf1_copy, embedder_copy)
    diff --git a/search.json b/search.json index ab28c72..d08e7ee 100644 --- a/search.json +++ b/search.json @@ -158,7 +158,7 @@ "href": "docs/reference/embedder.html", "title": "embedder", "section": "", - "text": "embedder.embedder\nClasses and functions for handling embedding objects.\n\n\n\n\n\nName\nDescription\n\n\n\n\nEmbeddedDataFrame\nA data frame with a reference to an Embedder object.\n\n\nEmbedder\nClass for embedding a dataset.\n\n\nSimilarityArray\nAugmented NumPy array of similarity scores with extra attributes.\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame(self, data, embedder, update_norms=True, update_thresholds=False, *args, **kwargs)\nA data frame with a reference to an Embedder object.\nAn EmbeddedDataFrame (EDF) instance wraps together a pandas.DataFrame with a reference to a pprl.embedder.Embedder object. An EDF also has a mandatory bf_indices column, describing the Bloom filter indices used for linkage.\nThe EDF instance can also calculate bf_norms and thresholds columns which are used in the Embedder.compare() method to compute pprl.embedder.SimilarityArray instances.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\nnumpy.numpy.ndarray | typing.Iterable | dict | pandas.pandas.DataFrame\nData to which to attach the embedder. Must include a bf_indices column with list data type.\nrequired\n\n\nembedder\npprl.embedder.embedder.Embedder\nA compatible embedder object for the Bloom filter columns in data.\nrequired\n\n\nupdate_norms\nbool\nWhether to update the Bloom filter norms on creation. Defaults to False.\nTrue\n\n\nupdate_thresholds\nbool\nWhether to update the similarity thresholds on creation. Defaults to True.\nFalse\n\n\n*args\n\nAdditional positional arguments to pass to pandas.DataFrame along with data.\n()\n\n\n**kwargs\n\nAdditional keyword arguments to pass to pandas.DataFrame along with data.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nembedder_checksum\nstr\nHexadecimal string digest from self.embedder.\n\n\n\n\n\n\nAn EDF instance is usually created from an existing Embedder object by calling the embedder.embed() method. It can also be initialised using an embedder and a pandas.DataFrame that already has a bf_indices column via EmbeddedDataFrame(df, embedder).\nIf using the second method it is up to the user to ensure that the Embedder instance is compatible with the bf_indices column (as well as bf_norms and thresholds, if present) in the data frame. If in doubt, call edf.update_norms() and edf.update_thresholds() to refresh them.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nto_bloom_matrix\nConvert Bloom filter indices into a binary matrix.\n\n\nupdate_norms\nGenerate vector norms for each row.\n\n\nupdate_thresholds\nGenerate matching thresholds for each row of the data.\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame.to_bloom_matrix()\nConvert Bloom filter indices into a binary matrix.\nThe matrix has a row for each row in the EDF. The number of columns is equal to self.embedder.bf_size + self.embedder.offset. Each row in the matrix is a Bloom filter expressed as a binary vector, with the ones corresponding to hashed features. This representation is used in the Embedder.compare() method.\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nnumpy.numpy.ndarray\nBinary array of size (len(self), self.embedder.bf_size + self.embedder.offset).\n\n\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame.update_norms()\nGenerate vector norms for each row.\nCreate or update the bf_norms column in the EDF. This method calculates, for each Bloom filter, its Euclidean norm when the filter is expressed as a binary vector, and saves it to the EDF. The norm is used to scale the (Soft) Cosine similarity scores.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndata.bf_norms\nlist\nColumn of vector norms for each row in the EDF.\n\n\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame.update_thresholds()\nGenerate matching thresholds for each row of the data.\nThe threshold is the minimum similarity score that will be matched. It is found by getting the pairwise similarities between each row and the other rows in the same EDF, and taking the maximum of these.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndata.thresholds\nnumpy.numpy.ndarray\nColumn for maximum similarity of each row within the EDF.\n\n\n\n\n\n\n\n\n\nembedder.embedder.Embedder(self, feature_factory, ff_args=None, bf_size=1024, num_hashes=2, offset=0, salt=None)\nClass for embedding a dataset.\nEach instance of the Embedder class represents an embedding space on personal data features. An Embedder instance is defined by three things:\n\nA set of Bloom filter parameters\nA set of feature factory functions\nAn embedding matrix that corresponds to the above\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfeature_factory\ndict\nMapping from dataset columns to feature generation functions.\nrequired\n\n\nff_args\ndict[str, dict] | None\nMapping from dataset columns to keyword arguments for their respective feature generation functions.\nNone\n\n\nbf_size\nint\nSize of the Bloom filter. Default is 1024.\n1024\n\n\nnum_hashes\nint\nNumber of hashes to perform. Default is two.\n2\n\n\noffset\nint\nOffset for Bloom filter to enable masking. Default is zero.\n0\n\n\nsalt\nstr | None\nCryptographic salt added to tokens from the data before hashing.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nscm_matrix\nnumpy.numpy.ndarray\nSoft Cosine Measure matrix. Initialised as an identity matrix of size bf_size + offset.\n\n\nfreq_matr_matched\nnumpy.numpy.ndarray\nMatched frequency matrix for computing scm_matrix. Initialised as an identity matrix of size bf_size + offset.\n\n\nfreq_matr_unmatched\nnumpy.numpy.ndarray\nUnmatched frequency matrix for computing scm_matrix. Initialised as an identity matrix of size bf_size + offset.\n\n\nchecksum\nstr\nHexadecimal string digest of the feature factory, SCM matrix, and other embedding parameters. Used to check an embedder is compatible with an EmbeddedDataFrame.\n\n\n\n\n\n\nWhen an instance is initialised in code, the embedding matrix is initialised as an identity matrix; the matrix can then be trained using a pair of datasets with known match status and the trained Embedder instance pickled to file. The pre-trained Embedder instance can then be reinitialised from the pickle file.\nBoth the untrained and trained instances provide embed() and compare() methods. Comparing datasets using an untrained Embedder instance is equivalent to calculating Cosine similarities on ordinary Bloom filters. Comparing datasets using a pre-trained Embedder calculates the Soft Cosine Measure between Bloom filters. The Soft Cosine Measure embedding matrix is trained using an experimental method.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompare\nCalculate a SimilarityArray on two EDFs.\n\n\nembed\nEncode data columns into features from Bloom embedding.\n\n\nfrom_pickle\nInitialise Embedder instance from pickle file.\n\n\nto_pickle\nSave Embedder instance to pickle file.\n\n\ntrain\nFit Soft Cosine Measure matrix to two matched datasets.\n\n\n\n\n\nembedder.embedder.Embedder.compare(edf1, edf2, require_thresholds=True)\nCalculate a SimilarityArray on two EDFs.\nGiven two EDFs, calculate all pairwise Soft Cosine Similarities between rows.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nedf1\npprl.embedder.embedder.EmbeddedDataFrame\nAn EDF instance with N rows. Must have thresholds column unless require_thresholds=False.\nrequired\n\n\nedf2\npprl.embedder.embedder.EmbeddedDataFrame\nAn EDF instance with M rows. Must have thresholds column unless require_thresholds=False.\nrequired\n\n\nrequire_thresholds\nbool\nIf True (default), the comparison will fail if thresholds are not present. Must be explicitly set to False to allow comparison without thresholds.\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npprl.embedder.embedder.SimilarityArray\nAn N by M array containing the similarity matrix of pairwise Soft Cosine similarities between rows of edf1 and edf2.\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nValueError\nIf require_thresholds is True and both EDFs don’t have a thresholds column.\n\n\n\n\n\n\n\nembedder.embedder.Embedder.embed(df, colspec, update_norms=True, update_thresholds=False)\nEncode data columns into features from Bloom embedding.\nGiven a pandas DataFrame and a column specification, convert columns into string features, and then embed the features into Bloom filters. The method returns an instance of EmbeddedDataFrame, which is an augmented pandas DataFrame.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndf\npandas.pandas.DataFrame\nData frame to be embedded.\nrequired\n\n\ncolspec\ndict\nDictionary mapping columns in df to feature factory functions.\nrequired\n\n\nupdate_norms\nbool\nWhether to calculate vector norms for SCM and add to EDF. False by default.\nTrue\n\n\nupdate_thresholds\nbool\nWhether to calculate similarity thresholds and add to EDF. Used as an outside option in matching. False by default.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npprl.embedder.embedder.EmbeddedDataFrame\nAn embedded data frame with its embedder.\n\n\n\n\n\n\n\nembedder.embedder.Embedder.from_pickle(path=None, pickled=None)\nInitialise Embedder instance from pickle file.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nstr\nFile path from which to load the pickled embedder.\nNone\n\n\npickled\nbytes\nByte-string containing the pickled embedder.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nValueError\nIf not exactly one of path and pickled are specified.\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npprl.embedder.embedder.Embedder\nThe reformed instance of the Embedder class.\n\n\n\n\n\n\n\nembedder.embedder.Embedder.to_pickle(path=None)\nSave Embedder instance to pickle file.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nstr\nFile path at which to save the pickled embedder. If not specified, the pickled bytes string is returned.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nbytes or None\nIf path is not specified, the pickled string comes back. Otherwise, nothing is returned.\n\n\n\n\n\n\n\nembedder.embedder.Embedder.train(edf1, edf2, update=True, learning_rate=1.0, eps=0.01, random_state=None)\nFit Soft Cosine Measure matrix to two matched datasets.\nThis function updates the scm_matrix attribute in-place along with its constituent matrices, freq_matr_matched and freq_matr_unmatched.\nProvide two datasets of pre-matched data, with matching records aligned. If update=True, the training is cumulative, so that train() can be called more than once, updating the same matrices each time by adding new frequency tables. Otherwise, all three matrices are reinitialised prior to training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nedf1\npprl.embedder.embedder.EmbeddedDataFrame\nAn embedded dataset.\nrequired\n\n\nedf2\npprl.embedder.embedder.EmbeddedDataFrame\nAn Embedded dataset of known matches in the same order as edf1.\nrequired\n\n\nupdate\nbool\nWhether to update the existing SCM matrix, or overwrite it. Defaults to True.\nTrue\n\n\neps\nfloat\nSmall non-negative constant to avoid -Inf in log of frequencies. Default is one.\n0.01\n\n\nlearning_rate\nfloat\nScaling factor to dampen matrix updates. Must be in the interval (0, 1]. Default is 0.01.\n1.0\n\n\nrandom_state\nNone | numpy.numpy.random.numpy.random.RandomState\nRandom state to pass to dataset jumbler. Defaults to None.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nscm_matrix\nnumpy.numpy.ndarray\nSoft Cosine Measure matrix that is fitted cumulatively or afresh.\n\n\n\n\n\n\n\n\n\nembedder.embedder.SimilarityArray()\nAugmented NumPy array of similarity scores with extra attributes.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ninput_array\n\nOriginal array of similarity score data.\nrequired\n\n\nthresholds\n\n2-tuple of similarity score thresholds for each axis. These thresholds are used when generating a matching.\nrequired\n\n\nembedder_checksum\n\nHexadecimal string digest of a pprl.embedder.Embedder object.\nrequired\n\n\n\n\n\n\nSimilarityArray objects are usually initialised from an instance of pprl.embedder.Embedder via the embedder.compare() method.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nmatch\nCompute a matching.\n\n\n\n\n\nembedder.embedder.SimilarityArray.match(abs_cutoff=0, rel_cutoff=0, hungarian=True, require_thresholds=True)\nCompute a matching.\nGiven an array of similarity scores, compute a matching of its elements, using the Hungarian algorithm by default. If the SimilarityArray has thresholds, masking is used to ensure that prospective matches whose similarity score is below the thresholds are not returned. An abs_cutoff (global minimum similarity score) can also be supplied.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nabs_cutoff\nint or float\nA lower cutoff for the similarity score. No pairs with similarity below the absolute cutoff will be matched. By default, this is 0.\n0\n\n\nrel_cutoff\nint or float\nA margin above the row/column-specific threshold. Raises all thresholds by a constant. By default, this is 0.\n0\n\n\nhungarian\nbool\nWhether to compute the unique matching using the Hungarian algorithm, filtered using thresholds and abs_cutoff. Default is True. If False, just return all pairs above the threshold.\nTrue\n\n\nrequire_thresholds\nbool\nIf True (default), the matching will fail if thresholds is not present and valid. Must be explicitly set to False to allow matching without similarity thresholds.\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\ntuple[list[int], list[int]]\n2-tuple of indexes containing row and column indices of matched pairs eg. ([0, 1, ...], [0, 1, ...]).\n\n\n\n\n\n\nIf hungarian=False, the matching returns all pairs with similarity score above the abs_cutoff, respecting thresholds if present. This method does not guarantee no duplicates.\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nnearest_pos_semi_definite\nCalculate nearest positive semi-definite version of a matrix.\n\n\n\n\n\nembedder.embedder.nearest_pos_semi_definite(X, eps=0.0)\nCalculate nearest positive semi-definite version of a matrix.\nThis function achieves this by setting all negative eigenvalues of the matrix to zero, or a small positive value to give a positive definite matrix.\nGraciously taken from this StackOverflow post\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\nnumpy.numpy.ndarray\nMatrix-like array.\nrequired\n\n\neps\nfloat\nUse a small positive constant to give a positive definite matrix. Default is 0 to give a positive semi-definite matrix.\n0.0\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nnumpy.numpy.ndarray\nA positive (semi-)definite matrix.", + "text": "embedder.embedder\nClasses and functions for handling embedding objects.\n\n\n\n\n\nName\nDescription\n\n\n\n\nEmbeddedDataFrame\nA data frame with a reference to an Embedder object.\n\n\nEmbedder\nClass for embedding a dataset.\n\n\nSimilarityArray\nAugmented NumPy array of similarity scores with extra attributes.\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame(self, data, embedder, update_norms=True, update_thresholds=False, *args, **kwargs)\nA data frame with a reference to an Embedder object.\nAn EmbeddedDataFrame (EDF) instance wraps together a pandas.DataFrame with a reference to a pprl.embedder.Embedder object. An EDF also has a mandatory bf_indices column, describing the Bloom filter indices used for linkage.\nThe EDF instance can also calculate bf_norms and thresholds columns which are used in the Embedder.compare() method to compute pprl.embedder.SimilarityArray instances.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\nnumpy.numpy.ndarray | typing.Iterable | dict | pandas.pandas.DataFrame\nData to which to attach the embedder. Must include a bf_indices column with list data type.\nrequired\n\n\nembedder\npprl.embedder.embedder.Embedder\nA compatible embedder object for the Bloom filter columns in data.\nrequired\n\n\nupdate_norms\nbool\nWhether to update the Bloom filter norms on creation. Defaults to False.\nTrue\n\n\nupdate_thresholds\nbool\nWhether to update the similarity thresholds on creation. Defaults to True.\nFalse\n\n\n*args\n\nAdditional positional arguments to pass to pandas.DataFrame along with data.\n()\n\n\n**kwargs\n\nAdditional keyword arguments to pass to pandas.DataFrame along with data.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nembedder_checksum\nstr\nHexadecimal string digest from self.embedder.\n\n\n\n\n\n\nAn EDF instance is usually created from an existing Embedder object by calling the embedder.embed() method. It can also be initialised using an embedder and a pandas.DataFrame that already has a bf_indices column via EmbeddedDataFrame(df, embedder).\nIf using the second method it is up to the user to ensure that the Embedder instance is compatible with the bf_indices column (as well as bf_norms and thresholds, if present) in the data frame. If in doubt, call edf.update_norms() and edf.update_thresholds() to refresh them.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nanonymise\nRemove raw data from embedded dataframe.\n\n\nto_bloom_matrix\nConvert Bloom filter indices into a binary matrix.\n\n\nupdate_norms\nGenerate vector norms for each row.\n\n\nupdate_thresholds\nGenerate matching thresholds for each row of the data.\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame.anonymise(keep=None)\nRemove raw data from embedded dataframe.\nRemove all columns from the embedded dataframe expect columns listed in keep and bf_indices, bf_norms and thresholds.\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nlist[str]\nColumns to be returned as they appear in the data in addition to bf_indices, bf_norms and thresholds if they are present in the data.\n\n\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame.to_bloom_matrix()\nConvert Bloom filter indices into a binary matrix.\nThe matrix has a row for each row in the EDF. The number of columns is equal to self.embedder.bf_size + self.embedder.offset. Each row in the matrix is a Bloom filter expressed as a binary vector, with the ones corresponding to hashed features. This representation is used in the Embedder.compare() method.\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nnumpy.numpy.ndarray\nBinary array of size (len(self), self.embedder.bf_size + self.embedder.offset).\n\n\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame.update_norms()\nGenerate vector norms for each row.\nCreate or update the bf_norms column in the EDF. This method calculates, for each Bloom filter, its Euclidean norm when the filter is expressed as a binary vector, and saves it to the EDF. The norm is used to scale the (Soft) Cosine similarity scores.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndata.bf_norms\nlist\nColumn of vector norms for each row in the EDF.\n\n\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame.update_thresholds()\nGenerate matching thresholds for each row of the data.\nThe threshold is the minimum similarity score that will be matched. It is found by getting the pairwise similarities between each row and the other rows in the same EDF, and taking the maximum of these.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndata.thresholds\nnumpy.numpy.ndarray\nColumn for maximum similarity of each row within the EDF.\n\n\n\n\n\n\n\n\n\nembedder.embedder.Embedder(self, feature_factory, ff_args=None, bf_size=1024, num_hashes=2, offset=0, salt=None)\nClass for embedding a dataset.\nEach instance of the Embedder class represents an embedding space on personal data features. An Embedder instance is defined by three things:\n\nA set of Bloom filter parameters\nA set of feature factory functions\nAn embedding matrix that corresponds to the above\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfeature_factory\ndict\nMapping from dataset columns to feature generation functions.\nrequired\n\n\nff_args\ndict[str, dict] | None\nMapping from dataset columns to keyword arguments for their respective feature generation functions.\nNone\n\n\nbf_size\nint\nSize of the Bloom filter. Default is 1024.\n1024\n\n\nnum_hashes\nint\nNumber of hashes to perform. Default is two.\n2\n\n\noffset\nint\nOffset for Bloom filter to enable masking. Default is zero.\n0\n\n\nsalt\nstr | None\nCryptographic salt added to tokens from the data before hashing.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nscm_matrix\nnumpy.numpy.ndarray\nSoft Cosine Measure matrix. Initialised as an identity matrix of size bf_size + offset.\n\n\nfreq_matr_matched\nnumpy.numpy.ndarray\nMatched frequency matrix for computing scm_matrix. Initialised as an identity matrix of size bf_size + offset.\n\n\nfreq_matr_unmatched\nnumpy.numpy.ndarray\nUnmatched frequency matrix for computing scm_matrix. Initialised as an identity matrix of size bf_size + offset.\n\n\nchecksum\nstr\nHexadecimal string digest of the feature factory, SCM matrix, and other embedding parameters. Used to check an embedder is compatible with an EmbeddedDataFrame.\n\n\n\n\n\n\nWhen an instance is initialised in code, the embedding matrix is initialised as an identity matrix; the matrix can then be trained using a pair of datasets with known match status and the trained Embedder instance pickled to file. The pre-trained Embedder instance can then be reinitialised from the pickle file.\nBoth the untrained and trained instances provide embed() and compare() methods. Comparing datasets using an untrained Embedder instance is equivalent to calculating Cosine similarities on ordinary Bloom filters. Comparing datasets using a pre-trained Embedder calculates the Soft Cosine Measure between Bloom filters. The Soft Cosine Measure embedding matrix is trained using an experimental method.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompare\nCalculate a SimilarityArray on two EDFs.\n\n\nembed\nEncode data columns into features from Bloom embedding.\n\n\nfrom_pickle\nInitialise Embedder instance from pickle file.\n\n\nto_pickle\nSave Embedder instance to pickle file.\n\n\ntrain\nFit Soft Cosine Measure matrix to two matched datasets.\n\n\n\n\n\nembedder.embedder.Embedder.compare(edf1, edf2, require_thresholds=True)\nCalculate a SimilarityArray on two EDFs.\nGiven two EDFs, calculate all pairwise Soft Cosine Similarities between rows.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nedf1\npprl.embedder.embedder.EmbeddedDataFrame\nAn EDF instance with N rows. Must have thresholds column unless require_thresholds=False.\nrequired\n\n\nedf2\npprl.embedder.embedder.EmbeddedDataFrame\nAn EDF instance with M rows. Must have thresholds column unless require_thresholds=False.\nrequired\n\n\nrequire_thresholds\nbool\nIf True (default), the comparison will fail if thresholds are not present. Must be explicitly set to False to allow comparison without thresholds.\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npprl.embedder.embedder.SimilarityArray\nAn N by M array containing the similarity matrix of pairwise Soft Cosine similarities between rows of edf1 and edf2.\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nValueError\nIf require_thresholds is True and both EDFs don’t have a thresholds column.\n\n\n\n\n\n\n\nembedder.embedder.Embedder.embed(df, colspec, update_norms=True, update_thresholds=False)\nEncode data columns into features from Bloom embedding.\nGiven a pandas DataFrame and a column specification, convert columns into string features, and then embed the features into Bloom filters. The method returns an instance of EmbeddedDataFrame, which is an augmented pandas DataFrame.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndf\npandas.pandas.DataFrame\nData frame to be embedded.\nrequired\n\n\ncolspec\ndict\nDictionary mapping columns in df to feature factory functions.\nrequired\n\n\nupdate_norms\nbool\nWhether to calculate vector norms for SCM and add to EDF. False by default.\nTrue\n\n\nupdate_thresholds\nbool\nWhether to calculate similarity thresholds and add to EDF. Used as an outside option in matching. False by default.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npprl.embedder.embedder.EmbeddedDataFrame\nAn embedded data frame with its embedder.\n\n\n\n\n\n\n\nembedder.embedder.Embedder.from_pickle(path=None, pickled=None)\nInitialise Embedder instance from pickle file.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nstr\nFile path from which to load the pickled embedder.\nNone\n\n\npickled\nbytes\nByte-string containing the pickled embedder.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nValueError\nIf not exactly one of path and pickled are specified.\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npprl.embedder.embedder.Embedder\nThe reformed instance of the Embedder class.\n\n\n\n\n\n\n\nembedder.embedder.Embedder.to_pickle(path=None)\nSave Embedder instance to pickle file.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nstr\nFile path at which to save the pickled embedder. If not specified, the pickled bytes string is returned.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nbytes or None\nIf path is not specified, the pickled string comes back. Otherwise, nothing is returned.\n\n\n\n\n\n\n\nembedder.embedder.Embedder.train(edf1, edf2, update=True, learning_rate=1.0, eps=0.01, random_state=None)\nFit Soft Cosine Measure matrix to two matched datasets.\nThis function updates the scm_matrix attribute in-place along with its constituent matrices, freq_matr_matched and freq_matr_unmatched.\nProvide two datasets of pre-matched data, with matching records aligned. If update=True, the training is cumulative, so that train() can be called more than once, updating the same matrices each time by adding new frequency tables. Otherwise, all three matrices are reinitialised prior to training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nedf1\npprl.embedder.embedder.EmbeddedDataFrame\nAn embedded dataset.\nrequired\n\n\nedf2\npprl.embedder.embedder.EmbeddedDataFrame\nAn Embedded dataset of known matches in the same order as edf1.\nrequired\n\n\nupdate\nbool\nWhether to update the existing SCM matrix, or overwrite it. Defaults to True.\nTrue\n\n\neps\nfloat\nSmall non-negative constant to avoid -Inf in log of frequencies. Default is one.\n0.01\n\n\nlearning_rate\nfloat\nScaling factor to dampen matrix updates. Must be in the interval (0, 1]. Default is 0.01.\n1.0\n\n\nrandom_state\nNone | numpy.numpy.random.numpy.random.RandomState\nRandom state to pass to dataset jumbler. Defaults to None.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nscm_matrix\nnumpy.numpy.ndarray\nSoft Cosine Measure matrix that is fitted cumulatively or afresh.\n\n\n\n\n\n\n\n\n\nembedder.embedder.SimilarityArray()\nAugmented NumPy array of similarity scores with extra attributes.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ninput_array\n\nOriginal array of similarity score data.\nrequired\n\n\nthresholds\n\n2-tuple of similarity score thresholds for each axis. These thresholds are used when generating a matching.\nrequired\n\n\nembedder_checksum\n\nHexadecimal string digest of a pprl.embedder.Embedder object.\nrequired\n\n\n\n\n\n\nSimilarityArray objects are usually initialised from an instance of pprl.embedder.Embedder via the embedder.compare() method.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nmatch\nCompute a matching.\n\n\n\n\n\nembedder.embedder.SimilarityArray.match(abs_cutoff=0, rel_cutoff=0, hungarian=True, require_thresholds=True)\nCompute a matching.\nGiven an array of similarity scores, compute a matching of its elements, using the Hungarian algorithm by default. If the SimilarityArray has thresholds, masking is used to ensure that prospective matches whose similarity score is below the thresholds are not returned. An abs_cutoff (global minimum similarity score) can also be supplied.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nabs_cutoff\nint or float\nA lower cutoff for the similarity score. No pairs with similarity below the absolute cutoff will be matched. By default, this is 0.\n0\n\n\nrel_cutoff\nint or float\nA margin above the row/column-specific threshold. Raises all thresholds by a constant. By default, this is 0.\n0\n\n\nhungarian\nbool\nWhether to compute the unique matching using the Hungarian algorithm, filtered using thresholds and abs_cutoff. Default is True. If False, just return all pairs above the threshold.\nTrue\n\n\nrequire_thresholds\nbool\nIf True (default), the matching will fail if thresholds is not present and valid. Must be explicitly set to False to allow matching without similarity thresholds.\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\ntuple[list[int], list[int]]\n2-tuple of indexes containing row and column indices of matched pairs eg. ([0, 1, ...], [0, 1, ...]).\n\n\n\n\n\n\nIf hungarian=False, the matching returns all pairs with similarity score above the abs_cutoff, respecting thresholds if present. This method does not guarantee no duplicates.\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nnearest_pos_semi_definite\nCalculate nearest positive semi-definite version of a matrix.\n\n\n\n\n\nembedder.embedder.nearest_pos_semi_definite(X, eps=0.0)\nCalculate nearest positive semi-definite version of a matrix.\nThis function achieves this by setting all negative eigenvalues of the matrix to zero, or a small positive value to give a positive definite matrix.\nGraciously taken from this StackOverflow post\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\nnumpy.numpy.ndarray\nMatrix-like array.\nrequired\n\n\neps\nfloat\nUse a small positive constant to give a positive definite matrix. Default is 0 to give a positive semi-definite matrix.\n0.0\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nnumpy.numpy.ndarray\nA positive (semi-)definite matrix.", "crumbs": [ "About", "Docs", @@ -171,7 +171,7 @@ "href": "docs/reference/embedder.html#classes", "title": "embedder", "section": "", - "text": "Name\nDescription\n\n\n\n\nEmbeddedDataFrame\nA data frame with a reference to an Embedder object.\n\n\nEmbedder\nClass for embedding a dataset.\n\n\nSimilarityArray\nAugmented NumPy array of similarity scores with extra attributes.\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame(self, data, embedder, update_norms=True, update_thresholds=False, *args, **kwargs)\nA data frame with a reference to an Embedder object.\nAn EmbeddedDataFrame (EDF) instance wraps together a pandas.DataFrame with a reference to a pprl.embedder.Embedder object. An EDF also has a mandatory bf_indices column, describing the Bloom filter indices used for linkage.\nThe EDF instance can also calculate bf_norms and thresholds columns which are used in the Embedder.compare() method to compute pprl.embedder.SimilarityArray instances.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\nnumpy.numpy.ndarray | typing.Iterable | dict | pandas.pandas.DataFrame\nData to which to attach the embedder. Must include a bf_indices column with list data type.\nrequired\n\n\nembedder\npprl.embedder.embedder.Embedder\nA compatible embedder object for the Bloom filter columns in data.\nrequired\n\n\nupdate_norms\nbool\nWhether to update the Bloom filter norms on creation. Defaults to False.\nTrue\n\n\nupdate_thresholds\nbool\nWhether to update the similarity thresholds on creation. Defaults to True.\nFalse\n\n\n*args\n\nAdditional positional arguments to pass to pandas.DataFrame along with data.\n()\n\n\n**kwargs\n\nAdditional keyword arguments to pass to pandas.DataFrame along with data.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nembedder_checksum\nstr\nHexadecimal string digest from self.embedder.\n\n\n\n\n\n\nAn EDF instance is usually created from an existing Embedder object by calling the embedder.embed() method. It can also be initialised using an embedder and a pandas.DataFrame that already has a bf_indices column via EmbeddedDataFrame(df, embedder).\nIf using the second method it is up to the user to ensure that the Embedder instance is compatible with the bf_indices column (as well as bf_norms and thresholds, if present) in the data frame. If in doubt, call edf.update_norms() and edf.update_thresholds() to refresh them.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nto_bloom_matrix\nConvert Bloom filter indices into a binary matrix.\n\n\nupdate_norms\nGenerate vector norms for each row.\n\n\nupdate_thresholds\nGenerate matching thresholds for each row of the data.\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame.to_bloom_matrix()\nConvert Bloom filter indices into a binary matrix.\nThe matrix has a row for each row in the EDF. The number of columns is equal to self.embedder.bf_size + self.embedder.offset. Each row in the matrix is a Bloom filter expressed as a binary vector, with the ones corresponding to hashed features. This representation is used in the Embedder.compare() method.\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nnumpy.numpy.ndarray\nBinary array of size (len(self), self.embedder.bf_size + self.embedder.offset).\n\n\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame.update_norms()\nGenerate vector norms for each row.\nCreate or update the bf_norms column in the EDF. This method calculates, for each Bloom filter, its Euclidean norm when the filter is expressed as a binary vector, and saves it to the EDF. The norm is used to scale the (Soft) Cosine similarity scores.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndata.bf_norms\nlist\nColumn of vector norms for each row in the EDF.\n\n\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame.update_thresholds()\nGenerate matching thresholds for each row of the data.\nThe threshold is the minimum similarity score that will be matched. It is found by getting the pairwise similarities between each row and the other rows in the same EDF, and taking the maximum of these.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndata.thresholds\nnumpy.numpy.ndarray\nColumn for maximum similarity of each row within the EDF.\n\n\n\n\n\n\n\n\n\nembedder.embedder.Embedder(self, feature_factory, ff_args=None, bf_size=1024, num_hashes=2, offset=0, salt=None)\nClass for embedding a dataset.\nEach instance of the Embedder class represents an embedding space on personal data features. An Embedder instance is defined by three things:\n\nA set of Bloom filter parameters\nA set of feature factory functions\nAn embedding matrix that corresponds to the above\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfeature_factory\ndict\nMapping from dataset columns to feature generation functions.\nrequired\n\n\nff_args\ndict[str, dict] | None\nMapping from dataset columns to keyword arguments for their respective feature generation functions.\nNone\n\n\nbf_size\nint\nSize of the Bloom filter. Default is 1024.\n1024\n\n\nnum_hashes\nint\nNumber of hashes to perform. Default is two.\n2\n\n\noffset\nint\nOffset for Bloom filter to enable masking. Default is zero.\n0\n\n\nsalt\nstr | None\nCryptographic salt added to tokens from the data before hashing.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nscm_matrix\nnumpy.numpy.ndarray\nSoft Cosine Measure matrix. Initialised as an identity matrix of size bf_size + offset.\n\n\nfreq_matr_matched\nnumpy.numpy.ndarray\nMatched frequency matrix for computing scm_matrix. Initialised as an identity matrix of size bf_size + offset.\n\n\nfreq_matr_unmatched\nnumpy.numpy.ndarray\nUnmatched frequency matrix for computing scm_matrix. Initialised as an identity matrix of size bf_size + offset.\n\n\nchecksum\nstr\nHexadecimal string digest of the feature factory, SCM matrix, and other embedding parameters. Used to check an embedder is compatible with an EmbeddedDataFrame.\n\n\n\n\n\n\nWhen an instance is initialised in code, the embedding matrix is initialised as an identity matrix; the matrix can then be trained using a pair of datasets with known match status and the trained Embedder instance pickled to file. The pre-trained Embedder instance can then be reinitialised from the pickle file.\nBoth the untrained and trained instances provide embed() and compare() methods. Comparing datasets using an untrained Embedder instance is equivalent to calculating Cosine similarities on ordinary Bloom filters. Comparing datasets using a pre-trained Embedder calculates the Soft Cosine Measure between Bloom filters. The Soft Cosine Measure embedding matrix is trained using an experimental method.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompare\nCalculate a SimilarityArray on two EDFs.\n\n\nembed\nEncode data columns into features from Bloom embedding.\n\n\nfrom_pickle\nInitialise Embedder instance from pickle file.\n\n\nto_pickle\nSave Embedder instance to pickle file.\n\n\ntrain\nFit Soft Cosine Measure matrix to two matched datasets.\n\n\n\n\n\nembedder.embedder.Embedder.compare(edf1, edf2, require_thresholds=True)\nCalculate a SimilarityArray on two EDFs.\nGiven two EDFs, calculate all pairwise Soft Cosine Similarities between rows.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nedf1\npprl.embedder.embedder.EmbeddedDataFrame\nAn EDF instance with N rows. Must have thresholds column unless require_thresholds=False.\nrequired\n\n\nedf2\npprl.embedder.embedder.EmbeddedDataFrame\nAn EDF instance with M rows. Must have thresholds column unless require_thresholds=False.\nrequired\n\n\nrequire_thresholds\nbool\nIf True (default), the comparison will fail if thresholds are not present. Must be explicitly set to False to allow comparison without thresholds.\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npprl.embedder.embedder.SimilarityArray\nAn N by M array containing the similarity matrix of pairwise Soft Cosine similarities between rows of edf1 and edf2.\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nValueError\nIf require_thresholds is True and both EDFs don’t have a thresholds column.\n\n\n\n\n\n\n\nembedder.embedder.Embedder.embed(df, colspec, update_norms=True, update_thresholds=False)\nEncode data columns into features from Bloom embedding.\nGiven a pandas DataFrame and a column specification, convert columns into string features, and then embed the features into Bloom filters. The method returns an instance of EmbeddedDataFrame, which is an augmented pandas DataFrame.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndf\npandas.pandas.DataFrame\nData frame to be embedded.\nrequired\n\n\ncolspec\ndict\nDictionary mapping columns in df to feature factory functions.\nrequired\n\n\nupdate_norms\nbool\nWhether to calculate vector norms for SCM and add to EDF. False by default.\nTrue\n\n\nupdate_thresholds\nbool\nWhether to calculate similarity thresholds and add to EDF. Used as an outside option in matching. False by default.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npprl.embedder.embedder.EmbeddedDataFrame\nAn embedded data frame with its embedder.\n\n\n\n\n\n\n\nembedder.embedder.Embedder.from_pickle(path=None, pickled=None)\nInitialise Embedder instance from pickle file.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nstr\nFile path from which to load the pickled embedder.\nNone\n\n\npickled\nbytes\nByte-string containing the pickled embedder.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nValueError\nIf not exactly one of path and pickled are specified.\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npprl.embedder.embedder.Embedder\nThe reformed instance of the Embedder class.\n\n\n\n\n\n\n\nembedder.embedder.Embedder.to_pickle(path=None)\nSave Embedder instance to pickle file.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nstr\nFile path at which to save the pickled embedder. If not specified, the pickled bytes string is returned.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nbytes or None\nIf path is not specified, the pickled string comes back. Otherwise, nothing is returned.\n\n\n\n\n\n\n\nembedder.embedder.Embedder.train(edf1, edf2, update=True, learning_rate=1.0, eps=0.01, random_state=None)\nFit Soft Cosine Measure matrix to two matched datasets.\nThis function updates the scm_matrix attribute in-place along with its constituent matrices, freq_matr_matched and freq_matr_unmatched.\nProvide two datasets of pre-matched data, with matching records aligned. If update=True, the training is cumulative, so that train() can be called more than once, updating the same matrices each time by adding new frequency tables. Otherwise, all three matrices are reinitialised prior to training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nedf1\npprl.embedder.embedder.EmbeddedDataFrame\nAn embedded dataset.\nrequired\n\n\nedf2\npprl.embedder.embedder.EmbeddedDataFrame\nAn Embedded dataset of known matches in the same order as edf1.\nrequired\n\n\nupdate\nbool\nWhether to update the existing SCM matrix, or overwrite it. Defaults to True.\nTrue\n\n\neps\nfloat\nSmall non-negative constant to avoid -Inf in log of frequencies. Default is one.\n0.01\n\n\nlearning_rate\nfloat\nScaling factor to dampen matrix updates. Must be in the interval (0, 1]. Default is 0.01.\n1.0\n\n\nrandom_state\nNone | numpy.numpy.random.numpy.random.RandomState\nRandom state to pass to dataset jumbler. Defaults to None.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nscm_matrix\nnumpy.numpy.ndarray\nSoft Cosine Measure matrix that is fitted cumulatively or afresh.\n\n\n\n\n\n\n\n\n\nembedder.embedder.SimilarityArray()\nAugmented NumPy array of similarity scores with extra attributes.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ninput_array\n\nOriginal array of similarity score data.\nrequired\n\n\nthresholds\n\n2-tuple of similarity score thresholds for each axis. These thresholds are used when generating a matching.\nrequired\n\n\nembedder_checksum\n\nHexadecimal string digest of a pprl.embedder.Embedder object.\nrequired\n\n\n\n\n\n\nSimilarityArray objects are usually initialised from an instance of pprl.embedder.Embedder via the embedder.compare() method.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nmatch\nCompute a matching.\n\n\n\n\n\nembedder.embedder.SimilarityArray.match(abs_cutoff=0, rel_cutoff=0, hungarian=True, require_thresholds=True)\nCompute a matching.\nGiven an array of similarity scores, compute a matching of its elements, using the Hungarian algorithm by default. If the SimilarityArray has thresholds, masking is used to ensure that prospective matches whose similarity score is below the thresholds are not returned. An abs_cutoff (global minimum similarity score) can also be supplied.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nabs_cutoff\nint or float\nA lower cutoff for the similarity score. No pairs with similarity below the absolute cutoff will be matched. By default, this is 0.\n0\n\n\nrel_cutoff\nint or float\nA margin above the row/column-specific threshold. Raises all thresholds by a constant. By default, this is 0.\n0\n\n\nhungarian\nbool\nWhether to compute the unique matching using the Hungarian algorithm, filtered using thresholds and abs_cutoff. Default is True. If False, just return all pairs above the threshold.\nTrue\n\n\nrequire_thresholds\nbool\nIf True (default), the matching will fail if thresholds is not present and valid. Must be explicitly set to False to allow matching without similarity thresholds.\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\ntuple[list[int], list[int]]\n2-tuple of indexes containing row and column indices of matched pairs eg. ([0, 1, ...], [0, 1, ...]).\n\n\n\n\n\n\nIf hungarian=False, the matching returns all pairs with similarity score above the abs_cutoff, respecting thresholds if present. This method does not guarantee no duplicates.", + "text": "Name\nDescription\n\n\n\n\nEmbeddedDataFrame\nA data frame with a reference to an Embedder object.\n\n\nEmbedder\nClass for embedding a dataset.\n\n\nSimilarityArray\nAugmented NumPy array of similarity scores with extra attributes.\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame(self, data, embedder, update_norms=True, update_thresholds=False, *args, **kwargs)\nA data frame with a reference to an Embedder object.\nAn EmbeddedDataFrame (EDF) instance wraps together a pandas.DataFrame with a reference to a pprl.embedder.Embedder object. An EDF also has a mandatory bf_indices column, describing the Bloom filter indices used for linkage.\nThe EDF instance can also calculate bf_norms and thresholds columns which are used in the Embedder.compare() method to compute pprl.embedder.SimilarityArray instances.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\nnumpy.numpy.ndarray | typing.Iterable | dict | pandas.pandas.DataFrame\nData to which to attach the embedder. Must include a bf_indices column with list data type.\nrequired\n\n\nembedder\npprl.embedder.embedder.Embedder\nA compatible embedder object for the Bloom filter columns in data.\nrequired\n\n\nupdate_norms\nbool\nWhether to update the Bloom filter norms on creation. Defaults to False.\nTrue\n\n\nupdate_thresholds\nbool\nWhether to update the similarity thresholds on creation. Defaults to True.\nFalse\n\n\n*args\n\nAdditional positional arguments to pass to pandas.DataFrame along with data.\n()\n\n\n**kwargs\n\nAdditional keyword arguments to pass to pandas.DataFrame along with data.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nembedder_checksum\nstr\nHexadecimal string digest from self.embedder.\n\n\n\n\n\n\nAn EDF instance is usually created from an existing Embedder object by calling the embedder.embed() method. It can also be initialised using an embedder and a pandas.DataFrame that already has a bf_indices column via EmbeddedDataFrame(df, embedder).\nIf using the second method it is up to the user to ensure that the Embedder instance is compatible with the bf_indices column (as well as bf_norms and thresholds, if present) in the data frame. If in doubt, call edf.update_norms() and edf.update_thresholds() to refresh them.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nanonymise\nRemove raw data from embedded dataframe.\n\n\nto_bloom_matrix\nConvert Bloom filter indices into a binary matrix.\n\n\nupdate_norms\nGenerate vector norms for each row.\n\n\nupdate_thresholds\nGenerate matching thresholds for each row of the data.\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame.anonymise(keep=None)\nRemove raw data from embedded dataframe.\nRemove all columns from the embedded dataframe expect columns listed in keep and bf_indices, bf_norms and thresholds.\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nlist[str]\nColumns to be returned as they appear in the data in addition to bf_indices, bf_norms and thresholds if they are present in the data.\n\n\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame.to_bloom_matrix()\nConvert Bloom filter indices into a binary matrix.\nThe matrix has a row for each row in the EDF. The number of columns is equal to self.embedder.bf_size + self.embedder.offset. Each row in the matrix is a Bloom filter expressed as a binary vector, with the ones corresponding to hashed features. This representation is used in the Embedder.compare() method.\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nnumpy.numpy.ndarray\nBinary array of size (len(self), self.embedder.bf_size + self.embedder.offset).\n\n\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame.update_norms()\nGenerate vector norms for each row.\nCreate or update the bf_norms column in the EDF. This method calculates, for each Bloom filter, its Euclidean norm when the filter is expressed as a binary vector, and saves it to the EDF. The norm is used to scale the (Soft) Cosine similarity scores.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndata.bf_norms\nlist\nColumn of vector norms for each row in the EDF.\n\n\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame.update_thresholds()\nGenerate matching thresholds for each row of the data.\nThe threshold is the minimum similarity score that will be matched. It is found by getting the pairwise similarities between each row and the other rows in the same EDF, and taking the maximum of these.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndata.thresholds\nnumpy.numpy.ndarray\nColumn for maximum similarity of each row within the EDF.\n\n\n\n\n\n\n\n\n\nembedder.embedder.Embedder(self, feature_factory, ff_args=None, bf_size=1024, num_hashes=2, offset=0, salt=None)\nClass for embedding a dataset.\nEach instance of the Embedder class represents an embedding space on personal data features. An Embedder instance is defined by three things:\n\nA set of Bloom filter parameters\nA set of feature factory functions\nAn embedding matrix that corresponds to the above\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfeature_factory\ndict\nMapping from dataset columns to feature generation functions.\nrequired\n\n\nff_args\ndict[str, dict] | None\nMapping from dataset columns to keyword arguments for their respective feature generation functions.\nNone\n\n\nbf_size\nint\nSize of the Bloom filter. Default is 1024.\n1024\n\n\nnum_hashes\nint\nNumber of hashes to perform. Default is two.\n2\n\n\noffset\nint\nOffset for Bloom filter to enable masking. Default is zero.\n0\n\n\nsalt\nstr | None\nCryptographic salt added to tokens from the data before hashing.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nscm_matrix\nnumpy.numpy.ndarray\nSoft Cosine Measure matrix. Initialised as an identity matrix of size bf_size + offset.\n\n\nfreq_matr_matched\nnumpy.numpy.ndarray\nMatched frequency matrix for computing scm_matrix. Initialised as an identity matrix of size bf_size + offset.\n\n\nfreq_matr_unmatched\nnumpy.numpy.ndarray\nUnmatched frequency matrix for computing scm_matrix. Initialised as an identity matrix of size bf_size + offset.\n\n\nchecksum\nstr\nHexadecimal string digest of the feature factory, SCM matrix, and other embedding parameters. Used to check an embedder is compatible with an EmbeddedDataFrame.\n\n\n\n\n\n\nWhen an instance is initialised in code, the embedding matrix is initialised as an identity matrix; the matrix can then be trained using a pair of datasets with known match status and the trained Embedder instance pickled to file. The pre-trained Embedder instance can then be reinitialised from the pickle file.\nBoth the untrained and trained instances provide embed() and compare() methods. Comparing datasets using an untrained Embedder instance is equivalent to calculating Cosine similarities on ordinary Bloom filters. Comparing datasets using a pre-trained Embedder calculates the Soft Cosine Measure between Bloom filters. The Soft Cosine Measure embedding matrix is trained using an experimental method.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompare\nCalculate a SimilarityArray on two EDFs.\n\n\nembed\nEncode data columns into features from Bloom embedding.\n\n\nfrom_pickle\nInitialise Embedder instance from pickle file.\n\n\nto_pickle\nSave Embedder instance to pickle file.\n\n\ntrain\nFit Soft Cosine Measure matrix to two matched datasets.\n\n\n\n\n\nembedder.embedder.Embedder.compare(edf1, edf2, require_thresholds=True)\nCalculate a SimilarityArray on two EDFs.\nGiven two EDFs, calculate all pairwise Soft Cosine Similarities between rows.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nedf1\npprl.embedder.embedder.EmbeddedDataFrame\nAn EDF instance with N rows. Must have thresholds column unless require_thresholds=False.\nrequired\n\n\nedf2\npprl.embedder.embedder.EmbeddedDataFrame\nAn EDF instance with M rows. Must have thresholds column unless require_thresholds=False.\nrequired\n\n\nrequire_thresholds\nbool\nIf True (default), the comparison will fail if thresholds are not present. Must be explicitly set to False to allow comparison without thresholds.\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npprl.embedder.embedder.SimilarityArray\nAn N by M array containing the similarity matrix of pairwise Soft Cosine similarities between rows of edf1 and edf2.\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nValueError\nIf require_thresholds is True and both EDFs don’t have a thresholds column.\n\n\n\n\n\n\n\nembedder.embedder.Embedder.embed(df, colspec, update_norms=True, update_thresholds=False)\nEncode data columns into features from Bloom embedding.\nGiven a pandas DataFrame and a column specification, convert columns into string features, and then embed the features into Bloom filters. The method returns an instance of EmbeddedDataFrame, which is an augmented pandas DataFrame.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndf\npandas.pandas.DataFrame\nData frame to be embedded.\nrequired\n\n\ncolspec\ndict\nDictionary mapping columns in df to feature factory functions.\nrequired\n\n\nupdate_norms\nbool\nWhether to calculate vector norms for SCM and add to EDF. False by default.\nTrue\n\n\nupdate_thresholds\nbool\nWhether to calculate similarity thresholds and add to EDF. Used as an outside option in matching. False by default.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npprl.embedder.embedder.EmbeddedDataFrame\nAn embedded data frame with its embedder.\n\n\n\n\n\n\n\nembedder.embedder.Embedder.from_pickle(path=None, pickled=None)\nInitialise Embedder instance from pickle file.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nstr\nFile path from which to load the pickled embedder.\nNone\n\n\npickled\nbytes\nByte-string containing the pickled embedder.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nValueError\nIf not exactly one of path and pickled are specified.\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npprl.embedder.embedder.Embedder\nThe reformed instance of the Embedder class.\n\n\n\n\n\n\n\nembedder.embedder.Embedder.to_pickle(path=None)\nSave Embedder instance to pickle file.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nstr\nFile path at which to save the pickled embedder. If not specified, the pickled bytes string is returned.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nbytes or None\nIf path is not specified, the pickled string comes back. Otherwise, nothing is returned.\n\n\n\n\n\n\n\nembedder.embedder.Embedder.train(edf1, edf2, update=True, learning_rate=1.0, eps=0.01, random_state=None)\nFit Soft Cosine Measure matrix to two matched datasets.\nThis function updates the scm_matrix attribute in-place along with its constituent matrices, freq_matr_matched and freq_matr_unmatched.\nProvide two datasets of pre-matched data, with matching records aligned. If update=True, the training is cumulative, so that train() can be called more than once, updating the same matrices each time by adding new frequency tables. Otherwise, all three matrices are reinitialised prior to training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nedf1\npprl.embedder.embedder.EmbeddedDataFrame\nAn embedded dataset.\nrequired\n\n\nedf2\npprl.embedder.embedder.EmbeddedDataFrame\nAn Embedded dataset of known matches in the same order as edf1.\nrequired\n\n\nupdate\nbool\nWhether to update the existing SCM matrix, or overwrite it. Defaults to True.\nTrue\n\n\neps\nfloat\nSmall non-negative constant to avoid -Inf in log of frequencies. Default is one.\n0.01\n\n\nlearning_rate\nfloat\nScaling factor to dampen matrix updates. Must be in the interval (0, 1]. Default is 0.01.\n1.0\n\n\nrandom_state\nNone | numpy.numpy.random.numpy.random.RandomState\nRandom state to pass to dataset jumbler. Defaults to None.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nscm_matrix\nnumpy.numpy.ndarray\nSoft Cosine Measure matrix that is fitted cumulatively or afresh.\n\n\n\n\n\n\n\n\n\nembedder.embedder.SimilarityArray()\nAugmented NumPy array of similarity scores with extra attributes.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ninput_array\n\nOriginal array of similarity score data.\nrequired\n\n\nthresholds\n\n2-tuple of similarity score thresholds for each axis. These thresholds are used when generating a matching.\nrequired\n\n\nembedder_checksum\n\nHexadecimal string digest of a pprl.embedder.Embedder object.\nrequired\n\n\n\n\n\n\nSimilarityArray objects are usually initialised from an instance of pprl.embedder.Embedder via the embedder.compare() method.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nmatch\nCompute a matching.\n\n\n\n\n\nembedder.embedder.SimilarityArray.match(abs_cutoff=0, rel_cutoff=0, hungarian=True, require_thresholds=True)\nCompute a matching.\nGiven an array of similarity scores, compute a matching of its elements, using the Hungarian algorithm by default. If the SimilarityArray has thresholds, masking is used to ensure that prospective matches whose similarity score is below the thresholds are not returned. An abs_cutoff (global minimum similarity score) can also be supplied.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nabs_cutoff\nint or float\nA lower cutoff for the similarity score. No pairs with similarity below the absolute cutoff will be matched. By default, this is 0.\n0\n\n\nrel_cutoff\nint or float\nA margin above the row/column-specific threshold. Raises all thresholds by a constant. By default, this is 0.\n0\n\n\nhungarian\nbool\nWhether to compute the unique matching using the Hungarian algorithm, filtered using thresholds and abs_cutoff. Default is True. If False, just return all pairs above the threshold.\nTrue\n\n\nrequire_thresholds\nbool\nIf True (default), the matching will fail if thresholds is not present and valid. Must be explicitly set to False to allow matching without similarity thresholds.\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\ntuple[list[int], list[int]]\n2-tuple of indexes containing row and column indices of matched pairs eg. ([0, 1, ...], [0, 1, ...]).\n\n\n\n\n\n\nIf hungarian=False, the matching returns all pairs with similarity score above the abs_cutoff, respecting thresholds if present. This method does not guarantee no duplicates.", "crumbs": [ "About", "Docs", @@ -223,7 +223,7 @@ "href": "docs/tutorials/example-verknupfung.html", "title": "Exploring a simple linkage example", "section": "", - "text": "The Python package implements the Bloom filter linkage method (Schnell et al., 2009), and can also implement pretrained Hash embeddings (Miranda et al., 2022), if a suitable large, pre-matched corpus of data is available.\nLet us consider a small example where we want to link two excerpts of data on bands. In this scenario, we are looking at some toy data on the members of a fictional, German rock trio called “Verknüpfung”. In this example we will see how to use untrained Bloom filters to match data.\n\nLoading the data\nFirst, we load our data into pandas.DataFrame objects. Here, the first records align, but the other two records should be swapped to have an aligned matching. We will use the toolkit to identify these matches.\n\nimport pandas as pd\n\ndf1 = pd.DataFrame(\n {\n \"first_name\": [\"Laura\", \"Kaspar\", \"Grete\"],\n \"last_name\": [\"Daten\", \"Gorman\", \"Knopf\"],\n \"gender\": [\"F\", \"M\", \"F\"],\n \"date_of_birth\": [\"01/03/1977\", \"31/12/1975\", \"12/7/1981\"],\n \"instrument\": [\"bass\", \"guitar\", \"drums\"],\n }\n)\ndf2 = pd.DataFrame(\n {\n \"name\": [\"Laura Datten\", \"Greta Knopf\", \"Casper Goreman\"],\n \"sex\": [\"female\", \"female\", \"male\"],\n \"main_instrument\": [\"bass guitar\", \"percussion\", \"electric guitar\"],\n \"birth_date\": [\"1977-03-23\", \"1981-07-12\", \"1975-12-31\"],\n }\n)\n\n\n\n\n\n\n\nNote\n\n\n\nThese datasets don’t have the same column names or follow the same encodings, and there are several spelling mistakes in the names of the band members, as well as a typo in the dates.\nThankfully, the PPRL Toolkit is flexible enough to handle this!\n\n\n\n\nCreating and assigning a feature factory\nThe next step is to decide how to process each of the columns in our datasets.\nTo do this, we define a feature factory that maps column types to feature generation functions, and a column specification for each dataset mapping our columns to column types in the factory.\n\nfrom pprl.embedder import features\nfrom functools import partial\n\nfactory = dict(\n name=features.gen_name_features,\n sex=features.gen_sex_features,\n misc=features.gen_misc_features,\n dob=features.gen_dateofbirth_features,\n instrument=partial(features.gen_misc_shingled_features, label=\"instrument\")\n)\nspec1 = dict(\n first_name=\"name\",\n last_name=\"name\",\n gender=\"sex\",\n instrument=\"instrument\",\n date_of_birth=\"dob\",\n)\nspec2 = dict(name=\"name\", sex=\"sex\", main_instrument=\"instrument\", birth_date=\"dob\")\n\n\n\n\n\n\n\nTip\n\n\n\nThe feature generation functions, features.gen_XXX_features have sensible default parameters, but sometimes have to be passed in to the feature factory with different parameters, such as to set a feature label in the example above. There are two ways to achieve this. Either use functools.partial to set parameters (as above), or pass keyword arguments as a dictionary of dictionaries to the Embedder as ff_args.\n\n\n\n\nEmbedding the data\nWith our specifications sorted out, we can get to creating our Bloom filter embedding. Before doing so, we need to decide on two parameters: the size of the filter and the number of hashes. By default, these are 1024 and 2, respectively.\nOnce we’ve decided, we can create our Embedder instance and use it to embed our data with their column specifications.\n\nfrom pprl.embedder.embedder import Embedder\n\nembedder = Embedder(factory, bf_size=1024, num_hashes=2)\n\nedf1 = embedder.embed(df1, colspec=spec1, update_thresholds=True)\nedf2 = embedder.embed(df2, colspec=spec2, update_thresholds=True)\n\nIf we take a look at one of these embedded datasets, we can see that it has a whole bunch of new columns. There is a _features column for each of the original columns containing their pre-embedding string features, and there’s an all_features column that combines the features. Then there are three additional columns: bf_indices, bf_norms and thresholds.\n\nedf1.columns\n\nIndex(['first_name', 'last_name', 'gender', 'date_of_birth', 'instrument',\n 'first_name_features', 'last_name_features', 'gender_features',\n 'instrument_features', 'date_of_birth_features', 'all_features',\n 'bf_indices', 'bf_norms', 'thresholds'],\n dtype='object')\n\n\nThe bf_indices column contains the Bloom filters, represented compactly as a list of non-zero indices for each record.\n\nprint(edf1.bf_indices[0])\n\n[2, 262, 646, 903, 9, 526, 15, 272, 654, 146, 531, 532, 17, 282, 667, 413, 670, 544, 288, 931, 292, 808, 937, 172, 942, 559, 816, 691, 820, 567, 823, 440, 56, 60, 61, 318, 319, 320, 444, 577, 836, 583, 332, 77, 972, 590, 465, 593, 211, 468, 82, 851, 338, 600, 84, 218, 861, 613, 871, 744, 238, 367, 881, 758, 890, 379, 1021, 763]\n\n\nThe bf_norms column contains the norm of each Bloom filter with respect to the Soft Cosine Measure (SCM) matrix. In this case since we are using an untrained model, the SCM matrix is an identity matrix, and the norm is just the Euclidean norm of the Bloom filter represented as a binary vector, which is equal to np.sqrt(len(bf_indices[i])) for record i. The norm is used to scale the similarity measures so that they take values between -1 and 1.\nThe thresholds column is calculated to provide, for each record, a threshold similarity score below which it will not be matched. It’s like a reserve price in an auction – it stops a record being matched to another record when the similarity isn’t high enough. This is an innovative feature of our method; other linkage methods typically only have one global threshold score for the entire dataset.\n\nprint(edf1.loc[:,[\"bf_norms\",\"thresholds\"]])\nprint(edf2.loc[:,[\"bf_norms\",\"thresholds\"]])\n\n bf_norms thresholds\n0 8.246211 0.114332\n1 9.055386 0.143159\n2 8.485281 0.143159\n bf_norms thresholds\n0 9.695360 0.294345\n1 9.380832 0.157014\n2 10.862781 0.294345\n\n\n\n\n\nThe processed features\nLet’s take a look at how the features are processed into small text strings (shingles) before being hashed into the Bloom filter. The first record in the first dataset is the same person as the first record in the second dataset, although the data is not identical, so we can compare the processed features for these records to see how pprl puts them into a format where they can be compared.\nFirst, we’ll look at date of birth:\n\nprint(edf1.date_of_birth_features[0])\nprint(edf2.birth_date_features[0])\n\n['day<01>', 'month<03>', 'year<1977>']\n['day<23>', 'month<03>', 'year<1977>']\n\n\nPython can parse the different formats easily. Although the dates are slightly different in the dataset, the year and month will still match, even though the day will not.\nThen we’ll look at name:\n\nprint(edf1.first_name_features[0] + edf1.last_name_features[0])\nprint(edf2.name_features[0])\n\n['_l', 'la', 'au', 'ur', 'ra', 'a_', '_la', 'lau', 'aur', 'ura', 'ra_', '_d', 'da', 'at', 'te', 'en', 'n_', '_da', 'dat', 'ate', 'ten', 'en_']\n['_l', 'la', 'au', 'ur', 'ra', 'a_', '_d', 'da', 'at', 'tt', 'te', 'en', 'n_', '_la', 'lau', 'aur', 'ura', 'ra_', '_da', 'dat', 'att', 'tte', 'ten', 'en_']\n\n\nThe two datasets store the names differently, but this doesn’t matter for the Bloom filter method because it treats each record like a bag of features. By default, the name processor produces 2-grams and 3-grams.\nThe sex processing function just converts different formats to lowercase and takes the first letter. This will often be enough:\n\nprint(edf1.gender_features[0])\nprint(edf2.sex_features[0])\n\n['sex<f>']\n['sex<f>']\n\n\nFinally, we’ll see how our instrument feature function (partial(features.gen_misc_shingled_features, label=\"instrument\")) processed the data:\n\nprint(edf1.instrument_features[0])\nprint(edf2.main_instrument_features[0])\n\n['instrument<_b>', 'instrument<ba>', 'instrument<as>', 'instrument<ss>', 'instrument<s_>', 'instrument<_ba>', 'instrument<bas>', 'instrument<ass>', 'instrument<ss_>']\n['instrument<_b>', 'instrument<ba>', 'instrument<as>', 'instrument<ss>', 'instrument<s_>', 'instrument<_g>', 'instrument<gu>', 'instrument<ui>', 'instrument<it>', 'instrument<ta>', 'instrument<ar>', 'instrument<r_>', 'instrument<_ba>', 'instrument<bas>', 'instrument<ass>', 'instrument<ss_>', 'instrument<_gu>', 'instrument<gui>', 'instrument<uit>', 'instrument<ita>', 'instrument<tar>', 'instrument<ar_>']\n\n\nSetting the label argument was important to ensure that the shingles match (and are hashed to the same slots) because the default behaviour of the function is to use the column name as a label: since the two columns have different names, the default wouldn’t have allowed the features to match to each other.\n\n\nPerforming the linkage\nWe can now perform the linkage by comparing these Bloom filter embeddings. We use the Soft Cosine Measure (which in this untrained model, is equivalent to a normal cosine similarity metric) to calculate record-wise similarity and an adapted Hungarian algorithm to match the records based on those similarities.\n\nsimilarities = embedder.compare(edf1, edf2)\nsimilarities\n\nSimilarityArray([[0.80050047, 0.10341754, 0.10047246],\n [0.34170424, 0.16480856, 0.63029481],\n [0.12155416, 0.54020787, 0.11933984]])\n\n\nThis SimilarityArray object is an augmented numpy.ndarray that can perform our matching. The matching itself can optionally be called with an absolute threshold score, but it doesn’t need one.\n\nmatching = similarities.match()\nmatching\n\n(array([0, 1, 2]), array([0, 2, 1]))\n\n\nSo, all three of the records in each dataset were matched correctly. Excellent!", + "text": "The Python package implements the Bloom filter linkage method (Schnell et al., 2009), and can also implement pretrained Hash embeddings (Miranda et al., 2022), if a suitable large, pre-matched corpus of data is available.\nLet us consider a small example where we want to link two excerpts of data on bands. In this scenario, we are looking at some toy data on the members of a fictional, German rock trio called “Verknüpfung”. In this example we will see how to use untrained Bloom filters to match data.\n\nLoading the data\nFirst, we load our data into pandas.DataFrame objects. Here, the first records align, but the other two records should be swapped to have an aligned matching. We will use the toolkit to identify these matches.\n\nimport pandas as pd\n\ndf1 = pd.DataFrame(\n {\n \"first_name\": [\"Laura\", \"Kaspar\", \"Grete\"],\n \"last_name\": [\"Daten\", \"Gorman\", \"Knopf\"],\n \"gender\": [\"F\", \"M\", \"F\"],\n \"date_of_birth\": [\"01/03/1977\", \"31/12/1975\", \"12/7/1981\"],\n \"instrument\": [\"bass\", \"guitar\", \"drums\"],\n }\n)\ndf2 = pd.DataFrame(\n {\n \"name\": [\"Laura Datten\", \"Greta Knopf\", \"Casper Goreman\"],\n \"sex\": [\"female\", \"female\", \"male\"],\n \"main_instrument\": [\"bass guitar\", \"percussion\", \"electric guitar\"],\n \"birth_date\": [\"1977-03-23\", \"1981-07-12\", \"1975-12-31\"],\n }\n)\n\n\n\n\n\n\n\nNote\n\n\n\nThese datasets don’t have the same column names or follow the same encodings, and there are several spelling mistakes in the names of the band members, as well as a typo in the dates.\nThankfully, the PPRL Toolkit is flexible enough to handle this!\n\n\n\n\nCreating and assigning a feature factory\nThe next step is to decide how to process each of the columns in our datasets.\nTo do this, we define a feature factory that maps column types to feature generation functions, and a column specification for each dataset mapping our columns to column types in the factory.\n\nfrom pprl.embedder import features\nfrom functools import partial\n\nfactory = dict(\n name=features.gen_name_features,\n sex=features.gen_sex_features,\n misc=features.gen_misc_features,\n dob=features.gen_dateofbirth_features,\n instrument=partial(features.gen_misc_shingled_features, label=\"instrument\")\n)\nspec1 = dict(\n first_name=\"name\",\n last_name=\"name\",\n gender=\"sex\",\n instrument=\"instrument\",\n date_of_birth=\"dob\",\n)\nspec2 = dict(name=\"name\", sex=\"sex\", main_instrument=\"instrument\", birth_date=\"dob\")\n\n\n\n\n\n\n\nTip\n\n\n\nThe feature generation functions, features.gen_XXX_features have sensible default parameters, but sometimes have to be passed in to the feature factory with different parameters, such as to set a feature label in the example above. There are two ways to achieve this. Either use functools.partial to set parameters (as above), or pass keyword arguments as a dictionary of dictionaries to the Embedder as ff_args.\n\n\n\n\nEmbedding the data\nWith our specifications sorted out, we can get to creating our Bloom filter embedding. Before doing so, we need to decide on two parameters: the size of the filter and the number of hashes. By default, these are 1024 and 2, respectively.\nOnce we’ve decided, we can create our Embedder instance and use it to embed our data with their column specifications.\n\nfrom pprl.embedder.embedder import Embedder\n\nembedder = Embedder(factory, bf_size=1024, num_hashes=2)\n\nedf1 = embedder.embed(df1, colspec=spec1, update_thresholds=True)\nedf2 = embedder.embed(df2, colspec=spec2, update_thresholds=True)\n\nIf we take a look at one of these embedded datasets, we can see that it has a whole bunch of new columns. There is a _features column for each of the original columns containing their pre-embedding string features, and there’s an all_features column that combines the features. Then there are three additional columns: bf_indices, bf_norms and thresholds.\n\nedf1.columns\n\nIndex(['first_name', 'last_name', 'gender', 'date_of_birth', 'instrument',\n 'first_name_features', 'last_name_features', 'gender_features',\n 'instrument_features', 'date_of_birth_features', 'all_features',\n 'bf_indices', 'bf_norms', 'thresholds'],\n dtype='object')\n\n\nThe bf_indices column contains the Bloom filters, represented compactly as a list of non-zero indices for each record.\n\nprint(edf1.bf_indices[0])\n\n[2, 262, 903, 646, 9, 526, 654, 272, 15, 146, 17, 532, 531, 282, 667, 413, 670, 544, 288, 931, 292, 808, 937, 172, 942, 559, 816, 691, 820, 567, 56, 823, 440, 60, 61, 318, 319, 320, 444, 577, 836, 583, 332, 77, 590, 972, 465, 82, 211, 468, 84, 338, 851, 600, 593, 218, 861, 613, 871, 744, 238, 367, 881, 758, 890, 379, 1021, 763]\n\n\nThe bf_norms column contains the norm of each Bloom filter with respect to the Soft Cosine Measure (SCM) matrix. In this case since we are using an untrained model, the SCM matrix is an identity matrix, and the norm is just the Euclidean norm of the Bloom filter represented as a binary vector, which is equal to np.sqrt(len(bf_indices[i])) for record i. The norm is used to scale the similarity measures so that they take values between -1 and 1.\nThe thresholds column is calculated to provide, for each record, a threshold similarity score below which it will not be matched. It’s like a reserve price in an auction – it stops a record being matched to another record when the similarity isn’t high enough. This is an innovative feature of our method; other linkage methods typically only have one global threshold score for the entire dataset.\n\nprint(edf1.loc[:,[\"bf_norms\",\"thresholds\"]])\nprint(edf2.loc[:,[\"bf_norms\",\"thresholds\"]])\n\n bf_norms thresholds\n0 8.246211 0.114332\n1 9.055386 0.143159\n2 8.485281 0.143159\n bf_norms thresholds\n0 9.695360 0.294345\n1 9.380832 0.157014\n2 10.862781 0.294345\n\n\n\n\n\nThe processed features\nLet’s take a look at how the features are processed into small text strings (shingles) before being hashed into the Bloom filter. The first record in the first dataset is the same person as the first record in the second dataset, although the data is not identical, so we can compare the processed features for these records to see how pprl puts them into a format where they can be compared.\nFirst, we’ll look at date of birth:\n\nprint(edf1.date_of_birth_features[0])\nprint(edf2.birth_date_features[0])\n\n['day<01>', 'month<03>', 'year<1977>']\n['day<23>', 'month<03>', 'year<1977>']\n\n\nPython can parse the different formats easily. Although the dates are slightly different in the dataset, the year and month will still match, even though the day will not.\nThen we’ll look at name:\n\nprint(edf1.first_name_features[0] + edf1.last_name_features[0])\nprint(edf2.name_features[0])\n\n['_l', 'la', 'au', 'ur', 'ra', 'a_', '_la', 'lau', 'aur', 'ura', 'ra_', '_d', 'da', 'at', 'te', 'en', 'n_', '_da', 'dat', 'ate', 'ten', 'en_']\n['_l', 'la', 'au', 'ur', 'ra', 'a_', '_d', 'da', 'at', 'tt', 'te', 'en', 'n_', '_la', 'lau', 'aur', 'ura', 'ra_', '_da', 'dat', 'att', 'tte', 'ten', 'en_']\n\n\nThe two datasets store the names differently, but this doesn’t matter for the Bloom filter method because it treats each record like a bag of features. By default, the name processor produces 2-grams and 3-grams.\nThe sex processing function just converts different formats to lowercase and takes the first letter. This will often be enough:\n\nprint(edf1.gender_features[0])\nprint(edf2.sex_features[0])\n\n['sex<f>']\n['sex<f>']\n\n\nFinally, we’ll see how our instrument feature function (partial(features.gen_misc_shingled_features, label=\"instrument\")) processed the data:\n\nprint(edf1.instrument_features[0])\nprint(edf2.main_instrument_features[0])\n\n['instrument<_b>', 'instrument<ba>', 'instrument<as>', 'instrument<ss>', 'instrument<s_>', 'instrument<_ba>', 'instrument<bas>', 'instrument<ass>', 'instrument<ss_>']\n['instrument<_b>', 'instrument<ba>', 'instrument<as>', 'instrument<ss>', 'instrument<s_>', 'instrument<_g>', 'instrument<gu>', 'instrument<ui>', 'instrument<it>', 'instrument<ta>', 'instrument<ar>', 'instrument<r_>', 'instrument<_ba>', 'instrument<bas>', 'instrument<ass>', 'instrument<ss_>', 'instrument<_gu>', 'instrument<gui>', 'instrument<uit>', 'instrument<ita>', 'instrument<tar>', 'instrument<ar_>']\n\n\nSetting the label argument was important to ensure that the shingles match (and are hashed to the same slots) because the default behaviour of the function is to use the column name as a label: since the two columns have different names, the default wouldn’t have allowed the features to match to each other.\n\n\nPerforming the linkage\nWe can now perform the linkage by comparing these Bloom filter embeddings. We use the Soft Cosine Measure (which in this untrained model, is equivalent to a normal cosine similarity metric) to calculate record-wise similarity and an adapted Hungarian algorithm to match the records based on those similarities.\n\nsimilarities = embedder.compare(edf1, edf2)\nsimilarities\n\nSimilarityArray([[0.80050047, 0.10341754, 0.10047246],\n [0.34170424, 0.16480856, 0.63029481],\n [0.12155416, 0.54020787, 0.11933984]])\n\n\nThis SimilarityArray object is an augmented numpy.ndarray that can perform our matching. The matching itself can optionally be called with an absolute threshold score, but it doesn’t need one.\n\nmatching = similarities.match()\nmatching\n\n(array([0, 1, 2]), array([0, 2, 1]))\n\n\nSo, all three of the records in each dataset were matched correctly. Excellent!", "crumbs": [ "About", "Docs", @@ -366,7 +366,7 @@ "href": "docs/tutorials/run-through.html#embedding", "title": "Embedder API run-through", "section": "Embedding", - "text": "Embedding\nNow we can create an Embedder object. We want our Bloom filter vectors to have a length of 1024 elements, and we choose to hash each feature two times. These choices seem to work ok, but we haven’t explored them systematically.\n\nembedder = Embedder(feature_factory,\n ff_args,\n bf_size = 2**10,\n num_hashes=2,\n )\n\nNow we can hash embed the dataset into an EmbeddedDataFrame (EDF). For this we need to pass a column specification colspec that maps each column of the data into the feature_factory functions. Any columns not mapped will not contribute to the embedding.\n\nedf1 = embedder.embed(\n df1, colspec=dict(forename=\"name\", surname=\"name\", dob=\"dob\", gender=\"sex\", county=\"misc\")\n)\nedf2 = embedder.embed(\n df2, colspec=dict(full_name=\"name\", date_of_birth=\"dob\", sex=\"sex\", county=\"misc\")\n)\n\nprint(edf1)\nprint(edf2)\n\n id forename surname dob gender county \\\n0 1 Henry Tull male \n1 2 Sally Brown 2/1/2001 Male NaN \n2 3 Ina Lawrey 4/10/1995 Female County Durham \n\n forename_features \\\n0 [_h, he, en, nr, ry, y_, _he, hen, enr, nry, ry_] \n1 [_s, sa, al, ll, ly, y_, _sa, sal, all, lly, ly_] \n2 [_i, in, na, a_, _in, ina, na_] \n\n surname_features \\\n0 [_t, tu, ul, ll, l_, _tu, tul, ull, ll_] \n1 [_b, br, ro, ow, wn, n_, _br, bro, row, own, wn_] \n2 [_l, la, aw, wr, re, ey, y_, _la, law, awr, wr... \n\n dob_features gender_features county_features \\\n0 [] [sex<m>] \n1 [day<02>, month<01>, year<2001>] [sex<m>] \n2 [day<04>, month<10>, year<1995>] [sex<f>] [county<county durham>] \n\n all_features \\\n0 [ll, nr, ll_, _t, ull, _tu, _he, he, tu, hen, ... \n1 [all, ll, ro, n_, ow, sa, ly_, bro, month<01>,... \n2 [ina, ey, _in, re, wr, aw, law, la, na_, ey_, ... \n\n bf_indices bf_norms \n0 [644, 773, 135, 776, 265, 778, 271, 402, 404, ... 6.244998 \n1 [129, 258, 130, 776, 523, 525, 398, 271, 671, ... 7.141428 \n2 [647, 394, 269, 13, 15, 532, 667, 155, 413, 28... 7.000000 \n personid full_name date_of_birth sex county \\\n0 4 Harry Tull 2/1/2001 M Rutland \n1 5 Sali Brown 2/1/2001 M Powys \n2 6 Ina Laurie 4/11/1995 F Durham \n\n full_name_features \\\n0 [_h, ha, ar, rr, ry, y_, _t, tu, ul, ll, l_, _... \n1 [_s, sa, al, li, i_, _b, br, ro, ow, wn, n_, _... \n2 [_i, in, na, a_, _l, la, au, ur, ri, ie, e_, _... \n\n date_of_birth_features sex_features county_features \\\n0 [day<02>, month<01>, year<2001>] [sex<m>] [county<rutland>] \n1 [day<02>, month<01>, year<2001>] [sex<m>] [county<powys>] \n2 [day<04>, month<11>, year<1995>] [sex<f>] [county<durham>] \n\n all_features \\\n0 [ll, ll_, rr, rry, ar, _ha, _t, ha, ull, count... \n1 [county<powys>, ro, li_, n_, ow, sa, bro, ali,... \n2 [ina, ie, aur, e_, _in, uri, la, na_, county<d... \n\n bf_indices bf_norms \n0 [640, 130, 644, 135, 776, 10, 778, 271, 402, 5... 6.855655 \n1 [130, 523, 525, 398, 271, 152, 671, 803, 806, ... 7.000000 \n2 [646, 647, 394, 269, 15, 272, 531, 532, 665, 6... 6.928203", + "text": "Embedding\nNow we can create an Embedder object. We want our Bloom filter vectors to have a length of 1024 elements, and we choose to hash each feature two times. These choices seem to work ok, but we haven’t explored them systematically.\n\nembedder = Embedder(feature_factory,\n ff_args,\n bf_size = 2**10,\n num_hashes=2,\n )\n\nNow we can hash embed the dataset into an EmbeddedDataFrame (EDF). For this we need to pass a column specification colspec that maps each column of the data into the feature_factory functions. Any columns not mapped will not contribute to the embedding.\n\nedf1 = embedder.embed(\n df1, colspec=dict(forename=\"name\", surname=\"name\", dob=\"dob\", gender=\"sex\", county=\"misc\")\n)\nedf2 = embedder.embed(\n df2, colspec=dict(full_name=\"name\", date_of_birth=\"dob\", sex=\"sex\", county=\"misc\")\n)\n\nprint(edf1)\nprint(edf2)\n\n id forename surname dob gender county \\\n0 1 Henry Tull male \n1 2 Sally Brown 2/1/2001 Male NaN \n2 3 Ina Lawrey 4/10/1995 Female County Durham \n\n forename_features \\\n0 [_h, he, en, nr, ry, y_, _he, hen, enr, nry, ry_] \n1 [_s, sa, al, ll, ly, y_, _sa, sal, all, lly, ly_] \n2 [_i, in, na, a_, _in, ina, na_] \n\n surname_features \\\n0 [_t, tu, ul, ll, l_, _tu, tul, ull, ll_] \n1 [_b, br, ro, ow, wn, n_, _br, bro, row, own, wn_] \n2 [_l, la, aw, wr, re, ey, y_, _la, law, awr, wr... \n\n dob_features gender_features county_features \\\n0 [] [sex<m>] \n1 [day<02>, month<01>, year<2001>] [sex<m>] \n2 [day<04>, month<10>, year<1995>] [sex<f>] [county<county durham>] \n\n all_features \\\n0 [_he, he, _t, ll, tul, ry_, l_, tu, ll_, y_, e... \n1 [_br, wn_, ro, ll, al, ly, row, _b, y_, _sa, o... \n2 [sex<f>, county<county durham>, na_, re, y_, a... \n\n bf_indices bf_norms \n0 [644, 773, 135, 776, 265, 778, 271, 402, 404, ... 6.244998 \n1 [129, 258, 130, 776, 523, 525, 398, 271, 671, ... 7.141428 \n2 [647, 394, 269, 13, 15, 532, 667, 28, 413, 155... 7.000000 \n personid full_name date_of_birth sex county \\\n0 4 Harry Tull 2/1/2001 M Rutland \n1 5 Sali Brown 2/1/2001 M Powys \n2 6 Ina Laurie 4/11/1995 F Durham \n\n full_name_features \\\n0 [_h, ha, ar, rr, ry, y_, _t, tu, ul, ll, l_, _... \n1 [_s, sa, al, li, i_, _b, br, ro, ow, wn, n_, _... \n2 [_i, in, na, a_, _l, la, au, ur, ri, ie, e_, _... \n\n date_of_birth_features sex_features county_features \\\n0 [day<02>, month<01>, year<2001>] [sex<m>] [county<rutland>] \n1 [day<02>, month<01>, year<2001>] [sex<m>] [county<powys>] \n2 [day<04>, month<11>, year<1995>] [sex<f>] [county<durham>] \n\n all_features \\\n0 [_t, ll, tul, ry_, l_, county<rutland>, ar, tu... \n1 [_br, wn_, i_, ro, li_, al, ali, row, _b, wn, ... \n2 [uri, sex<f>, month<11>, na_, ur, ie, a_, au, ... \n\n bf_indices bf_norms \n0 [640, 130, 644, 135, 776, 778, 10, 271, 402, 5... 6.855655 \n1 [130, 523, 525, 398, 271, 152, 671, 803, 806, ... 7.000000 \n2 [646, 647, 394, 269, 15, 272, 531, 532, 665, 6... 6.928203", "crumbs": [ "About", "Docs", @@ -392,7 +392,7 @@ "href": "docs/tutorials/run-through.html#computing-the-similarity-scores-and-the-matching", "title": "Embedder API run-through", "section": "Computing the similarity scores and the matching", - "text": "Computing the similarity scores and the matching\nNow we have two embedded datasets, we can compare them and compute all the pairwise Cosine similarity scores.\nFirst, we have to compute the vector norms of each Bloom vector (for scaling the Cosine similarity) and the thresholds (thresholds are explained here [link]). Computing the thresholds can be time-consuming for a larger dataset, because it essentially computes all pairwise comparisons of the data to itself.\n\n\n\n\n\n\n\n\n\n\npersonid\nfull_name\ndate_of_birth\nsex\ncounty\nfull_name_features\ndate_of_birth_features\nsex_features\ncounty_features\nall_features\nbf_indices\nbf_norms\nthresholds\n\n\n\n\n0\n4\nHarry Tull\n2/1/2001\nM\nRutland\n[_h, ha, ar, rr, ry, y_, _t, tu, ul, ll, l_, _...\n[day<02>, month<01>, year<2001>]\n[sex<m>]\n[county<rutland>]\n[ll, ll_, rr, rry, ar, _ha, _t, ha, ull, count...\n[640, 130, 644, 135, 776, 10, 778, 271, 402, 5...\n6.855655\n0.187541\n\n\n1\n5\nSali Brown\n2/1/2001\nM\nPowys\n[_s, sa, al, li, i_, _b, br, ro, ow, wn, n_, _...\n[day<02>, month<01>, year<2001>]\n[sex<m>]\n[county<powys>]\n[county<powys>, ro, li_, n_, ow, sa, bro, ali,...\n[130, 523, 525, 398, 271, 152, 671, 803, 806, ...\n7.000000\n0.187541\n\n\n2\n6\nIna Laurie\n4/11/1995\nF\nDurham\n[_i, in, na, a_, _l, la, au, ur, ri, ie, e_, _...\n[day<04>, month<11>, year<1995>]\n[sex<f>]\n[county<durham>]\n[ina, ie, aur, e_, _in, uri, la, na_, county<d...\n[646, 647, 394, 269, 15, 272, 531, 532, 665, 6...\n6.928203\n0.082479\n\n\n\n\n\n\n\n\nNB: there’s also a flag to compute these at the same time as the embedding, but it doesn’t by default because, depending on the workflow, you may wish to compute the norms and thresholds at different times (e.g. on the server).\nNow you can compute the similarities:\n\nsimilarities = embedder.compare(edf1,edf2)\n\nprint(similarities)\n\n[[0.60728442 0.09150181 0. ]\n [0.2859526 0.78015612 0.08084521]\n [0.08335143 0.10204083 0.57735028]]\n\n\nFinally, you can compute the matching:\n\nmatching = similarities.match(abs_cutoff=0.5)\n\nprint(matching)\n\n(array([0, 1, 2]), array([0, 1, 2]))", + "text": "Computing the similarity scores and the matching\nNow we have two embedded datasets, we can compare them and compute all the pairwise Cosine similarity scores.\nFirst, we have to compute the vector norms of each Bloom vector (for scaling the Cosine similarity) and the thresholds (thresholds are explained here [link]). Computing the thresholds can be time-consuming for a larger dataset, because it essentially computes all pairwise comparisons of the data to itself.\n\n\n\n\n\n\n\n\n\n\npersonid\nfull_name\ndate_of_birth\nsex\ncounty\nfull_name_features\ndate_of_birth_features\nsex_features\ncounty_features\nall_features\nbf_indices\nbf_norms\nthresholds\n\n\n\n\n0\n4\nHarry Tull\n2/1/2001\nM\nRutland\n[_h, ha, ar, rr, ry, y_, _t, tu, ul, ll, l_, _...\n[day<02>, month<01>, year<2001>]\n[sex<m>]\n[county<rutland>]\n[_t, ll, tul, ry_, l_, county<rutland>, ar, tu...\n[640, 130, 644, 135, 776, 778, 10, 271, 402, 5...\n6.855655\n0.187541\n\n\n1\n5\nSali Brown\n2/1/2001\nM\nPowys\n[_s, sa, al, li, i_, _b, br, ro, ow, wn, n_, _...\n[day<02>, month<01>, year<2001>]\n[sex<m>]\n[county<powys>]\n[_br, wn_, i_, ro, li_, al, ali, row, _b, wn, ...\n[130, 523, 525, 398, 271, 152, 671, 803, 806, ...\n7.000000\n0.187541\n\n\n2\n6\nIna Laurie\n4/11/1995\nF\nDurham\n[_i, in, na, a_, _l, la, au, ur, ri, ie, e_, _...\n[day<04>, month<11>, year<1995>]\n[sex<f>]\n[county<durham>]\n[uri, sex<f>, month<11>, na_, ur, ie, a_, au, ...\n[646, 647, 394, 269, 15, 272, 531, 532, 665, 6...\n6.928203\n0.082479\n\n\n\n\n\n\n\n\nNB: there’s also a flag to compute these at the same time as the embedding, but it doesn’t by default because, depending on the workflow, you may wish to compute the norms and thresholds at different times (e.g. on the server).\nNow you can compute the similarities:\n\nsimilarities = embedder.compare(edf1,edf2)\n\nprint(similarities)\n\n[[0.60728442 0.09150181 0. ]\n [0.2859526 0.78015612 0.08084521]\n [0.08335143 0.10204083 0.57735028]]\n\n\nFinally, you can compute the matching:\n\nmatching = similarities.match(abs_cutoff=0.5)\n\nprint(matching)\n\n(array([0, 1, 2]), array([0, 1, 2]))", "crumbs": [ "About", "Docs", @@ -496,7 +496,7 @@ "href": "docs/tutorials/example-febrl.html#calculate-similarity", "title": "Linking the FEBRL datasets", "section": "Calculate similarity", - "text": "Calculate similarity\nCompute the row thresholds to provide a lower bound on matching similarity scores for each row. This operation is the most computationally intensive part of the whole process.\n\nstart = time.time()\nedf1.update_thresholds()\nedf2.update_thresholds()\nend = time.time()\n\nprint(f\"Updating thresholds took {end - start:.2f} seconds\")\n\nUpdating thresholds took 8.40 seconds\n\n\nCompute the matrix of similarity scores.\n\nsimilarity_scores = embedder.compare(edf1,edf2)", + "text": "Calculate similarity\nCompute the row thresholds to provide a lower bound on matching similarity scores for each row. This operation is the most computationally intensive part of the whole process.\n\nstart = time.time()\nedf1.update_thresholds()\nedf2.update_thresholds()\nend = time.time()\n\nprint(f\"Updating thresholds took {end - start:.2f} seconds\")\n\nUpdating thresholds took 8.35 seconds\n\n\nCompute the matrix of similarity scores.\n\nsimilarity_scores = embedder.compare(edf1,edf2)", "crumbs": [ "About", "Docs", @@ -638,7 +638,7 @@ "href": "docs/reference/utils.html", "title": "utils", "section": "", - "text": "app.utils\nUtility functions for the party-side app.\n\n\n\n\n\nName\nDescription\n\n\n\n\nassign_columns\nAssign columns from a form to collections.\n\n\ncheck_is_csv\nDetermine whether a file has the csv extension.\n\n\nconvert_dataframe_to_bf\nConvert a dataframe of features to a bloom filter.\n\n\ndownload_files\nSerialize, compress, and send a data frame with its embedder.\n\n\n\n\n\napp.utils.assign_columns(form, feature_funcs)\nAssign columns from a form to collections.\nAll columns belong to one of three collections: columns to drop, raw columns to keep, or a column feature factory specification.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nform\ndict\nForm from our column chooser page.\nrequired\n\n\nfeature_funcs\ndict\nMapping between column types and feature functions.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nlist[str]\nList of columns to drop.\n\n\nlist[str]\nList of columns to keep in their raw format.\n\n\ndict[str, func]\nMapping between column names and feature functions.\n\n\n\n\n\n\n\napp.utils.check_is_csv(path)\nDetermine whether a file has the csv extension.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nstr\nPath to the file.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nbool\nWhether the file name follows the pattern {name}.csv or not.\n\n\n\n\n\n\n\napp.utils.convert_dataframe_to_bf(df, colspec, other_columns=None, salt='')\nConvert a dataframe of features to a bloom filter.\nConvert the columns to features based on the colspec. The features are then combined and converted to Bloom filter indices with the Bloom filter norm also calculated.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndf\npandas.pandas.DataFrame\nData frame of features.\nrequired\n\n\ncolspec\ndict\nDictionary designating columns in the data frame as particular feature types to be processed as appropriate.\nrequired\n\n\nother_columns\nNone | list\nColumns to be returned as they appear in the data in addition to bf_indices and bf_norms.\nNone\n\n\nsalt\nstr\nCryptographic salt to add to tokens before hashing.\n''\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npandas.pandas.DataFrame\nData frame of bloom-filtered data.\n\n\n\n\n\n\n\napp.utils.download_files(dataframe, embedder, party, archive='archive')\nSerialize, compress, and send a data frame with its embedder.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndataframe\npprl.embedder.embedder.EmbeddedDataFrame\nData frame to be downloaded.\nrequired\n\n\nembedder\npprl.embedder.embedder.Embedder\nEmbedder used to embed dataframe.\nrequired\n\n\nparty\nstr\nName of the party.\nrequired\n\n\narchive\nstr\nName of the archive. Default is \"archive\".\n'archive'\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nflask.flask.Response\nResponse containing a ZIP archive with the data frame and its embedder.", + "text": "app.utils\nUtility functions for the party-side app.\n\n\n\n\n\nName\nDescription\n\n\n\n\nassign_columns\nAssign columns from a form to collections.\n\n\ncheck_is_csv\nDetermine whether a file has the csv extension.\n\n\nconvert_dataframe_to_bf\nConvert a dataframe of features to a bloom filter.\n\n\ndownload_files\nSerialize, compress, and send a data frame with its embedder.\n\n\n\n\n\napp.utils.assign_columns(form, feature_funcs)\nAssign columns from a form to collections.\nAll columns belong to one of three collections: columns to drop, raw columns to keep, or a column feature factory specification.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nform\ndict\nForm from our column chooser page.\nrequired\n\n\nfeature_funcs\ndict\nMapping between column types and feature functions.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nlist[str]\nList of columns to drop.\n\n\nlist[str]\nList of columns to keep in their raw format.\n\n\ndict[str, func]\nMapping between column names and feature functions.\n\n\n\n\n\n\n\napp.utils.check_is_csv(path)\nDetermine whether a file has the csv extension.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nstr\nPath to the file.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nbool\nWhether the file name follows the pattern {name}.csv or not.\n\n\n\n\n\n\n\napp.utils.convert_dataframe_to_bf(df, colspec, other_columns=None, salt='')\nConvert a dataframe of features to a bloom filter.\nConvert the columns to features based on the colspec. The features are then combined and converted to Bloom filter indices with the Bloom filter norm also calculated.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndf\npandas.pandas.DataFrame\nData frame of features.\nrequired\n\n\ncolspec\ndict\nDictionary designating columns in the data frame as particular feature types to be processed as appropriate.\nrequired\n\n\nother_columns\nNone | list\nColumns to be returned as they appear in the data in addition to bf_indices, bf_norms and thresholds.\nNone\n\n\nsalt\nstr\nCryptographic salt to add to tokens before hashing.\n''\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npandas.pandas.DataFrame\nData frame of bloom-filtered data.\n\n\n\n\n\n\n\napp.utils.download_files(dataframe, embedder, party, archive='archive')\nSerialize, compress, and send a data frame with its embedder.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndataframe\npprl.embedder.embedder.EmbeddedDataFrame\nData frame to be downloaded.\nrequired\n\n\nembedder\npprl.embedder.embedder.Embedder\nEmbedder used to embed dataframe.\nrequired\n\n\nparty\nstr\nName of the party.\nrequired\n\n\narchive\nstr\nName of the archive. Default is \"archive\".\n'archive'\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nflask.flask.Response\nResponse containing a ZIP archive with the data frame and its embedder.", "crumbs": [ "About", "Docs", @@ -651,7 +651,7 @@ "href": "docs/reference/utils.html#functions", "title": "utils", "section": "", - "text": "Name\nDescription\n\n\n\n\nassign_columns\nAssign columns from a form to collections.\n\n\ncheck_is_csv\nDetermine whether a file has the csv extension.\n\n\nconvert_dataframe_to_bf\nConvert a dataframe of features to a bloom filter.\n\n\ndownload_files\nSerialize, compress, and send a data frame with its embedder.\n\n\n\n\n\napp.utils.assign_columns(form, feature_funcs)\nAssign columns from a form to collections.\nAll columns belong to one of three collections: columns to drop, raw columns to keep, or a column feature factory specification.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nform\ndict\nForm from our column chooser page.\nrequired\n\n\nfeature_funcs\ndict\nMapping between column types and feature functions.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nlist[str]\nList of columns to drop.\n\n\nlist[str]\nList of columns to keep in their raw format.\n\n\ndict[str, func]\nMapping between column names and feature functions.\n\n\n\n\n\n\n\napp.utils.check_is_csv(path)\nDetermine whether a file has the csv extension.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nstr\nPath to the file.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nbool\nWhether the file name follows the pattern {name}.csv or not.\n\n\n\n\n\n\n\napp.utils.convert_dataframe_to_bf(df, colspec, other_columns=None, salt='')\nConvert a dataframe of features to a bloom filter.\nConvert the columns to features based on the colspec. The features are then combined and converted to Bloom filter indices with the Bloom filter norm also calculated.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndf\npandas.pandas.DataFrame\nData frame of features.\nrequired\n\n\ncolspec\ndict\nDictionary designating columns in the data frame as particular feature types to be processed as appropriate.\nrequired\n\n\nother_columns\nNone | list\nColumns to be returned as they appear in the data in addition to bf_indices and bf_norms.\nNone\n\n\nsalt\nstr\nCryptographic salt to add to tokens before hashing.\n''\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npandas.pandas.DataFrame\nData frame of bloom-filtered data.\n\n\n\n\n\n\n\napp.utils.download_files(dataframe, embedder, party, archive='archive')\nSerialize, compress, and send a data frame with its embedder.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndataframe\npprl.embedder.embedder.EmbeddedDataFrame\nData frame to be downloaded.\nrequired\n\n\nembedder\npprl.embedder.embedder.Embedder\nEmbedder used to embed dataframe.\nrequired\n\n\nparty\nstr\nName of the party.\nrequired\n\n\narchive\nstr\nName of the archive. Default is \"archive\".\n'archive'\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nflask.flask.Response\nResponse containing a ZIP archive with the data frame and its embedder.", + "text": "Name\nDescription\n\n\n\n\nassign_columns\nAssign columns from a form to collections.\n\n\ncheck_is_csv\nDetermine whether a file has the csv extension.\n\n\nconvert_dataframe_to_bf\nConvert a dataframe of features to a bloom filter.\n\n\ndownload_files\nSerialize, compress, and send a data frame with its embedder.\n\n\n\n\n\napp.utils.assign_columns(form, feature_funcs)\nAssign columns from a form to collections.\nAll columns belong to one of three collections: columns to drop, raw columns to keep, or a column feature factory specification.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nform\ndict\nForm from our column chooser page.\nrequired\n\n\nfeature_funcs\ndict\nMapping between column types and feature functions.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nlist[str]\nList of columns to drop.\n\n\nlist[str]\nList of columns to keep in their raw format.\n\n\ndict[str, func]\nMapping between column names and feature functions.\n\n\n\n\n\n\n\napp.utils.check_is_csv(path)\nDetermine whether a file has the csv extension.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nstr\nPath to the file.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nbool\nWhether the file name follows the pattern {name}.csv or not.\n\n\n\n\n\n\n\napp.utils.convert_dataframe_to_bf(df, colspec, other_columns=None, salt='')\nConvert a dataframe of features to a bloom filter.\nConvert the columns to features based on the colspec. The features are then combined and converted to Bloom filter indices with the Bloom filter norm also calculated.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndf\npandas.pandas.DataFrame\nData frame of features.\nrequired\n\n\ncolspec\ndict\nDictionary designating columns in the data frame as particular feature types to be processed as appropriate.\nrequired\n\n\nother_columns\nNone | list\nColumns to be returned as they appear in the data in addition to bf_indices, bf_norms and thresholds.\nNone\n\n\nsalt\nstr\nCryptographic salt to add to tokens before hashing.\n''\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npandas.pandas.DataFrame\nData frame of bloom-filtered data.\n\n\n\n\n\n\n\napp.utils.download_files(dataframe, embedder, party, archive='archive')\nSerialize, compress, and send a data frame with its embedder.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndataframe\npprl.embedder.embedder.EmbeddedDataFrame\nData frame to be downloaded.\nrequired\n\n\nembedder\npprl.embedder.embedder.Embedder\nEmbedder used to embed dataframe.\nrequired\n\n\nparty\nstr\nName of the party.\nrequired\n\n\narchive\nstr\nName of the archive. Default is \"archive\".\n'archive'\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nflask.flask.Response\nResponse containing a ZIP archive with the data frame and its embedder.", "crumbs": [ "About", "Docs", diff --git a/sitemap.xml b/sitemap.xml index b322d28..3b9237c 100644 --- a/sitemap.xml +++ b/sitemap.xml @@ -2,66 +2,66 @@ https://datasciencecampus.github.io/pprl_toolkit/index.html - 2024-05-08T14:03:10.597Z + 2024-05-13T15:50:05.709Z https://datasciencecampus.github.io/pprl_toolkit/docs/reference/index.html - 2024-05-08T14:03:57.385Z + 2024-05-13T15:50:58.694Z https://datasciencecampus.github.io/pprl_toolkit/docs/reference/config.html - 2024-05-08T14:03:57.505Z + 2024-05-13T15:50:58.814Z https://datasciencecampus.github.io/pprl_toolkit/docs/reference/cloud.html - 2024-05-08T14:03:57.537Z + 2024-05-13T15:50:58.846Z https://datasciencecampus.github.io/pprl_toolkit/docs/reference/embedder.html - 2024-05-08T14:03:57.453Z + 2024-05-13T15:50:58.766Z https://datasciencecampus.github.io/pprl_toolkit/docs/reference/encryption.html - 2024-05-08T14:03:57.501Z + 2024-05-13T15:50:58.814Z https://datasciencecampus.github.io/pprl_toolkit/docs/tutorials/example-verknupfung.html - 2024-05-08T14:03:10.597Z + 2024-05-13T15:50:05.709Z https://datasciencecampus.github.io/pprl_toolkit/docs/tutorials/in-the-cloud.html - 2024-05-08T14:03:10.597Z + 2024-05-13T15:50:05.709Z https://datasciencecampus.github.io/pprl_toolkit/docs/tutorials/run-through.html - 2024-05-08T14:03:10.597Z + 2024-05-13T15:50:05.709Z https://datasciencecampus.github.io/pprl_toolkit/docs/tutorials/example-febrl.html - 2024-05-08T14:03:10.597Z + 2024-05-13T15:50:05.709Z https://datasciencecampus.github.io/pprl_toolkit/docs/tutorials/index.html - 2024-05-08T14:03:10.597Z + 2024-05-13T15:50:05.709Z https://datasciencecampus.github.io/pprl_toolkit/docs/reference/local.html - 2024-05-08T14:03:57.541Z + 2024-05-13T15:50:58.850Z https://datasciencecampus.github.io/pprl_toolkit/docs/reference/bloom_filters.html - 2024-05-08T14:03:57.405Z + 2024-05-13T15:50:58.714Z https://datasciencecampus.github.io/pprl_toolkit/docs/reference/features.html - 2024-05-08T14:03:57.485Z + 2024-05-13T15:50:58.798Z https://datasciencecampus.github.io/pprl_toolkit/docs/reference/perform.html - 2024-05-08T14:03:57.553Z + 2024-05-13T15:50:58.862Z https://datasciencecampus.github.io/pprl_toolkit/docs/reference/utils.html - 2024-05-08T14:03:57.521Z + 2024-05-13T15:50:58.830Z

    other_columns None | listColumns to be returned as they appear in the data in addition to bf_indices and bf_norms.Columns to be returned as they appear in the data in addition to bf_indices, bf_norms and thresholds. None
    Embedder API run-through
    Exploring a simple linkage example
    Linking the FEBRL datasets
    Working in the cloud