From de06c8932d2018c75bb47ceb7a77ce66c758f20f Mon Sep 17 00:00:00 2001
From: Quarto GHA Workflow Runner <quarto-github-actions-publish@example.com>
Date: Mon, 13 May 2024 15:51:34 +0000
Subject: [PATCH] Built site for gh-pages

---
 .nojekyll                               |  2 +-
 docs/reference/embedder.html            | 63 ++++++++++++++++++-------
 docs/reference/utils.html               |  2 +-
 docs/tutorials/example-febrl.html       | 24 +++++-----
 docs/tutorials/example-verknupfung.html | 26 +++++-----
 docs/tutorials/index.html               |  8 ++--
 docs/tutorials/run-through.html         | 50 ++++++++++----------
 search.json                             | 16 +++----
 sitemap.xml                             | 32 ++++++-------
 9 files changed, 127 insertions(+), 96 deletions(-)
diff --git a/.nojekyll b/.nojekyll
index cb3b51d..e51f4c4 100644
--- a/.nojekyll
+++ b/.nojekyll
@@ -1 +1 @@
-7064a115
\ No newline at end of file
+ce820a19
\ No newline at end of file
diff --git a/docs/reference/embedder.html b/docs/reference/embedder.html
index f0af37b..4d6658c 100644
--- a/docs/reference/embedder.html
+++ b/docs/reference/embedder.html
@@ -417,26 +417,57 @@ <h4 class="anchored" data-anchor-id="methods">Methods</h4>
 </thead>
 <tbody>
 <tr class="odd">
+<td><a href="#pprl.embedder.embedder.EmbeddedDataFrame.anonymise">anonymise</a></td>
+<td>Remove raw data from embedded dataframe.</td>
+</tr>
+<tr class="even">
 <td><a href="#pprl.embedder.embedder.EmbeddedDataFrame.to_bloom_matrix">to_bloom_matrix</a></td>
 <td>Convert Bloom filter indices into a binary matrix.</td>
 </tr>
-<tr class="even">
+<tr class="odd">
 <td><a href="#pprl.embedder.embedder.EmbeddedDataFrame.update_norms">update_norms</a></td>
 <td>Generate vector norms for each row.</td>
 </tr>
-<tr class="odd">
+<tr class="even">
 <td><a href="#pprl.embedder.embedder.EmbeddedDataFrame.update_thresholds">update_thresholds</a></td>
 <td>Generate matching thresholds for each row of the data.</td>
 </tr>
 </tbody>
 </table>
+<section id="pprl.embedder.embedder.EmbeddedDataFrame.anonymise" class="level5">
+<h5 class="anchored" data-anchor-id="pprl.embedder.embedder.EmbeddedDataFrame.anonymise">anonymise</h5>
+<p><code>embedder.embedder.EmbeddedDataFrame.anonymise(keep=None)</code></p>
+<p>Remove raw data from embedded dataframe.</p>
+<p>Remove all columns from the embedded dataframe expect columns listed in keep and <code>bf_indices</code>, <code>bf_norms</code> and <code>thresholds</code>.</p>
+<section id="returns" class="level6">
+<h6 class="anchored" data-anchor-id="returns">Returns</h6>
+<table class="table">
+<colgroup>
+<col style="width: 8%">
+<col style="width: 91%">
+</colgroup>
+<thead>
+<tr class="header">
+<th>Type</th>
+<th>Description</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td>list[str]</td>
+<td>Columns to be returned as they appear in the data in addition to <code>bf_indices</code>, <code>bf_norms</code> and <code>thresholds</code> if they are present in the data.</td>
+</tr>
+</tbody>
+</table>
+</section>
+</section>
 <section id="pprl.embedder.embedder.EmbeddedDataFrame.to_bloom_matrix" class="level5">
 <h5 class="anchored" data-anchor-id="pprl.embedder.embedder.EmbeddedDataFrame.to_bloom_matrix">to_bloom_matrix</h5>
 <p><code>embedder.embedder.EmbeddedDataFrame.to_bloom_matrix()</code></p>
 <p>Convert Bloom filter indices into a binary matrix.</p>
 <p>The matrix has a row for each row in the EDF. The number of columns is equal to <code>self.embedder.bf_size + self.embedder.offset</code>. Each row in the matrix is a Bloom filter expressed as a binary vector, with the ones corresponding to hashed features. This representation is used in the <code>Embedder.compare()</code> method.</p>
-<section id="returns" class="level6">
-<h6 class="anchored" data-anchor-id="returns">Returns</h6>
+<section id="returns-1" class="level6">
+<h6 class="anchored" data-anchor-id="returns-1">Returns</h6>
 <table class="table">
 <colgroup>
 <col style="width: 20%">
@@ -706,8 +737,8 @@ <h6 class="anchored" data-anchor-id="parameters-2">Parameters</h6>
 </tbody>
 </table>
 </section>
-<section id="returns-1" class="level6">
-<h6 class="anchored" data-anchor-id="returns-1">Returns</h6>
+<section id="returns-2" class="level6">
+<h6 class="anchored" data-anchor-id="returns-2">Returns</h6>
 <table class="table">
 <colgroup>
 <col style="width: 24%">
@@ -799,8 +830,8 @@ <h6 class="anchored" data-anchor-id="parameters-3">Parameters</h6>
 </tbody>
 </table>
 </section>
-<section id="returns-2" class="level6">
-<h6 class="anchored" data-anchor-id="returns-2">Returns</h6>
+<section id="returns-3" class="level6">
+<h6 class="anchored" data-anchor-id="returns-3">Returns</h6>
 <table class="table">
 <colgroup>
 <col style="width: 49%">
@@ -879,8 +910,8 @@ <h6 class="anchored" data-anchor-id="raises-1">Raises</h6>
 </tbody>
 </table>
 </section>
-<section id="returns-3" class="level6">
-<h6 class="anchored" data-anchor-id="returns-3">Returns</h6>
+<section id="returns-4" class="level6">
+<h6 class="anchored" data-anchor-id="returns-4">Returns</h6>
 <table class="table">
 <colgroup>
 <col style="width: 40%">
@@ -932,8 +963,8 @@ <h6 class="anchored" data-anchor-id="parameters-5">Parameters</h6>
 </tbody>
 </table>
 </section>
-<section id="returns-4" class="level6">
-<h6 class="anchored" data-anchor-id="returns-4">Returns</h6>
+<section id="returns-5" class="level6">
+<h6 class="anchored" data-anchor-id="returns-5">Returns</h6>
 <table class="table">
 <colgroup>
 <col style="width: 14%">
@@ -1157,8 +1188,8 @@ <h6 class="anchored" data-anchor-id="parameters-8">Parameters</h6>
 </tbody>
 </table>
 </section>
-<section id="returns-5" class="level6">
-<h6 class="anchored" data-anchor-id="returns-5">Returns</h6>
+<section id="returns-6" class="level6">
+<h6 class="anchored" data-anchor-id="returns-6">Returns</h6>
 <table class="table">
 <colgroup>
 <col style="width: 25%">
@@ -1241,8 +1272,8 @@ <h4 class="anchored" data-anchor-id="parameters-9">Parameters</h4>
 </tbody>
 </table>
 </section>
-<section id="returns-6" class="level4">
-<h4 class="anchored" data-anchor-id="returns-6">Returns</h4>
+<section id="returns-7" class="level4">
+<h4 class="anchored" data-anchor-id="returns-7">Returns</h4>
 <table class="table">
 <thead>
 <tr class="header">
diff --git a/docs/reference/utils.html b/docs/reference/utils.html
index 07a7dff..7671f11 100644
--- a/docs/reference/utils.html
+++ b/docs/reference/utils.html
@@ -471,7 +471,7 @@ <h4 class="anchored" data-anchor-id="parameters-2">Parameters</h4>
 <tr class="odd">
 <td><code>other_columns</code></td>
 <td>None | list</td>
-<td>Columns to be returned as they appear in the data in addition to <code>bf_indices</code> and <code>bf_norms</code>.</td>
+<td>Columns to be returned as they appear in the data in addition to <code>bf_indices</code>, <code>bf_norms</code> and <code>thresholds</code>.</td>
 <td><code>None</code></td>
 </tr>
 <tr class="even">
diff --git a/docs/tutorials/example-febrl.html b/docs/tutorials/example-febrl.html
index c30a2d5..2c8ba8c 100644
--- a/docs/tutorials/example-febrl.html
+++ b/docs/tutorials/example-febrl.html
@@ -343,7 +343,7 @@ <h1 class="title">Linking the FEBRL datasets</h1>
 
 
 <p>This tutorial shows how the package can be used locally to match the <a href="http://users.cecs.anu.edu.au/~Peter.Christen/publications/hdkm2008slides.pdf">FEBRL</a> datasets, included as example datasets in the <a href="https://recordlinkage.readthedocs.io/en/latest/"><code>recordlinkage</code></a> package.</p>
-<div id="2658b4e0" class="cell" data-execution_count="1">
+<div id="614f9355" class="cell" data-execution_count="1">
 <div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> os</span>
 <span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> time</span>
 <span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> functools <span class="im">import</span> partial</span>
@@ -359,7 +359,7 @@ <h1 class="title">Linking the FEBRL datasets</h1>
 <h2 class="anchored" data-anchor-id="load-the-data">Load the data</h2>
 <p>The datasets we are using are 5000 records across two datasets with no duplicates, and each of the records has a valid match in the other dataset.</p>
 <p>After loading the data, we can parse the true matched ID number from the indices.</p>
-<div id="454f8bcd" class="cell" data-execution_count="2">
+<div id="63fef37a" class="cell" data-execution_count="2">
 <div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>feb4a, feb4b <span class="op">=</span> load_febrl4()</span>
 <span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a>feb4a[<span class="st">"true_id"</span>] <span class="op">=</span> (</span>
@@ -382,7 +382,7 @@ <h2 class="anchored" data-anchor-id="create-a-feature-factory">Create a feature
 <li>Pass a dictionary of dictionaries of keyword arguments as an optional <code>ff_args</code> parameter (e.g.&nbsp;<code>ff_args = {"dob": {"dayfirst": False, "yearfirst": True}})</code>)</li>
 <li>Use <code>functools.partial()</code>, as we have below.</li>
 </ol>
-<div id="b7ebef7c" class="cell" data-execution_count="3">
+<div id="cf5c1cab" class="cell" data-execution_count="3">
 <div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a>feature_factory <span class="op">=</span> <span class="bu">dict</span>(</span>
 <span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a>    name<span class="op">=</span>feat.gen_name_features,</span>
 <span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a>    dob<span class="op">=</span>partial(feat.gen_dateofbirth_features, dayfirst<span class="op">=</span><span class="va">False</span>, yearfirst<span class="op">=</span><span class="va">True</span>),</span>
@@ -396,7 +396,7 @@ <h2 class="anchored" data-anchor-id="create-a-feature-factory">Create a feature
 <section id="initialise-the-embedder-instance" class="level2">
 <h2 class="anchored" data-anchor-id="initialise-the-embedder-instance">Initialise the embedder instance</h2>
 <p>This instance embeds each feature twice into a Bloom filter of length 1024.</p>
-<div id="f8a7a5e3" class="cell" data-execution_count="4">
+<div id="bcd35bd0" class="cell" data-execution_count="4">
 <div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a>embedder <span class="op">=</span> Embedder(feature_factory, bf_size<span class="op">=</span><span class="dv">1024</span>, num_hashes<span class="op">=</span><span class="dv">2</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 </section>
@@ -418,7 +418,7 @@ <h2 class="anchored" data-anchor-id="embed-the-datasets">Embed the datasets</h2>
 <p>For example, to ensure suburb doesn’t collide with state (if they happened to be the same), <code>gen_misc_features()</code> would encode each of their tokens as <code>suburb&lt;token&gt;</code> and <code>state&lt;token&gt;</code>, respectively. If you want to map different columns into the same feature, such as <code>address</code> below, you can set the label explicitly when passing the function to the embedder.</p>
 </div>
 </div>
-<div id="75454628" class="cell" data-execution_count="5">
+<div id="26c65cd3" class="cell" data-execution_count="5">
 <div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a>colspec <span class="op">=</span> <span class="bu">dict</span>(</span>
 <span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a>    given_name<span class="op">=</span><span class="st">"name"</span>,</span>
 <span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a>    surname<span class="op">=</span><span class="st">"name"</span>,</span>
@@ -436,7 +436,7 @@ <h2 class="anchored" data-anchor-id="embed-the-datasets">Embed the datasets</h2>
 <span id="cb5-15"><a href="#cb5-15" aria-hidden="true" tabindex="-1"></a>edf2 <span class="op">=</span> embedder.embed(feb4b, colspec<span class="op">=</span>colspec)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Store the embedded datasets and their embedder to file.</p>
-<div id="e043c2cd" class="cell" data-execution_count="6">
+<div id="f8d49a2a" class="cell" data-execution_count="6">
 <div class="sourceCode cell-code" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a>edf1.to_json(<span class="st">"party1_data.json"</span>)</span>
 <span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a>edf2.to_json(<span class="st">"party2_data.json"</span>)</span>
 <span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a>embedder.to_pickle(<span class="st">"embedder.pkl"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -445,7 +445,7 @@ <h2 class="anchored" data-anchor-id="embed-the-datasets">Embed the datasets</h2>
 <section id="calculate-similarity" class="level2">
 <h2 class="anchored" data-anchor-id="calculate-similarity">Calculate similarity</h2>
 <p>Compute the row thresholds to provide a lower bound on matching similarity scores for each row. This operation is the most computationally intensive part of the whole process.</p>
-<div id="58f15fdc" class="cell" data-execution_count="7">
+<div id="055e343c" class="cell" data-execution_count="7">
 <div class="sourceCode cell-code" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a>start <span class="op">=</span> time.time()</span>
 <span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a>edf1.update_thresholds()</span>
 <span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a>edf2.update_thresholds()</span>
@@ -453,22 +453,22 @@ <h2 class="anchored" data-anchor-id="calculate-similarity">Calculate similarity<
 <span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb7-6"><a href="#cb7-6" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f"Updating thresholds took </span><span class="sc">{</span>end <span class="op">-</span> start<span class="sc">:.2f}</span><span class="ss"> seconds"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
-<pre><code>Updating thresholds took 8.40 seconds</code></pre>
+<pre><code>Updating thresholds took 8.35 seconds</code></pre>
 </div>
 </div>
 <p>Compute the matrix of similarity scores.</p>
-<div id="58455085" class="cell" data-execution_count="8">
+<div id="d96a56b8" class="cell" data-execution_count="8">
 <div class="sourceCode cell-code" id="cb9"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a>similarity_scores <span class="op">=</span> embedder.compare(edf1,edf2)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 </section>
 <section id="compute-a-match" class="level2">
 <h2 class="anchored" data-anchor-id="compute-a-match">Compute a match</h2>
 <p>Use the similarity scores to compute a match, using the Hungarian algorithm. First, we compute the match with the row thresholds.</p>
-<div id="e5dbd978" class="cell" data-execution_count="9">
+<div id="04f6b677" class="cell" data-execution_count="9">
 <div class="sourceCode cell-code" id="cb10"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a>matching <span class="op">=</span> similarity_scores.match(require_thresholds<span class="op">=</span><span class="va">True</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Using the true IDs, evaluate the precision and recall of the match.</p>
-<div id="f72fd9d8" class="cell" data-execution_count="10">
+<div id="493670e2" class="cell" data-execution_count="10">
 <div class="sourceCode cell-code" id="cb11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> get_results(edf1, edf2, matching):</span>
 <span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a>    <span class="co">"""Get the results for a given matching."""</span></span>
 <span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a></span>
@@ -492,7 +492,7 @@ <h2 class="anchored" data-anchor-id="compute-a-match">Compute a match</h2>
 </div>
 </div>
 <p>Then, we compute the match without using the row thresholds, calculating the same performance metrics:</p>
-<div id="74d9fb69" class="cell" data-execution_count="11">
+<div id="7342aafc" class="cell" data-execution_count="11">
 <div class="sourceCode cell-code" id="cb13"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a>matching <span class="op">=</span> similarity_scores.match(require_thresholds<span class="op">=</span><span class="va">False</span>)</span>
 <span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a>_ <span class="op">=</span> get_results(edf1, edf2, matching)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
diff --git a/docs/tutorials/example-verknupfung.html b/docs/tutorials/example-verknupfung.html
index 4e571c0..0606dd3 100644
--- a/docs/tutorials/example-verknupfung.html
+++ b/docs/tutorials/example-verknupfung.html
@@ -341,7 +341,7 @@ <h1 class="title">Exploring a simple linkage example</h1>
 <section id="loading-the-data" class="level3">
 <h3 class="anchored" data-anchor-id="loading-the-data">Loading the data</h3>
 <p>First, we load our data into <code>pandas.DataFrame</code> objects. Here, the first records align, but the other two records should be swapped to have an aligned matching. We will use the toolkit to identify these matches.</p>
-<div id="1453c6db" class="cell" data-execution_count="1">
+<div id="efb0f0e5" class="cell" data-execution_count="1">
 <div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
 <span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a>df1 <span class="op">=</span> pd.DataFrame(</span>
@@ -381,7 +381,7 @@ <h3 class="anchored" data-anchor-id="loading-the-data">Loading the data</h3>
 <h3 class="anchored" data-anchor-id="creating-and-assigning-a-feature-factory">Creating and assigning a feature factory</h3>
 <p>The next step is to decide how to process each of the columns in our datasets.</p>
 <p>To do this, we define a feature factory that maps column types to feature generation functions, and a column specification for each dataset mapping our columns to column types in the factory.</p>
-<div id="5d8b4537" class="cell" data-execution_count="2">
+<div id="a94af132" class="cell" data-execution_count="2">
 <div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> pprl.embedder <span class="im">import</span> features</span>
 <span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> functools <span class="im">import</span> partial</span>
 <span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a></span>
@@ -419,7 +419,7 @@ <h3 class="anchored" data-anchor-id="creating-and-assigning-a-feature-factory">C
 <h3 class="anchored" data-anchor-id="embedding-the-data">Embedding the data</h3>
 <p>With our specifications sorted out, we can get to creating our Bloom filter embedding. Before doing so, we need to decide on two parameters: the size of the filter and the number of hashes. By default, these are 1024 and 2, respectively.</p>
 <p>Once we’ve decided, we can create our <code>Embedder</code> instance and use it to embed our data with their column specifications.</p>
-<div id="06627453" class="cell" data-execution_count="3">
+<div id="299c467f" class="cell" data-execution_count="3">
 <div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> pprl.embedder.embedder <span class="im">import</span> Embedder</span>
 <span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a>embedder <span class="op">=</span> Embedder(factory, bf_size<span class="op">=</span><span class="dv">1024</span>, num_hashes<span class="op">=</span><span class="dv">2</span>)</span>
@@ -428,7 +428,7 @@ <h3 class="anchored" data-anchor-id="embedding-the-data">Embedding the data</h3>
 <span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a>edf2 <span class="op">=</span> embedder.embed(df2, colspec<span class="op">=</span>spec2, update_thresholds<span class="op">=</span><span class="va">True</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>If we take a look at one of these embedded datasets, we can see that it has a whole bunch of new columns. There is a <code>_features</code> column for each of the original columns containing their pre-embedding string features, and there’s an <code>all_features</code> column that combines the features. Then there are three additional columns: <code>bf_indices</code>, <code>bf_norms</code> and <code>thresholds</code>.</p>
-<div id="52ed5634" class="cell" data-execution_count="4">
+<div id="c7cddca6" class="cell" data-execution_count="4">
 <div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a>edf1.columns</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-display" data-execution_count="4">
 <pre><code>Index(['first_name', 'last_name', 'gender', 'date_of_birth', 'instrument',
@@ -439,15 +439,15 @@ <h3 class="anchored" data-anchor-id="embedding-the-data">Embedding the data</h3>
 </div>
 </div>
 <p>The <code>bf_indices</code> column contains the Bloom filters, represented compactly as a list of non-zero indices for each record.</p>
-<div id="46d6cdc4" class="cell" data-execution_count="5">
+<div id="6ece37c7" class="cell" data-execution_count="5">
 <div class="sourceCode cell-code" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(edf1.bf_indices[<span class="dv">0</span>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
-<pre><code>[2, 262, 646, 903, 9, 526, 15, 272, 654, 146, 531, 532, 17, 282, 667, 413, 670, 544, 288, 931, 292, 808, 937, 172, 942, 559, 816, 691, 820, 567, 823, 440, 56, 60, 61, 318, 319, 320, 444, 577, 836, 583, 332, 77, 972, 590, 465, 593, 211, 468, 82, 851, 338, 600, 84, 218, 861, 613, 871, 744, 238, 367, 881, 758, 890, 379, 1021, 763]</code></pre>
+<pre><code>[2, 262, 903, 646, 9, 526, 654, 272, 15, 146, 17, 532, 531, 282, 667, 413, 670, 544, 288, 931, 292, 808, 937, 172, 942, 559, 816, 691, 820, 567, 56, 823, 440, 60, 61, 318, 319, 320, 444, 577, 836, 583, 332, 77, 590, 972, 465, 82, 211, 468, 84, 338, 851, 600, 593, 218, 861, 613, 871, 744, 238, 367, 881, 758, 890, 379, 1021, 763]</code></pre>
 </div>
 </div>
 <p>The <code>bf_norms</code> column contains the norm of each Bloom filter with respect to the Soft Cosine Measure (SCM) matrix. In this case since we are using an untrained model, the SCM matrix is an identity matrix, and the norm is just the Euclidean norm of the Bloom filter represented as a binary vector, which is equal to <code>np.sqrt(len(bf_indices[i]))</code> for record <code>i</code>. The norm is used to scale the similarity measures so that they take values between -1 and 1.</p>
 <p>The <code>thresholds</code> column is calculated to provide, for each record, a threshold similarity score below which it will not be matched. It’s like a reserve price in an auction – it stops a record being matched to another record when the similarity isn’t high enough. This is an innovative feature of our method; other linkage methods typically only have one global threshold score for the entire dataset.</p>
-<div id="72ed847b" class="cell" data-execution_count="6">
+<div id="5947b2c7" class="cell" data-execution_count="6">
 <div class="sourceCode cell-code" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(edf1.loc[:,[<span class="st">"bf_norms"</span>,<span class="st">"thresholds"</span>]])</span>
 <span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(edf2.loc[:,[<span class="st">"bf_norms"</span>,<span class="st">"thresholds"</span>]])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
@@ -467,7 +467,7 @@ <h3 class="anchored" data-anchor-id="embedding-the-data">Embedding the data</h3>
 <h3 class="anchored" data-anchor-id="the-processed-features">The processed features</h3>
 <p>Let’s take a look at how the features are processed into small text strings (shingles) before being hashed into the Bloom filter. The first record in the first dataset is the same person as the first record in the second dataset, although the data is not identical, so we can compare the processed features for these records to see how pprl puts them into a format where they can be compared.</p>
 <p>First, we’ll look at date of birth:</p>
-<div id="9b83cf0e" class="cell" data-execution_count="7">
+<div id="2d9f0731" class="cell" data-execution_count="7">
 <div class="sourceCode cell-code" id="cb10"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(edf1.date_of_birth_features[<span class="dv">0</span>])</span>
 <span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(edf2.birth_date_features[<span class="dv">0</span>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
@@ -477,7 +477,7 @@ <h3 class="anchored" data-anchor-id="the-processed-features">The processed featu
 </div>
 <p>Python can parse the different formats easily. Although the dates are slightly different in the dataset, the year and month will still match, even though the day will not.</p>
 <p>Then we’ll look at name:</p>
-<div id="2b7d5fa5" class="cell" data-execution_count="8">
+<div id="fc6b021d" class="cell" data-execution_count="8">
 <div class="sourceCode cell-code" id="cb12"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(edf1.first_name_features[<span class="dv">0</span>] <span class="op">+</span> edf1.last_name_features[<span class="dv">0</span>])</span>
 <span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(edf2.name_features[<span class="dv">0</span>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
@@ -487,7 +487,7 @@ <h3 class="anchored" data-anchor-id="the-processed-features">The processed featu
 </div>
 <p>The two datasets store the names differently, but this doesn’t matter for the Bloom filter method because it treats each record like a bag of features. By default, the name processor produces 2-grams and 3-grams.</p>
 <p>The sex processing function just converts different formats to lowercase and takes the first letter. This will often be enough:</p>
-<div id="716314b2" class="cell" data-execution_count="9">
+<div id="de108c17" class="cell" data-execution_count="9">
 <div class="sourceCode cell-code" id="cb14"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(edf1.gender_features[<span class="dv">0</span>])</span>
 <span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(edf2.sex_features[<span class="dv">0</span>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
@@ -496,7 +496,7 @@ <h3 class="anchored" data-anchor-id="the-processed-features">The processed featu
 </div>
 </div>
 <p>Finally, we’ll see how our instrument feature function (<code>partial(features.gen_misc_shingled_features, label="instrument")</code>) processed the data:</p>
-<div id="1780c09c" class="cell" data-execution_count="10">
+<div id="511e0d27" class="cell" data-execution_count="10">
 <div class="sourceCode cell-code" id="cb16"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(edf1.instrument_features[<span class="dv">0</span>])</span>
 <span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(edf2.main_instrument_features[<span class="dv">0</span>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
@@ -509,7 +509,7 @@ <h3 class="anchored" data-anchor-id="the-processed-features">The processed featu
 <section id="performing-the-linkage" class="level3">
 <h3 class="anchored" data-anchor-id="performing-the-linkage">Performing the linkage</h3>
 <p>We can now perform the linkage by comparing these Bloom filter embeddings. We use the Soft Cosine Measure (which in this untrained model, is equivalent to a normal cosine similarity metric) to calculate record-wise similarity and an adapted Hungarian algorithm to match the records based on those similarities.</p>
-<div id="16766f5e" class="cell" data-execution_count="11">
+<div id="e1017341" class="cell" data-execution_count="11">
 <div class="sourceCode cell-code" id="cb18"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a>similarities <span class="op">=</span> embedder.compare(edf1, edf2)</span>
 <span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a>similarities</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-display" data-execution_count="11">
@@ -519,7 +519,7 @@ <h3 class="anchored" data-anchor-id="performing-the-linkage">Performing the link
 </div>
 </div>
 <p>This <code>SimilarityArray</code> object is an augmented <code>numpy.ndarray</code> that can perform our matching. The matching itself can optionally be called with an absolute threshold score, but it doesn’t need one.</p>
-<div id="384c2606" class="cell" data-execution_count="12">
+<div id="766745ab" class="cell" data-execution_count="12">
 <div class="sourceCode cell-code" id="cb20"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a>matching <span class="op">=</span> similarities.match()</span>
 <span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a>matching</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-display" data-execution_count="12">
diff --git a/docs/tutorials/index.html b/docs/tutorials/index.html
index d14e966..e533e03 100644
--- a/docs/tutorials/index.html
+++ b/docs/tutorials/index.html
@@ -384,7 +384,7 @@ <h1 class="title">Tutorials</h1>
 </tr>
 </thead>
 <tbody class="list">
-<tr data-index="0" data-listing-file-modified-sort="1715176990597" data-listing-reading-time-sort="5" data-listing-word-count-sort="884" data-listing-title-sort="Embedder API run-through" data-listing-filename-sort="run-through.qmd">
+<tr data-index="0" data-listing-file-modified-sort="1715615405709" data-listing-reading-time-sort="5" data-listing-word-count-sort="884" data-listing-title-sort="Embedder API run-through" data-listing-filename-sort="run-through.qmd">
 <td>
 <a href="../../docs/tutorials/run-through.html" class="title listing-title">Embedder API run-through</a>
 </td>
@@ -395,7 +395,7 @@ <h1 class="title">Tutorials</h1>
 <span class="listing-reading-time">5 min</span>
 </td>
 </tr>
-<tr data-index="1" data-listing-file-modified-sort="1715176990597" data-listing-reading-time-sort="6" data-listing-word-count-sort="1080" data-listing-title-sort="Exploring a simple linkage example" data-listing-filename-sort="example-verknupfung.qmd">
+<tr data-index="1" data-listing-file-modified-sort="1715615405709" data-listing-reading-time-sort="6" data-listing-word-count-sort="1080" data-listing-title-sort="Exploring a simple linkage example" data-listing-filename-sort="example-verknupfung.qmd">
 <td>
 <a href="../../docs/tutorials/example-verknupfung.html" class="title listing-title">Exploring a simple linkage example</a>
 </td>
@@ -406,7 +406,7 @@ <h1 class="title">Tutorials</h1>
 <span class="listing-reading-time">6 min</span>
 </td>
 </tr>
-<tr data-index="2" data-listing-file-modified-sort="1715176990597" data-listing-reading-time-sort="4" data-listing-word-count-sort="717" data-listing-title-sort="Linking the FEBRL datasets" data-listing-filename-sort="example-febrl.qmd">
+<tr data-index="2" data-listing-file-modified-sort="1715615405709" data-listing-reading-time-sort="4" data-listing-word-count-sort="717" data-listing-title-sort="Linking the FEBRL datasets" data-listing-filename-sort="example-febrl.qmd">
 <td>
 <a href="../../docs/tutorials/example-febrl.html" class="title listing-title">Linking the FEBRL datasets</a>
 </td>
@@ -417,7 +417,7 @@ <h1 class="title">Tutorials</h1>
 <span class="listing-reading-time">4 min</span>
 </td>
 </tr>
-<tr data-index="3" data-listing-file-modified-sort="1715176990597" data-listing-reading-time-sort="11" data-listing-word-count-sort="2032" data-listing-title-sort="Working in the cloud" data-listing-filename-sort="in-the-cloud.qmd">
+<tr data-index="3" data-listing-file-modified-sort="1715615405709" data-listing-reading-time-sort="11" data-listing-word-count-sort="2032" data-listing-title-sort="Working in the cloud" data-listing-filename-sort="in-the-cloud.qmd">
 <td>
 <a href="../../docs/tutorials/in-the-cloud.html" class="title listing-title">Working in the cloud</a>
 </td>
diff --git a/docs/tutorials/run-through.html b/docs/tutorials/run-through.html
index 756526b..73db0b8 100644
--- a/docs/tutorials/run-through.html
+++ b/docs/tutorials/run-through.html
@@ -346,7 +346,7 @@ <h1 class="title">Embedder API run-through</h1>
 <li>the <code>config</code> module, which includes our package configuration (such as the location of data directories)</li>
 <li>some classes from the main <code>embedder</code> module</li>
 </ul>
-<div id="dfb91982" class="cell" data-execution_count="1">
+<div id="c26922fb" class="cell" data-execution_count="1">
 <div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> os</span>
 <span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
 <span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
@@ -357,7 +357,7 @@ <h1 class="title">Embedder API run-through</h1>
 <section id="data-set-up" class="level2">
 <h2 class="anchored" data-anchor-id="data-set-up">Data set-up</h2>
 <p>For this demo we’ll create a really minimal pair of datasets. Notice that they don’t have to have the same structure or field names.</p>
-<div id="5b3249fa" class="cell" data-execution_count="2">
+<div id="143362fb" class="cell" data-execution_count="2">
 <div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>df1 <span class="op">=</span> pd.DataFrame(</span>
 <span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a>    <span class="bu">dict</span>(</span>
 <span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a>        <span class="bu">id</span><span class="op">=</span>[<span class="dv">1</span>,<span class="dv">2</span>,<span class="dv">3</span>],</span>
@@ -381,7 +381,7 @@ <h2 class="anchored" data-anchor-id="data-set-up">Data set-up</h2>
 </div>
 <p>Features are extracted as different kinds of string objects from each field, ready to be hash embedded into the Bloom filters. We need to specify the feature extraction functions we’ll need.</p>
 <p>In this case we’ll need one extractor for names, one for dates of birth, and one for sex/gender records. We create a dict with the functions we need. We create another dict to store any keyword arguments we want to pass in to each function (in this case we use all the default arguments so the keyword argument dictionaries are empty):</p>
-<div id="94813f0b" class="cell" data-execution_count="3">
+<div id="cc939e35" class="cell" data-execution_count="3">
 <div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a>feature_factory <span class="op">=</span> <span class="bu">dict</span>(</span>
 <span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a>    name<span class="op">=</span>feat.gen_name_features,</span>
 <span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a>    dob<span class="op">=</span>feat.gen_dateofbirth_features,</span>
@@ -395,7 +395,7 @@ <h2 class="anchored" data-anchor-id="data-set-up">Data set-up</h2>
 <section id="embedding" class="level2">
 <h2 class="anchored" data-anchor-id="embedding">Embedding</h2>
 <p>Now we can create an <code>Embedder</code> object. We want our Bloom filter vectors to have a length of 1024 elements, and we choose to hash each feature two times. These choices seem to work ok, but we haven’t explored them systematically.</p>
-<div id="db791392" class="cell" data-execution_count="4">
+<div id="8fffdc12" class="cell" data-execution_count="4">
 <div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a>embedder <span class="op">=</span> Embedder(feature_factory,</span>
 <span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>                    ff_args,</span>
 <span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a>                    bf_size <span class="op">=</span> <span class="dv">2</span><span class="op">**</span><span class="dv">10</span>,</span>
@@ -403,7 +403,7 @@ <h2 class="anchored" data-anchor-id="embedding">Embedding</h2>
 <span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a>                    )</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Now we can hash embed the dataset into an EmbeddedDataFrame (EDF). For this we need to pass a column specification <code>colspec</code> that maps each column of the data into the <code>feature_factory</code> functions. Any columns not mapped will not contribute to the embedding.</p>
-<div id="051a46ca" class="cell" data-execution_count="5">
+<div id="a7292ee2" class="cell" data-execution_count="5">
 <div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a>edf1 <span class="op">=</span> embedder.embed(</span>
 <span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a>    df1, colspec<span class="op">=</span><span class="bu">dict</span>(forename<span class="op">=</span><span class="st">"name"</span>, surname<span class="op">=</span><span class="st">"name"</span>, dob<span class="op">=</span><span class="st">"dob"</span>, gender<span class="op">=</span><span class="st">"sex"</span>, county<span class="op">=</span><span class="st">"misc"</span>)</span>
 <span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a>)</span>
@@ -435,14 +435,14 @@ <h2 class="anchored" data-anchor-id="embedding">Embedding</h2>
 2  [day&lt;04&gt;, month&lt;10&gt;, year&lt;1995&gt;]        [sex&lt;f&gt;]  [county&lt;county durham&gt;]   
 
                                         all_features  \
-0  [ll, nr, ll_, _t, ull, _tu, _he, he, tu, hen, ...   
-1  [all, ll, ro, n_, ow, sa, ly_, bro, month&lt;01&gt;,...   
-2  [ina, ey, _in, re, wr, aw, law, la, na_, ey_, ...   
+0  [_he, he, _t, ll, tul, ry_, l_, tu, ll_, y_, e...   
+1  [_br, wn_, ro, ll, al, ly, row, _b, y_, _sa, o...   
+2  [sex&lt;f&gt;, county&lt;county durham&gt;, na_, re, y_, a...   
 
                                           bf_indices  bf_norms  
 0  [644, 773, 135, 776, 265, 778, 271, 402, 404, ...  6.244998  
 1  [129, 258, 130, 776, 523, 525, 398, 271, 671, ...  7.141428  
-2  [647, 394, 269, 13, 15, 532, 667, 155, 413, 28...  7.000000  
+2  [647, 394, 269, 13, 15, 532, 667, 28, 413, 155...  7.000000  
    personid   full_name date_of_birth sex   county  \
 0         4  Harry Tull      2/1/2001   M  Rutland   
 1         5  Sali Brown      2/1/2001   M    Powys   
@@ -459,12 +459,12 @@ <h2 class="anchored" data-anchor-id="embedding">Embedding</h2>
 2  [day&lt;04&gt;, month&lt;11&gt;, year&lt;1995&gt;]     [sex&lt;f&gt;]   [county&lt;durham&gt;]   
 
                                         all_features  \
-0  [ll, ll_, rr, rry, ar, _ha, _t, ha, ull, count...   
-1  [county&lt;powys&gt;, ro, li_, n_, ow, sa, bro, ali,...   
-2  [ina, ie, aur, e_, _in, uri, la, na_, county&lt;d...   
+0  [_t, ll, tul, ry_, l_, county&lt;rutland&gt;, ar, tu...   
+1  [_br, wn_, i_, ro, li_, al, ali, row, _b, wn, ...   
+2  [uri, sex&lt;f&gt;, month&lt;11&gt;, na_, ur, ie, a_, au, ...   
 
                                           bf_indices  bf_norms  
-0  [640, 130, 644, 135, 776, 10, 778, 271, 402, 5...  6.855655  
+0  [640, 130, 644, 135, 776, 778, 10, 271, 402, 5...  6.855655  
 1  [130, 523, 525, 398, 271, 152, 671, 803, 806, ...  7.000000  
 2  [646, 647, 394, 269, 15, 272, 531, 532, 665, 6...  6.928203  </code></pre>
 </div>
@@ -478,7 +478,7 @@ <h2 class="anchored" data-anchor-id="training">Training</h2>
 <h2 class="anchored" data-anchor-id="computing-the-similarity-scores-and-the-matching">Computing the similarity scores and the matching</h2>
 <p>Now we have two embedded datasets, we can compare them and compute all the pairwise Cosine similarity scores.</p>
 <p>First, we have to compute the vector norms of each Bloom vector (for scaling the Cosine similarity) and the thresholds (thresholds are explained here [link]). Computing the thresholds can be time-consuming for a larger dataset, because it essentially computes all pairwise comparisons of the data to itself.</p>
-<div id="b1f7a809" class="cell" data-execution_count="6">
+<div id="fed80ab7" class="cell" data-execution_count="6">
 <div class="cell-output cell-output-display" data-execution_count="6">
 <div>
 <div>
@@ -515,8 +515,8 @@ <h2 class="anchored" data-anchor-id="computing-the-similarity-scores-and-the-mat
 <td>[day&lt;02&gt;, month&lt;01&gt;, year&lt;2001&gt;]</td>
 <td>[sex&lt;m&gt;]</td>
 <td>[county&lt;rutland&gt;]</td>
-<td>[ll, ll_, rr, rry, ar, _ha, _t, ha, ull, count...</td>
-<td>[640, 130, 644, 135, 776, 10, 778, 271, 402, 5...</td>
+<td>[_t, ll, tul, ry_, l_, county&lt;rutland&gt;, ar, tu...</td>
+<td>[640, 130, 644, 135, 776, 778, 10, 271, 402, 5...</td>
 <td>6.855655</td>
 <td>0.187541</td>
 </tr>
@@ -531,7 +531,7 @@ <h2 class="anchored" data-anchor-id="computing-the-similarity-scores-and-the-mat
 <td>[day&lt;02&gt;, month&lt;01&gt;, year&lt;2001&gt;]</td>
 <td>[sex&lt;m&gt;]</td>
 <td>[county&lt;powys&gt;]</td>
-<td>[county&lt;powys&gt;, ro, li_, n_, ow, sa, bro, ali,...</td>
+<td>[_br, wn_, i_, ro, li_, al, ali, row, _b, wn, ...</td>
 <td>[130, 523, 525, 398, 271, 152, 671, 803, 806, ...</td>
 <td>7.000000</td>
 <td>0.187541</td>
@@ -547,7 +547,7 @@ <h2 class="anchored" data-anchor-id="computing-the-similarity-scores-and-the-mat
 <td>[day&lt;04&gt;, month&lt;11&gt;, year&lt;1995&gt;]</td>
 <td>[sex&lt;f&gt;]</td>
 <td>[county&lt;durham&gt;]</td>
-<td>[ina, ie, aur, e_, _in, uri, la, na_, county&lt;d...</td>
+<td>[uri, sex&lt;f&gt;, month&lt;11&gt;, na_, ur, ie, a_, au, ...</td>
 <td>[646, 647, 394, 269, 15, 272, 531, 532, 665, 6...</td>
 <td>6.928203</td>
 <td>0.082479</td>
@@ -561,7 +561,7 @@ <h2 class="anchored" data-anchor-id="computing-the-similarity-scores-and-the-mat
 </div>
 <p>NB: there’s also a flag to compute these at the same time as the embedding, but it doesn’t by default because, depending on the workflow, you may wish to compute the norms and thresholds at different times (e.g.&nbsp;on the server).</p>
 <p>Now you can compute the similarities:</p>
-<div id="1e0fb5e9" class="cell" data-execution_count="7">
+<div id="9526c63a" class="cell" data-execution_count="7">
 <div class="sourceCode cell-code" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a>similarities <span class="op">=</span> embedder.compare(edf1,edf2)</span>
 <span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(similarities)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -572,7 +572,7 @@ <h2 class="anchored" data-anchor-id="computing-the-similarity-scores-and-the-mat
 </div>
 </div>
 <p>Finally, you can compute the matching:</p>
-<div id="685f7371" class="cell" data-execution_count="8">
+<div id="535f375a" class="cell" data-execution_count="8">
 <div class="sourceCode cell-code" id="cb9"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a>matching <span class="op">=</span> similarities.match(abs_cutoff<span class="op">=</span><span class="fl">0.5</span>)</span>
 <span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(matching)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -585,13 +585,13 @@ <h2 class="anchored" data-anchor-id="computing-the-similarity-scores-and-the-mat
 <h2 class="anchored" data-anchor-id="serialisation-and-file-io">Serialisation and file I/O</h2>
 <p>That’s how to do the workflow in one session. However, this demo follows a multi-stage workflow, so we need to be able to pass objects around. There are a couple of methods that enable file I/O and serialisation.</p>
 <p>First, the <code>Embedder</code> object itself needs to be written to file and loaded. The idea is to train it, share it to the data owning parties, and also to the matching server. For this purpose, it’s possible to pickle the entire <code>Embedder</code> object.</p>
-<div id="546cc87d" class="cell" data-execution_count="9">
+<div id="9b907c0f" class="cell" data-execution_count="9">
 <div class="sourceCode cell-code" id="cb11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a>embedder.to_pickle(<span class="st">"embedder.pkl"</span>)</span>
 <span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a>embedder_copy <span class="op">=</span> Embedder.from_pickle(<span class="st">"embedder.pkl"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>The copy has the same functionality as the original:</p>
-<div id="8d1f5108" class="cell" data-execution_count="10">
+<div id="2d3dde11" class="cell" data-execution_count="10">
 <div class="sourceCode cell-code" id="cb12"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a>similarities <span class="op">=</span> embedder_copy.compare(edf1,edf2)</span>
 <span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(similarities)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -602,7 +602,7 @@ <h2 class="anchored" data-anchor-id="serialisation-and-file-io">Serialisation an
 </div>
 </div>
 <p>NB: This won’t work if two datasets were embedded with different <code>Embedder</code> instances, even if they’re identical. The <code>compare()</code> method checks for the same embedder object memory reference so it won’t work if one was embedded with the original and the other with the copy. The way to fix this is to re-initialise the <code>EmbeddedDataFrame</code> with the new <code>Embedder</code> object.</p>
-<div id="ffa6bdc1" class="cell" data-execution_count="11">
+<div id="622bc62a" class="cell" data-execution_count="11">
 <div class="sourceCode cell-code" id="cb14"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a>edf2_copy <span class="op">=</span> EmbeddedDataFrame(edf2, embedder_copy)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>In this case, be careful that the <code>Embedder</code> is compatible with the Bloom filter vectors in the EDF (i.e.&nbsp;uses the same parameters and feature factories), because while you can refresh the norms and thresholds, you can’t refresh the ‘bf_indices’ without reembedding the data frame.</p>
@@ -610,7 +610,7 @@ <h2 class="anchored" data-anchor-id="serialisation-and-file-io">Serialisation an
 <section id="serialising-the-data" class="level2">
 <h2 class="anchored" data-anchor-id="serialising-the-data">Serialising the data</h2>
 <p>The EDF objects are just a thin wrapper around <code>pandas.DataFrame</code> instances, so you can serialise to JSON using the normal methods.</p>
-<div id="99b263ce" class="cell" data-execution_count="12">
+<div id="64d0e938" class="cell" data-execution_count="12">
 <div class="sourceCode cell-code" id="cb15"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a>edf1.to_json(<span class="st">"edf1.json"</span>)</span>
 <span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a>edf1_copy <span class="op">=</span> pd.read_json(<span class="st">"edf1.json"</span>)</span>
@@ -624,7 +624,7 @@ <h2 class="anchored" data-anchor-id="serialising-the-data">Serialising the data<
 </div>
 <p>The <code>bf_indices</code>, <code>bf_norms</code> and <code>thresholds</code> columns will be preserved. However, this demotes the data frames back to normal <code>pandas.DataFrame</code> instances and loses the link to an <code>Embedder</code> instance.</p>
 <p>To fix this, just re-initialise them:</p>
-<div id="10de7c92" class="cell" data-execution_count="13">
+<div id="728ab7e8" class="cell" data-execution_count="13">
 <div class="sourceCode cell-code" id="cb17"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a>edf1_copy <span class="op">=</span> EmbeddedDataFrame(edf1_copy, embedder_copy)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 
diff --git a/search.json b/search.json
index ab28c72..d08e7ee 100644
--- a/search.json
+++ b/search.json
@@ -158,7 +158,7 @@
     "href": "docs/reference/embedder.html",
     "title": "embedder",
     "section": "",
-    "text": "embedder.embedder\nClasses and functions for handling embedding objects.\n\n\n\n\n\nName\nDescription\n\n\n\n\nEmbeddedDataFrame\nA data frame with a reference to an Embedder object.\n\n\nEmbedder\nClass for embedding a dataset.\n\n\nSimilarityArray\nAugmented NumPy array of similarity scores with extra attributes.\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame(self, data, embedder, update_norms=True, update_thresholds=False, *args, **kwargs)\nA data frame with a reference to an Embedder object.\nAn EmbeddedDataFrame (EDF) instance wraps together a pandas.DataFrame with a reference to a pprl.embedder.Embedder object. An EDF also has a mandatory bf_indices column, describing the Bloom filter indices used for linkage.\nThe EDF instance can also calculate bf_norms and thresholds columns which are used in the Embedder.compare() method to compute pprl.embedder.SimilarityArray instances.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\nnumpy.numpy.ndarray | typing.Iterable | dict | pandas.pandas.DataFrame\nData to which to attach the embedder. Must include a bf_indices column with list data type.\nrequired\n\n\nembedder\npprl.embedder.embedder.Embedder\nA compatible embedder object for the Bloom filter columns in data.\nrequired\n\n\nupdate_norms\nbool\nWhether to update the Bloom filter norms on creation. Defaults to False.\nTrue\n\n\nupdate_thresholds\nbool\nWhether to update the similarity thresholds on creation. Defaults to True.\nFalse\n\n\n*args\n\nAdditional positional arguments to pass to pandas.DataFrame along with data.\n()\n\n\n**kwargs\n\nAdditional keyword arguments to pass to pandas.DataFrame along with data.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nembedder_checksum\nstr\nHexadecimal string digest from self.embedder.\n\n\n\n\n\n\nAn EDF instance is usually created from an existing Embedder object by calling the embedder.embed() method. It can also be initialised using an embedder and a pandas.DataFrame that already has a bf_indices column via EmbeddedDataFrame(df, embedder).\nIf using the second method it is up to the user to ensure that the Embedder instance is compatible with the bf_indices column (as well as bf_norms and thresholds, if present) in the data frame. If in doubt, call edf.update_norms() and edf.update_thresholds() to refresh them.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nto_bloom_matrix\nConvert Bloom filter indices into a binary matrix.\n\n\nupdate_norms\nGenerate vector norms for each row.\n\n\nupdate_thresholds\nGenerate matching thresholds for each row of the data.\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame.to_bloom_matrix()\nConvert Bloom filter indices into a binary matrix.\nThe matrix has a row for each row in the EDF. The number of columns is equal to self.embedder.bf_size + self.embedder.offset. Each row in the matrix is a Bloom filter expressed as a binary vector, with the ones corresponding to hashed features. This representation is used in the Embedder.compare() method.\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nnumpy.numpy.ndarray\nBinary array of size (len(self), self.embedder.bf_size + self.embedder.offset).\n\n\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame.update_norms()\nGenerate vector norms for each row.\nCreate or update the bf_norms column in the EDF. This method calculates, for each Bloom filter, its Euclidean norm when the filter is expressed as a binary vector, and saves it to the EDF. The norm is used to scale the (Soft) Cosine similarity scores.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndata.bf_norms\nlist\nColumn of vector norms for each row in the EDF.\n\n\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame.update_thresholds()\nGenerate matching thresholds for each row of the data.\nThe threshold is the minimum similarity score that will be matched. It is found by getting the pairwise similarities between each row and the other rows in the same EDF, and taking the maximum of these.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndata.thresholds\nnumpy.numpy.ndarray\nColumn for maximum similarity of each row within the EDF.\n\n\n\n\n\n\n\n\n\nembedder.embedder.Embedder(self, feature_factory, ff_args=None, bf_size=1024, num_hashes=2, offset=0, salt=None)\nClass for embedding a dataset.\nEach instance of the Embedder class represents an embedding space on personal data features. An Embedder instance is defined by three things:\n\nA set of Bloom filter parameters\nA set of feature factory functions\nAn embedding matrix that corresponds to the above\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfeature_factory\ndict\nMapping from dataset columns to feature generation functions.\nrequired\n\n\nff_args\ndict[str, dict] | None\nMapping from dataset columns to keyword arguments for their respective feature generation functions.\nNone\n\n\nbf_size\nint\nSize of the Bloom filter. Default is 1024.\n1024\n\n\nnum_hashes\nint\nNumber of hashes to perform. Default is two.\n2\n\n\noffset\nint\nOffset for Bloom filter to enable masking. Default is zero.\n0\n\n\nsalt\nstr | None\nCryptographic salt added to tokens from the data before hashing.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nscm_matrix\nnumpy.numpy.ndarray\nSoft Cosine Measure matrix. Initialised as an identity matrix of size bf_size + offset.\n\n\nfreq_matr_matched\nnumpy.numpy.ndarray\nMatched frequency matrix for computing scm_matrix. Initialised as an identity matrix of size bf_size + offset.\n\n\nfreq_matr_unmatched\nnumpy.numpy.ndarray\nUnmatched frequency matrix for computing scm_matrix. Initialised as an identity matrix of size bf_size + offset.\n\n\nchecksum\nstr\nHexadecimal string digest of the feature factory, SCM matrix, and other embedding parameters. Used to check an embedder is compatible with an EmbeddedDataFrame.\n\n\n\n\n\n\nWhen an instance is initialised in code, the embedding matrix is initialised as an identity matrix; the matrix can then be trained using a pair of datasets with known match status and the trained Embedder instance pickled to file. The pre-trained Embedder instance can then be reinitialised from the pickle file.\nBoth the untrained and trained instances provide embed() and compare() methods. Comparing datasets using an untrained Embedder instance is equivalent to calculating Cosine similarities on ordinary Bloom filters. Comparing datasets using a pre-trained Embedder calculates the Soft Cosine Measure between Bloom filters. The Soft Cosine Measure embedding matrix is trained using an experimental method.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompare\nCalculate a SimilarityArray on two EDFs.\n\n\nembed\nEncode data columns into features from Bloom embedding.\n\n\nfrom_pickle\nInitialise Embedder instance from pickle file.\n\n\nto_pickle\nSave Embedder instance to pickle file.\n\n\ntrain\nFit Soft Cosine Measure matrix to two matched datasets.\n\n\n\n\n\nembedder.embedder.Embedder.compare(edf1, edf2, require_thresholds=True)\nCalculate a SimilarityArray on two EDFs.\nGiven two EDFs, calculate all pairwise Soft Cosine Similarities between rows.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nedf1\npprl.embedder.embedder.EmbeddedDataFrame\nAn EDF instance with N rows. Must have thresholds column unless require_thresholds=False.\nrequired\n\n\nedf2\npprl.embedder.embedder.EmbeddedDataFrame\nAn EDF instance with M rows. Must have thresholds column unless require_thresholds=False.\nrequired\n\n\nrequire_thresholds\nbool\nIf True (default), the comparison will fail if thresholds are not present. Must be explicitly set to False to allow comparison without thresholds.\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npprl.embedder.embedder.SimilarityArray\nAn N by M array containing the similarity matrix of pairwise Soft Cosine similarities between rows of edf1 and edf2.\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nValueError\nIf require_thresholds is True and both EDFs don’t have a thresholds column.\n\n\n\n\n\n\n\nembedder.embedder.Embedder.embed(df, colspec, update_norms=True, update_thresholds=False)\nEncode data columns into features from Bloom embedding.\nGiven a pandas DataFrame and a column specification, convert columns into string features, and then embed the features into Bloom filters. The method returns an instance of EmbeddedDataFrame, which is an augmented pandas DataFrame.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndf\npandas.pandas.DataFrame\nData frame to be embedded.\nrequired\n\n\ncolspec\ndict\nDictionary mapping columns in df to feature factory functions.\nrequired\n\n\nupdate_norms\nbool\nWhether to calculate vector norms for SCM and add to EDF. False by default.\nTrue\n\n\nupdate_thresholds\nbool\nWhether to calculate similarity thresholds and add to EDF. Used as an outside option in matching. False by default.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npprl.embedder.embedder.EmbeddedDataFrame\nAn embedded data frame with its embedder.\n\n\n\n\n\n\n\nembedder.embedder.Embedder.from_pickle(path=None, pickled=None)\nInitialise Embedder instance from pickle file.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nstr\nFile path from which to load the pickled embedder.\nNone\n\n\npickled\nbytes\nByte-string containing the pickled embedder.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nValueError\nIf not exactly one of path and pickled are specified.\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npprl.embedder.embedder.Embedder\nThe reformed instance of the Embedder class.\n\n\n\n\n\n\n\nembedder.embedder.Embedder.to_pickle(path=None)\nSave Embedder instance to pickle file.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nstr\nFile path at which to save the pickled embedder. If not specified, the pickled bytes string is returned.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nbytes or None\nIf path is not specified, the pickled string comes back. Otherwise, nothing is returned.\n\n\n\n\n\n\n\nembedder.embedder.Embedder.train(edf1, edf2, update=True, learning_rate=1.0, eps=0.01, random_state=None)\nFit Soft Cosine Measure matrix to two matched datasets.\nThis function updates the scm_matrix attribute in-place along with its constituent matrices, freq_matr_matched and freq_matr_unmatched.\nProvide two datasets of pre-matched data, with matching records aligned. If update=True, the training is cumulative, so that train() can be called more than once, updating the same matrices each time by adding new frequency tables. Otherwise, all three matrices are reinitialised prior to training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nedf1\npprl.embedder.embedder.EmbeddedDataFrame\nAn embedded dataset.\nrequired\n\n\nedf2\npprl.embedder.embedder.EmbeddedDataFrame\nAn Embedded dataset of known matches in the same order as edf1.\nrequired\n\n\nupdate\nbool\nWhether to update the existing SCM matrix, or overwrite it. Defaults to True.\nTrue\n\n\neps\nfloat\nSmall non-negative constant to avoid -Inf in log of frequencies. Default is one.\n0.01\n\n\nlearning_rate\nfloat\nScaling factor to dampen matrix updates. Must be in the interval (0, 1]. Default is 0.01.\n1.0\n\n\nrandom_state\nNone | numpy.numpy.random.numpy.random.RandomState\nRandom state to pass to dataset jumbler. Defaults to None.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nscm_matrix\nnumpy.numpy.ndarray\nSoft Cosine Measure matrix that is fitted cumulatively or afresh.\n\n\n\n\n\n\n\n\n\nembedder.embedder.SimilarityArray()\nAugmented NumPy array of similarity scores with extra attributes.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ninput_array\n\nOriginal array of similarity score data.\nrequired\n\n\nthresholds\n\n2-tuple of similarity score thresholds for each axis. These thresholds are used when generating a matching.\nrequired\n\n\nembedder_checksum\n\nHexadecimal string digest of a pprl.embedder.Embedder object.\nrequired\n\n\n\n\n\n\nSimilarityArray objects are usually initialised from an instance of pprl.embedder.Embedder via the embedder.compare() method.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nmatch\nCompute a matching.\n\n\n\n\n\nembedder.embedder.SimilarityArray.match(abs_cutoff=0, rel_cutoff=0, hungarian=True, require_thresholds=True)\nCompute a matching.\nGiven an array of similarity scores, compute a matching of its elements, using the Hungarian algorithm by default. If the SimilarityArray has thresholds, masking is used to ensure that prospective matches whose similarity score is below the thresholds are not returned. An abs_cutoff (global minimum similarity score) can also be supplied.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nabs_cutoff\nint or float\nA lower cutoff for the similarity score. No pairs with similarity below the absolute cutoff will be matched. By default, this is 0.\n0\n\n\nrel_cutoff\nint or float\nA margin above the row/column-specific threshold. Raises all thresholds by a constant. By default, this is 0.\n0\n\n\nhungarian\nbool\nWhether to compute the unique matching using the Hungarian algorithm, filtered using thresholds and abs_cutoff. Default is True. If False, just return all pairs above the threshold.\nTrue\n\n\nrequire_thresholds\nbool\nIf True (default), the matching will fail if thresholds is not present and valid. Must be explicitly set to False to allow matching without similarity thresholds.\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\ntuple[list[int], list[int]]\n2-tuple of indexes containing row and column indices of matched pairs eg. ([0, 1, ...], [0, 1, ...]).\n\n\n\n\n\n\nIf hungarian=False, the matching returns all pairs with similarity score above the abs_cutoff, respecting thresholds if present. This method does not guarantee no duplicates.\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nnearest_pos_semi_definite\nCalculate nearest positive semi-definite version of a matrix.\n\n\n\n\n\nembedder.embedder.nearest_pos_semi_definite(X, eps=0.0)\nCalculate nearest positive semi-definite version of a matrix.\nThis function achieves this by setting all negative eigenvalues of the matrix to zero, or a small positive value to give a positive definite matrix.\nGraciously taken from this StackOverflow post\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\nnumpy.numpy.ndarray\nMatrix-like array.\nrequired\n\n\neps\nfloat\nUse a small positive constant to give a positive definite matrix. Default is 0 to give a positive semi-definite matrix.\n0.0\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nnumpy.numpy.ndarray\nA positive (semi-)definite matrix.",
+    "text": "embedder.embedder\nClasses and functions for handling embedding objects.\n\n\n\n\n\nName\nDescription\n\n\n\n\nEmbeddedDataFrame\nA data frame with a reference to an Embedder object.\n\n\nEmbedder\nClass for embedding a dataset.\n\n\nSimilarityArray\nAugmented NumPy array of similarity scores with extra attributes.\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame(self, data, embedder, update_norms=True, update_thresholds=False, *args, **kwargs)\nA data frame with a reference to an Embedder object.\nAn EmbeddedDataFrame (EDF) instance wraps together a pandas.DataFrame with a reference to a pprl.embedder.Embedder object. An EDF also has a mandatory bf_indices column, describing the Bloom filter indices used for linkage.\nThe EDF instance can also calculate bf_norms and thresholds columns which are used in the Embedder.compare() method to compute pprl.embedder.SimilarityArray instances.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\nnumpy.numpy.ndarray | typing.Iterable | dict | pandas.pandas.DataFrame\nData to which to attach the embedder. Must include a bf_indices column with list data type.\nrequired\n\n\nembedder\npprl.embedder.embedder.Embedder\nA compatible embedder object for the Bloom filter columns in data.\nrequired\n\n\nupdate_norms\nbool\nWhether to update the Bloom filter norms on creation. Defaults to False.\nTrue\n\n\nupdate_thresholds\nbool\nWhether to update the similarity thresholds on creation. Defaults to True.\nFalse\n\n\n*args\n\nAdditional positional arguments to pass to pandas.DataFrame along with data.\n()\n\n\n**kwargs\n\nAdditional keyword arguments to pass to pandas.DataFrame along with data.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nembedder_checksum\nstr\nHexadecimal string digest from self.embedder.\n\n\n\n\n\n\nAn EDF instance is usually created from an existing Embedder object by calling the embedder.embed() method. It can also be initialised using an embedder and a pandas.DataFrame that already has a bf_indices column via EmbeddedDataFrame(df, embedder).\nIf using the second method it is up to the user to ensure that the Embedder instance is compatible with the bf_indices column (as well as bf_norms and thresholds, if present) in the data frame. If in doubt, call edf.update_norms() and edf.update_thresholds() to refresh them.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nanonymise\nRemove raw data from embedded dataframe.\n\n\nto_bloom_matrix\nConvert Bloom filter indices into a binary matrix.\n\n\nupdate_norms\nGenerate vector norms for each row.\n\n\nupdate_thresholds\nGenerate matching thresholds for each row of the data.\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame.anonymise(keep=None)\nRemove raw data from embedded dataframe.\nRemove all columns from the embedded dataframe expect columns listed in keep and bf_indices, bf_norms and thresholds.\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nlist[str]\nColumns to be returned as they appear in the data in addition to bf_indices, bf_norms and thresholds if they are present in the data.\n\n\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame.to_bloom_matrix()\nConvert Bloom filter indices into a binary matrix.\nThe matrix has a row for each row in the EDF. The number of columns is equal to self.embedder.bf_size + self.embedder.offset. Each row in the matrix is a Bloom filter expressed as a binary vector, with the ones corresponding to hashed features. This representation is used in the Embedder.compare() method.\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nnumpy.numpy.ndarray\nBinary array of size (len(self), self.embedder.bf_size + self.embedder.offset).\n\n\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame.update_norms()\nGenerate vector norms for each row.\nCreate or update the bf_norms column in the EDF. This method calculates, for each Bloom filter, its Euclidean norm when the filter is expressed as a binary vector, and saves it to the EDF. The norm is used to scale the (Soft) Cosine similarity scores.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndata.bf_norms\nlist\nColumn of vector norms for each row in the EDF.\n\n\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame.update_thresholds()\nGenerate matching thresholds for each row of the data.\nThe threshold is the minimum similarity score that will be matched. It is found by getting the pairwise similarities between each row and the other rows in the same EDF, and taking the maximum of these.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndata.thresholds\nnumpy.numpy.ndarray\nColumn for maximum similarity of each row within the EDF.\n\n\n\n\n\n\n\n\n\nembedder.embedder.Embedder(self, feature_factory, ff_args=None, bf_size=1024, num_hashes=2, offset=0, salt=None)\nClass for embedding a dataset.\nEach instance of the Embedder class represents an embedding space on personal data features. An Embedder instance is defined by three things:\n\nA set of Bloom filter parameters\nA set of feature factory functions\nAn embedding matrix that corresponds to the above\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfeature_factory\ndict\nMapping from dataset columns to feature generation functions.\nrequired\n\n\nff_args\ndict[str, dict] | None\nMapping from dataset columns to keyword arguments for their respective feature generation functions.\nNone\n\n\nbf_size\nint\nSize of the Bloom filter. Default is 1024.\n1024\n\n\nnum_hashes\nint\nNumber of hashes to perform. Default is two.\n2\n\n\noffset\nint\nOffset for Bloom filter to enable masking. Default is zero.\n0\n\n\nsalt\nstr | None\nCryptographic salt added to tokens from the data before hashing.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nscm_matrix\nnumpy.numpy.ndarray\nSoft Cosine Measure matrix. Initialised as an identity matrix of size bf_size + offset.\n\n\nfreq_matr_matched\nnumpy.numpy.ndarray\nMatched frequency matrix for computing scm_matrix. Initialised as an identity matrix of size bf_size + offset.\n\n\nfreq_matr_unmatched\nnumpy.numpy.ndarray\nUnmatched frequency matrix for computing scm_matrix. Initialised as an identity matrix of size bf_size + offset.\n\n\nchecksum\nstr\nHexadecimal string digest of the feature factory, SCM matrix, and other embedding parameters. Used to check an embedder is compatible with an EmbeddedDataFrame.\n\n\n\n\n\n\nWhen an instance is initialised in code, the embedding matrix is initialised as an identity matrix; the matrix can then be trained using a pair of datasets with known match status and the trained Embedder instance pickled to file. The pre-trained Embedder instance can then be reinitialised from the pickle file.\nBoth the untrained and trained instances provide embed() and compare() methods. Comparing datasets using an untrained Embedder instance is equivalent to calculating Cosine similarities on ordinary Bloom filters. Comparing datasets using a pre-trained Embedder calculates the Soft Cosine Measure between Bloom filters. The Soft Cosine Measure embedding matrix is trained using an experimental method.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompare\nCalculate a SimilarityArray on two EDFs.\n\n\nembed\nEncode data columns into features from Bloom embedding.\n\n\nfrom_pickle\nInitialise Embedder instance from pickle file.\n\n\nto_pickle\nSave Embedder instance to pickle file.\n\n\ntrain\nFit Soft Cosine Measure matrix to two matched datasets.\n\n\n\n\n\nembedder.embedder.Embedder.compare(edf1, edf2, require_thresholds=True)\nCalculate a SimilarityArray on two EDFs.\nGiven two EDFs, calculate all pairwise Soft Cosine Similarities between rows.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nedf1\npprl.embedder.embedder.EmbeddedDataFrame\nAn EDF instance with N rows. Must have thresholds column unless require_thresholds=False.\nrequired\n\n\nedf2\npprl.embedder.embedder.EmbeddedDataFrame\nAn EDF instance with M rows. Must have thresholds column unless require_thresholds=False.\nrequired\n\n\nrequire_thresholds\nbool\nIf True (default), the comparison will fail if thresholds are not present. Must be explicitly set to False to allow comparison without thresholds.\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npprl.embedder.embedder.SimilarityArray\nAn N by M array containing the similarity matrix of pairwise Soft Cosine similarities between rows of edf1 and edf2.\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nValueError\nIf require_thresholds is True and both EDFs don’t have a thresholds column.\n\n\n\n\n\n\n\nembedder.embedder.Embedder.embed(df, colspec, update_norms=True, update_thresholds=False)\nEncode data columns into features from Bloom embedding.\nGiven a pandas DataFrame and a column specification, convert columns into string features, and then embed the features into Bloom filters. The method returns an instance of EmbeddedDataFrame, which is an augmented pandas DataFrame.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndf\npandas.pandas.DataFrame\nData frame to be embedded.\nrequired\n\n\ncolspec\ndict\nDictionary mapping columns in df to feature factory functions.\nrequired\n\n\nupdate_norms\nbool\nWhether to calculate vector norms for SCM and add to EDF. False by default.\nTrue\n\n\nupdate_thresholds\nbool\nWhether to calculate similarity thresholds and add to EDF. Used as an outside option in matching. False by default.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npprl.embedder.embedder.EmbeddedDataFrame\nAn embedded data frame with its embedder.\n\n\n\n\n\n\n\nembedder.embedder.Embedder.from_pickle(path=None, pickled=None)\nInitialise Embedder instance from pickle file.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nstr\nFile path from which to load the pickled embedder.\nNone\n\n\npickled\nbytes\nByte-string containing the pickled embedder.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nValueError\nIf not exactly one of path and pickled are specified.\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npprl.embedder.embedder.Embedder\nThe reformed instance of the Embedder class.\n\n\n\n\n\n\n\nembedder.embedder.Embedder.to_pickle(path=None)\nSave Embedder instance to pickle file.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nstr\nFile path at which to save the pickled embedder. If not specified, the pickled bytes string is returned.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nbytes or None\nIf path is not specified, the pickled string comes back. Otherwise, nothing is returned.\n\n\n\n\n\n\n\nembedder.embedder.Embedder.train(edf1, edf2, update=True, learning_rate=1.0, eps=0.01, random_state=None)\nFit Soft Cosine Measure matrix to two matched datasets.\nThis function updates the scm_matrix attribute in-place along with its constituent matrices, freq_matr_matched and freq_matr_unmatched.\nProvide two datasets of pre-matched data, with matching records aligned. If update=True, the training is cumulative, so that train() can be called more than once, updating the same matrices each time by adding new frequency tables. Otherwise, all three matrices are reinitialised prior to training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nedf1\npprl.embedder.embedder.EmbeddedDataFrame\nAn embedded dataset.\nrequired\n\n\nedf2\npprl.embedder.embedder.EmbeddedDataFrame\nAn Embedded dataset of known matches in the same order as edf1.\nrequired\n\n\nupdate\nbool\nWhether to update the existing SCM matrix, or overwrite it. Defaults to True.\nTrue\n\n\neps\nfloat\nSmall non-negative constant to avoid -Inf in log of frequencies. Default is one.\n0.01\n\n\nlearning_rate\nfloat\nScaling factor to dampen matrix updates. Must be in the interval (0, 1]. Default is 0.01.\n1.0\n\n\nrandom_state\nNone | numpy.numpy.random.numpy.random.RandomState\nRandom state to pass to dataset jumbler. Defaults to None.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nscm_matrix\nnumpy.numpy.ndarray\nSoft Cosine Measure matrix that is fitted cumulatively or afresh.\n\n\n\n\n\n\n\n\n\nembedder.embedder.SimilarityArray()\nAugmented NumPy array of similarity scores with extra attributes.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ninput_array\n\nOriginal array of similarity score data.\nrequired\n\n\nthresholds\n\n2-tuple of similarity score thresholds for each axis. These thresholds are used when generating a matching.\nrequired\n\n\nembedder_checksum\n\nHexadecimal string digest of a pprl.embedder.Embedder object.\nrequired\n\n\n\n\n\n\nSimilarityArray objects are usually initialised from an instance of pprl.embedder.Embedder via the embedder.compare() method.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nmatch\nCompute a matching.\n\n\n\n\n\nembedder.embedder.SimilarityArray.match(abs_cutoff=0, rel_cutoff=0, hungarian=True, require_thresholds=True)\nCompute a matching.\nGiven an array of similarity scores, compute a matching of its elements, using the Hungarian algorithm by default. If the SimilarityArray has thresholds, masking is used to ensure that prospective matches whose similarity score is below the thresholds are not returned. An abs_cutoff (global minimum similarity score) can also be supplied.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nabs_cutoff\nint or float\nA lower cutoff for the similarity score. No pairs with similarity below the absolute cutoff will be matched. By default, this is 0.\n0\n\n\nrel_cutoff\nint or float\nA margin above the row/column-specific threshold. Raises all thresholds by a constant. By default, this is 0.\n0\n\n\nhungarian\nbool\nWhether to compute the unique matching using the Hungarian algorithm, filtered using thresholds and abs_cutoff. Default is True. If False, just return all pairs above the threshold.\nTrue\n\n\nrequire_thresholds\nbool\nIf True (default), the matching will fail if thresholds is not present and valid. Must be explicitly set to False to allow matching without similarity thresholds.\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\ntuple[list[int], list[int]]\n2-tuple of indexes containing row and column indices of matched pairs eg. ([0, 1, ...], [0, 1, ...]).\n\n\n\n\n\n\nIf hungarian=False, the matching returns all pairs with similarity score above the abs_cutoff, respecting thresholds if present. This method does not guarantee no duplicates.\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nnearest_pos_semi_definite\nCalculate nearest positive semi-definite version of a matrix.\n\n\n\n\n\nembedder.embedder.nearest_pos_semi_definite(X, eps=0.0)\nCalculate nearest positive semi-definite version of a matrix.\nThis function achieves this by setting all negative eigenvalues of the matrix to zero, or a small positive value to give a positive definite matrix.\nGraciously taken from this StackOverflow post\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\nnumpy.numpy.ndarray\nMatrix-like array.\nrequired\n\n\neps\nfloat\nUse a small positive constant to give a positive definite matrix. Default is 0 to give a positive semi-definite matrix.\n0.0\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nnumpy.numpy.ndarray\nA positive (semi-)definite matrix.",
     "crumbs": [
       "About",
       "Docs",
@@ -171,7 +171,7 @@
     "href": "docs/reference/embedder.html#classes",
     "title": "embedder",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nEmbeddedDataFrame\nA data frame with a reference to an Embedder object.\n\n\nEmbedder\nClass for embedding a dataset.\n\n\nSimilarityArray\nAugmented NumPy array of similarity scores with extra attributes.\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame(self, data, embedder, update_norms=True, update_thresholds=False, *args, **kwargs)\nA data frame with a reference to an Embedder object.\nAn EmbeddedDataFrame (EDF) instance wraps together a pandas.DataFrame with a reference to a pprl.embedder.Embedder object. An EDF also has a mandatory bf_indices column, describing the Bloom filter indices used for linkage.\nThe EDF instance can also calculate bf_norms and thresholds columns which are used in the Embedder.compare() method to compute pprl.embedder.SimilarityArray instances.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\nnumpy.numpy.ndarray | typing.Iterable | dict | pandas.pandas.DataFrame\nData to which to attach the embedder. Must include a bf_indices column with list data type.\nrequired\n\n\nembedder\npprl.embedder.embedder.Embedder\nA compatible embedder object for the Bloom filter columns in data.\nrequired\n\n\nupdate_norms\nbool\nWhether to update the Bloom filter norms on creation. Defaults to False.\nTrue\n\n\nupdate_thresholds\nbool\nWhether to update the similarity thresholds on creation. Defaults to True.\nFalse\n\n\n*args\n\nAdditional positional arguments to pass to pandas.DataFrame along with data.\n()\n\n\n**kwargs\n\nAdditional keyword arguments to pass to pandas.DataFrame along with data.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nembedder_checksum\nstr\nHexadecimal string digest from self.embedder.\n\n\n\n\n\n\nAn EDF instance is usually created from an existing Embedder object by calling the embedder.embed() method. It can also be initialised using an embedder and a pandas.DataFrame that already has a bf_indices column via EmbeddedDataFrame(df, embedder).\nIf using the second method it is up to the user to ensure that the Embedder instance is compatible with the bf_indices column (as well as bf_norms and thresholds, if present) in the data frame. If in doubt, call edf.update_norms() and edf.update_thresholds() to refresh them.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nto_bloom_matrix\nConvert Bloom filter indices into a binary matrix.\n\n\nupdate_norms\nGenerate vector norms for each row.\n\n\nupdate_thresholds\nGenerate matching thresholds for each row of the data.\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame.to_bloom_matrix()\nConvert Bloom filter indices into a binary matrix.\nThe matrix has a row for each row in the EDF. The number of columns is equal to self.embedder.bf_size + self.embedder.offset. Each row in the matrix is a Bloom filter expressed as a binary vector, with the ones corresponding to hashed features. This representation is used in the Embedder.compare() method.\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nnumpy.numpy.ndarray\nBinary array of size (len(self), self.embedder.bf_size + self.embedder.offset).\n\n\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame.update_norms()\nGenerate vector norms for each row.\nCreate or update the bf_norms column in the EDF. This method calculates, for each Bloom filter, its Euclidean norm when the filter is expressed as a binary vector, and saves it to the EDF. The norm is used to scale the (Soft) Cosine similarity scores.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndata.bf_norms\nlist\nColumn of vector norms for each row in the EDF.\n\n\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame.update_thresholds()\nGenerate matching thresholds for each row of the data.\nThe threshold is the minimum similarity score that will be matched. It is found by getting the pairwise similarities between each row and the other rows in the same EDF, and taking the maximum of these.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndata.thresholds\nnumpy.numpy.ndarray\nColumn for maximum similarity of each row within the EDF.\n\n\n\n\n\n\n\n\n\nembedder.embedder.Embedder(self, feature_factory, ff_args=None, bf_size=1024, num_hashes=2, offset=0, salt=None)\nClass for embedding a dataset.\nEach instance of the Embedder class represents an embedding space on personal data features. An Embedder instance is defined by three things:\n\nA set of Bloom filter parameters\nA set of feature factory functions\nAn embedding matrix that corresponds to the above\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfeature_factory\ndict\nMapping from dataset columns to feature generation functions.\nrequired\n\n\nff_args\ndict[str, dict] | None\nMapping from dataset columns to keyword arguments for their respective feature generation functions.\nNone\n\n\nbf_size\nint\nSize of the Bloom filter. Default is 1024.\n1024\n\n\nnum_hashes\nint\nNumber of hashes to perform. Default is two.\n2\n\n\noffset\nint\nOffset for Bloom filter to enable masking. Default is zero.\n0\n\n\nsalt\nstr | None\nCryptographic salt added to tokens from the data before hashing.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nscm_matrix\nnumpy.numpy.ndarray\nSoft Cosine Measure matrix. Initialised as an identity matrix of size bf_size + offset.\n\n\nfreq_matr_matched\nnumpy.numpy.ndarray\nMatched frequency matrix for computing scm_matrix. Initialised as an identity matrix of size bf_size + offset.\n\n\nfreq_matr_unmatched\nnumpy.numpy.ndarray\nUnmatched frequency matrix for computing scm_matrix. Initialised as an identity matrix of size bf_size + offset.\n\n\nchecksum\nstr\nHexadecimal string digest of the feature factory, SCM matrix, and other embedding parameters. Used to check an embedder is compatible with an EmbeddedDataFrame.\n\n\n\n\n\n\nWhen an instance is initialised in code, the embedding matrix is initialised as an identity matrix; the matrix can then be trained using a pair of datasets with known match status and the trained Embedder instance pickled to file. The pre-trained Embedder instance can then be reinitialised from the pickle file.\nBoth the untrained and trained instances provide embed() and compare() methods. Comparing datasets using an untrained Embedder instance is equivalent to calculating Cosine similarities on ordinary Bloom filters. Comparing datasets using a pre-trained Embedder calculates the Soft Cosine Measure between Bloom filters. The Soft Cosine Measure embedding matrix is trained using an experimental method.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompare\nCalculate a SimilarityArray on two EDFs.\n\n\nembed\nEncode data columns into features from Bloom embedding.\n\n\nfrom_pickle\nInitialise Embedder instance from pickle file.\n\n\nto_pickle\nSave Embedder instance to pickle file.\n\n\ntrain\nFit Soft Cosine Measure matrix to two matched datasets.\n\n\n\n\n\nembedder.embedder.Embedder.compare(edf1, edf2, require_thresholds=True)\nCalculate a SimilarityArray on two EDFs.\nGiven two EDFs, calculate all pairwise Soft Cosine Similarities between rows.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nedf1\npprl.embedder.embedder.EmbeddedDataFrame\nAn EDF instance with N rows. Must have thresholds column unless require_thresholds=False.\nrequired\n\n\nedf2\npprl.embedder.embedder.EmbeddedDataFrame\nAn EDF instance with M rows. Must have thresholds column unless require_thresholds=False.\nrequired\n\n\nrequire_thresholds\nbool\nIf True (default), the comparison will fail if thresholds are not present. Must be explicitly set to False to allow comparison without thresholds.\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npprl.embedder.embedder.SimilarityArray\nAn N by M array containing the similarity matrix of pairwise Soft Cosine similarities between rows of edf1 and edf2.\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nValueError\nIf require_thresholds is True and both EDFs don’t have a thresholds column.\n\n\n\n\n\n\n\nembedder.embedder.Embedder.embed(df, colspec, update_norms=True, update_thresholds=False)\nEncode data columns into features from Bloom embedding.\nGiven a pandas DataFrame and a column specification, convert columns into string features, and then embed the features into Bloom filters. The method returns an instance of EmbeddedDataFrame, which is an augmented pandas DataFrame.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndf\npandas.pandas.DataFrame\nData frame to be embedded.\nrequired\n\n\ncolspec\ndict\nDictionary mapping columns in df to feature factory functions.\nrequired\n\n\nupdate_norms\nbool\nWhether to calculate vector norms for SCM and add to EDF. False by default.\nTrue\n\n\nupdate_thresholds\nbool\nWhether to calculate similarity thresholds and add to EDF. Used as an outside option in matching. False by default.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npprl.embedder.embedder.EmbeddedDataFrame\nAn embedded data frame with its embedder.\n\n\n\n\n\n\n\nembedder.embedder.Embedder.from_pickle(path=None, pickled=None)\nInitialise Embedder instance from pickle file.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nstr\nFile path from which to load the pickled embedder.\nNone\n\n\npickled\nbytes\nByte-string containing the pickled embedder.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nValueError\nIf not exactly one of path and pickled are specified.\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npprl.embedder.embedder.Embedder\nThe reformed instance of the Embedder class.\n\n\n\n\n\n\n\nembedder.embedder.Embedder.to_pickle(path=None)\nSave Embedder instance to pickle file.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nstr\nFile path at which to save the pickled embedder. If not specified, the pickled bytes string is returned.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nbytes or None\nIf path is not specified, the pickled string comes back. Otherwise, nothing is returned.\n\n\n\n\n\n\n\nembedder.embedder.Embedder.train(edf1, edf2, update=True, learning_rate=1.0, eps=0.01, random_state=None)\nFit Soft Cosine Measure matrix to two matched datasets.\nThis function updates the scm_matrix attribute in-place along with its constituent matrices, freq_matr_matched and freq_matr_unmatched.\nProvide two datasets of pre-matched data, with matching records aligned. If update=True, the training is cumulative, so that train() can be called more than once, updating the same matrices each time by adding new frequency tables. Otherwise, all three matrices are reinitialised prior to training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nedf1\npprl.embedder.embedder.EmbeddedDataFrame\nAn embedded dataset.\nrequired\n\n\nedf2\npprl.embedder.embedder.EmbeddedDataFrame\nAn Embedded dataset of known matches in the same order as edf1.\nrequired\n\n\nupdate\nbool\nWhether to update the existing SCM matrix, or overwrite it. Defaults to True.\nTrue\n\n\neps\nfloat\nSmall non-negative constant to avoid -Inf in log of frequencies. Default is one.\n0.01\n\n\nlearning_rate\nfloat\nScaling factor to dampen matrix updates. Must be in the interval (0, 1]. Default is 0.01.\n1.0\n\n\nrandom_state\nNone | numpy.numpy.random.numpy.random.RandomState\nRandom state to pass to dataset jumbler. Defaults to None.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nscm_matrix\nnumpy.numpy.ndarray\nSoft Cosine Measure matrix that is fitted cumulatively or afresh.\n\n\n\n\n\n\n\n\n\nembedder.embedder.SimilarityArray()\nAugmented NumPy array of similarity scores with extra attributes.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ninput_array\n\nOriginal array of similarity score data.\nrequired\n\n\nthresholds\n\n2-tuple of similarity score thresholds for each axis. These thresholds are used when generating a matching.\nrequired\n\n\nembedder_checksum\n\nHexadecimal string digest of a pprl.embedder.Embedder object.\nrequired\n\n\n\n\n\n\nSimilarityArray objects are usually initialised from an instance of pprl.embedder.Embedder via the embedder.compare() method.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nmatch\nCompute a matching.\n\n\n\n\n\nembedder.embedder.SimilarityArray.match(abs_cutoff=0, rel_cutoff=0, hungarian=True, require_thresholds=True)\nCompute a matching.\nGiven an array of similarity scores, compute a matching of its elements, using the Hungarian algorithm by default. If the SimilarityArray has thresholds, masking is used to ensure that prospective matches whose similarity score is below the thresholds are not returned. An abs_cutoff (global minimum similarity score) can also be supplied.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nabs_cutoff\nint or float\nA lower cutoff for the similarity score. No pairs with similarity below the absolute cutoff will be matched. By default, this is 0.\n0\n\n\nrel_cutoff\nint or float\nA margin above the row/column-specific threshold. Raises all thresholds by a constant. By default, this is 0.\n0\n\n\nhungarian\nbool\nWhether to compute the unique matching using the Hungarian algorithm, filtered using thresholds and abs_cutoff. Default is True. If False, just return all pairs above the threshold.\nTrue\n\n\nrequire_thresholds\nbool\nIf True (default), the matching will fail if thresholds is not present and valid. Must be explicitly set to False to allow matching without similarity thresholds.\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\ntuple[list[int], list[int]]\n2-tuple of indexes containing row and column indices of matched pairs eg. ([0, 1, ...], [0, 1, ...]).\n\n\n\n\n\n\nIf hungarian=False, the matching returns all pairs with similarity score above the abs_cutoff, respecting thresholds if present. This method does not guarantee no duplicates.",
+    "text": "Name\nDescription\n\n\n\n\nEmbeddedDataFrame\nA data frame with a reference to an Embedder object.\n\n\nEmbedder\nClass for embedding a dataset.\n\n\nSimilarityArray\nAugmented NumPy array of similarity scores with extra attributes.\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame(self, data, embedder, update_norms=True, update_thresholds=False, *args, **kwargs)\nA data frame with a reference to an Embedder object.\nAn EmbeddedDataFrame (EDF) instance wraps together a pandas.DataFrame with a reference to a pprl.embedder.Embedder object. An EDF also has a mandatory bf_indices column, describing the Bloom filter indices used for linkage.\nThe EDF instance can also calculate bf_norms and thresholds columns which are used in the Embedder.compare() method to compute pprl.embedder.SimilarityArray instances.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\nnumpy.numpy.ndarray | typing.Iterable | dict | pandas.pandas.DataFrame\nData to which to attach the embedder. Must include a bf_indices column with list data type.\nrequired\n\n\nembedder\npprl.embedder.embedder.Embedder\nA compatible embedder object for the Bloom filter columns in data.\nrequired\n\n\nupdate_norms\nbool\nWhether to update the Bloom filter norms on creation. Defaults to False.\nTrue\n\n\nupdate_thresholds\nbool\nWhether to update the similarity thresholds on creation. Defaults to True.\nFalse\n\n\n*args\n\nAdditional positional arguments to pass to pandas.DataFrame along with data.\n()\n\n\n**kwargs\n\nAdditional keyword arguments to pass to pandas.DataFrame along with data.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nembedder_checksum\nstr\nHexadecimal string digest from self.embedder.\n\n\n\n\n\n\nAn EDF instance is usually created from an existing Embedder object by calling the embedder.embed() method. It can also be initialised using an embedder and a pandas.DataFrame that already has a bf_indices column via EmbeddedDataFrame(df, embedder).\nIf using the second method it is up to the user to ensure that the Embedder instance is compatible with the bf_indices column (as well as bf_norms and thresholds, if present) in the data frame. If in doubt, call edf.update_norms() and edf.update_thresholds() to refresh them.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nanonymise\nRemove raw data from embedded dataframe.\n\n\nto_bloom_matrix\nConvert Bloom filter indices into a binary matrix.\n\n\nupdate_norms\nGenerate vector norms for each row.\n\n\nupdate_thresholds\nGenerate matching thresholds for each row of the data.\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame.anonymise(keep=None)\nRemove raw data from embedded dataframe.\nRemove all columns from the embedded dataframe expect columns listed in keep and bf_indices, bf_norms and thresholds.\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nlist[str]\nColumns to be returned as they appear in the data in addition to bf_indices, bf_norms and thresholds if they are present in the data.\n\n\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame.to_bloom_matrix()\nConvert Bloom filter indices into a binary matrix.\nThe matrix has a row for each row in the EDF. The number of columns is equal to self.embedder.bf_size + self.embedder.offset. Each row in the matrix is a Bloom filter expressed as a binary vector, with the ones corresponding to hashed features. This representation is used in the Embedder.compare() method.\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nnumpy.numpy.ndarray\nBinary array of size (len(self), self.embedder.bf_size + self.embedder.offset).\n\n\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame.update_norms()\nGenerate vector norms for each row.\nCreate or update the bf_norms column in the EDF. This method calculates, for each Bloom filter, its Euclidean norm when the filter is expressed as a binary vector, and saves it to the EDF. The norm is used to scale the (Soft) Cosine similarity scores.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndata.bf_norms\nlist\nColumn of vector norms for each row in the EDF.\n\n\n\n\n\n\n\nembedder.embedder.EmbeddedDataFrame.update_thresholds()\nGenerate matching thresholds for each row of the data.\nThe threshold is the minimum similarity score that will be matched. It is found by getting the pairwise similarities between each row and the other rows in the same EDF, and taking the maximum of these.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndata.thresholds\nnumpy.numpy.ndarray\nColumn for maximum similarity of each row within the EDF.\n\n\n\n\n\n\n\n\n\nembedder.embedder.Embedder(self, feature_factory, ff_args=None, bf_size=1024, num_hashes=2, offset=0, salt=None)\nClass for embedding a dataset.\nEach instance of the Embedder class represents an embedding space on personal data features. An Embedder instance is defined by three things:\n\nA set of Bloom filter parameters\nA set of feature factory functions\nAn embedding matrix that corresponds to the above\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfeature_factory\ndict\nMapping from dataset columns to feature generation functions.\nrequired\n\n\nff_args\ndict[str, dict] | None\nMapping from dataset columns to keyword arguments for their respective feature generation functions.\nNone\n\n\nbf_size\nint\nSize of the Bloom filter. Default is 1024.\n1024\n\n\nnum_hashes\nint\nNumber of hashes to perform. Default is two.\n2\n\n\noffset\nint\nOffset for Bloom filter to enable masking. Default is zero.\n0\n\n\nsalt\nstr | None\nCryptographic salt added to tokens from the data before hashing.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nscm_matrix\nnumpy.numpy.ndarray\nSoft Cosine Measure matrix. Initialised as an identity matrix of size bf_size + offset.\n\n\nfreq_matr_matched\nnumpy.numpy.ndarray\nMatched frequency matrix for computing scm_matrix. Initialised as an identity matrix of size bf_size + offset.\n\n\nfreq_matr_unmatched\nnumpy.numpy.ndarray\nUnmatched frequency matrix for computing scm_matrix. Initialised as an identity matrix of size bf_size + offset.\n\n\nchecksum\nstr\nHexadecimal string digest of the feature factory, SCM matrix, and other embedding parameters. Used to check an embedder is compatible with an EmbeddedDataFrame.\n\n\n\n\n\n\nWhen an instance is initialised in code, the embedding matrix is initialised as an identity matrix; the matrix can then be trained using a pair of datasets with known match status and the trained Embedder instance pickled to file. The pre-trained Embedder instance can then be reinitialised from the pickle file.\nBoth the untrained and trained instances provide embed() and compare() methods. Comparing datasets using an untrained Embedder instance is equivalent to calculating Cosine similarities on ordinary Bloom filters. Comparing datasets using a pre-trained Embedder calculates the Soft Cosine Measure between Bloom filters. The Soft Cosine Measure embedding matrix is trained using an experimental method.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompare\nCalculate a SimilarityArray on two EDFs.\n\n\nembed\nEncode data columns into features from Bloom embedding.\n\n\nfrom_pickle\nInitialise Embedder instance from pickle file.\n\n\nto_pickle\nSave Embedder instance to pickle file.\n\n\ntrain\nFit Soft Cosine Measure matrix to two matched datasets.\n\n\n\n\n\nembedder.embedder.Embedder.compare(edf1, edf2, require_thresholds=True)\nCalculate a SimilarityArray on two EDFs.\nGiven two EDFs, calculate all pairwise Soft Cosine Similarities between rows.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nedf1\npprl.embedder.embedder.EmbeddedDataFrame\nAn EDF instance with N rows. Must have thresholds column unless require_thresholds=False.\nrequired\n\n\nedf2\npprl.embedder.embedder.EmbeddedDataFrame\nAn EDF instance with M rows. Must have thresholds column unless require_thresholds=False.\nrequired\n\n\nrequire_thresholds\nbool\nIf True (default), the comparison will fail if thresholds are not present. Must be explicitly set to False to allow comparison without thresholds.\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npprl.embedder.embedder.SimilarityArray\nAn N by M array containing the similarity matrix of pairwise Soft Cosine similarities between rows of edf1 and edf2.\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nValueError\nIf require_thresholds is True and both EDFs don’t have a thresholds column.\n\n\n\n\n\n\n\nembedder.embedder.Embedder.embed(df, colspec, update_norms=True, update_thresholds=False)\nEncode data columns into features from Bloom embedding.\nGiven a pandas DataFrame and a column specification, convert columns into string features, and then embed the features into Bloom filters. The method returns an instance of EmbeddedDataFrame, which is an augmented pandas DataFrame.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndf\npandas.pandas.DataFrame\nData frame to be embedded.\nrequired\n\n\ncolspec\ndict\nDictionary mapping columns in df to feature factory functions.\nrequired\n\n\nupdate_norms\nbool\nWhether to calculate vector norms for SCM and add to EDF. False by default.\nTrue\n\n\nupdate_thresholds\nbool\nWhether to calculate similarity thresholds and add to EDF. Used as an outside option in matching. False by default.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npprl.embedder.embedder.EmbeddedDataFrame\nAn embedded data frame with its embedder.\n\n\n\n\n\n\n\nembedder.embedder.Embedder.from_pickle(path=None, pickled=None)\nInitialise Embedder instance from pickle file.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nstr\nFile path from which to load the pickled embedder.\nNone\n\n\npickled\nbytes\nByte-string containing the pickled embedder.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nValueError\nIf not exactly one of path and pickled are specified.\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npprl.embedder.embedder.Embedder\nThe reformed instance of the Embedder class.\n\n\n\n\n\n\n\nembedder.embedder.Embedder.to_pickle(path=None)\nSave Embedder instance to pickle file.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nstr\nFile path at which to save the pickled embedder. If not specified, the pickled bytes string is returned.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nbytes or None\nIf path is not specified, the pickled string comes back. Otherwise, nothing is returned.\n\n\n\n\n\n\n\nembedder.embedder.Embedder.train(edf1, edf2, update=True, learning_rate=1.0, eps=0.01, random_state=None)\nFit Soft Cosine Measure matrix to two matched datasets.\nThis function updates the scm_matrix attribute in-place along with its constituent matrices, freq_matr_matched and freq_matr_unmatched.\nProvide two datasets of pre-matched data, with matching records aligned. If update=True, the training is cumulative, so that train() can be called more than once, updating the same matrices each time by adding new frequency tables. Otherwise, all three matrices are reinitialised prior to training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nedf1\npprl.embedder.embedder.EmbeddedDataFrame\nAn embedded dataset.\nrequired\n\n\nedf2\npprl.embedder.embedder.EmbeddedDataFrame\nAn Embedded dataset of known matches in the same order as edf1.\nrequired\n\n\nupdate\nbool\nWhether to update the existing SCM matrix, or overwrite it. Defaults to True.\nTrue\n\n\neps\nfloat\nSmall non-negative constant to avoid -Inf in log of frequencies. Default is one.\n0.01\n\n\nlearning_rate\nfloat\nScaling factor to dampen matrix updates. Must be in the interval (0, 1]. Default is 0.01.\n1.0\n\n\nrandom_state\nNone | numpy.numpy.random.numpy.random.RandomState\nRandom state to pass to dataset jumbler. Defaults to None.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nscm_matrix\nnumpy.numpy.ndarray\nSoft Cosine Measure matrix that is fitted cumulatively or afresh.\n\n\n\n\n\n\n\n\n\nembedder.embedder.SimilarityArray()\nAugmented NumPy array of similarity scores with extra attributes.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ninput_array\n\nOriginal array of similarity score data.\nrequired\n\n\nthresholds\n\n2-tuple of similarity score thresholds for each axis. These thresholds are used when generating a matching.\nrequired\n\n\nembedder_checksum\n\nHexadecimal string digest of a pprl.embedder.Embedder object.\nrequired\n\n\n\n\n\n\nSimilarityArray objects are usually initialised from an instance of pprl.embedder.Embedder via the embedder.compare() method.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nmatch\nCompute a matching.\n\n\n\n\n\nembedder.embedder.SimilarityArray.match(abs_cutoff=0, rel_cutoff=0, hungarian=True, require_thresholds=True)\nCompute a matching.\nGiven an array of similarity scores, compute a matching of its elements, using the Hungarian algorithm by default. If the SimilarityArray has thresholds, masking is used to ensure that prospective matches whose similarity score is below the thresholds are not returned. An abs_cutoff (global minimum similarity score) can also be supplied.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nabs_cutoff\nint or float\nA lower cutoff for the similarity score. No pairs with similarity below the absolute cutoff will be matched. By default, this is 0.\n0\n\n\nrel_cutoff\nint or float\nA margin above the row/column-specific threshold. Raises all thresholds by a constant. By default, this is 0.\n0\n\n\nhungarian\nbool\nWhether to compute the unique matching using the Hungarian algorithm, filtered using thresholds and abs_cutoff. Default is True. If False, just return all pairs above the threshold.\nTrue\n\n\nrequire_thresholds\nbool\nIf True (default), the matching will fail if thresholds is not present and valid. Must be explicitly set to False to allow matching without similarity thresholds.\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\ntuple[list[int], list[int]]\n2-tuple of indexes containing row and column indices of matched pairs eg. ([0, 1, ...], [0, 1, ...]).\n\n\n\n\n\n\nIf hungarian=False, the matching returns all pairs with similarity score above the abs_cutoff, respecting thresholds if present. This method does not guarantee no duplicates.",
     "crumbs": [
       "About",
       "Docs",
@@ -223,7 +223,7 @@
     "href": "docs/tutorials/example-verknupfung.html",
     "title": "Exploring a simple linkage example",
     "section": "",
-    "text": "The Python package implements the Bloom filter linkage method (Schnell et al., 2009), and can also implement pretrained Hash embeddings (Miranda et al., 2022), if a suitable large, pre-matched corpus of data is available.\nLet us consider a small example where we want to link two excerpts of data on bands. In this scenario, we are looking at some toy data on the members of a fictional, German rock trio called “Verknüpfung”. In this example we will see how to use untrained Bloom filters to match data.\n\nLoading the data\nFirst, we load our data into pandas.DataFrame objects. Here, the first records align, but the other two records should be swapped to have an aligned matching. We will use the toolkit to identify these matches.\n\nimport pandas as pd\n\ndf1 = pd.DataFrame(\n    {\n        \"first_name\": [\"Laura\", \"Kaspar\", \"Grete\"],\n        \"last_name\": [\"Daten\", \"Gorman\", \"Knopf\"],\n        \"gender\": [\"F\", \"M\", \"F\"],\n        \"date_of_birth\": [\"01/03/1977\", \"31/12/1975\", \"12/7/1981\"],\n        \"instrument\": [\"bass\", \"guitar\", \"drums\"],\n    }\n)\ndf2 = pd.DataFrame(\n    {\n        \"name\": [\"Laura Datten\", \"Greta Knopf\", \"Casper Goreman\"],\n        \"sex\": [\"female\", \"female\", \"male\"],\n        \"main_instrument\": [\"bass guitar\", \"percussion\", \"electric guitar\"],\n        \"birth_date\": [\"1977-03-23\", \"1981-07-12\", \"1975-12-31\"],\n    }\n)\n\n\n\n\n\n\n\nNote\n\n\n\nThese datasets don’t have the same column names or follow the same encodings, and there are several spelling mistakes in the names of the band members, as well as a typo in the dates.\nThankfully, the PPRL Toolkit is flexible enough to handle this!\n\n\n\n\nCreating and assigning a feature factory\nThe next step is to decide how to process each of the columns in our datasets.\nTo do this, we define a feature factory that maps column types to feature generation functions, and a column specification for each dataset mapping our columns to column types in the factory.\n\nfrom pprl.embedder import features\nfrom functools import partial\n\nfactory = dict(\n    name=features.gen_name_features,\n    sex=features.gen_sex_features,\n    misc=features.gen_misc_features,\n    dob=features.gen_dateofbirth_features,\n    instrument=partial(features.gen_misc_shingled_features, label=\"instrument\")\n)\nspec1 = dict(\n    first_name=\"name\",\n    last_name=\"name\",\n    gender=\"sex\",\n    instrument=\"instrument\",\n    date_of_birth=\"dob\",\n)\nspec2 = dict(name=\"name\", sex=\"sex\", main_instrument=\"instrument\", birth_date=\"dob\")\n\n\n\n\n\n\n\nTip\n\n\n\nThe feature generation functions, features.gen_XXX_features have sensible default parameters, but sometimes have to be passed in to the feature factory with different parameters, such as to set a feature label in the example above. There are two ways to achieve this. Either use functools.partial to set parameters (as above), or pass keyword arguments as a dictionary of dictionaries to the Embedder as ff_args.\n\n\n\n\nEmbedding the data\nWith our specifications sorted out, we can get to creating our Bloom filter embedding. Before doing so, we need to decide on two parameters: the size of the filter and the number of hashes. By default, these are 1024 and 2, respectively.\nOnce we’ve decided, we can create our Embedder instance and use it to embed our data with their column specifications.\n\nfrom pprl.embedder.embedder import Embedder\n\nembedder = Embedder(factory, bf_size=1024, num_hashes=2)\n\nedf1 = embedder.embed(df1, colspec=spec1, update_thresholds=True)\nedf2 = embedder.embed(df2, colspec=spec2, update_thresholds=True)\n\nIf we take a look at one of these embedded datasets, we can see that it has a whole bunch of new columns. There is a _features column for each of the original columns containing their pre-embedding string features, and there’s an all_features column that combines the features. Then there are three additional columns: bf_indices, bf_norms and thresholds.\n\nedf1.columns\n\nIndex(['first_name', 'last_name', 'gender', 'date_of_birth', 'instrument',\n       'first_name_features', 'last_name_features', 'gender_features',\n       'instrument_features', 'date_of_birth_features', 'all_features',\n       'bf_indices', 'bf_norms', 'thresholds'],\n      dtype='object')\n\n\nThe bf_indices column contains the Bloom filters, represented compactly as a list of non-zero indices for each record.\n\nprint(edf1.bf_indices[0])\n\n[2, 262, 646, 903, 9, 526, 15, 272, 654, 146, 531, 532, 17, 282, 667, 413, 670, 544, 288, 931, 292, 808, 937, 172, 942, 559, 816, 691, 820, 567, 823, 440, 56, 60, 61, 318, 319, 320, 444, 577, 836, 583, 332, 77, 972, 590, 465, 593, 211, 468, 82, 851, 338, 600, 84, 218, 861, 613, 871, 744, 238, 367, 881, 758, 890, 379, 1021, 763]\n\n\nThe bf_norms column contains the norm of each Bloom filter with respect to the Soft Cosine Measure (SCM) matrix. In this case since we are using an untrained model, the SCM matrix is an identity matrix, and the norm is just the Euclidean norm of the Bloom filter represented as a binary vector, which is equal to np.sqrt(len(bf_indices[i])) for record i. The norm is used to scale the similarity measures so that they take values between -1 and 1.\nThe thresholds column is calculated to provide, for each record, a threshold similarity score below which it will not be matched. It’s like a reserve price in an auction – it stops a record being matched to another record when the similarity isn’t high enough. This is an innovative feature of our method; other linkage methods typically only have one global threshold score for the entire dataset.\n\nprint(edf1.loc[:,[\"bf_norms\",\"thresholds\"]])\nprint(edf2.loc[:,[\"bf_norms\",\"thresholds\"]])\n\n   bf_norms  thresholds\n0  8.246211    0.114332\n1  9.055386    0.143159\n2  8.485281    0.143159\n    bf_norms  thresholds\n0   9.695360    0.294345\n1   9.380832    0.157014\n2  10.862781    0.294345\n\n\n\n\n\nThe processed features\nLet’s take a look at how the features are processed into small text strings (shingles) before being hashed into the Bloom filter. The first record in the first dataset is the same person as the first record in the second dataset, although the data is not identical, so we can compare the processed features for these records to see how pprl puts them into a format where they can be compared.\nFirst, we’ll look at date of birth:\n\nprint(edf1.date_of_birth_features[0])\nprint(edf2.birth_date_features[0])\n\n['day&lt;01&gt;', 'month&lt;03&gt;', 'year&lt;1977&gt;']\n['day&lt;23&gt;', 'month&lt;03&gt;', 'year&lt;1977&gt;']\n\n\nPython can parse the different formats easily. Although the dates are slightly different in the dataset, the year and month will still match, even though the day will not.\nThen we’ll look at name:\n\nprint(edf1.first_name_features[0] + edf1.last_name_features[0])\nprint(edf2.name_features[0])\n\n['_l', 'la', 'au', 'ur', 'ra', 'a_', '_la', 'lau', 'aur', 'ura', 'ra_', '_d', 'da', 'at', 'te', 'en', 'n_', '_da', 'dat', 'ate', 'ten', 'en_']\n['_l', 'la', 'au', 'ur', 'ra', 'a_', '_d', 'da', 'at', 'tt', 'te', 'en', 'n_', '_la', 'lau', 'aur', 'ura', 'ra_', '_da', 'dat', 'att', 'tte', 'ten', 'en_']\n\n\nThe two datasets store the names differently, but this doesn’t matter for the Bloom filter method because it treats each record like a bag of features. By default, the name processor produces 2-grams and 3-grams.\nThe sex processing function just converts different formats to lowercase and takes the first letter. This will often be enough:\n\nprint(edf1.gender_features[0])\nprint(edf2.sex_features[0])\n\n['sex&lt;f&gt;']\n['sex&lt;f&gt;']\n\n\nFinally, we’ll see how our instrument feature function (partial(features.gen_misc_shingled_features, label=\"instrument\")) processed the data:\n\nprint(edf1.instrument_features[0])\nprint(edf2.main_instrument_features[0])\n\n['instrument&lt;_b&gt;', 'instrument&lt;ba&gt;', 'instrument&lt;as&gt;', 'instrument&lt;ss&gt;', 'instrument&lt;s_&gt;', 'instrument&lt;_ba&gt;', 'instrument&lt;bas&gt;', 'instrument&lt;ass&gt;', 'instrument&lt;ss_&gt;']\n['instrument&lt;_b&gt;', 'instrument&lt;ba&gt;', 'instrument&lt;as&gt;', 'instrument&lt;ss&gt;', 'instrument&lt;s_&gt;', 'instrument&lt;_g&gt;', 'instrument&lt;gu&gt;', 'instrument&lt;ui&gt;', 'instrument&lt;it&gt;', 'instrument&lt;ta&gt;', 'instrument&lt;ar&gt;', 'instrument&lt;r_&gt;', 'instrument&lt;_ba&gt;', 'instrument&lt;bas&gt;', 'instrument&lt;ass&gt;', 'instrument&lt;ss_&gt;', 'instrument&lt;_gu&gt;', 'instrument&lt;gui&gt;', 'instrument&lt;uit&gt;', 'instrument&lt;ita&gt;', 'instrument&lt;tar&gt;', 'instrument&lt;ar_&gt;']\n\n\nSetting the label argument was important to ensure that the shingles match (and are hashed to the same slots) because the default behaviour of the function is to use the column name as a label: since the two columns have different names, the default wouldn’t have allowed the features to match to each other.\n\n\nPerforming the linkage\nWe can now perform the linkage by comparing these Bloom filter embeddings. We use the Soft Cosine Measure (which in this untrained model, is equivalent to a normal cosine similarity metric) to calculate record-wise similarity and an adapted Hungarian algorithm to match the records based on those similarities.\n\nsimilarities = embedder.compare(edf1, edf2)\nsimilarities\n\nSimilarityArray([[0.80050047, 0.10341754, 0.10047246],\n                 [0.34170424, 0.16480856, 0.63029481],\n                 [0.12155416, 0.54020787, 0.11933984]])\n\n\nThis SimilarityArray object is an augmented numpy.ndarray that can perform our matching. The matching itself can optionally be called with an absolute threshold score, but it doesn’t need one.\n\nmatching = similarities.match()\nmatching\n\n(array([0, 1, 2]), array([0, 2, 1]))\n\n\nSo, all three of the records in each dataset were matched correctly. Excellent!",
+    "text": "The Python package implements the Bloom filter linkage method (Schnell et al., 2009), and can also implement pretrained Hash embeddings (Miranda et al., 2022), if a suitable large, pre-matched corpus of data is available.\nLet us consider a small example where we want to link two excerpts of data on bands. In this scenario, we are looking at some toy data on the members of a fictional, German rock trio called “Verknüpfung”. In this example we will see how to use untrained Bloom filters to match data.\n\nLoading the data\nFirst, we load our data into pandas.DataFrame objects. Here, the first records align, but the other two records should be swapped to have an aligned matching. We will use the toolkit to identify these matches.\n\nimport pandas as pd\n\ndf1 = pd.DataFrame(\n    {\n        \"first_name\": [\"Laura\", \"Kaspar\", \"Grete\"],\n        \"last_name\": [\"Daten\", \"Gorman\", \"Knopf\"],\n        \"gender\": [\"F\", \"M\", \"F\"],\n        \"date_of_birth\": [\"01/03/1977\", \"31/12/1975\", \"12/7/1981\"],\n        \"instrument\": [\"bass\", \"guitar\", \"drums\"],\n    }\n)\ndf2 = pd.DataFrame(\n    {\n        \"name\": [\"Laura Datten\", \"Greta Knopf\", \"Casper Goreman\"],\n        \"sex\": [\"female\", \"female\", \"male\"],\n        \"main_instrument\": [\"bass guitar\", \"percussion\", \"electric guitar\"],\n        \"birth_date\": [\"1977-03-23\", \"1981-07-12\", \"1975-12-31\"],\n    }\n)\n\n\n\n\n\n\n\nNote\n\n\n\nThese datasets don’t have the same column names or follow the same encodings, and there are several spelling mistakes in the names of the band members, as well as a typo in the dates.\nThankfully, the PPRL Toolkit is flexible enough to handle this!\n\n\n\n\nCreating and assigning a feature factory\nThe next step is to decide how to process each of the columns in our datasets.\nTo do this, we define a feature factory that maps column types to feature generation functions, and a column specification for each dataset mapping our columns to column types in the factory.\n\nfrom pprl.embedder import features\nfrom functools import partial\n\nfactory = dict(\n    name=features.gen_name_features,\n    sex=features.gen_sex_features,\n    misc=features.gen_misc_features,\n    dob=features.gen_dateofbirth_features,\n    instrument=partial(features.gen_misc_shingled_features, label=\"instrument\")\n)\nspec1 = dict(\n    first_name=\"name\",\n    last_name=\"name\",\n    gender=\"sex\",\n    instrument=\"instrument\",\n    date_of_birth=\"dob\",\n)\nspec2 = dict(name=\"name\", sex=\"sex\", main_instrument=\"instrument\", birth_date=\"dob\")\n\n\n\n\n\n\n\nTip\n\n\n\nThe feature generation functions, features.gen_XXX_features have sensible default parameters, but sometimes have to be passed in to the feature factory with different parameters, such as to set a feature label in the example above. There are two ways to achieve this. Either use functools.partial to set parameters (as above), or pass keyword arguments as a dictionary of dictionaries to the Embedder as ff_args.\n\n\n\n\nEmbedding the data\nWith our specifications sorted out, we can get to creating our Bloom filter embedding. Before doing so, we need to decide on two parameters: the size of the filter and the number of hashes. By default, these are 1024 and 2, respectively.\nOnce we’ve decided, we can create our Embedder instance and use it to embed our data with their column specifications.\n\nfrom pprl.embedder.embedder import Embedder\n\nembedder = Embedder(factory, bf_size=1024, num_hashes=2)\n\nedf1 = embedder.embed(df1, colspec=spec1, update_thresholds=True)\nedf2 = embedder.embed(df2, colspec=spec2, update_thresholds=True)\n\nIf we take a look at one of these embedded datasets, we can see that it has a whole bunch of new columns. There is a _features column for each of the original columns containing their pre-embedding string features, and there’s an all_features column that combines the features. Then there are three additional columns: bf_indices, bf_norms and thresholds.\n\nedf1.columns\n\nIndex(['first_name', 'last_name', 'gender', 'date_of_birth', 'instrument',\n       'first_name_features', 'last_name_features', 'gender_features',\n       'instrument_features', 'date_of_birth_features', 'all_features',\n       'bf_indices', 'bf_norms', 'thresholds'],\n      dtype='object')\n\n\nThe bf_indices column contains the Bloom filters, represented compactly as a list of non-zero indices for each record.\n\nprint(edf1.bf_indices[0])\n\n[2, 262, 903, 646, 9, 526, 654, 272, 15, 146, 17, 532, 531, 282, 667, 413, 670, 544, 288, 931, 292, 808, 937, 172, 942, 559, 816, 691, 820, 567, 56, 823, 440, 60, 61, 318, 319, 320, 444, 577, 836, 583, 332, 77, 590, 972, 465, 82, 211, 468, 84, 338, 851, 600, 593, 218, 861, 613, 871, 744, 238, 367, 881, 758, 890, 379, 1021, 763]\n\n\nThe bf_norms column contains the norm of each Bloom filter with respect to the Soft Cosine Measure (SCM) matrix. In this case since we are using an untrained model, the SCM matrix is an identity matrix, and the norm is just the Euclidean norm of the Bloom filter represented as a binary vector, which is equal to np.sqrt(len(bf_indices[i])) for record i. The norm is used to scale the similarity measures so that they take values between -1 and 1.\nThe thresholds column is calculated to provide, for each record, a threshold similarity score below which it will not be matched. It’s like a reserve price in an auction – it stops a record being matched to another record when the similarity isn’t high enough. This is an innovative feature of our method; other linkage methods typically only have one global threshold score for the entire dataset.\n\nprint(edf1.loc[:,[\"bf_norms\",\"thresholds\"]])\nprint(edf2.loc[:,[\"bf_norms\",\"thresholds\"]])\n\n   bf_norms  thresholds\n0  8.246211    0.114332\n1  9.055386    0.143159\n2  8.485281    0.143159\n    bf_norms  thresholds\n0   9.695360    0.294345\n1   9.380832    0.157014\n2  10.862781    0.294345\n\n\n\n\n\nThe processed features\nLet’s take a look at how the features are processed into small text strings (shingles) before being hashed into the Bloom filter. The first record in the first dataset is the same person as the first record in the second dataset, although the data is not identical, so we can compare the processed features for these records to see how pprl puts them into a format where they can be compared.\nFirst, we’ll look at date of birth:\n\nprint(edf1.date_of_birth_features[0])\nprint(edf2.birth_date_features[0])\n\n['day&lt;01&gt;', 'month&lt;03&gt;', 'year&lt;1977&gt;']\n['day&lt;23&gt;', 'month&lt;03&gt;', 'year&lt;1977&gt;']\n\n\nPython can parse the different formats easily. Although the dates are slightly different in the dataset, the year and month will still match, even though the day will not.\nThen we’ll look at name:\n\nprint(edf1.first_name_features[0] + edf1.last_name_features[0])\nprint(edf2.name_features[0])\n\n['_l', 'la', 'au', 'ur', 'ra', 'a_', '_la', 'lau', 'aur', 'ura', 'ra_', '_d', 'da', 'at', 'te', 'en', 'n_', '_da', 'dat', 'ate', 'ten', 'en_']\n['_l', 'la', 'au', 'ur', 'ra', 'a_', '_d', 'da', 'at', 'tt', 'te', 'en', 'n_', '_la', 'lau', 'aur', 'ura', 'ra_', '_da', 'dat', 'att', 'tte', 'ten', 'en_']\n\n\nThe two datasets store the names differently, but this doesn’t matter for the Bloom filter method because it treats each record like a bag of features. By default, the name processor produces 2-grams and 3-grams.\nThe sex processing function just converts different formats to lowercase and takes the first letter. This will often be enough:\n\nprint(edf1.gender_features[0])\nprint(edf2.sex_features[0])\n\n['sex&lt;f&gt;']\n['sex&lt;f&gt;']\n\n\nFinally, we’ll see how our instrument feature function (partial(features.gen_misc_shingled_features, label=\"instrument\")) processed the data:\n\nprint(edf1.instrument_features[0])\nprint(edf2.main_instrument_features[0])\n\n['instrument&lt;_b&gt;', 'instrument&lt;ba&gt;', 'instrument&lt;as&gt;', 'instrument&lt;ss&gt;', 'instrument&lt;s_&gt;', 'instrument&lt;_ba&gt;', 'instrument&lt;bas&gt;', 'instrument&lt;ass&gt;', 'instrument&lt;ss_&gt;']\n['instrument&lt;_b&gt;', 'instrument&lt;ba&gt;', 'instrument&lt;as&gt;', 'instrument&lt;ss&gt;', 'instrument&lt;s_&gt;', 'instrument&lt;_g&gt;', 'instrument&lt;gu&gt;', 'instrument&lt;ui&gt;', 'instrument&lt;it&gt;', 'instrument&lt;ta&gt;', 'instrument&lt;ar&gt;', 'instrument&lt;r_&gt;', 'instrument&lt;_ba&gt;', 'instrument&lt;bas&gt;', 'instrument&lt;ass&gt;', 'instrument&lt;ss_&gt;', 'instrument&lt;_gu&gt;', 'instrument&lt;gui&gt;', 'instrument&lt;uit&gt;', 'instrument&lt;ita&gt;', 'instrument&lt;tar&gt;', 'instrument&lt;ar_&gt;']\n\n\nSetting the label argument was important to ensure that the shingles match (and are hashed to the same slots) because the default behaviour of the function is to use the column name as a label: since the two columns have different names, the default wouldn’t have allowed the features to match to each other.\n\n\nPerforming the linkage\nWe can now perform the linkage by comparing these Bloom filter embeddings. We use the Soft Cosine Measure (which in this untrained model, is equivalent to a normal cosine similarity metric) to calculate record-wise similarity and an adapted Hungarian algorithm to match the records based on those similarities.\n\nsimilarities = embedder.compare(edf1, edf2)\nsimilarities\n\nSimilarityArray([[0.80050047, 0.10341754, 0.10047246],\n                 [0.34170424, 0.16480856, 0.63029481],\n                 [0.12155416, 0.54020787, 0.11933984]])\n\n\nThis SimilarityArray object is an augmented numpy.ndarray that can perform our matching. The matching itself can optionally be called with an absolute threshold score, but it doesn’t need one.\n\nmatching = similarities.match()\nmatching\n\n(array([0, 1, 2]), array([0, 2, 1]))\n\n\nSo, all three of the records in each dataset were matched correctly. Excellent!",
     "crumbs": [
       "About",
       "Docs",
@@ -366,7 +366,7 @@
     "href": "docs/tutorials/run-through.html#embedding",
     "title": "Embedder API run-through",
     "section": "Embedding",
-    "text": "Embedding\nNow we can create an Embedder object. We want our Bloom filter vectors to have a length of 1024 elements, and we choose to hash each feature two times. These choices seem to work ok, but we haven’t explored them systematically.\n\nembedder = Embedder(feature_factory,\n                    ff_args,\n                    bf_size = 2**10,\n                    num_hashes=2,\n                    )\n\nNow we can hash embed the dataset into an EmbeddedDataFrame (EDF). For this we need to pass a column specification colspec that maps each column of the data into the feature_factory functions. Any columns not mapped will not contribute to the embedding.\n\nedf1 = embedder.embed(\n    df1, colspec=dict(forename=\"name\", surname=\"name\", dob=\"dob\", gender=\"sex\", county=\"misc\")\n)\nedf2 = embedder.embed(\n    df2, colspec=dict(full_name=\"name\", date_of_birth=\"dob\", sex=\"sex\", county=\"misc\")\n)\n\nprint(edf1)\nprint(edf2)\n\n   id forename surname        dob  gender         county  \\\n0   1    Henry    Tull               male                  \n1   2    Sally   Brown   2/1/2001    Male            NaN   \n2   3      Ina  Lawrey  4/10/1995  Female  County Durham   \n\n                                   forename_features  \\\n0  [_h, he, en, nr, ry, y_, _he, hen, enr, nry, ry_]   \n1  [_s, sa, al, ll, ly, y_, _sa, sal, all, lly, ly_]   \n2                    [_i, in, na, a_, _in, ina, na_]   \n\n                                    surname_features  \\\n0           [_t, tu, ul, ll, l_, _tu, tul, ull, ll_]   \n1  [_b, br, ro, ow, wn, n_, _br, bro, row, own, wn_]   \n2  [_l, la, aw, wr, re, ey, y_, _la, law, awr, wr...   \n\n                       dob_features gender_features          county_features  \\\n0                                []        [sex&lt;m&gt;]                            \n1  [day&lt;02&gt;, month&lt;01&gt;, year&lt;2001&gt;]        [sex&lt;m&gt;]                            \n2  [day&lt;04&gt;, month&lt;10&gt;, year&lt;1995&gt;]        [sex&lt;f&gt;]  [county&lt;county durham&gt;]   \n\n                                        all_features  \\\n0  [ll, nr, ll_, _t, ull, _tu, _he, he, tu, hen, ...   \n1  [all, ll, ro, n_, ow, sa, ly_, bro, month&lt;01&gt;,...   \n2  [ina, ey, _in, re, wr, aw, law, la, na_, ey_, ...   \n\n                                          bf_indices  bf_norms  \n0  [644, 773, 135, 776, 265, 778, 271, 402, 404, ...  6.244998  \n1  [129, 258, 130, 776, 523, 525, 398, 271, 671, ...  7.141428  \n2  [647, 394, 269, 13, 15, 532, 667, 155, 413, 28...  7.000000  \n   personid   full_name date_of_birth sex   county  \\\n0         4  Harry Tull      2/1/2001   M  Rutland   \n1         5  Sali Brown      2/1/2001   M    Powys   \n2         6  Ina Laurie     4/11/1995   F   Durham   \n\n                                  full_name_features  \\\n0  [_h, ha, ar, rr, ry, y_, _t, tu, ul, ll, l_, _...   \n1  [_s, sa, al, li, i_, _b, br, ro, ow, wn, n_, _...   \n2  [_i, in, na, a_, _l, la, au, ur, ri, ie, e_, _...   \n\n             date_of_birth_features sex_features    county_features  \\\n0  [day&lt;02&gt;, month&lt;01&gt;, year&lt;2001&gt;]     [sex&lt;m&gt;]  [county&lt;rutland&gt;]   \n1  [day&lt;02&gt;, month&lt;01&gt;, year&lt;2001&gt;]     [sex&lt;m&gt;]    [county&lt;powys&gt;]   \n2  [day&lt;04&gt;, month&lt;11&gt;, year&lt;1995&gt;]     [sex&lt;f&gt;]   [county&lt;durham&gt;]   \n\n                                        all_features  \\\n0  [ll, ll_, rr, rry, ar, _ha, _t, ha, ull, count...   \n1  [county&lt;powys&gt;, ro, li_, n_, ow, sa, bro, ali,...   \n2  [ina, ie, aur, e_, _in, uri, la, na_, county&lt;d...   \n\n                                          bf_indices  bf_norms  \n0  [640, 130, 644, 135, 776, 10, 778, 271, 402, 5...  6.855655  \n1  [130, 523, 525, 398, 271, 152, 671, 803, 806, ...  7.000000  \n2  [646, 647, 394, 269, 15, 272, 531, 532, 665, 6...  6.928203",
+    "text": "Embedding\nNow we can create an Embedder object. We want our Bloom filter vectors to have a length of 1024 elements, and we choose to hash each feature two times. These choices seem to work ok, but we haven’t explored them systematically.\n\nembedder = Embedder(feature_factory,\n                    ff_args,\n                    bf_size = 2**10,\n                    num_hashes=2,\n                    )\n\nNow we can hash embed the dataset into an EmbeddedDataFrame (EDF). For this we need to pass a column specification colspec that maps each column of the data into the feature_factory functions. Any columns not mapped will not contribute to the embedding.\n\nedf1 = embedder.embed(\n    df1, colspec=dict(forename=\"name\", surname=\"name\", dob=\"dob\", gender=\"sex\", county=\"misc\")\n)\nedf2 = embedder.embed(\n    df2, colspec=dict(full_name=\"name\", date_of_birth=\"dob\", sex=\"sex\", county=\"misc\")\n)\n\nprint(edf1)\nprint(edf2)\n\n   id forename surname        dob  gender         county  \\\n0   1    Henry    Tull               male                  \n1   2    Sally   Brown   2/1/2001    Male            NaN   \n2   3      Ina  Lawrey  4/10/1995  Female  County Durham   \n\n                                   forename_features  \\\n0  [_h, he, en, nr, ry, y_, _he, hen, enr, nry, ry_]   \n1  [_s, sa, al, ll, ly, y_, _sa, sal, all, lly, ly_]   \n2                    [_i, in, na, a_, _in, ina, na_]   \n\n                                    surname_features  \\\n0           [_t, tu, ul, ll, l_, _tu, tul, ull, ll_]   \n1  [_b, br, ro, ow, wn, n_, _br, bro, row, own, wn_]   \n2  [_l, la, aw, wr, re, ey, y_, _la, law, awr, wr...   \n\n                       dob_features gender_features          county_features  \\\n0                                []        [sex&lt;m&gt;]                            \n1  [day&lt;02&gt;, month&lt;01&gt;, year&lt;2001&gt;]        [sex&lt;m&gt;]                            \n2  [day&lt;04&gt;, month&lt;10&gt;, year&lt;1995&gt;]        [sex&lt;f&gt;]  [county&lt;county durham&gt;]   \n\n                                        all_features  \\\n0  [_he, he, _t, ll, tul, ry_, l_, tu, ll_, y_, e...   \n1  [_br, wn_, ro, ll, al, ly, row, _b, y_, _sa, o...   \n2  [sex&lt;f&gt;, county&lt;county durham&gt;, na_, re, y_, a...   \n\n                                          bf_indices  bf_norms  \n0  [644, 773, 135, 776, 265, 778, 271, 402, 404, ...  6.244998  \n1  [129, 258, 130, 776, 523, 525, 398, 271, 671, ...  7.141428  \n2  [647, 394, 269, 13, 15, 532, 667, 28, 413, 155...  7.000000  \n   personid   full_name date_of_birth sex   county  \\\n0         4  Harry Tull      2/1/2001   M  Rutland   \n1         5  Sali Brown      2/1/2001   M    Powys   \n2         6  Ina Laurie     4/11/1995   F   Durham   \n\n                                  full_name_features  \\\n0  [_h, ha, ar, rr, ry, y_, _t, tu, ul, ll, l_, _...   \n1  [_s, sa, al, li, i_, _b, br, ro, ow, wn, n_, _...   \n2  [_i, in, na, a_, _l, la, au, ur, ri, ie, e_, _...   \n\n             date_of_birth_features sex_features    county_features  \\\n0  [day&lt;02&gt;, month&lt;01&gt;, year&lt;2001&gt;]     [sex&lt;m&gt;]  [county&lt;rutland&gt;]   \n1  [day&lt;02&gt;, month&lt;01&gt;, year&lt;2001&gt;]     [sex&lt;m&gt;]    [county&lt;powys&gt;]   \n2  [day&lt;04&gt;, month&lt;11&gt;, year&lt;1995&gt;]     [sex&lt;f&gt;]   [county&lt;durham&gt;]   \n\n                                        all_features  \\\n0  [_t, ll, tul, ry_, l_, county&lt;rutland&gt;, ar, tu...   \n1  [_br, wn_, i_, ro, li_, al, ali, row, _b, wn, ...   \n2  [uri, sex&lt;f&gt;, month&lt;11&gt;, na_, ur, ie, a_, au, ...   \n\n                                          bf_indices  bf_norms  \n0  [640, 130, 644, 135, 776, 778, 10, 271, 402, 5...  6.855655  \n1  [130, 523, 525, 398, 271, 152, 671, 803, 806, ...  7.000000  \n2  [646, 647, 394, 269, 15, 272, 531, 532, 665, 6...  6.928203",
     "crumbs": [
       "About",
       "Docs",
@@ -392,7 +392,7 @@
     "href": "docs/tutorials/run-through.html#computing-the-similarity-scores-and-the-matching",
     "title": "Embedder API run-through",
     "section": "Computing the similarity scores and the matching",
-    "text": "Computing the similarity scores and the matching\nNow we have two embedded datasets, we can compare them and compute all the pairwise Cosine similarity scores.\nFirst, we have to compute the vector norms of each Bloom vector (for scaling the Cosine similarity) and the thresholds (thresholds are explained here [link]). Computing the thresholds can be time-consuming for a larger dataset, because it essentially computes all pairwise comparisons of the data to itself.\n\n\n\n\n\n\n\n\n\n\npersonid\nfull_name\ndate_of_birth\nsex\ncounty\nfull_name_features\ndate_of_birth_features\nsex_features\ncounty_features\nall_features\nbf_indices\nbf_norms\nthresholds\n\n\n\n\n0\n4\nHarry Tull\n2/1/2001\nM\nRutland\n[_h, ha, ar, rr, ry, y_, _t, tu, ul, ll, l_, _...\n[day&lt;02&gt;, month&lt;01&gt;, year&lt;2001&gt;]\n[sex&lt;m&gt;]\n[county&lt;rutland&gt;]\n[ll, ll_, rr, rry, ar, _ha, _t, ha, ull, count...\n[640, 130, 644, 135, 776, 10, 778, 271, 402, 5...\n6.855655\n0.187541\n\n\n1\n5\nSali Brown\n2/1/2001\nM\nPowys\n[_s, sa, al, li, i_, _b, br, ro, ow, wn, n_, _...\n[day&lt;02&gt;, month&lt;01&gt;, year&lt;2001&gt;]\n[sex&lt;m&gt;]\n[county&lt;powys&gt;]\n[county&lt;powys&gt;, ro, li_, n_, ow, sa, bro, ali,...\n[130, 523, 525, 398, 271, 152, 671, 803, 806, ...\n7.000000\n0.187541\n\n\n2\n6\nIna Laurie\n4/11/1995\nF\nDurham\n[_i, in, na, a_, _l, la, au, ur, ri, ie, e_, _...\n[day&lt;04&gt;, month&lt;11&gt;, year&lt;1995&gt;]\n[sex&lt;f&gt;]\n[county&lt;durham&gt;]\n[ina, ie, aur, e_, _in, uri, la, na_, county&lt;d...\n[646, 647, 394, 269, 15, 272, 531, 532, 665, 6...\n6.928203\n0.082479\n\n\n\n\n\n\n\n\nNB: there’s also a flag to compute these at the same time as the embedding, but it doesn’t by default because, depending on the workflow, you may wish to compute the norms and thresholds at different times (e.g. on the server).\nNow you can compute the similarities:\n\nsimilarities = embedder.compare(edf1,edf2)\n\nprint(similarities)\n\n[[0.60728442 0.09150181 0.        ]\n [0.2859526  0.78015612 0.08084521]\n [0.08335143 0.10204083 0.57735028]]\n\n\nFinally, you can compute the matching:\n\nmatching = similarities.match(abs_cutoff=0.5)\n\nprint(matching)\n\n(array([0, 1, 2]), array([0, 1, 2]))",
+    "text": "Computing the similarity scores and the matching\nNow we have two embedded datasets, we can compare them and compute all the pairwise Cosine similarity scores.\nFirst, we have to compute the vector norms of each Bloom vector (for scaling the Cosine similarity) and the thresholds (thresholds are explained here [link]). Computing the thresholds can be time-consuming for a larger dataset, because it essentially computes all pairwise comparisons of the data to itself.\n\n\n\n\n\n\n\n\n\n\npersonid\nfull_name\ndate_of_birth\nsex\ncounty\nfull_name_features\ndate_of_birth_features\nsex_features\ncounty_features\nall_features\nbf_indices\nbf_norms\nthresholds\n\n\n\n\n0\n4\nHarry Tull\n2/1/2001\nM\nRutland\n[_h, ha, ar, rr, ry, y_, _t, tu, ul, ll, l_, _...\n[day&lt;02&gt;, month&lt;01&gt;, year&lt;2001&gt;]\n[sex&lt;m&gt;]\n[county&lt;rutland&gt;]\n[_t, ll, tul, ry_, l_, county&lt;rutland&gt;, ar, tu...\n[640, 130, 644, 135, 776, 778, 10, 271, 402, 5...\n6.855655\n0.187541\n\n\n1\n5\nSali Brown\n2/1/2001\nM\nPowys\n[_s, sa, al, li, i_, _b, br, ro, ow, wn, n_, _...\n[day&lt;02&gt;, month&lt;01&gt;, year&lt;2001&gt;]\n[sex&lt;m&gt;]\n[county&lt;powys&gt;]\n[_br, wn_, i_, ro, li_, al, ali, row, _b, wn, ...\n[130, 523, 525, 398, 271, 152, 671, 803, 806, ...\n7.000000\n0.187541\n\n\n2\n6\nIna Laurie\n4/11/1995\nF\nDurham\n[_i, in, na, a_, _l, la, au, ur, ri, ie, e_, _...\n[day&lt;04&gt;, month&lt;11&gt;, year&lt;1995&gt;]\n[sex&lt;f&gt;]\n[county&lt;durham&gt;]\n[uri, sex&lt;f&gt;, month&lt;11&gt;, na_, ur, ie, a_, au, ...\n[646, 647, 394, 269, 15, 272, 531, 532, 665, 6...\n6.928203\n0.082479\n\n\n\n\n\n\n\n\nNB: there’s also a flag to compute these at the same time as the embedding, but it doesn’t by default because, depending on the workflow, you may wish to compute the norms and thresholds at different times (e.g. on the server).\nNow you can compute the similarities:\n\nsimilarities = embedder.compare(edf1,edf2)\n\nprint(similarities)\n\n[[0.60728442 0.09150181 0.        ]\n [0.2859526  0.78015612 0.08084521]\n [0.08335143 0.10204083 0.57735028]]\n\n\nFinally, you can compute the matching:\n\nmatching = similarities.match(abs_cutoff=0.5)\n\nprint(matching)\n\n(array([0, 1, 2]), array([0, 1, 2]))",
     "crumbs": [
       "About",
       "Docs",
@@ -496,7 +496,7 @@
     "href": "docs/tutorials/example-febrl.html#calculate-similarity",
     "title": "Linking the FEBRL datasets",
     "section": "Calculate similarity",
-    "text": "Calculate similarity\nCompute the row thresholds to provide a lower bound on matching similarity scores for each row. This operation is the most computationally intensive part of the whole process.\n\nstart = time.time()\nedf1.update_thresholds()\nedf2.update_thresholds()\nend = time.time()\n\nprint(f\"Updating thresholds took {end - start:.2f} seconds\")\n\nUpdating thresholds took 8.40 seconds\n\n\nCompute the matrix of similarity scores.\n\nsimilarity_scores = embedder.compare(edf1,edf2)",
+    "text": "Calculate similarity\nCompute the row thresholds to provide a lower bound on matching similarity scores for each row. This operation is the most computationally intensive part of the whole process.\n\nstart = time.time()\nedf1.update_thresholds()\nedf2.update_thresholds()\nend = time.time()\n\nprint(f\"Updating thresholds took {end - start:.2f} seconds\")\n\nUpdating thresholds took 8.35 seconds\n\n\nCompute the matrix of similarity scores.\n\nsimilarity_scores = embedder.compare(edf1,edf2)",
     "crumbs": [
       "About",
       "Docs",
@@ -638,7 +638,7 @@
     "href": "docs/reference/utils.html",
     "title": "utils",
     "section": "",
-    "text": "app.utils\nUtility functions for the party-side app.\n\n\n\n\n\nName\nDescription\n\n\n\n\nassign_columns\nAssign columns from a form to collections.\n\n\ncheck_is_csv\nDetermine whether a file has the csv extension.\n\n\nconvert_dataframe_to_bf\nConvert a dataframe of features to a bloom filter.\n\n\ndownload_files\nSerialize, compress, and send a data frame with its embedder.\n\n\n\n\n\napp.utils.assign_columns(form, feature_funcs)\nAssign columns from a form to collections.\nAll columns belong to one of three collections: columns to drop, raw columns to keep, or a column feature factory specification.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nform\ndict\nForm from our column chooser page.\nrequired\n\n\nfeature_funcs\ndict\nMapping between column types and feature functions.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nlist[str]\nList of columns to drop.\n\n\nlist[str]\nList of columns to keep in their raw format.\n\n\ndict[str, func]\nMapping between column names and feature functions.\n\n\n\n\n\n\n\napp.utils.check_is_csv(path)\nDetermine whether a file has the csv extension.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nstr\nPath to the file.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nbool\nWhether the file name follows the pattern {name}.csv or not.\n\n\n\n\n\n\n\napp.utils.convert_dataframe_to_bf(df, colspec, other_columns=None, salt='')\nConvert a dataframe of features to a bloom filter.\nConvert the columns to features based on the colspec. The features are then combined and converted to Bloom filter indices with the Bloom filter norm also calculated.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndf\npandas.pandas.DataFrame\nData frame of features.\nrequired\n\n\ncolspec\ndict\nDictionary designating columns in the data frame as particular feature types to be processed as appropriate.\nrequired\n\n\nother_columns\nNone | list\nColumns to be returned as they appear in the data in addition to bf_indices and bf_norms.\nNone\n\n\nsalt\nstr\nCryptographic salt to add to tokens before hashing.\n''\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npandas.pandas.DataFrame\nData frame of bloom-filtered data.\n\n\n\n\n\n\n\napp.utils.download_files(dataframe, embedder, party, archive='archive')\nSerialize, compress, and send a data frame with its embedder.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndataframe\npprl.embedder.embedder.EmbeddedDataFrame\nData frame to be downloaded.\nrequired\n\n\nembedder\npprl.embedder.embedder.Embedder\nEmbedder used to embed dataframe.\nrequired\n\n\nparty\nstr\nName of the party.\nrequired\n\n\narchive\nstr\nName of the archive. Default is \"archive\".\n'archive'\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nflask.flask.Response\nResponse containing a ZIP archive with the data frame and its embedder.",
+    "text": "app.utils\nUtility functions for the party-side app.\n\n\n\n\n\nName\nDescription\n\n\n\n\nassign_columns\nAssign columns from a form to collections.\n\n\ncheck_is_csv\nDetermine whether a file has the csv extension.\n\n\nconvert_dataframe_to_bf\nConvert a dataframe of features to a bloom filter.\n\n\ndownload_files\nSerialize, compress, and send a data frame with its embedder.\n\n\n\n\n\napp.utils.assign_columns(form, feature_funcs)\nAssign columns from a form to collections.\nAll columns belong to one of three collections: columns to drop, raw columns to keep, or a column feature factory specification.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nform\ndict\nForm from our column chooser page.\nrequired\n\n\nfeature_funcs\ndict\nMapping between column types and feature functions.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nlist[str]\nList of columns to drop.\n\n\nlist[str]\nList of columns to keep in their raw format.\n\n\ndict[str, func]\nMapping between column names and feature functions.\n\n\n\n\n\n\n\napp.utils.check_is_csv(path)\nDetermine whether a file has the csv extension.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nstr\nPath to the file.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nbool\nWhether the file name follows the pattern {name}.csv or not.\n\n\n\n\n\n\n\napp.utils.convert_dataframe_to_bf(df, colspec, other_columns=None, salt='')\nConvert a dataframe of features to a bloom filter.\nConvert the columns to features based on the colspec. The features are then combined and converted to Bloom filter indices with the Bloom filter norm also calculated.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndf\npandas.pandas.DataFrame\nData frame of features.\nrequired\n\n\ncolspec\ndict\nDictionary designating columns in the data frame as particular feature types to be processed as appropriate.\nrequired\n\n\nother_columns\nNone | list\nColumns to be returned as they appear in the data in addition to bf_indices, bf_norms and thresholds.\nNone\n\n\nsalt\nstr\nCryptographic salt to add to tokens before hashing.\n''\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npandas.pandas.DataFrame\nData frame of bloom-filtered data.\n\n\n\n\n\n\n\napp.utils.download_files(dataframe, embedder, party, archive='archive')\nSerialize, compress, and send a data frame with its embedder.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndataframe\npprl.embedder.embedder.EmbeddedDataFrame\nData frame to be downloaded.\nrequired\n\n\nembedder\npprl.embedder.embedder.Embedder\nEmbedder used to embed dataframe.\nrequired\n\n\nparty\nstr\nName of the party.\nrequired\n\n\narchive\nstr\nName of the archive. Default is \"archive\".\n'archive'\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nflask.flask.Response\nResponse containing a ZIP archive with the data frame and its embedder.",
     "crumbs": [
       "About",
       "Docs",
@@ -651,7 +651,7 @@
     "href": "docs/reference/utils.html#functions",
     "title": "utils",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nassign_columns\nAssign columns from a form to collections.\n\n\ncheck_is_csv\nDetermine whether a file has the csv extension.\n\n\nconvert_dataframe_to_bf\nConvert a dataframe of features to a bloom filter.\n\n\ndownload_files\nSerialize, compress, and send a data frame with its embedder.\n\n\n\n\n\napp.utils.assign_columns(form, feature_funcs)\nAssign columns from a form to collections.\nAll columns belong to one of three collections: columns to drop, raw columns to keep, or a column feature factory specification.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nform\ndict\nForm from our column chooser page.\nrequired\n\n\nfeature_funcs\ndict\nMapping between column types and feature functions.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nlist[str]\nList of columns to drop.\n\n\nlist[str]\nList of columns to keep in their raw format.\n\n\ndict[str, func]\nMapping between column names and feature functions.\n\n\n\n\n\n\n\napp.utils.check_is_csv(path)\nDetermine whether a file has the csv extension.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nstr\nPath to the file.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nbool\nWhether the file name follows the pattern {name}.csv or not.\n\n\n\n\n\n\n\napp.utils.convert_dataframe_to_bf(df, colspec, other_columns=None, salt='')\nConvert a dataframe of features to a bloom filter.\nConvert the columns to features based on the colspec. The features are then combined and converted to Bloom filter indices with the Bloom filter norm also calculated.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndf\npandas.pandas.DataFrame\nData frame of features.\nrequired\n\n\ncolspec\ndict\nDictionary designating columns in the data frame as particular feature types to be processed as appropriate.\nrequired\n\n\nother_columns\nNone | list\nColumns to be returned as they appear in the data in addition to bf_indices and bf_norms.\nNone\n\n\nsalt\nstr\nCryptographic salt to add to tokens before hashing.\n''\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npandas.pandas.DataFrame\nData frame of bloom-filtered data.\n\n\n\n\n\n\n\napp.utils.download_files(dataframe, embedder, party, archive='archive')\nSerialize, compress, and send a data frame with its embedder.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndataframe\npprl.embedder.embedder.EmbeddedDataFrame\nData frame to be downloaded.\nrequired\n\n\nembedder\npprl.embedder.embedder.Embedder\nEmbedder used to embed dataframe.\nrequired\n\n\nparty\nstr\nName of the party.\nrequired\n\n\narchive\nstr\nName of the archive. Default is \"archive\".\n'archive'\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nflask.flask.Response\nResponse containing a ZIP archive with the data frame and its embedder.",
+    "text": "Name\nDescription\n\n\n\n\nassign_columns\nAssign columns from a form to collections.\n\n\ncheck_is_csv\nDetermine whether a file has the csv extension.\n\n\nconvert_dataframe_to_bf\nConvert a dataframe of features to a bloom filter.\n\n\ndownload_files\nSerialize, compress, and send a data frame with its embedder.\n\n\n\n\n\napp.utils.assign_columns(form, feature_funcs)\nAssign columns from a form to collections.\nAll columns belong to one of three collections: columns to drop, raw columns to keep, or a column feature factory specification.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nform\ndict\nForm from our column chooser page.\nrequired\n\n\nfeature_funcs\ndict\nMapping between column types and feature functions.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nlist[str]\nList of columns to drop.\n\n\nlist[str]\nList of columns to keep in their raw format.\n\n\ndict[str, func]\nMapping between column names and feature functions.\n\n\n\n\n\n\n\napp.utils.check_is_csv(path)\nDetermine whether a file has the csv extension.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nstr\nPath to the file.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nbool\nWhether the file name follows the pattern {name}.csv or not.\n\n\n\n\n\n\n\napp.utils.convert_dataframe_to_bf(df, colspec, other_columns=None, salt='')\nConvert a dataframe of features to a bloom filter.\nConvert the columns to features based on the colspec. The features are then combined and converted to Bloom filter indices with the Bloom filter norm also calculated.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndf\npandas.pandas.DataFrame\nData frame of features.\nrequired\n\n\ncolspec\ndict\nDictionary designating columns in the data frame as particular feature types to be processed as appropriate.\nrequired\n\n\nother_columns\nNone | list\nColumns to be returned as they appear in the data in addition to bf_indices, bf_norms and thresholds.\nNone\n\n\nsalt\nstr\nCryptographic salt to add to tokens before hashing.\n''\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npandas.pandas.DataFrame\nData frame of bloom-filtered data.\n\n\n\n\n\n\n\napp.utils.download_files(dataframe, embedder, party, archive='archive')\nSerialize, compress, and send a data frame with its embedder.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndataframe\npprl.embedder.embedder.EmbeddedDataFrame\nData frame to be downloaded.\nrequired\n\n\nembedder\npprl.embedder.embedder.Embedder\nEmbedder used to embed dataframe.\nrequired\n\n\nparty\nstr\nName of the party.\nrequired\n\n\narchive\nstr\nName of the archive. Default is \"archive\".\n'archive'\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nflask.flask.Response\nResponse containing a ZIP archive with the data frame and its embedder.",
     "crumbs": [
       "About",
       "Docs",
diff --git a/sitemap.xml b/sitemap.xml
index b322d28..3b9237c 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -2,66 +2,66 @@
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
   <url>
     <loc>https://datasciencecampus.github.io/pprl_toolkit/index.html</loc>
-    <lastmod>2024-05-08T14:03:10.597Z</lastmod>
+    <lastmod>2024-05-13T15:50:05.709Z</lastmod>
   </url>
   <url>
     <loc>https://datasciencecampus.github.io/pprl_toolkit/docs/reference/index.html</loc>
-    <lastmod>2024-05-08T14:03:57.385Z</lastmod>
+    <lastmod>2024-05-13T15:50:58.694Z</lastmod>
   </url>
   <url>
     <loc>https://datasciencecampus.github.io/pprl_toolkit/docs/reference/config.html</loc>
-    <lastmod>2024-05-08T14:03:57.505Z</lastmod>
+    <lastmod>2024-05-13T15:50:58.814Z</lastmod>
   </url>
   <url>
     <loc>https://datasciencecampus.github.io/pprl_toolkit/docs/reference/cloud.html</loc>
-    <lastmod>2024-05-08T14:03:57.537Z</lastmod>
+    <lastmod>2024-05-13T15:50:58.846Z</lastmod>
   </url>
   <url>
     <loc>https://datasciencecampus.github.io/pprl_toolkit/docs/reference/embedder.html</loc>
-    <lastmod>2024-05-08T14:03:57.453Z</lastmod>
+    <lastmod>2024-05-13T15:50:58.766Z</lastmod>
   </url>
   <url>
     <loc>https://datasciencecampus.github.io/pprl_toolkit/docs/reference/encryption.html</loc>
-    <lastmod>2024-05-08T14:03:57.501Z</lastmod>
+    <lastmod>2024-05-13T15:50:58.814Z</lastmod>
   </url>
   <url>
     <loc>https://datasciencecampus.github.io/pprl_toolkit/docs/tutorials/example-verknupfung.html</loc>
-    <lastmod>2024-05-08T14:03:10.597Z</lastmod>
+    <lastmod>2024-05-13T15:50:05.709Z</lastmod>
   </url>
   <url>
     <loc>https://datasciencecampus.github.io/pprl_toolkit/docs/tutorials/in-the-cloud.html</loc>
-    <lastmod>2024-05-08T14:03:10.597Z</lastmod>
+    <lastmod>2024-05-13T15:50:05.709Z</lastmod>
   </url>
   <url>
     <loc>https://datasciencecampus.github.io/pprl_toolkit/docs/tutorials/run-through.html</loc>
-    <lastmod>2024-05-08T14:03:10.597Z</lastmod>
+    <lastmod>2024-05-13T15:50:05.709Z</lastmod>
   </url>
   <url>
     <loc>https://datasciencecampus.github.io/pprl_toolkit/docs/tutorials/example-febrl.html</loc>
-    <lastmod>2024-05-08T14:03:10.597Z</lastmod>
+    <lastmod>2024-05-13T15:50:05.709Z</lastmod>
   </url>
   <url>
     <loc>https://datasciencecampus.github.io/pprl_toolkit/docs/tutorials/index.html</loc>
-    <lastmod>2024-05-08T14:03:10.597Z</lastmod>
+    <lastmod>2024-05-13T15:50:05.709Z</lastmod>
   </url>
   <url>
     <loc>https://datasciencecampus.github.io/pprl_toolkit/docs/reference/local.html</loc>
-    <lastmod>2024-05-08T14:03:57.541Z</lastmod>
+    <lastmod>2024-05-13T15:50:58.850Z</lastmod>
   </url>
   <url>
     <loc>https://datasciencecampus.github.io/pprl_toolkit/docs/reference/bloom_filters.html</loc>
-    <lastmod>2024-05-08T14:03:57.405Z</lastmod>
+    <lastmod>2024-05-13T15:50:58.714Z</lastmod>
   </url>
   <url>
     <loc>https://datasciencecampus.github.io/pprl_toolkit/docs/reference/features.html</loc>
-    <lastmod>2024-05-08T14:03:57.485Z</lastmod>
+    <lastmod>2024-05-13T15:50:58.798Z</lastmod>
   </url>
   <url>
     <loc>https://datasciencecampus.github.io/pprl_toolkit/docs/reference/perform.html</loc>
-    <lastmod>2024-05-08T14:03:57.553Z</lastmod>
+    <lastmod>2024-05-13T15:50:58.862Z</lastmod>
   </url>
   <url>
     <loc>https://datasciencecampus.github.io/pprl_toolkit/docs/reference/utils.html</loc>
-    <lastmod>2024-05-08T14:03:57.521Z</lastmod>
+    <lastmod>2024-05-13T15:50:58.830Z</lastmod>
   </url>
 </urlset>

`other_columns`	None \| list	Columns to be returned as they appear in the data in addition to `bf_indices` and `bf_norms`.	Columns to be returned as they appear in the data in addition to `bf_indices`, `bf_norms` and `thresholds`.	`None`
Embedder API run-through
Exploring a simple linkage example
Linking the FEBRL datasets
Working in the cloud