From 8968887d32dc9b60901bafae31c212523db08bcb Mon Sep 17 00:00:00 2001
From: Quarto GHA Workflow Runner <quarto-github-actions-publish@example.com>
Date: Wed, 8 May 2024 14:04:33 +0000
Subject: [PATCH] Built site for gh-pages

---
 .nojekyll                               |   2 +-
 docs/reference/features.html            |  10 +-
 docs/tutorials/example-febrl.html       |  26 ++--
 docs/tutorials/example-verknupfung.html |  26 ++--
 docs/tutorials/index.html               |   8 +-
 docs/tutorials/run-through.html         | 157 +++++++++++++-----------
 search.json                             |  20 +--
 sitemap.xml                             |  32 ++---
 8 files changed, 146 insertions(+), 135 deletions(-)
diff --git a/.nojekyll b/.nojekyll
index 3f6c135..cb3b51d 100644
--- a/.nojekyll
+++ b/.nojekyll
@@ -1 +1 @@
-6b37c638
\ No newline at end of file
+7064a115
\ No newline at end of file
diff --git a/docs/reference/features.html b/docs/reference/features.html
index c108714..ecd8ab1 100644
--- a/docs/reference/features.html
+++ b/docs/reference/features.html
@@ -378,17 +378,17 @@ <h2 class="anchored" data-anchor-id="functions">Functions</h2>
 </table>
 <section id="pprl.embedder.features.gen_dateofbirth_features" class="level3">
 <h3 class="anchored" data-anchor-id="pprl.embedder.features.gen_dateofbirth_features">gen_dateofbirth_features</h3>
-<p><code>embedder.features.gen_dateofbirth_features(dob, dayfirst=True, yearfirst=False, default=['day&lt;01&gt;', 'month&lt;01&gt;', 'year&lt;2050&gt;'])</code></p>
+<p><code>embedder.features.gen_dateofbirth_features(dob, dayfirst=True, yearfirst=False, default=[])</code></p>
 <p>Generate labelled date features from a series of dates of birth.</p>
 <p>Features take the form <code>["day&lt;dd&gt;", "month&lt;mm&gt;", "year&lt;YYYY&gt;"]</code>. Note that this feature generator can be used for any sort of date data, not just dates of birth.</p>
 <section id="parameters" class="level4">
 <h4 class="anchored" data-anchor-id="parameters">Parameters</h4>
 <table class="table">
 <colgroup>
+<col style="width: 8%">
+<col style="width: 14%">
+<col style="width: 69%">
 <col style="width: 7%">
-<col style="width: 11%">
-<col style="width: 58%">
-<col style="width: 22%">
 </colgroup>
 <thead>
 <tr class="header">
@@ -421,7 +421,7 @@ <h4 class="anchored" data-anchor-id="parameters">Parameters</h4>
 <td><code>default</code></td>
 <td>list[str]</td>
 <td>Default date to fill in missing data in feature (list) form. Default is the feature form of <code>2050-01-01</code>.</td>
-<td><code>['day&lt;01&gt;', 'month&lt;01&gt;', 'year&lt;2050&gt;']</code></td>
+<td><code>[]</code></td>
 </tr>
 </tbody>
 </table>
diff --git a/docs/tutorials/example-febrl.html b/docs/tutorials/example-febrl.html
index 0f83fce..c30a2d5 100644
--- a/docs/tutorials/example-febrl.html
+++ b/docs/tutorials/example-febrl.html
@@ -343,7 +343,7 @@ <h1 class="title">Linking the FEBRL datasets</h1>
 
 
 <p>This tutorial shows how the package can be used locally to match the <a href="http://users.cecs.anu.edu.au/~Peter.Christen/publications/hdkm2008slides.pdf">FEBRL</a> datasets, included as example datasets in the <a href="https://recordlinkage.readthedocs.io/en/latest/"><code>recordlinkage</code></a> package.</p>
-<div id="5794dce7" class="cell" data-execution_count="1">
+<div id="2658b4e0" class="cell" data-execution_count="1">
 <div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> os</span>
 <span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> time</span>
 <span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> functools <span class="im">import</span> partial</span>
@@ -359,7 +359,7 @@ <h1 class="title">Linking the FEBRL datasets</h1>
 <h2 class="anchored" data-anchor-id="load-the-data">Load the data</h2>
 <p>The datasets we are using are 5000 records across two datasets with no duplicates, and each of the records has a valid match in the other dataset.</p>
 <p>After loading the data, we can parse the true matched ID number from the indices.</p>
-<div id="6a435ba6" class="cell" data-execution_count="2">
+<div id="454f8bcd" class="cell" data-execution_count="2">
 <div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>feb4a, feb4b <span class="op">=</span> load_febrl4()</span>
 <span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a>feb4a[<span class="st">"true_id"</span>] <span class="op">=</span> (</span>
@@ -382,7 +382,7 @@ <h2 class="anchored" data-anchor-id="create-a-feature-factory">Create a feature
 <li>Pass a dictionary of dictionaries of keyword arguments as an optional <code>ff_args</code> parameter (e.g.&nbsp;<code>ff_args = {"dob": {"dayfirst": False, "yearfirst": True}})</code>)</li>
 <li>Use <code>functools.partial()</code>, as we have below.</li>
 </ol>
-<div id="09bdb1ce" class="cell" data-execution_count="3">
+<div id="b7ebef7c" class="cell" data-execution_count="3">
 <div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a>feature_factory <span class="op">=</span> <span class="bu">dict</span>(</span>
 <span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a>    name<span class="op">=</span>feat.gen_name_features,</span>
 <span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a>    dob<span class="op">=</span>partial(feat.gen_dateofbirth_features, dayfirst<span class="op">=</span><span class="va">False</span>, yearfirst<span class="op">=</span><span class="va">True</span>),</span>
@@ -396,7 +396,7 @@ <h2 class="anchored" data-anchor-id="create-a-feature-factory">Create a feature
 <section id="initialise-the-embedder-instance" class="level2">
 <h2 class="anchored" data-anchor-id="initialise-the-embedder-instance">Initialise the embedder instance</h2>
 <p>This instance embeds each feature twice into a Bloom filter of length 1024.</p>
-<div id="5288a45e" class="cell" data-execution_count="4">
+<div id="f8a7a5e3" class="cell" data-execution_count="4">
 <div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a>embedder <span class="op">=</span> Embedder(feature_factory, bf_size<span class="op">=</span><span class="dv">1024</span>, num_hashes<span class="op">=</span><span class="dv">2</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 </section>
@@ -418,7 +418,7 @@ <h2 class="anchored" data-anchor-id="embed-the-datasets">Embed the datasets</h2>
 <p>For example, to ensure suburb doesn’t collide with state (if they happened to be the same), <code>gen_misc_features()</code> would encode each of their tokens as <code>suburb&lt;token&gt;</code> and <code>state&lt;token&gt;</code>, respectively. If you want to map different columns into the same feature, such as <code>address</code> below, you can set the label explicitly when passing the function to the embedder.</p>
 </div>
 </div>
-<div id="c7722047" class="cell" data-execution_count="5">
+<div id="75454628" class="cell" data-execution_count="5">
 <div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a>colspec <span class="op">=</span> <span class="bu">dict</span>(</span>
 <span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a>    given_name<span class="op">=</span><span class="st">"name"</span>,</span>
 <span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a>    surname<span class="op">=</span><span class="st">"name"</span>,</span>
@@ -436,7 +436,7 @@ <h2 class="anchored" data-anchor-id="embed-the-datasets">Embed the datasets</h2>
 <span id="cb5-15"><a href="#cb5-15" aria-hidden="true" tabindex="-1"></a>edf2 <span class="op">=</span> embedder.embed(feb4b, colspec<span class="op">=</span>colspec)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Store the embedded datasets and their embedder to file.</p>
-<div id="8e1bac82" class="cell" data-execution_count="6">
+<div id="e043c2cd" class="cell" data-execution_count="6">
 <div class="sourceCode cell-code" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a>edf1.to_json(<span class="st">"party1_data.json"</span>)</span>
 <span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a>edf2.to_json(<span class="st">"party2_data.json"</span>)</span>
 <span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a>embedder.to_pickle(<span class="st">"embedder.pkl"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -445,7 +445,7 @@ <h2 class="anchored" data-anchor-id="embed-the-datasets">Embed the datasets</h2>
 <section id="calculate-similarity" class="level2">
 <h2 class="anchored" data-anchor-id="calculate-similarity">Calculate similarity</h2>
 <p>Compute the row thresholds to provide a lower bound on matching similarity scores for each row. This operation is the most computationally intensive part of the whole process.</p>
-<div id="098d27e1" class="cell" data-execution_count="7">
+<div id="58f15fdc" class="cell" data-execution_count="7">
 <div class="sourceCode cell-code" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a>start <span class="op">=</span> time.time()</span>
 <span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a>edf1.update_thresholds()</span>
 <span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a>edf2.update_thresholds()</span>
@@ -453,22 +453,22 @@ <h2 class="anchored" data-anchor-id="calculate-similarity">Calculate similarity<
 <span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb7-6"><a href="#cb7-6" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f"Updating thresholds took </span><span class="sc">{</span>end <span class="op">-</span> start<span class="sc">:.2f}</span><span class="ss"> seconds"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
-<pre><code>Updating thresholds took 8.35 seconds</code></pre>
+<pre><code>Updating thresholds took 8.40 seconds</code></pre>
 </div>
 </div>
 <p>Compute the matrix of similarity scores.</p>
-<div id="715a0b10" class="cell" data-execution_count="8">
+<div id="58455085" class="cell" data-execution_count="8">
 <div class="sourceCode cell-code" id="cb9"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a>similarity_scores <span class="op">=</span> embedder.compare(edf1,edf2)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 </section>
 <section id="compute-a-match" class="level2">
 <h2 class="anchored" data-anchor-id="compute-a-match">Compute a match</h2>
 <p>Use the similarity scores to compute a match, using the Hungarian algorithm. First, we compute the match with the row thresholds.</p>
-<div id="d5a9e12a" class="cell" data-execution_count="9">
+<div id="e5dbd978" class="cell" data-execution_count="9">
 <div class="sourceCode cell-code" id="cb10"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a>matching <span class="op">=</span> similarity_scores.match(require_thresholds<span class="op">=</span><span class="va">True</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Using the true IDs, evaluate the precision and recall of the match.</p>
-<div id="8e3888bc" class="cell" data-execution_count="10">
+<div id="f72fd9d8" class="cell" data-execution_count="10">
 <div class="sourceCode cell-code" id="cb11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="kw">def</span> get_results(edf1, edf2, matching):</span>
 <span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a>    <span class="co">"""Get the results for a given matching."""</span></span>
 <span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a></span>
@@ -488,11 +488,11 @@ <h2 class="anchored" data-anchor-id="compute-a-match">Compute a match</h2>
 <span id="cb11-17"><a href="#cb11-17" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb11-18"><a href="#cb11-18" aria-hidden="true" tabindex="-1"></a>_ <span class="op">=</span> get_results(edf1, edf2, matching)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
-<pre><code>True pos: 4973 | False pos: 0 | Precision: 100.0% | Recall: 99.5%</code></pre>
+<pre><code>True pos: 4969 | False pos: 0 | Precision: 100.0% | Recall: 99.4%</code></pre>
 </div>
 </div>
 <p>Then, we compute the match without using the row thresholds, calculating the same performance metrics:</p>
-<div id="010d6dc9" class="cell" data-execution_count="11">
+<div id="74d9fb69" class="cell" data-execution_count="11">
 <div class="sourceCode cell-code" id="cb13"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a>matching <span class="op">=</span> similarity_scores.match(require_thresholds<span class="op">=</span><span class="va">False</span>)</span>
 <span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a>_ <span class="op">=</span> get_results(edf1, edf2, matching)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
diff --git a/docs/tutorials/example-verknupfung.html b/docs/tutorials/example-verknupfung.html
index 681bff2..4e571c0 100644
--- a/docs/tutorials/example-verknupfung.html
+++ b/docs/tutorials/example-verknupfung.html
@@ -341,7 +341,7 @@ <h1 class="title">Exploring a simple linkage example</h1>
 <section id="loading-the-data" class="level3">
 <h3 class="anchored" data-anchor-id="loading-the-data">Loading the data</h3>
 <p>First, we load our data into <code>pandas.DataFrame</code> objects. Here, the first records align, but the other two records should be swapped to have an aligned matching. We will use the toolkit to identify these matches.</p>
-<div id="1a3da542" class="cell" data-execution_count="1">
+<div id="1453c6db" class="cell" data-execution_count="1">
 <div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
 <span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a>df1 <span class="op">=</span> pd.DataFrame(</span>
@@ -381,7 +381,7 @@ <h3 class="anchored" data-anchor-id="loading-the-data">Loading the data</h3>
 <h3 class="anchored" data-anchor-id="creating-and-assigning-a-feature-factory">Creating and assigning a feature factory</h3>
 <p>The next step is to decide how to process each of the columns in our datasets.</p>
 <p>To do this, we define a feature factory that maps column types to feature generation functions, and a column specification for each dataset mapping our columns to column types in the factory.</p>
-<div id="93e456d5" class="cell" data-execution_count="2">
+<div id="5d8b4537" class="cell" data-execution_count="2">
 <div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> pprl.embedder <span class="im">import</span> features</span>
 <span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> functools <span class="im">import</span> partial</span>
 <span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a></span>
@@ -419,7 +419,7 @@ <h3 class="anchored" data-anchor-id="creating-and-assigning-a-feature-factory">C
 <h3 class="anchored" data-anchor-id="embedding-the-data">Embedding the data</h3>
 <p>With our specifications sorted out, we can get to creating our Bloom filter embedding. Before doing so, we need to decide on two parameters: the size of the filter and the number of hashes. By default, these are 1024 and 2, respectively.</p>
 <p>Once we’ve decided, we can create our <code>Embedder</code> instance and use it to embed our data with their column specifications.</p>
-<div id="4f986d66" class="cell" data-execution_count="3">
+<div id="06627453" class="cell" data-execution_count="3">
 <div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> pprl.embedder.embedder <span class="im">import</span> Embedder</span>
 <span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a>embedder <span class="op">=</span> Embedder(factory, bf_size<span class="op">=</span><span class="dv">1024</span>, num_hashes<span class="op">=</span><span class="dv">2</span>)</span>
@@ -428,7 +428,7 @@ <h3 class="anchored" data-anchor-id="embedding-the-data">Embedding the data</h3>
 <span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a>edf2 <span class="op">=</span> embedder.embed(df2, colspec<span class="op">=</span>spec2, update_thresholds<span class="op">=</span><span class="va">True</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>If we take a look at one of these embedded datasets, we can see that it has a whole bunch of new columns. There is a <code>_features</code> column for each of the original columns containing their pre-embedding string features, and there’s an <code>all_features</code> column that combines the features. Then there are three additional columns: <code>bf_indices</code>, <code>bf_norms</code> and <code>thresholds</code>.</p>
-<div id="0588a776" class="cell" data-execution_count="4">
+<div id="52ed5634" class="cell" data-execution_count="4">
 <div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a>edf1.columns</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-display" data-execution_count="4">
 <pre><code>Index(['first_name', 'last_name', 'gender', 'date_of_birth', 'instrument',
@@ -439,15 +439,15 @@ <h3 class="anchored" data-anchor-id="embedding-the-data">Embedding the data</h3>
 </div>
 </div>
 <p>The <code>bf_indices</code> column contains the Bloom filters, represented compactly as a list of non-zero indices for each record.</p>
-<div id="e2580c0d" class="cell" data-execution_count="5">
+<div id="46d6cdc4" class="cell" data-execution_count="5">
 <div class="sourceCode cell-code" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(edf1.bf_indices[<span class="dv">0</span>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
-<pre><code>[2, 646, 903, 262, 9, 654, 15, 272, 17, 146, 526, 532, 531, 282, 667, 413, 670, 544, 288, 931, 292, 808, 937, 172, 942, 559, 816, 691, 820, 567, 440, 56, 823, 60, 61, 318, 319, 320, 577, 444, 836, 583, 332, 972, 590, 77, 593, 338, 465, 468, 84, 82, 851, 600, 211, 218, 861, 613, 871, 744, 238, 367, 881, 758, 890, 379, 1021, 763]</code></pre>
+<pre><code>[2, 262, 646, 903, 9, 526, 15, 272, 654, 146, 531, 532, 17, 282, 667, 413, 670, 544, 288, 931, 292, 808, 937, 172, 942, 559, 816, 691, 820, 567, 823, 440, 56, 60, 61, 318, 319, 320, 444, 577, 836, 583, 332, 77, 972, 590, 465, 593, 211, 468, 82, 851, 338, 600, 84, 218, 861, 613, 871, 744, 238, 367, 881, 758, 890, 379, 1021, 763]</code></pre>
 </div>
 </div>
 <p>The <code>bf_norms</code> column contains the norm of each Bloom filter with respect to the Soft Cosine Measure (SCM) matrix. In this case since we are using an untrained model, the SCM matrix is an identity matrix, and the norm is just the Euclidean norm of the Bloom filter represented as a binary vector, which is equal to <code>np.sqrt(len(bf_indices[i]))</code> for record <code>i</code>. The norm is used to scale the similarity measures so that they take values between -1 and 1.</p>
 <p>The <code>thresholds</code> column is calculated to provide, for each record, a threshold similarity score below which it will not be matched. It’s like a reserve price in an auction – it stops a record being matched to another record when the similarity isn’t high enough. This is an innovative feature of our method; other linkage methods typically only have one global threshold score for the entire dataset.</p>
-<div id="7a842631" class="cell" data-execution_count="6">
+<div id="72ed847b" class="cell" data-execution_count="6">
 <div class="sourceCode cell-code" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(edf1.loc[:,[<span class="st">"bf_norms"</span>,<span class="st">"thresholds"</span>]])</span>
 <span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(edf2.loc[:,[<span class="st">"bf_norms"</span>,<span class="st">"thresholds"</span>]])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
@@ -467,7 +467,7 @@ <h3 class="anchored" data-anchor-id="embedding-the-data">Embedding the data</h3>
 <h3 class="anchored" data-anchor-id="the-processed-features">The processed features</h3>
 <p>Let’s take a look at how the features are processed into small text strings (shingles) before being hashed into the Bloom filter. The first record in the first dataset is the same person as the first record in the second dataset, although the data is not identical, so we can compare the processed features for these records to see how pprl puts them into a format where they can be compared.</p>
 <p>First, we’ll look at date of birth:</p>
-<div id="10d05f56" class="cell" data-execution_count="7">
+<div id="9b83cf0e" class="cell" data-execution_count="7">
 <div class="sourceCode cell-code" id="cb10"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(edf1.date_of_birth_features[<span class="dv">0</span>])</span>
 <span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(edf2.birth_date_features[<span class="dv">0</span>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
@@ -477,7 +477,7 @@ <h3 class="anchored" data-anchor-id="the-processed-features">The processed featu
 </div>
 <p>Python can parse the different formats easily. Although the dates are slightly different in the dataset, the year and month will still match, even though the day will not.</p>
 <p>Then we’ll look at name:</p>
-<div id="07db8aa3" class="cell" data-execution_count="8">
+<div id="2b7d5fa5" class="cell" data-execution_count="8">
 <div class="sourceCode cell-code" id="cb12"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(edf1.first_name_features[<span class="dv">0</span>] <span class="op">+</span> edf1.last_name_features[<span class="dv">0</span>])</span>
 <span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(edf2.name_features[<span class="dv">0</span>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
@@ -487,7 +487,7 @@ <h3 class="anchored" data-anchor-id="the-processed-features">The processed featu
 </div>
 <p>The two datasets store the names differently, but this doesn’t matter for the Bloom filter method because it treats each record like a bag of features. By default, the name processor produces 2-grams and 3-grams.</p>
 <p>The sex processing function just converts different formats to lowercase and takes the first letter. This will often be enough:</p>
-<div id="55608b1b" class="cell" data-execution_count="9">
+<div id="716314b2" class="cell" data-execution_count="9">
 <div class="sourceCode cell-code" id="cb14"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(edf1.gender_features[<span class="dv">0</span>])</span>
 <span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(edf2.sex_features[<span class="dv">0</span>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
@@ -496,7 +496,7 @@ <h3 class="anchored" data-anchor-id="the-processed-features">The processed featu
 </div>
 </div>
 <p>Finally, we’ll see how our instrument feature function (<code>partial(features.gen_misc_shingled_features, label="instrument")</code>) processed the data:</p>
-<div id="68fb3cf9" class="cell" data-execution_count="10">
+<div id="1780c09c" class="cell" data-execution_count="10">
 <div class="sourceCode cell-code" id="cb16"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(edf1.instrument_features[<span class="dv">0</span>])</span>
 <span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(edf2.main_instrument_features[<span class="dv">0</span>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
@@ -509,7 +509,7 @@ <h3 class="anchored" data-anchor-id="the-processed-features">The processed featu
 <section id="performing-the-linkage" class="level3">
 <h3 class="anchored" data-anchor-id="performing-the-linkage">Performing the linkage</h3>
 <p>We can now perform the linkage by comparing these Bloom filter embeddings. We use the Soft Cosine Measure (which in this untrained model, is equivalent to a normal cosine similarity metric) to calculate record-wise similarity and an adapted Hungarian algorithm to match the records based on those similarities.</p>
-<div id="0705024a" class="cell" data-execution_count="11">
+<div id="16766f5e" class="cell" data-execution_count="11">
 <div class="sourceCode cell-code" id="cb18"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a>similarities <span class="op">=</span> embedder.compare(edf1, edf2)</span>
 <span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a>similarities</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-display" data-execution_count="11">
@@ -519,7 +519,7 @@ <h3 class="anchored" data-anchor-id="performing-the-linkage">Performing the link
 </div>
 </div>
 <p>This <code>SimilarityArray</code> object is an augmented <code>numpy.ndarray</code> that can perform our matching. The matching itself can optionally be called with an absolute threshold score, but it doesn’t need one.</p>
-<div id="acd46a0b" class="cell" data-execution_count="12">
+<div id="384c2606" class="cell" data-execution_count="12">
 <div class="sourceCode cell-code" id="cb20"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a>matching <span class="op">=</span> similarities.match()</span>
 <span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a>matching</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-display" data-execution_count="12">
diff --git a/docs/tutorials/index.html b/docs/tutorials/index.html
index 113b2fa..d14e966 100644
--- a/docs/tutorials/index.html
+++ b/docs/tutorials/index.html
@@ -384,7 +384,7 @@ <h1 class="title">Tutorials</h1>
 </tr>
 </thead>
 <tbody class="list">
-<tr data-index="0" data-listing-file-modified-sort="1714645062950" data-listing-reading-time-sort="5" data-listing-word-count-sort="854" data-listing-title-sort="Embedder API run-through" data-listing-filename-sort="run-through.qmd">
+<tr data-index="0" data-listing-file-modified-sort="1715176990597" data-listing-reading-time-sort="5" data-listing-word-count-sort="884" data-listing-title-sort="Embedder API run-through" data-listing-filename-sort="run-through.qmd">
 <td>
 <a href="../../docs/tutorials/run-through.html" class="title listing-title">Embedder API run-through</a>
 </td>
@@ -395,7 +395,7 @@ <h1 class="title">Tutorials</h1>
 <span class="listing-reading-time">5 min</span>
 </td>
 </tr>
-<tr data-index="1" data-listing-file-modified-sort="1714645062950" data-listing-reading-time-sort="6" data-listing-word-count-sort="1080" data-listing-title-sort="Exploring a simple linkage example" data-listing-filename-sort="example-verknupfung.qmd">
+<tr data-index="1" data-listing-file-modified-sort="1715176990597" data-listing-reading-time-sort="6" data-listing-word-count-sort="1080" data-listing-title-sort="Exploring a simple linkage example" data-listing-filename-sort="example-verknupfung.qmd">
 <td>
 <a href="../../docs/tutorials/example-verknupfung.html" class="title listing-title">Exploring a simple linkage example</a>
 </td>
@@ -406,7 +406,7 @@ <h1 class="title">Tutorials</h1>
 <span class="listing-reading-time">6 min</span>
 </td>
 </tr>
-<tr data-index="2" data-listing-file-modified-sort="1714645062950" data-listing-reading-time-sort="4" data-listing-word-count-sort="717" data-listing-title-sort="Linking the FEBRL datasets" data-listing-filename-sort="example-febrl.qmd">
+<tr data-index="2" data-listing-file-modified-sort="1715176990597" data-listing-reading-time-sort="4" data-listing-word-count-sort="717" data-listing-title-sort="Linking the FEBRL datasets" data-listing-filename-sort="example-febrl.qmd">
 <td>
 <a href="../../docs/tutorials/example-febrl.html" class="title listing-title">Linking the FEBRL datasets</a>
 </td>
@@ -417,7 +417,7 @@ <h1 class="title">Tutorials</h1>
 <span class="listing-reading-time">4 min</span>
 </td>
 </tr>
-<tr data-index="3" data-listing-file-modified-sort="1714645062950" data-listing-reading-time-sort="11" data-listing-word-count-sort="2032" data-listing-title-sort="Working in the cloud" data-listing-filename-sort="in-the-cloud.qmd">
+<tr data-index="3" data-listing-file-modified-sort="1715176990597" data-listing-reading-time-sort="11" data-listing-word-count-sort="2032" data-listing-title-sort="Working in the cloud" data-listing-filename-sort="in-the-cloud.qmd">
 <td>
 <a href="../../docs/tutorials/in-the-cloud.html" class="title listing-title">Working in the cloud</a>
 </td>
diff --git a/docs/tutorials/run-through.html b/docs/tutorials/run-through.html
index ed46085..756526b 100644
--- a/docs/tutorials/run-through.html
+++ b/docs/tutorials/run-through.html
@@ -346,9 +346,9 @@ <h1 class="title">Embedder API run-through</h1>
 <li>the <code>config</code> module, which includes our package configuration (such as the location of data directories)</li>
 <li>some classes from the main <code>embedder</code> module</li>
 </ul>
-<div id="a0c35f11" class="cell" data-execution_count="1">
+<div id="dfb91982" class="cell" data-execution_count="1">
 <div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> os</span>
-<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
 <span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
 <span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> pprl <span class="im">import</span> EmbeddedDataFrame, Embedder, config</span>
@@ -357,42 +357,45 @@ <h1 class="title">Embedder API run-through</h1>
 <section id="data-set-up" class="level2">
 <h2 class="anchored" data-anchor-id="data-set-up">Data set-up</h2>
 <p>For this demo we’ll create a really minimal pair of datasets. Notice that they don’t have to have the same structure or field names.</p>
-<div id="b4186123" class="cell" data-execution_count="2">
+<div id="5b3249fa" class="cell" data-execution_count="2">
 <div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>df1 <span class="op">=</span> pd.DataFrame(</span>
 <span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a>    <span class="bu">dict</span>(</span>
 <span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a>        <span class="bu">id</span><span class="op">=</span>[<span class="dv">1</span>,<span class="dv">2</span>,<span class="dv">3</span>],</span>
 <span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a>        forename<span class="op">=</span>[<span class="st">"Henry"</span>, <span class="st">"Sally"</span>, <span class="st">"Ina"</span>],</span>
 <span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a>        surname <span class="op">=</span> [<span class="st">"Tull"</span>, <span class="st">"Brown"</span>, <span class="st">"Lawrey"</span>],</span>
-<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a>        dob<span class="op">=</span>[<span class="st">"1/1/2001"</span>, <span class="st">"2/1/2001"</span>, <span class="st">"4/10/1995"</span>],</span>
+<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a>        dob<span class="op">=</span>[<span class="st">""</span>, <span class="st">"2/1/2001"</span>, <span class="st">"4/10/1995"</span>],</span>
 <span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a>        gender<span class="op">=</span>[<span class="st">"male"</span>, <span class="st">"Male"</span>, <span class="st">"Female"</span>],</span>
-<span id="cb2-8"><a href="#cb2-8" aria-hidden="true" tabindex="-1"></a>    )</span>
-<span id="cb2-9"><a href="#cb2-9" aria-hidden="true" tabindex="-1"></a>)</span>
-<span id="cb2-10"><a href="#cb2-10" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb2-11"><a href="#cb2-11" aria-hidden="true" tabindex="-1"></a>df2 <span class="op">=</span> pd.DataFrame(</span>
-<span id="cb2-12"><a href="#cb2-12" aria-hidden="true" tabindex="-1"></a>    <span class="bu">dict</span>(</span>
-<span id="cb2-13"><a href="#cb2-13" aria-hidden="true" tabindex="-1"></a>        personid<span class="op">=</span>[<span class="dv">4</span>,<span class="dv">5</span>,<span class="dv">6</span>],</span>
-<span id="cb2-14"><a href="#cb2-14" aria-hidden="true" tabindex="-1"></a>        full_name<span class="op">=</span>[<span class="st">"Harry Tull"</span>, <span class="st">"Sali Brown"</span>, <span class="st">"Ina Laurie"</span>],</span>
-<span id="cb2-15"><a href="#cb2-15" aria-hidden="true" tabindex="-1"></a>        date_of_birth<span class="op">=</span>[<span class="st">"2/1/2001"</span>, <span class="st">"2/1/2001"</span>, <span class="st">"4/11/1995"</span>],</span>
-<span id="cb2-16"><a href="#cb2-16" aria-hidden="true" tabindex="-1"></a>        sex<span class="op">=</span>[<span class="st">"M"</span>, <span class="st">"M"</span>, <span class="st">"F"</span>],</span>
-<span id="cb2-17"><a href="#cb2-17" aria-hidden="true" tabindex="-1"></a>    )</span>
-<span id="cb2-18"><a href="#cb2-18" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span id="cb2-8"><a href="#cb2-8" aria-hidden="true" tabindex="-1"></a>        county<span class="op">=</span>[<span class="st">""</span>, np.NaN, <span class="st">"County Durham"</span>]</span>
+<span id="cb2-9"><a href="#cb2-9" aria-hidden="true" tabindex="-1"></a>    )</span>
+<span id="cb2-10"><a href="#cb2-10" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb2-11"><a href="#cb2-11" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-12"><a href="#cb2-12" aria-hidden="true" tabindex="-1"></a>df2 <span class="op">=</span> pd.DataFrame(</span>
+<span id="cb2-13"><a href="#cb2-13" aria-hidden="true" tabindex="-1"></a>    <span class="bu">dict</span>(</span>
+<span id="cb2-14"><a href="#cb2-14" aria-hidden="true" tabindex="-1"></a>        personid<span class="op">=</span>[<span class="dv">4</span>,<span class="dv">5</span>,<span class="dv">6</span>],</span>
+<span id="cb2-15"><a href="#cb2-15" aria-hidden="true" tabindex="-1"></a>        full_name<span class="op">=</span>[<span class="st">"Harry Tull"</span>, <span class="st">"Sali Brown"</span>, <span class="st">"Ina Laurie"</span>],</span>
+<span id="cb2-16"><a href="#cb2-16" aria-hidden="true" tabindex="-1"></a>        date_of_birth<span class="op">=</span>[<span class="st">"2/1/2001"</span>, <span class="st">"2/1/2001"</span>, <span class="st">"4/11/1995"</span>],</span>
+<span id="cb2-17"><a href="#cb2-17" aria-hidden="true" tabindex="-1"></a>        sex<span class="op">=</span>[<span class="st">"M"</span>, <span class="st">"M"</span>, <span class="st">"F"</span>],</span>
+<span id="cb2-18"><a href="#cb2-18" aria-hidden="true" tabindex="-1"></a>        county<span class="op">=</span>[<span class="st">"Rutland"</span>, <span class="st">"Powys"</span>, <span class="st">"Durham"</span>]</span>
+<span id="cb2-19"><a href="#cb2-19" aria-hidden="true" tabindex="-1"></a>    )</span>
+<span id="cb2-20"><a href="#cb2-20" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Features are extracted as different kinds of string objects from each field, ready to be hash embedded into the Bloom filters. We need to specify the feature extraction functions we’ll need.</p>
 <p>In this case we’ll need one extractor for names, one for dates of birth, and one for sex/gender records. We create a dict with the functions we need. We create another dict to store any keyword arguments we want to pass in to each function (in this case we use all the default arguments so the keyword argument dictionaries are empty):</p>
-<div id="79b1e74a" class="cell" data-execution_count="3">
+<div id="94813f0b" class="cell" data-execution_count="3">
 <div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a>feature_factory <span class="op">=</span> <span class="bu">dict</span>(</span>
 <span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a>    name<span class="op">=</span>feat.gen_name_features,</span>
 <span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a>    dob<span class="op">=</span>feat.gen_dateofbirth_features,</span>
 <span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a>    sex<span class="op">=</span>feat.gen_sex_features,</span>
-<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a>)</span>
-<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a>ff_args <span class="op">=</span> <span class="bu">dict</span>(name<span class="op">=</span>{}, sex<span class="op">=</span>{}, dob<span class="op">=</span>{})</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a>    misc<span class="op">=</span>feat.gen_misc_features</span>
+<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a>ff_args <span class="op">=</span> <span class="bu">dict</span>(name<span class="op">=</span>{}, sex<span class="op">=</span>{}, dob<span class="op">=</span>{})</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 </section>
 <section id="embedding" class="level2">
 <h2 class="anchored" data-anchor-id="embedding">Embedding</h2>
 <p>Now we can create an <code>Embedder</code> object. We want our Bloom filter vectors to have a length of 1024 elements, and we choose to hash each feature two times. These choices seem to work ok, but we haven’t explored them systematically.</p>
-<div id="6b9ae5dd" class="cell" data-execution_count="4">
+<div id="db791392" class="cell" data-execution_count="4">
 <div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a>embedder <span class="op">=</span> Embedder(feature_factory,</span>
 <span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>                    ff_args,</span>
 <span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a>                    bf_size <span class="op">=</span> <span class="dv">2</span><span class="op">**</span><span class="dv">10</span>,</span>
@@ -400,21 +403,21 @@ <h2 class="anchored" data-anchor-id="embedding">Embedding</h2>
 <span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a>                    )</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>Now we can hash embed the dataset into an EmbeddedDataFrame (EDF). For this we need to pass a column specification <code>colspec</code> that maps each column of the data into the <code>feature_factory</code> functions. Any columns not mapped will not contribute to the embedding.</p>
-<div id="5dfd0eb2" class="cell" data-execution_count="5">
+<div id="051a46ca" class="cell" data-execution_count="5">
 <div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a>edf1 <span class="op">=</span> embedder.embed(</span>
-<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a>    df1, colspec<span class="op">=</span><span class="bu">dict</span>(forename<span class="op">=</span><span class="st">"name"</span>, surname<span class="op">=</span><span class="st">"name"</span>, dob<span class="op">=</span><span class="st">"dob"</span>, gender<span class="op">=</span><span class="st">"sex"</span>)</span>
+<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a>    df1, colspec<span class="op">=</span><span class="bu">dict</span>(forename<span class="op">=</span><span class="st">"name"</span>, surname<span class="op">=</span><span class="st">"name"</span>, dob<span class="op">=</span><span class="st">"dob"</span>, gender<span class="op">=</span><span class="st">"sex"</span>, county<span class="op">=</span><span class="st">"misc"</span>)</span>
 <span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a>)</span>
 <span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a>edf2 <span class="op">=</span> embedder.embed(</span>
-<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a>    df2, colspec<span class="op">=</span><span class="bu">dict</span>(full_name<span class="op">=</span><span class="st">"name"</span>, date_of_birth<span class="op">=</span><span class="st">"dob"</span>, sex<span class="op">=</span><span class="st">"sex"</span>)</span>
+<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a>    df2, colspec<span class="op">=</span><span class="bu">dict</span>(full_name<span class="op">=</span><span class="st">"name"</span>, date_of_birth<span class="op">=</span><span class="st">"dob"</span>, sex<span class="op">=</span><span class="st">"sex"</span>, county<span class="op">=</span><span class="st">"misc"</span>)</span>
 <span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a>)</span>
 <span id="cb5-7"><a href="#cb5-7" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb5-8"><a href="#cb5-8" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(edf1)</span>
 <span id="cb5-9"><a href="#cb5-9" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(edf2)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
-<pre><code>   id forename surname        dob  gender  \
-0   1    Henry    Tull   1/1/2001    male   
-1   2    Sally   Brown   2/1/2001    Male   
-2   3      Ina  Lawrey  4/10/1995  Female   
+<pre><code>   id forename surname        dob  gender         county  \
+0   1    Henry    Tull               male                  
+1   2    Sally   Brown   2/1/2001    Male            NaN   
+2   3      Ina  Lawrey  4/10/1995  Female  County Durham   
 
                                    forename_features  \
 0  [_h, he, en, nr, ry, y_, _he, hen, enr, nry, ry_]   
@@ -426,44 +429,44 @@ <h2 class="anchored" data-anchor-id="embedding">Embedding</h2>
 1  [_b, br, ro, ow, wn, n_, _br, bro, row, own, wn_]   
 2  [_l, la, aw, wr, re, ey, y_, _la, law, awr, wr...   
 
-                       dob_features gender_features  \
-0  [day&lt;01&gt;, month&lt;01&gt;, year&lt;2001&gt;]        [sex&lt;m&gt;]   
-1  [day&lt;02&gt;, month&lt;01&gt;, year&lt;2001&gt;]        [sex&lt;m&gt;]   
-2  [day&lt;04&gt;, month&lt;10&gt;, year&lt;1995&gt;]        [sex&lt;f&gt;]   
+                       dob_features gender_features          county_features  \
+0                                []        [sex&lt;m&gt;]                            
+1  [day&lt;02&gt;, month&lt;01&gt;, year&lt;2001&gt;]        [sex&lt;m&gt;]                            
+2  [day&lt;04&gt;, month&lt;10&gt;, year&lt;1995&gt;]        [sex&lt;f&gt;]  [county&lt;county durham&gt;]   
 
                                         all_features  \
-0  [ll_, _tu, day&lt;01&gt;, ul, l_, sex&lt;m&gt;, ull, y_, _...   
-1  [lly, day&lt;02&gt;, wn_, sex&lt;m&gt;, sal, wn, y_, ly_, ...   
-2  [_in, _i, ey_, wr, y_, rey, wre, sex&lt;f&gt;, _l, _...   
+0  [ll, nr, ll_, _t, ull, _tu, _he, he, tu, hen, ...   
+1  [all, ll, ro, n_, ow, sa, ly_, bro, month&lt;01&gt;,...   
+2  [ina, ey, _in, re, wr, aw, law, la, na_, ey_, ...   
 
                                           bf_indices  bf_norms  
-0  [130, 644, 773, 903, 135, 776, 778, 265, 654, ...  6.708204  
+0  [644, 773, 135, 776, 265, 778, 271, 402, 404, ...  6.244998  
 1  [129, 258, 130, 776, 523, 525, 398, 271, 671, ...  7.141428  
-2  [647, 394, 269, 13, 15, 532, 155, 28, 667, 413...  6.855655  
-   personid   full_name date_of_birth sex  \
-0         4  Harry Tull      2/1/2001   M   
-1         5  Sali Brown      2/1/2001   M   
-2         6  Ina Laurie     4/11/1995   F   
+2  [647, 394, 269, 13, 15, 532, 667, 155, 413, 28...  7.000000  
+   personid   full_name date_of_birth sex   county  \
+0         4  Harry Tull      2/1/2001   M  Rutland   
+1         5  Sali Brown      2/1/2001   M    Powys   
+2         6  Ina Laurie     4/11/1995   F   Durham   
 
                                   full_name_features  \
 0  [_h, ha, ar, rr, ry, y_, _t, tu, ul, ll, l_, _...   
 1  [_s, sa, al, li, i_, _b, br, ro, ow, wn, n_, _...   
 2  [_i, in, na, a_, _l, la, au, ur, ri, ie, e_, _...   
 
-             date_of_birth_features sex_features  \
-0  [day&lt;02&gt;, month&lt;01&gt;, year&lt;2001&gt;]     [sex&lt;m&gt;]   
-1  [day&lt;02&gt;, month&lt;01&gt;, year&lt;2001&gt;]     [sex&lt;m&gt;]   
-2  [day&lt;04&gt;, month&lt;11&gt;, year&lt;1995&gt;]     [sex&lt;f&gt;]   
+             date_of_birth_features sex_features    county_features  \
+0  [day&lt;02&gt;, month&lt;01&gt;, year&lt;2001&gt;]     [sex&lt;m&gt;]  [county&lt;rutland&gt;]   
+1  [day&lt;02&gt;, month&lt;01&gt;, year&lt;2001&gt;]     [sex&lt;m&gt;]    [county&lt;powys&gt;]   
+2  [day&lt;04&gt;, month&lt;11&gt;, year&lt;1995&gt;]     [sex&lt;f&gt;]   [county&lt;durham&gt;]   
 
                                         all_features  \
-0  [ll_, _tu, day&lt;02&gt;, ar, ul, l_, sex&lt;m&gt;, ull, y...   
-1  [day&lt;02&gt;, wn_, sex&lt;m&gt;, wn, sal, ow, al, n_, al...   
-2  [ri, _in, _i, aur, ie_, ur, sex&lt;f&gt;, _l, au, _l...   
+0  [ll, ll_, rr, rry, ar, _ha, _t, ha, ull, count...   
+1  [county&lt;powys&gt;, ro, li_, n_, ow, sa, bro, ali,...   
+2  [ina, ie, aur, e_, _in, uri, la, na_, county&lt;d...   
 
                                           bf_indices  bf_norms  
-0  [640, 130, 644, 135, 776, 10, 778, 271, 402, 5...  6.708204  
-1  [130, 523, 525, 398, 271, 152, 671, 803, 806, ...  6.855655  
-2  [646, 647, 394, 269, 15, 272, 531, 532, 665, 6...  6.782330  </code></pre>
+0  [640, 130, 644, 135, 776, 10, 778, 271, 402, 5...  6.855655  
+1  [130, 523, 525, 398, 271, 152, 671, 803, 806, ...  7.000000  
+2  [646, 647, 394, 269, 15, 272, 531, 532, 665, 6...  6.928203  </code></pre>
 </div>
 </div>
 </section>
@@ -475,7 +478,7 @@ <h2 class="anchored" data-anchor-id="training">Training</h2>
 <h2 class="anchored" data-anchor-id="computing-the-similarity-scores-and-the-matching">Computing the similarity scores and the matching</h2>
 <p>Now we have two embedded datasets, we can compare them and compute all the pairwise Cosine similarity scores.</p>
 <p>First, we have to compute the vector norms of each Bloom vector (for scaling the Cosine similarity) and the thresholds (thresholds are explained here [link]). Computing the thresholds can be time-consuming for a larger dataset, because it essentially computes all pairwise comparisons of the data to itself.</p>
-<div id="30a6ecf8" class="cell" data-execution_count="6">
+<div id="b1f7a809" class="cell" data-execution_count="6">
 <div class="cell-output cell-output-display" data-execution_count="6">
 <div>
 <div>
@@ -489,9 +492,11 @@ <h2 class="anchored" data-anchor-id="computing-the-similarity-scores-and-the-mat
 <th data-quarto-table-cell-role="th">full_name</th>
 <th data-quarto-table-cell-role="th">date_of_birth</th>
 <th data-quarto-table-cell-role="th">sex</th>
+<th data-quarto-table-cell-role="th">county</th>
 <th data-quarto-table-cell-role="th">full_name_features</th>
 <th data-quarto-table-cell-role="th">date_of_birth_features</th>
 <th data-quarto-table-cell-role="th">sex_features</th>
+<th data-quarto-table-cell-role="th">county_features</th>
 <th data-quarto-table-cell-role="th">all_features</th>
 <th data-quarto-table-cell-role="th">bf_indices</th>
 <th data-quarto-table-cell-role="th">bf_norms</th>
@@ -505,13 +510,15 @@ <h2 class="anchored" data-anchor-id="computing-the-similarity-scores-and-the-mat
 <td>Harry Tull</td>
 <td>2/1/2001</td>
 <td>M</td>
+<td>Rutland</td>
 <td>[_h, ha, ar, rr, ry, y_, _t, tu, ul, ll, l_, _...</td>
 <td>[day&lt;02&gt;, month&lt;01&gt;, year&lt;2001&gt;]</td>
 <td>[sex&lt;m&gt;]</td>
-<td>[ll_, _tu, day&lt;02&gt;, ar, ul, l_, sex&lt;m&gt;, ull, y...</td>
+<td>[county&lt;rutland&gt;]</td>
+<td>[ll, ll_, rr, rry, ar, _ha, _t, ha, ull, count...</td>
 <td>[640, 130, 644, 135, 776, 10, 778, 271, 402, 5...</td>
-<td>6.708204</td>
-<td>0.195698</td>
+<td>6.855655</td>
+<td>0.187541</td>
 </tr>
 <tr class="even">
 <td data-quarto-table-cell-role="th">1</td>
@@ -519,13 +526,15 @@ <h2 class="anchored" data-anchor-id="computing-the-similarity-scores-and-the-mat
 <td>Sali Brown</td>
 <td>2/1/2001</td>
 <td>M</td>
+<td>Powys</td>
 <td>[_s, sa, al, li, i_, _b, br, ro, ow, wn, n_, _...</td>
 <td>[day&lt;02&gt;, month&lt;01&gt;, year&lt;2001&gt;]</td>
 <td>[sex&lt;m&gt;]</td>
-<td>[day&lt;02&gt;, wn_, sex&lt;m&gt;, wn, sal, ow, al, n_, al...</td>
+<td>[county&lt;powys&gt;]</td>
+<td>[county&lt;powys&gt;, ro, li_, n_, ow, sa, bro, ali,...</td>
 <td>[130, 523, 525, 398, 271, 152, 671, 803, 806, ...</td>
-<td>6.855655</td>
-<td>0.195698</td>
+<td>7.000000</td>
+<td>0.187541</td>
 </tr>
 <tr class="odd">
 <td data-quarto-table-cell-role="th">2</td>
@@ -533,13 +542,15 @@ <h2 class="anchored" data-anchor-id="computing-the-similarity-scores-and-the-mat
 <td>Ina Laurie</td>
 <td>4/11/1995</td>
 <td>F</td>
+<td>Durham</td>
 <td>[_i, in, na, a_, _l, la, au, ur, ri, ie, e_, _...</td>
 <td>[day&lt;04&gt;, month&lt;11&gt;, year&lt;1995&gt;]</td>
 <td>[sex&lt;f&gt;]</td>
-<td>[ri, _in, _i, aur, ie_, ur, sex&lt;f&gt;, _l, au, _l...</td>
+<td>[county&lt;durham&gt;]</td>
+<td>[ina, ie, aur, e_, _in, uri, la, na_, county&lt;d...</td>
 <td>[646, 647, 394, 269, 15, 272, 531, 532, 665, 6...</td>
-<td>6.782330</td>
-<td>0.086026</td>
+<td>6.928203</td>
+<td>0.082479</td>
 </tr>
 </tbody>
 </table>
@@ -550,18 +561,18 @@ <h2 class="anchored" data-anchor-id="computing-the-similarity-scores-and-the-mat
 </div>
 <p>NB: there’s also a flag to compute these at the same time as the embedding, but it doesn’t by default because, depending on the workflow, you may wish to compute the norms and thresholds at different times (e.g.&nbsp;on the server).</p>
 <p>Now you can compute the similarities:</p>
-<div id="db5b361e" class="cell" data-execution_count="7">
+<div id="1e0fb5e9" class="cell" data-execution_count="7">
 <div class="sourceCode cell-code" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a>similarities <span class="op">=</span> embedder.compare(edf1,edf2)</span>
 <span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(similarities)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
-<pre><code>[[0.6666667  0.17395416 0.        ]
- [0.29223802 0.79658223 0.08258402]
- [0.08697708 0.10638298 0.58067873]]</code></pre>
+<pre><code>[[0.60728442 0.09150181 0.        ]
+ [0.2859526  0.78015612 0.08084521]
+ [0.08335143 0.10204083 0.57735028]]</code></pre>
 </div>
 </div>
 <p>Finally, you can compute the matching:</p>
-<div id="b41ba061" class="cell" data-execution_count="8">
+<div id="685f7371" class="cell" data-execution_count="8">
 <div class="sourceCode cell-code" id="cb9"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a>matching <span class="op">=</span> similarities.match(abs_cutoff<span class="op">=</span><span class="fl">0.5</span>)</span>
 <span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(matching)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
@@ -574,24 +585,24 @@ <h2 class="anchored" data-anchor-id="computing-the-similarity-scores-and-the-mat
 <h2 class="anchored" data-anchor-id="serialisation-and-file-io">Serialisation and file I/O</h2>
 <p>That’s how to do the workflow in one session. However, this demo follows a multi-stage workflow, so we need to be able to pass objects around. There are a couple of methods that enable file I/O and serialisation.</p>
 <p>First, the <code>Embedder</code> object itself needs to be written to file and loaded. The idea is to train it, share it to the data owning parties, and also to the matching server. For this purpose, it’s possible to pickle the entire <code>Embedder</code> object.</p>
-<div id="2757b46b" class="cell" data-execution_count="9">
+<div id="546cc87d" class="cell" data-execution_count="9">
 <div class="sourceCode cell-code" id="cb11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a>embedder.to_pickle(<span class="st">"embedder.pkl"</span>)</span>
 <span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a>embedder_copy <span class="op">=</span> Embedder.from_pickle(<span class="st">"embedder.pkl"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>The copy has the same functionality as the original:</p>
-<div id="8eb175e7" class="cell" data-execution_count="10">
+<div id="8d1f5108" class="cell" data-execution_count="10">
 <div class="sourceCode cell-code" id="cb12"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a>similarities <span class="op">=</span> embedder_copy.compare(edf1,edf2)</span>
 <span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(similarities)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
-<pre><code>[[0.6666667  0.17395416 0.        ]
- [0.29223802 0.79658223 0.08258402]
- [0.08697708 0.10638298 0.58067873]]</code></pre>
+<pre><code>[[0.60728442 0.09150181 0.        ]
+ [0.2859526  0.78015612 0.08084521]
+ [0.08335143 0.10204083 0.57735028]]</code></pre>
 </div>
 </div>
 <p>NB: This won’t work if two datasets were embedded with different <code>Embedder</code> instances, even if they’re identical. The <code>compare()</code> method checks for the same embedder object memory reference so it won’t work if one was embedded with the original and the other with the copy. The way to fix this is to re-initialise the <code>EmbeddedDataFrame</code> with the new <code>Embedder</code> object.</p>
-<div id="6ff6d3fb" class="cell" data-execution_count="11">
+<div id="ffa6bdc1" class="cell" data-execution_count="11">
 <div class="sourceCode cell-code" id="cb14"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a>edf2_copy <span class="op">=</span> EmbeddedDataFrame(edf2, embedder_copy)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 <p>In this case, be careful that the <code>Embedder</code> is compatible with the Bloom filter vectors in the EDF (i.e.&nbsp;uses the same parameters and feature factories), because while you can refresh the norms and thresholds, you can’t refresh the ‘bf_indices’ without reembedding the data frame.</p>
@@ -599,7 +610,7 @@ <h2 class="anchored" data-anchor-id="serialisation-and-file-io">Serialisation an
 <section id="serialising-the-data" class="level2">
 <h2 class="anchored" data-anchor-id="serialising-the-data">Serialising the data</h2>
 <p>The EDF objects are just a thin wrapper around <code>pandas.DataFrame</code> instances, so you can serialise to JSON using the normal methods.</p>
-<div id="5fca2312" class="cell" data-execution_count="12">
+<div id="99b263ce" class="cell" data-execution_count="12">
 <div class="sourceCode cell-code" id="cb15"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a>edf1.to_json(<span class="st">"edf1.json"</span>)</span>
 <span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a>edf1_copy <span class="op">=</span> pd.read_json(<span class="st">"edf1.json"</span>)</span>
@@ -613,7 +624,7 @@ <h2 class="anchored" data-anchor-id="serialising-the-data">Serialising the data<
 </div>
 <p>The <code>bf_indices</code>, <code>bf_norms</code> and <code>thresholds</code> columns will be preserved. However, this demotes the data frames back to normal <code>pandas.DataFrame</code> instances and loses the link to an <code>Embedder</code> instance.</p>
 <p>To fix this, just re-initialise them:</p>
-<div id="ead0ee98" class="cell" data-execution_count="13">
+<div id="10de7c92" class="cell" data-execution_count="13">
 <div class="sourceCode cell-code" id="cb17"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a>edf1_copy <span class="op">=</span> EmbeddedDataFrame(edf1_copy, embedder_copy)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 
diff --git a/search.json b/search.json
index 8c077a3..ab28c72 100644
--- a/search.json
+++ b/search.json
@@ -223,7 +223,7 @@
     "href": "docs/tutorials/example-verknupfung.html",
     "title": "Exploring a simple linkage example",
     "section": "",
-    "text": "The Python package implements the Bloom filter linkage method (Schnell et al., 2009), and can also implement pretrained Hash embeddings (Miranda et al., 2022), if a suitable large, pre-matched corpus of data is available.\nLet us consider a small example where we want to link two excerpts of data on bands. In this scenario, we are looking at some toy data on the members of a fictional, German rock trio called “Verknüpfung”. In this example we will see how to use untrained Bloom filters to match data.\n\nLoading the data\nFirst, we load our data into pandas.DataFrame objects. Here, the first records align, but the other two records should be swapped to have an aligned matching. We will use the toolkit to identify these matches.\n\nimport pandas as pd\n\ndf1 = pd.DataFrame(\n    {\n        \"first_name\": [\"Laura\", \"Kaspar\", \"Grete\"],\n        \"last_name\": [\"Daten\", \"Gorman\", \"Knopf\"],\n        \"gender\": [\"F\", \"M\", \"F\"],\n        \"date_of_birth\": [\"01/03/1977\", \"31/12/1975\", \"12/7/1981\"],\n        \"instrument\": [\"bass\", \"guitar\", \"drums\"],\n    }\n)\ndf2 = pd.DataFrame(\n    {\n        \"name\": [\"Laura Datten\", \"Greta Knopf\", \"Casper Goreman\"],\n        \"sex\": [\"female\", \"female\", \"male\"],\n        \"main_instrument\": [\"bass guitar\", \"percussion\", \"electric guitar\"],\n        \"birth_date\": [\"1977-03-23\", \"1981-07-12\", \"1975-12-31\"],\n    }\n)\n\n\n\n\n\n\n\nNote\n\n\n\nThese datasets don’t have the same column names or follow the same encodings, and there are several spelling mistakes in the names of the band members, as well as a typo in the dates.\nThankfully, the PPRL Toolkit is flexible enough to handle this!\n\n\n\n\nCreating and assigning a feature factory\nThe next step is to decide how to process each of the columns in our datasets.\nTo do this, we define a feature factory that maps column types to feature generation functions, and a column specification for each dataset mapping our columns to column types in the factory.\n\nfrom pprl.embedder import features\nfrom functools import partial\n\nfactory = dict(\n    name=features.gen_name_features,\n    sex=features.gen_sex_features,\n    misc=features.gen_misc_features,\n    dob=features.gen_dateofbirth_features,\n    instrument=partial(features.gen_misc_shingled_features, label=\"instrument\")\n)\nspec1 = dict(\n    first_name=\"name\",\n    last_name=\"name\",\n    gender=\"sex\",\n    instrument=\"instrument\",\n    date_of_birth=\"dob\",\n)\nspec2 = dict(name=\"name\", sex=\"sex\", main_instrument=\"instrument\", birth_date=\"dob\")\n\n\n\n\n\n\n\nTip\n\n\n\nThe feature generation functions, features.gen_XXX_features have sensible default parameters, but sometimes have to be passed in to the feature factory with different parameters, such as to set a feature label in the example above. There are two ways to achieve this. Either use functools.partial to set parameters (as above), or pass keyword arguments as a dictionary of dictionaries to the Embedder as ff_args.\n\n\n\n\nEmbedding the data\nWith our specifications sorted out, we can get to creating our Bloom filter embedding. Before doing so, we need to decide on two parameters: the size of the filter and the number of hashes. By default, these are 1024 and 2, respectively.\nOnce we’ve decided, we can create our Embedder instance and use it to embed our data with their column specifications.\n\nfrom pprl.embedder.embedder import Embedder\n\nembedder = Embedder(factory, bf_size=1024, num_hashes=2)\n\nedf1 = embedder.embed(df1, colspec=spec1, update_thresholds=True)\nedf2 = embedder.embed(df2, colspec=spec2, update_thresholds=True)\n\nIf we take a look at one of these embedded datasets, we can see that it has a whole bunch of new columns. There is a _features column for each of the original columns containing their pre-embedding string features, and there’s an all_features column that combines the features. Then there are three additional columns: bf_indices, bf_norms and thresholds.\n\nedf1.columns\n\nIndex(['first_name', 'last_name', 'gender', 'date_of_birth', 'instrument',\n       'first_name_features', 'last_name_features', 'gender_features',\n       'instrument_features', 'date_of_birth_features', 'all_features',\n       'bf_indices', 'bf_norms', 'thresholds'],\n      dtype='object')\n\n\nThe bf_indices column contains the Bloom filters, represented compactly as a list of non-zero indices for each record.\n\nprint(edf1.bf_indices[0])\n\n[2, 646, 903, 262, 9, 654, 15, 272, 17, 146, 526, 532, 531, 282, 667, 413, 670, 544, 288, 931, 292, 808, 937, 172, 942, 559, 816, 691, 820, 567, 440, 56, 823, 60, 61, 318, 319, 320, 577, 444, 836, 583, 332, 972, 590, 77, 593, 338, 465, 468, 84, 82, 851, 600, 211, 218, 861, 613, 871, 744, 238, 367, 881, 758, 890, 379, 1021, 763]\n\n\nThe bf_norms column contains the norm of each Bloom filter with respect to the Soft Cosine Measure (SCM) matrix. In this case since we are using an untrained model, the SCM matrix is an identity matrix, and the norm is just the Euclidean norm of the Bloom filter represented as a binary vector, which is equal to np.sqrt(len(bf_indices[i])) for record i. The norm is used to scale the similarity measures so that they take values between -1 and 1.\nThe thresholds column is calculated to provide, for each record, a threshold similarity score below which it will not be matched. It’s like a reserve price in an auction – it stops a record being matched to another record when the similarity isn’t high enough. This is an innovative feature of our method; other linkage methods typically only have one global threshold score for the entire dataset.\n\nprint(edf1.loc[:,[\"bf_norms\",\"thresholds\"]])\nprint(edf2.loc[:,[\"bf_norms\",\"thresholds\"]])\n\n   bf_norms  thresholds\n0  8.246211    0.114332\n1  9.055386    0.143159\n2  8.485281    0.143159\n    bf_norms  thresholds\n0   9.695360    0.294345\n1   9.380832    0.157014\n2  10.862781    0.294345\n\n\n\n\n\nThe processed features\nLet’s take a look at how the features are processed into small text strings (shingles) before being hashed into the Bloom filter. The first record in the first dataset is the same person as the first record in the second dataset, although the data is not identical, so we can compare the processed features for these records to see how pprl puts them into a format where they can be compared.\nFirst, we’ll look at date of birth:\n\nprint(edf1.date_of_birth_features[0])\nprint(edf2.birth_date_features[0])\n\n['day&lt;01&gt;', 'month&lt;03&gt;', 'year&lt;1977&gt;']\n['day&lt;23&gt;', 'month&lt;03&gt;', 'year&lt;1977&gt;']\n\n\nPython can parse the different formats easily. Although the dates are slightly different in the dataset, the year and month will still match, even though the day will not.\nThen we’ll look at name:\n\nprint(edf1.first_name_features[0] + edf1.last_name_features[0])\nprint(edf2.name_features[0])\n\n['_l', 'la', 'au', 'ur', 'ra', 'a_', '_la', 'lau', 'aur', 'ura', 'ra_', '_d', 'da', 'at', 'te', 'en', 'n_', '_da', 'dat', 'ate', 'ten', 'en_']\n['_l', 'la', 'au', 'ur', 'ra', 'a_', '_d', 'da', 'at', 'tt', 'te', 'en', 'n_', '_la', 'lau', 'aur', 'ura', 'ra_', '_da', 'dat', 'att', 'tte', 'ten', 'en_']\n\n\nThe two datasets store the names differently, but this doesn’t matter for the Bloom filter method because it treats each record like a bag of features. By default, the name processor produces 2-grams and 3-grams.\nThe sex processing function just converts different formats to lowercase and takes the first letter. This will often be enough:\n\nprint(edf1.gender_features[0])\nprint(edf2.sex_features[0])\n\n['sex&lt;f&gt;']\n['sex&lt;f&gt;']\n\n\nFinally, we’ll see how our instrument feature function (partial(features.gen_misc_shingled_features, label=\"instrument\")) processed the data:\n\nprint(edf1.instrument_features[0])\nprint(edf2.main_instrument_features[0])\n\n['instrument&lt;_b&gt;', 'instrument&lt;ba&gt;', 'instrument&lt;as&gt;', 'instrument&lt;ss&gt;', 'instrument&lt;s_&gt;', 'instrument&lt;_ba&gt;', 'instrument&lt;bas&gt;', 'instrument&lt;ass&gt;', 'instrument&lt;ss_&gt;']\n['instrument&lt;_b&gt;', 'instrument&lt;ba&gt;', 'instrument&lt;as&gt;', 'instrument&lt;ss&gt;', 'instrument&lt;s_&gt;', 'instrument&lt;_g&gt;', 'instrument&lt;gu&gt;', 'instrument&lt;ui&gt;', 'instrument&lt;it&gt;', 'instrument&lt;ta&gt;', 'instrument&lt;ar&gt;', 'instrument&lt;r_&gt;', 'instrument&lt;_ba&gt;', 'instrument&lt;bas&gt;', 'instrument&lt;ass&gt;', 'instrument&lt;ss_&gt;', 'instrument&lt;_gu&gt;', 'instrument&lt;gui&gt;', 'instrument&lt;uit&gt;', 'instrument&lt;ita&gt;', 'instrument&lt;tar&gt;', 'instrument&lt;ar_&gt;']\n\n\nSetting the label argument was important to ensure that the shingles match (and are hashed to the same slots) because the default behaviour of the function is to use the column name as a label: since the two columns have different names, the default wouldn’t have allowed the features to match to each other.\n\n\nPerforming the linkage\nWe can now perform the linkage by comparing these Bloom filter embeddings. We use the Soft Cosine Measure (which in this untrained model, is equivalent to a normal cosine similarity metric) to calculate record-wise similarity and an adapted Hungarian algorithm to match the records based on those similarities.\n\nsimilarities = embedder.compare(edf1, edf2)\nsimilarities\n\nSimilarityArray([[0.80050047, 0.10341754, 0.10047246],\n                 [0.34170424, 0.16480856, 0.63029481],\n                 [0.12155416, 0.54020787, 0.11933984]])\n\n\nThis SimilarityArray object is an augmented numpy.ndarray that can perform our matching. The matching itself can optionally be called with an absolute threshold score, but it doesn’t need one.\n\nmatching = similarities.match()\nmatching\n\n(array([0, 1, 2]), array([0, 2, 1]))\n\n\nSo, all three of the records in each dataset were matched correctly. Excellent!",
+    "text": "The Python package implements the Bloom filter linkage method (Schnell et al., 2009), and can also implement pretrained Hash embeddings (Miranda et al., 2022), if a suitable large, pre-matched corpus of data is available.\nLet us consider a small example where we want to link two excerpts of data on bands. In this scenario, we are looking at some toy data on the members of a fictional, German rock trio called “Verknüpfung”. In this example we will see how to use untrained Bloom filters to match data.\n\nLoading the data\nFirst, we load our data into pandas.DataFrame objects. Here, the first records align, but the other two records should be swapped to have an aligned matching. We will use the toolkit to identify these matches.\n\nimport pandas as pd\n\ndf1 = pd.DataFrame(\n    {\n        \"first_name\": [\"Laura\", \"Kaspar\", \"Grete\"],\n        \"last_name\": [\"Daten\", \"Gorman\", \"Knopf\"],\n        \"gender\": [\"F\", \"M\", \"F\"],\n        \"date_of_birth\": [\"01/03/1977\", \"31/12/1975\", \"12/7/1981\"],\n        \"instrument\": [\"bass\", \"guitar\", \"drums\"],\n    }\n)\ndf2 = pd.DataFrame(\n    {\n        \"name\": [\"Laura Datten\", \"Greta Knopf\", \"Casper Goreman\"],\n        \"sex\": [\"female\", \"female\", \"male\"],\n        \"main_instrument\": [\"bass guitar\", \"percussion\", \"electric guitar\"],\n        \"birth_date\": [\"1977-03-23\", \"1981-07-12\", \"1975-12-31\"],\n    }\n)\n\n\n\n\n\n\n\nNote\n\n\n\nThese datasets don’t have the same column names or follow the same encodings, and there are several spelling mistakes in the names of the band members, as well as a typo in the dates.\nThankfully, the PPRL Toolkit is flexible enough to handle this!\n\n\n\n\nCreating and assigning a feature factory\nThe next step is to decide how to process each of the columns in our datasets.\nTo do this, we define a feature factory that maps column types to feature generation functions, and a column specification for each dataset mapping our columns to column types in the factory.\n\nfrom pprl.embedder import features\nfrom functools import partial\n\nfactory = dict(\n    name=features.gen_name_features,\n    sex=features.gen_sex_features,\n    misc=features.gen_misc_features,\n    dob=features.gen_dateofbirth_features,\n    instrument=partial(features.gen_misc_shingled_features, label=\"instrument\")\n)\nspec1 = dict(\n    first_name=\"name\",\n    last_name=\"name\",\n    gender=\"sex\",\n    instrument=\"instrument\",\n    date_of_birth=\"dob\",\n)\nspec2 = dict(name=\"name\", sex=\"sex\", main_instrument=\"instrument\", birth_date=\"dob\")\n\n\n\n\n\n\n\nTip\n\n\n\nThe feature generation functions, features.gen_XXX_features have sensible default parameters, but sometimes have to be passed in to the feature factory with different parameters, such as to set a feature label in the example above. There are two ways to achieve this. Either use functools.partial to set parameters (as above), or pass keyword arguments as a dictionary of dictionaries to the Embedder as ff_args.\n\n\n\n\nEmbedding the data\nWith our specifications sorted out, we can get to creating our Bloom filter embedding. Before doing so, we need to decide on two parameters: the size of the filter and the number of hashes. By default, these are 1024 and 2, respectively.\nOnce we’ve decided, we can create our Embedder instance and use it to embed our data with their column specifications.\n\nfrom pprl.embedder.embedder import Embedder\n\nembedder = Embedder(factory, bf_size=1024, num_hashes=2)\n\nedf1 = embedder.embed(df1, colspec=spec1, update_thresholds=True)\nedf2 = embedder.embed(df2, colspec=spec2, update_thresholds=True)\n\nIf we take a look at one of these embedded datasets, we can see that it has a whole bunch of new columns. There is a _features column for each of the original columns containing their pre-embedding string features, and there’s an all_features column that combines the features. Then there are three additional columns: bf_indices, bf_norms and thresholds.\n\nedf1.columns\n\nIndex(['first_name', 'last_name', 'gender', 'date_of_birth', 'instrument',\n       'first_name_features', 'last_name_features', 'gender_features',\n       'instrument_features', 'date_of_birth_features', 'all_features',\n       'bf_indices', 'bf_norms', 'thresholds'],\n      dtype='object')\n\n\nThe bf_indices column contains the Bloom filters, represented compactly as a list of non-zero indices for each record.\n\nprint(edf1.bf_indices[0])\n\n[2, 262, 646, 903, 9, 526, 15, 272, 654, 146, 531, 532, 17, 282, 667, 413, 670, 544, 288, 931, 292, 808, 937, 172, 942, 559, 816, 691, 820, 567, 823, 440, 56, 60, 61, 318, 319, 320, 444, 577, 836, 583, 332, 77, 972, 590, 465, 593, 211, 468, 82, 851, 338, 600, 84, 218, 861, 613, 871, 744, 238, 367, 881, 758, 890, 379, 1021, 763]\n\n\nThe bf_norms column contains the norm of each Bloom filter with respect to the Soft Cosine Measure (SCM) matrix. In this case since we are using an untrained model, the SCM matrix is an identity matrix, and the norm is just the Euclidean norm of the Bloom filter represented as a binary vector, which is equal to np.sqrt(len(bf_indices[i])) for record i. The norm is used to scale the similarity measures so that they take values between -1 and 1.\nThe thresholds column is calculated to provide, for each record, a threshold similarity score below which it will not be matched. It’s like a reserve price in an auction – it stops a record being matched to another record when the similarity isn’t high enough. This is an innovative feature of our method; other linkage methods typically only have one global threshold score for the entire dataset.\n\nprint(edf1.loc[:,[\"bf_norms\",\"thresholds\"]])\nprint(edf2.loc[:,[\"bf_norms\",\"thresholds\"]])\n\n   bf_norms  thresholds\n0  8.246211    0.114332\n1  9.055386    0.143159\n2  8.485281    0.143159\n    bf_norms  thresholds\n0   9.695360    0.294345\n1   9.380832    0.157014\n2  10.862781    0.294345\n\n\n\n\n\nThe processed features\nLet’s take a look at how the features are processed into small text strings (shingles) before being hashed into the Bloom filter. The first record in the first dataset is the same person as the first record in the second dataset, although the data is not identical, so we can compare the processed features for these records to see how pprl puts them into a format where they can be compared.\nFirst, we’ll look at date of birth:\n\nprint(edf1.date_of_birth_features[0])\nprint(edf2.birth_date_features[0])\n\n['day&lt;01&gt;', 'month&lt;03&gt;', 'year&lt;1977&gt;']\n['day&lt;23&gt;', 'month&lt;03&gt;', 'year&lt;1977&gt;']\n\n\nPython can parse the different formats easily. Although the dates are slightly different in the dataset, the year and month will still match, even though the day will not.\nThen we’ll look at name:\n\nprint(edf1.first_name_features[0] + edf1.last_name_features[0])\nprint(edf2.name_features[0])\n\n['_l', 'la', 'au', 'ur', 'ra', 'a_', '_la', 'lau', 'aur', 'ura', 'ra_', '_d', 'da', 'at', 'te', 'en', 'n_', '_da', 'dat', 'ate', 'ten', 'en_']\n['_l', 'la', 'au', 'ur', 'ra', 'a_', '_d', 'da', 'at', 'tt', 'te', 'en', 'n_', '_la', 'lau', 'aur', 'ura', 'ra_', '_da', 'dat', 'att', 'tte', 'ten', 'en_']\n\n\nThe two datasets store the names differently, but this doesn’t matter for the Bloom filter method because it treats each record like a bag of features. By default, the name processor produces 2-grams and 3-grams.\nThe sex processing function just converts different formats to lowercase and takes the first letter. This will often be enough:\n\nprint(edf1.gender_features[0])\nprint(edf2.sex_features[0])\n\n['sex&lt;f&gt;']\n['sex&lt;f&gt;']\n\n\nFinally, we’ll see how our instrument feature function (partial(features.gen_misc_shingled_features, label=\"instrument\")) processed the data:\n\nprint(edf1.instrument_features[0])\nprint(edf2.main_instrument_features[0])\n\n['instrument&lt;_b&gt;', 'instrument&lt;ba&gt;', 'instrument&lt;as&gt;', 'instrument&lt;ss&gt;', 'instrument&lt;s_&gt;', 'instrument&lt;_ba&gt;', 'instrument&lt;bas&gt;', 'instrument&lt;ass&gt;', 'instrument&lt;ss_&gt;']\n['instrument&lt;_b&gt;', 'instrument&lt;ba&gt;', 'instrument&lt;as&gt;', 'instrument&lt;ss&gt;', 'instrument&lt;s_&gt;', 'instrument&lt;_g&gt;', 'instrument&lt;gu&gt;', 'instrument&lt;ui&gt;', 'instrument&lt;it&gt;', 'instrument&lt;ta&gt;', 'instrument&lt;ar&gt;', 'instrument&lt;r_&gt;', 'instrument&lt;_ba&gt;', 'instrument&lt;bas&gt;', 'instrument&lt;ass&gt;', 'instrument&lt;ss_&gt;', 'instrument&lt;_gu&gt;', 'instrument&lt;gui&gt;', 'instrument&lt;uit&gt;', 'instrument&lt;ita&gt;', 'instrument&lt;tar&gt;', 'instrument&lt;ar_&gt;']\n\n\nSetting the label argument was important to ensure that the shingles match (and are hashed to the same slots) because the default behaviour of the function is to use the column name as a label: since the two columns have different names, the default wouldn’t have allowed the features to match to each other.\n\n\nPerforming the linkage\nWe can now perform the linkage by comparing these Bloom filter embeddings. We use the Soft Cosine Measure (which in this untrained model, is equivalent to a normal cosine similarity metric) to calculate record-wise similarity and an adapted Hungarian algorithm to match the records based on those similarities.\n\nsimilarities = embedder.compare(edf1, edf2)\nsimilarities\n\nSimilarityArray([[0.80050047, 0.10341754, 0.10047246],\n                 [0.34170424, 0.16480856, 0.63029481],\n                 [0.12155416, 0.54020787, 0.11933984]])\n\n\nThis SimilarityArray object is an augmented numpy.ndarray that can perform our matching. The matching itself can optionally be called with an absolute threshold score, but it doesn’t need one.\n\nmatching = similarities.match()\nmatching\n\n(array([0, 1, 2]), array([0, 2, 1]))\n\n\nSo, all three of the records in each dataset were matched correctly. Excellent!",
     "crumbs": [
       "About",
       "Docs",
@@ -340,7 +340,7 @@
     "href": "docs/tutorials/run-through.html",
     "title": "Embedder API run-through",
     "section": "",
-    "text": "This article shows the main classes, methods and functionality of the Embedder API.\nFirst, we’ll import a few modules, including:\nimport os\n\nimport pandas as pd\n\nfrom pprl import EmbeddedDataFrame, Embedder, config\nfrom pprl.embedder import features as feat",
+    "text": "This article shows the main classes, methods and functionality of the Embedder API.\nFirst, we’ll import a few modules, including:\nimport os\nimport numpy as np\nimport pandas as pd\n\nfrom pprl import EmbeddedDataFrame, Embedder, config\nfrom pprl.embedder import features as feat",
     "crumbs": [
       "About",
       "Docs",
@@ -353,7 +353,7 @@
     "href": "docs/tutorials/run-through.html#data-set-up",
     "title": "Embedder API run-through",
     "section": "Data set-up",
-    "text": "Data set-up\nFor this demo we’ll create a really minimal pair of datasets. Notice that they don’t have to have the same structure or field names.\n\ndf1 = pd.DataFrame(\n    dict(\n        id=[1,2,3],\n        forename=[\"Henry\", \"Sally\", \"Ina\"],\n        surname = [\"Tull\", \"Brown\", \"Lawrey\"],\n        dob=[\"1/1/2001\", \"2/1/2001\", \"4/10/1995\"],\n        gender=[\"male\", \"Male\", \"Female\"],\n    )\n)\n\ndf2 = pd.DataFrame(\n    dict(\n        personid=[4,5,6],\n        full_name=[\"Harry Tull\", \"Sali Brown\", \"Ina Laurie\"],\n        date_of_birth=[\"2/1/2001\", \"2/1/2001\", \"4/11/1995\"],\n        sex=[\"M\", \"M\", \"F\"],\n    )\n)\n\nFeatures are extracted as different kinds of string objects from each field, ready to be hash embedded into the Bloom filters. We need to specify the feature extraction functions we’ll need.\nIn this case we’ll need one extractor for names, one for dates of birth, and one for sex/gender records. We create a dict with the functions we need. We create another dict to store any keyword arguments we want to pass in to each function (in this case we use all the default arguments so the keyword argument dictionaries are empty):\n\nfeature_factory = dict(\n    name=feat.gen_name_features,\n    dob=feat.gen_dateofbirth_features,\n    sex=feat.gen_sex_features,\n)\n\nff_args = dict(name={}, sex={}, dob={})",
+    "text": "Data set-up\nFor this demo we’ll create a really minimal pair of datasets. Notice that they don’t have to have the same structure or field names.\n\ndf1 = pd.DataFrame(\n    dict(\n        id=[1,2,3],\n        forename=[\"Henry\", \"Sally\", \"Ina\"],\n        surname = [\"Tull\", \"Brown\", \"Lawrey\"],\n        dob=[\"\", \"2/1/2001\", \"4/10/1995\"],\n        gender=[\"male\", \"Male\", \"Female\"],\n        county=[\"\", np.NaN, \"County Durham\"]\n    )\n)\n\ndf2 = pd.DataFrame(\n    dict(\n        personid=[4,5,6],\n        full_name=[\"Harry Tull\", \"Sali Brown\", \"Ina Laurie\"],\n        date_of_birth=[\"2/1/2001\", \"2/1/2001\", \"4/11/1995\"],\n        sex=[\"M\", \"M\", \"F\"],\n        county=[\"Rutland\", \"Powys\", \"Durham\"]\n    )\n)\n\nFeatures are extracted as different kinds of string objects from each field, ready to be hash embedded into the Bloom filters. We need to specify the feature extraction functions we’ll need.\nIn this case we’ll need one extractor for names, one for dates of birth, and one for sex/gender records. We create a dict with the functions we need. We create another dict to store any keyword arguments we want to pass in to each function (in this case we use all the default arguments so the keyword argument dictionaries are empty):\n\nfeature_factory = dict(\n    name=feat.gen_name_features,\n    dob=feat.gen_dateofbirth_features,\n    sex=feat.gen_sex_features,\n    misc=feat.gen_misc_features\n)\n\nff_args = dict(name={}, sex={}, dob={})",
     "crumbs": [
       "About",
       "Docs",
@@ -366,7 +366,7 @@
     "href": "docs/tutorials/run-through.html#embedding",
     "title": "Embedder API run-through",
     "section": "Embedding",
-    "text": "Embedding\nNow we can create an Embedder object. We want our Bloom filter vectors to have a length of 1024 elements, and we choose to hash each feature two times. These choices seem to work ok, but we haven’t explored them systematically.\n\nembedder = Embedder(feature_factory,\n                    ff_args,\n                    bf_size = 2**10,\n                    num_hashes=2,\n                    )\n\nNow we can hash embed the dataset into an EmbeddedDataFrame (EDF). For this we need to pass a column specification colspec that maps each column of the data into the feature_factory functions. Any columns not mapped will not contribute to the embedding.\n\nedf1 = embedder.embed(\n    df1, colspec=dict(forename=\"name\", surname=\"name\", dob=\"dob\", gender=\"sex\")\n)\nedf2 = embedder.embed(\n    df2, colspec=dict(full_name=\"name\", date_of_birth=\"dob\", sex=\"sex\")\n)\n\nprint(edf1)\nprint(edf2)\n\n   id forename surname        dob  gender  \\\n0   1    Henry    Tull   1/1/2001    male   \n1   2    Sally   Brown   2/1/2001    Male   \n2   3      Ina  Lawrey  4/10/1995  Female   \n\n                                   forename_features  \\\n0  [_h, he, en, nr, ry, y_, _he, hen, enr, nry, ry_]   \n1  [_s, sa, al, ll, ly, y_, _sa, sal, all, lly, ly_]   \n2                    [_i, in, na, a_, _in, ina, na_]   \n\n                                    surname_features  \\\n0           [_t, tu, ul, ll, l_, _tu, tul, ull, ll_]   \n1  [_b, br, ro, ow, wn, n_, _br, bro, row, own, wn_]   \n2  [_l, la, aw, wr, re, ey, y_, _la, law, awr, wr...   \n\n                       dob_features gender_features  \\\n0  [day&lt;01&gt;, month&lt;01&gt;, year&lt;2001&gt;]        [sex&lt;m&gt;]   \n1  [day&lt;02&gt;, month&lt;01&gt;, year&lt;2001&gt;]        [sex&lt;m&gt;]   \n2  [day&lt;04&gt;, month&lt;10&gt;, year&lt;1995&gt;]        [sex&lt;f&gt;]   \n\n                                        all_features  \\\n0  [ll_, _tu, day&lt;01&gt;, ul, l_, sex&lt;m&gt;, ull, y_, _...   \n1  [lly, day&lt;02&gt;, wn_, sex&lt;m&gt;, sal, wn, y_, ly_, ...   \n2  [_in, _i, ey_, wr, y_, rey, wre, sex&lt;f&gt;, _l, _...   \n\n                                          bf_indices  bf_norms  \n0  [130, 644, 773, 903, 135, 776, 778, 265, 654, ...  6.708204  \n1  [129, 258, 130, 776, 523, 525, 398, 271, 671, ...  7.141428  \n2  [647, 394, 269, 13, 15, 532, 155, 28, 667, 413...  6.855655  \n   personid   full_name date_of_birth sex  \\\n0         4  Harry Tull      2/1/2001   M   \n1         5  Sali Brown      2/1/2001   M   \n2         6  Ina Laurie     4/11/1995   F   \n\n                                  full_name_features  \\\n0  [_h, ha, ar, rr, ry, y_, _t, tu, ul, ll, l_, _...   \n1  [_s, sa, al, li, i_, _b, br, ro, ow, wn, n_, _...   \n2  [_i, in, na, a_, _l, la, au, ur, ri, ie, e_, _...   \n\n             date_of_birth_features sex_features  \\\n0  [day&lt;02&gt;, month&lt;01&gt;, year&lt;2001&gt;]     [sex&lt;m&gt;]   \n1  [day&lt;02&gt;, month&lt;01&gt;, year&lt;2001&gt;]     [sex&lt;m&gt;]   \n2  [day&lt;04&gt;, month&lt;11&gt;, year&lt;1995&gt;]     [sex&lt;f&gt;]   \n\n                                        all_features  \\\n0  [ll_, _tu, day&lt;02&gt;, ar, ul, l_, sex&lt;m&gt;, ull, y...   \n1  [day&lt;02&gt;, wn_, sex&lt;m&gt;, wn, sal, ow, al, n_, al...   \n2  [ri, _in, _i, aur, ie_, ur, sex&lt;f&gt;, _l, au, _l...   \n\n                                          bf_indices  bf_norms  \n0  [640, 130, 644, 135, 776, 10, 778, 271, 402, 5...  6.708204  \n1  [130, 523, 525, 398, 271, 152, 671, 803, 806, ...  6.855655  \n2  [646, 647, 394, 269, 15, 272, 531, 532, 665, 6...  6.782330",
+    "text": "Embedding\nNow we can create an Embedder object. We want our Bloom filter vectors to have a length of 1024 elements, and we choose to hash each feature two times. These choices seem to work ok, but we haven’t explored them systematically.\n\nembedder = Embedder(feature_factory,\n                    ff_args,\n                    bf_size = 2**10,\n                    num_hashes=2,\n                    )\n\nNow we can hash embed the dataset into an EmbeddedDataFrame (EDF). For this we need to pass a column specification colspec that maps each column of the data into the feature_factory functions. Any columns not mapped will not contribute to the embedding.\n\nedf1 = embedder.embed(\n    df1, colspec=dict(forename=\"name\", surname=\"name\", dob=\"dob\", gender=\"sex\", county=\"misc\")\n)\nedf2 = embedder.embed(\n    df2, colspec=dict(full_name=\"name\", date_of_birth=\"dob\", sex=\"sex\", county=\"misc\")\n)\n\nprint(edf1)\nprint(edf2)\n\n   id forename surname        dob  gender         county  \\\n0   1    Henry    Tull               male                  \n1   2    Sally   Brown   2/1/2001    Male            NaN   \n2   3      Ina  Lawrey  4/10/1995  Female  County Durham   \n\n                                   forename_features  \\\n0  [_h, he, en, nr, ry, y_, _he, hen, enr, nry, ry_]   \n1  [_s, sa, al, ll, ly, y_, _sa, sal, all, lly, ly_]   \n2                    [_i, in, na, a_, _in, ina, na_]   \n\n                                    surname_features  \\\n0           [_t, tu, ul, ll, l_, _tu, tul, ull, ll_]   \n1  [_b, br, ro, ow, wn, n_, _br, bro, row, own, wn_]   \n2  [_l, la, aw, wr, re, ey, y_, _la, law, awr, wr...   \n\n                       dob_features gender_features          county_features  \\\n0                                []        [sex&lt;m&gt;]                            \n1  [day&lt;02&gt;, month&lt;01&gt;, year&lt;2001&gt;]        [sex&lt;m&gt;]                            \n2  [day&lt;04&gt;, month&lt;10&gt;, year&lt;1995&gt;]        [sex&lt;f&gt;]  [county&lt;county durham&gt;]   \n\n                                        all_features  \\\n0  [ll, nr, ll_, _t, ull, _tu, _he, he, tu, hen, ...   \n1  [all, ll, ro, n_, ow, sa, ly_, bro, month&lt;01&gt;,...   \n2  [ina, ey, _in, re, wr, aw, law, la, na_, ey_, ...   \n\n                                          bf_indices  bf_norms  \n0  [644, 773, 135, 776, 265, 778, 271, 402, 404, ...  6.244998  \n1  [129, 258, 130, 776, 523, 525, 398, 271, 671, ...  7.141428  \n2  [647, 394, 269, 13, 15, 532, 667, 155, 413, 28...  7.000000  \n   personid   full_name date_of_birth sex   county  \\\n0         4  Harry Tull      2/1/2001   M  Rutland   \n1         5  Sali Brown      2/1/2001   M    Powys   \n2         6  Ina Laurie     4/11/1995   F   Durham   \n\n                                  full_name_features  \\\n0  [_h, ha, ar, rr, ry, y_, _t, tu, ul, ll, l_, _...   \n1  [_s, sa, al, li, i_, _b, br, ro, ow, wn, n_, _...   \n2  [_i, in, na, a_, _l, la, au, ur, ri, ie, e_, _...   \n\n             date_of_birth_features sex_features    county_features  \\\n0  [day&lt;02&gt;, month&lt;01&gt;, year&lt;2001&gt;]     [sex&lt;m&gt;]  [county&lt;rutland&gt;]   \n1  [day&lt;02&gt;, month&lt;01&gt;, year&lt;2001&gt;]     [sex&lt;m&gt;]    [county&lt;powys&gt;]   \n2  [day&lt;04&gt;, month&lt;11&gt;, year&lt;1995&gt;]     [sex&lt;f&gt;]   [county&lt;durham&gt;]   \n\n                                        all_features  \\\n0  [ll, ll_, rr, rry, ar, _ha, _t, ha, ull, count...   \n1  [county&lt;powys&gt;, ro, li_, n_, ow, sa, bro, ali,...   \n2  [ina, ie, aur, e_, _in, uri, la, na_, county&lt;d...   \n\n                                          bf_indices  bf_norms  \n0  [640, 130, 644, 135, 776, 10, 778, 271, 402, 5...  6.855655  \n1  [130, 523, 525, 398, 271, 152, 671, 803, 806, ...  7.000000  \n2  [646, 647, 394, 269, 15, 272, 531, 532, 665, 6...  6.928203",
     "crumbs": [
       "About",
       "Docs",
@@ -392,7 +392,7 @@
     "href": "docs/tutorials/run-through.html#computing-the-similarity-scores-and-the-matching",
     "title": "Embedder API run-through",
     "section": "Computing the similarity scores and the matching",
-    "text": "Computing the similarity scores and the matching\nNow we have two embedded datasets, we can compare them and compute all the pairwise Cosine similarity scores.\nFirst, we have to compute the vector norms of each Bloom vector (for scaling the Cosine similarity) and the thresholds (thresholds are explained here [link]). Computing the thresholds can be time-consuming for a larger dataset, because it essentially computes all pairwise comparisons of the data to itself.\n\n\n\n\n\n\n\n\n\n\npersonid\nfull_name\ndate_of_birth\nsex\nfull_name_features\ndate_of_birth_features\nsex_features\nall_features\nbf_indices\nbf_norms\nthresholds\n\n\n\n\n0\n4\nHarry Tull\n2/1/2001\nM\n[_h, ha, ar, rr, ry, y_, _t, tu, ul, ll, l_, _...\n[day&lt;02&gt;, month&lt;01&gt;, year&lt;2001&gt;]\n[sex&lt;m&gt;]\n[ll_, _tu, day&lt;02&gt;, ar, ul, l_, sex&lt;m&gt;, ull, y...\n[640, 130, 644, 135, 776, 10, 778, 271, 402, 5...\n6.708204\n0.195698\n\n\n1\n5\nSali Brown\n2/1/2001\nM\n[_s, sa, al, li, i_, _b, br, ro, ow, wn, n_, _...\n[day&lt;02&gt;, month&lt;01&gt;, year&lt;2001&gt;]\n[sex&lt;m&gt;]\n[day&lt;02&gt;, wn_, sex&lt;m&gt;, wn, sal, ow, al, n_, al...\n[130, 523, 525, 398, 271, 152, 671, 803, 806, ...\n6.855655\n0.195698\n\n\n2\n6\nIna Laurie\n4/11/1995\nF\n[_i, in, na, a_, _l, la, au, ur, ri, ie, e_, _...\n[day&lt;04&gt;, month&lt;11&gt;, year&lt;1995&gt;]\n[sex&lt;f&gt;]\n[ri, _in, _i, aur, ie_, ur, sex&lt;f&gt;, _l, au, _l...\n[646, 647, 394, 269, 15, 272, 531, 532, 665, 6...\n6.782330\n0.086026\n\n\n\n\n\n\n\n\nNB: there’s also a flag to compute these at the same time as the embedding, but it doesn’t by default because, depending on the workflow, you may wish to compute the norms and thresholds at different times (e.g. on the server).\nNow you can compute the similarities:\n\nsimilarities = embedder.compare(edf1,edf2)\n\nprint(similarities)\n\n[[0.6666667  0.17395416 0.        ]\n [0.29223802 0.79658223 0.08258402]\n [0.08697708 0.10638298 0.58067873]]\n\n\nFinally, you can compute the matching:\n\nmatching = similarities.match(abs_cutoff=0.5)\n\nprint(matching)\n\n(array([0, 1, 2]), array([0, 1, 2]))",
+    "text": "Computing the similarity scores and the matching\nNow we have two embedded datasets, we can compare them and compute all the pairwise Cosine similarity scores.\nFirst, we have to compute the vector norms of each Bloom vector (for scaling the Cosine similarity) and the thresholds (thresholds are explained here [link]). Computing the thresholds can be time-consuming for a larger dataset, because it essentially computes all pairwise comparisons of the data to itself.\n\n\n\n\n\n\n\n\n\n\npersonid\nfull_name\ndate_of_birth\nsex\ncounty\nfull_name_features\ndate_of_birth_features\nsex_features\ncounty_features\nall_features\nbf_indices\nbf_norms\nthresholds\n\n\n\n\n0\n4\nHarry Tull\n2/1/2001\nM\nRutland\n[_h, ha, ar, rr, ry, y_, _t, tu, ul, ll, l_, _...\n[day&lt;02&gt;, month&lt;01&gt;, year&lt;2001&gt;]\n[sex&lt;m&gt;]\n[county&lt;rutland&gt;]\n[ll, ll_, rr, rry, ar, _ha, _t, ha, ull, count...\n[640, 130, 644, 135, 776, 10, 778, 271, 402, 5...\n6.855655\n0.187541\n\n\n1\n5\nSali Brown\n2/1/2001\nM\nPowys\n[_s, sa, al, li, i_, _b, br, ro, ow, wn, n_, _...\n[day&lt;02&gt;, month&lt;01&gt;, year&lt;2001&gt;]\n[sex&lt;m&gt;]\n[county&lt;powys&gt;]\n[county&lt;powys&gt;, ro, li_, n_, ow, sa, bro, ali,...\n[130, 523, 525, 398, 271, 152, 671, 803, 806, ...\n7.000000\n0.187541\n\n\n2\n6\nIna Laurie\n4/11/1995\nF\nDurham\n[_i, in, na, a_, _l, la, au, ur, ri, ie, e_, _...\n[day&lt;04&gt;, month&lt;11&gt;, year&lt;1995&gt;]\n[sex&lt;f&gt;]\n[county&lt;durham&gt;]\n[ina, ie, aur, e_, _in, uri, la, na_, county&lt;d...\n[646, 647, 394, 269, 15, 272, 531, 532, 665, 6...\n6.928203\n0.082479\n\n\n\n\n\n\n\n\nNB: there’s also a flag to compute these at the same time as the embedding, but it doesn’t by default because, depending on the workflow, you may wish to compute the norms and thresholds at different times (e.g. on the server).\nNow you can compute the similarities:\n\nsimilarities = embedder.compare(edf1,edf2)\n\nprint(similarities)\n\n[[0.60728442 0.09150181 0.        ]\n [0.2859526  0.78015612 0.08084521]\n [0.08335143 0.10204083 0.57735028]]\n\n\nFinally, you can compute the matching:\n\nmatching = similarities.match(abs_cutoff=0.5)\n\nprint(matching)\n\n(array([0, 1, 2]), array([0, 1, 2]))",
     "crumbs": [
       "About",
       "Docs",
@@ -405,7 +405,7 @@
     "href": "docs/tutorials/run-through.html#serialisation-and-file-io",
     "title": "Embedder API run-through",
     "section": "Serialisation and file I/O",
-    "text": "Serialisation and file I/O\nThat’s how to do the workflow in one session. However, this demo follows a multi-stage workflow, so we need to be able to pass objects around. There are a couple of methods that enable file I/O and serialisation.\nFirst, the Embedder object itself needs to be written to file and loaded. The idea is to train it, share it to the data owning parties, and also to the matching server. For this purpose, it’s possible to pickle the entire Embedder object.\n\nembedder.to_pickle(\"embedder.pkl\")\n\nembedder_copy = Embedder.from_pickle(\"embedder.pkl\")\n\nThe copy has the same functionality as the original:\n\nsimilarities = embedder_copy.compare(edf1,edf2)\n\nprint(similarities)\n\n[[0.6666667  0.17395416 0.        ]\n [0.29223802 0.79658223 0.08258402]\n [0.08697708 0.10638298 0.58067873]]\n\n\nNB: This won’t work if two datasets were embedded with different Embedder instances, even if they’re identical. The compare() method checks for the same embedder object memory reference so it won’t work if one was embedded with the original and the other with the copy. The way to fix this is to re-initialise the EmbeddedDataFrame with the new Embedder object.\n\nedf2_copy = EmbeddedDataFrame(edf2, embedder_copy)\n\nIn this case, be careful that the Embedder is compatible with the Bloom filter vectors in the EDF (i.e. uses the same parameters and feature factories), because while you can refresh the norms and thresholds, you can’t refresh the ‘bf_indices’ without reembedding the data frame.",
+    "text": "Serialisation and file I/O\nThat’s how to do the workflow in one session. However, this demo follows a multi-stage workflow, so we need to be able to pass objects around. There are a couple of methods that enable file I/O and serialisation.\nFirst, the Embedder object itself needs to be written to file and loaded. The idea is to train it, share it to the data owning parties, and also to the matching server. For this purpose, it’s possible to pickle the entire Embedder object.\n\nembedder.to_pickle(\"embedder.pkl\")\n\nembedder_copy = Embedder.from_pickle(\"embedder.pkl\")\n\nThe copy has the same functionality as the original:\n\nsimilarities = embedder_copy.compare(edf1,edf2)\n\nprint(similarities)\n\n[[0.60728442 0.09150181 0.        ]\n [0.2859526  0.78015612 0.08084521]\n [0.08335143 0.10204083 0.57735028]]\n\n\nNB: This won’t work if two datasets were embedded with different Embedder instances, even if they’re identical. The compare() method checks for the same embedder object memory reference so it won’t work if one was embedded with the original and the other with the copy. The way to fix this is to re-initialise the EmbeddedDataFrame with the new Embedder object.\n\nedf2_copy = EmbeddedDataFrame(edf2, embedder_copy)\n\nIn this case, be careful that the Embedder is compatible with the Bloom filter vectors in the EDF (i.e. uses the same parameters and feature factories), because while you can refresh the norms and thresholds, you can’t refresh the ‘bf_indices’ without reembedding the data frame.",
     "crumbs": [
       "About",
       "Docs",
@@ -496,7 +496,7 @@
     "href": "docs/tutorials/example-febrl.html#calculate-similarity",
     "title": "Linking the FEBRL datasets",
     "section": "Calculate similarity",
-    "text": "Calculate similarity\nCompute the row thresholds to provide a lower bound on matching similarity scores for each row. This operation is the most computationally intensive part of the whole process.\n\nstart = time.time()\nedf1.update_thresholds()\nedf2.update_thresholds()\nend = time.time()\n\nprint(f\"Updating thresholds took {end - start:.2f} seconds\")\n\nUpdating thresholds took 8.35 seconds\n\n\nCompute the matrix of similarity scores.\n\nsimilarity_scores = embedder.compare(edf1,edf2)",
+    "text": "Calculate similarity\nCompute the row thresholds to provide a lower bound on matching similarity scores for each row. This operation is the most computationally intensive part of the whole process.\n\nstart = time.time()\nedf1.update_thresholds()\nedf2.update_thresholds()\nend = time.time()\n\nprint(f\"Updating thresholds took {end - start:.2f} seconds\")\n\nUpdating thresholds took 8.40 seconds\n\n\nCompute the matrix of similarity scores.\n\nsimilarity_scores = embedder.compare(edf1,edf2)",
     "crumbs": [
       "About",
       "Docs",
@@ -509,7 +509,7 @@
     "href": "docs/tutorials/example-febrl.html#compute-a-match",
     "title": "Linking the FEBRL datasets",
     "section": "Compute a match",
-    "text": "Compute a match\nUse the similarity scores to compute a match, using the Hungarian algorithm. First, we compute the match with the row thresholds.\n\nmatching = similarity_scores.match(require_thresholds=True)\n\nUsing the true IDs, evaluate the precision and recall of the match.\n\ndef get_results(edf1, edf2, matching):\n    \"\"\"Get the results for a given matching.\"\"\"\n\n    trueids_matched1 = edf1.iloc[matching[0], edf1.columns.get_loc(\"true_id\")]\n    trueids_matched2 = edf2.iloc[matching[1], edf2.columns.get_loc(\"true_id\")]\n\n    nmatches = len(matching[0])\n    truepos = sum(map(np.equal, trueids_matched1, trueids_matched2))\n    falsepos = nmatches - truepos\n\n    print(\n        f\"True pos: {truepos} | False pos: {falsepos} | \"\n        f\"Precision: {truepos / nmatches:.1%} | Recall: {truepos / 5000:.1%}\"\n    )\n\n    return nmatches, truepos, falsepos\n\n_ = get_results(edf1, edf2, matching)\n\nTrue pos: 4973 | False pos: 0 | Precision: 100.0% | Recall: 99.5%\n\n\nThen, we compute the match without using the row thresholds, calculating the same performance metrics:\n\nmatching = similarity_scores.match(require_thresholds=False)\n_ = get_results(edf1, edf2, matching)\n\nTrue pos: 5000 | False pos: 0 | Precision: 100.0% | Recall: 100.0%\n\n\nWithout using the row thresholds, the number of false positives is larger, but the recall is much better. For some uses this balance may be preferable.\nIn testing, the use of local row thresholds provides a better trade-off between precision and recall, compared to using a single absolute threshold. It has the additional advantage, in a privacy-preserving setting, of being automatic and not requiring clerical review to set the level.",
+    "text": "Compute a match\nUse the similarity scores to compute a match, using the Hungarian algorithm. First, we compute the match with the row thresholds.\n\nmatching = similarity_scores.match(require_thresholds=True)\n\nUsing the true IDs, evaluate the precision and recall of the match.\n\ndef get_results(edf1, edf2, matching):\n    \"\"\"Get the results for a given matching.\"\"\"\n\n    trueids_matched1 = edf1.iloc[matching[0], edf1.columns.get_loc(\"true_id\")]\n    trueids_matched2 = edf2.iloc[matching[1], edf2.columns.get_loc(\"true_id\")]\n\n    nmatches = len(matching[0])\n    truepos = sum(map(np.equal, trueids_matched1, trueids_matched2))\n    falsepos = nmatches - truepos\n\n    print(\n        f\"True pos: {truepos} | False pos: {falsepos} | \"\n        f\"Precision: {truepos / nmatches:.1%} | Recall: {truepos / 5000:.1%}\"\n    )\n\n    return nmatches, truepos, falsepos\n\n_ = get_results(edf1, edf2, matching)\n\nTrue pos: 4969 | False pos: 0 | Precision: 100.0% | Recall: 99.4%\n\n\nThen, we compute the match without using the row thresholds, calculating the same performance metrics:\n\nmatching = similarity_scores.match(require_thresholds=False)\n_ = get_results(edf1, edf2, matching)\n\nTrue pos: 5000 | False pos: 0 | Precision: 100.0% | Recall: 100.0%\n\n\nWithout using the row thresholds, the number of false positives is larger, but the recall is much better. For some uses this balance may be preferable.\nIn testing, the use of local row thresholds provides a better trade-off between precision and recall, compared to using a single absolute threshold. It has the additional advantage, in a privacy-preserving setting, of being automatic and not requiring clerical review to set the level.",
     "crumbs": [
       "About",
       "Docs",
@@ -586,7 +586,7 @@
     "href": "docs/reference/features.html",
     "title": "features",
     "section": "",
-    "text": "embedder.features\nFeature generation functions for various column types.\n\n\n\n\n\nName\nDescription\n\n\n\n\ngen_dateofbirth_features\nGenerate labelled date features from a series of dates of birth.\n\n\ngen_double_metaphone\nGenerate the double methaphones of a string.\n\n\ngen_features\nGenerate string features of various types.\n\n\ngen_misc_features\nGenerate miscellaneous categorical features for a series.\n\n\ngen_misc_shingled_features\nGenerate shingled labelled features.\n\n\ngen_name_features\nGenerate a features series for a series of names.\n\n\ngen_ngram\nGenerate n-grams from a set of tokens.\n\n\ngen_sex_features\nGenerate labelled sex features from a series of sexes.\n\n\ngen_skip_grams\nGenerate skip 2-grams from a set of tokens.\n\n\nsplit_string_underscore\nSplit and underwrap a string at typical punctuation marks.\n\n\n\n\n\nembedder.features.gen_dateofbirth_features(dob, dayfirst=True, yearfirst=False, default=['day&lt;01&gt;', 'month&lt;01&gt;', 'year&lt;2050&gt;'])\nGenerate labelled date features from a series of dates of birth.\nFeatures take the form [\"day&lt;dd&gt;\", \"month&lt;mm&gt;\", \"year&lt;YYYY&gt;\"]. Note that this feature generator can be used for any sort of date data, not just dates of birth.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndob\npandas.pandas.Series\nSeries of dates of birth.\nrequired\n\n\ndayfirst\nbool\nWhether the day comes first in the DOBs. Passed to pd.to_datetime() and defaults to True.\nTrue\n\n\nyearfirst\nbool\nWhether the year comes first in the DOBs. Passed to pd.to_datetime() and defaults to False.\nFalse\n\n\ndefault\nlist[str]\nDefault date to fill in missing data in feature (list) form. Default is the feature form of 2050-01-01.\n['day&lt;01&gt;', 'month&lt;01&gt;', 'year&lt;2050&gt;']\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npandas.pandas.Series\nSeries containing lists of date features.\n\n\n\n\n\n\n\nembedder.features.gen_double_metaphone(string)\nGenerate the double methaphones of a string.\nThis function is a generator containing all the possible, non-empty double metaphones of a given string, separated by spaces. This function uses the metaphone.doublemetaphone() function under the hood, ignoring any empty strings. See their repository for details.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nstring\nstr\nString from which to derive double metaphones.\nrequired\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nstr\nThe next double metaphone in the sequence.\n\n\n\n\n\n\n\nembedder.features.gen_features(string, ngram_length=[2, 3], use_gen_ngram=True, use_gen_skip_grams=False, use_double_metaphone=False)\nGenerate string features of various types.\nThis function is a generator capable of producing n-grams, skip 2-grams, and double metaphones from a single string. These outputs are referred to as features.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nstring\nstr\nBase string from which to generate features.\nrequired\n\n\nngram_length\nlist\nLengths of n-grams to make. Ignored if use_gen_ngram=False.\n[2, 3]\n\n\nuse_gen_ngram\nbool\nWhether to create n-grams. Default is True.\nTrue\n\n\nuse_gen_skip_grams\nbool\nWhether to create skip 2-grams. Default is False.\nFalse\n\n\nuse_double_metaphone\nbool\nWhether to create double metaphones. Default is False.\nFalse\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nstr\nThe next feature in the sequence.\n\n\n\n\n\n\n\nembedder.features.gen_misc_features(field, label=None)\nGenerate miscellaneous categorical features for a series.\nUseful for keeping raw columns in the linkage data. All features use a label and take the form [\"label&lt;option&gt;\"] except for missing data, which are coded as \"\".\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfield\npandas.pandas.Series\nSeries from which to generate our features.\nrequired\n\n\nlabel\nNone | str | typing.Hashable\nLabel for the series. By default, the name of the series is used if available. Otherwise, if not specified, misc is used.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npandas.pandas.Series\nSeries containing lists of miscellaneous features.\n\n\n\n\n\n\n\nembedder.features.gen_misc_shingled_features(field, ngram_length=[2, 3], use_gen_skip_grams=False, label=None)\nGenerate shingled labelled features.\nGenerate n-grams, with a label to distinguish them from (and ensure they’re hashed separately from) names. Like gen_name_features(), this function makes a call to gen_features() via pd.Series.apply().\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfield\npandas.pandas.Series\nSeries of string data.\nrequired\n\n\nngram_length\nlist\nShingle sizes to generate. By default [2, 3].\n[2, 3]\n\n\nuse_gen_skip_grams\nbool\nWhether to generate skip 2-grams. False by default.\nFalse\n\n\nlabel\nstr\nA label to differentiate from other shingled features. If field has no name, this defaults to zz.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npandas.pandas.Series\nSeries containing lists of shingled string features.\n\n\n\n\n\n\n\nembedder.features.gen_name_features(names, ngram_length=[2, 3], use_gen_ngram=True, use_gen_skip_grams=False, use_double_metaphone=False)\nGenerate a features series for a series of names.\nEffectively, this function is a call to pd.Series.apply() using our gen_features() string feature generator function.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nnames\npandas.pandas.Series\nSeries of names.\nrequired\n\n\nngram_length\nlist[int]\nLengths of n-grams to make. Ignored if use_gen_ngram=False.\n[2, 3]\n\n\nuse_gen_ngram\nbool\nWhether to create n-grams. Default is True.\nTrue\n\n\nuse_gen_skip_grams\nbool\nWhether to create skip 2-grams. Default is False.\nFalse\n\n\nuse_double_metaphone\nbool\nWhether to create double metaphones. Default is False.\nFalse\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npandas.pandas.Series\nSeries containing lists of features.\n\n\n\n\n\n\n\nembedder.features.gen_ngram(split_tokens, ngram_length)\nGenerate n-grams from a set of tokens.\nThis is a generator function that contains a series of n-grams the size of the sliding window.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsplit_tokens\nlist\nAll the split-up tokens from which to form n-grams.\nrequired\n\n\nngram_length\nlist\nDesired lengths of n-grams. For examples, ngram_length=[2, 3] would generate all 2-grams and 3-grams.\nrequired\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nstr\nThe next n-gram in the sequence.\n\n\n\n\n\n\n\nembedder.features.gen_sex_features(sexes)\nGenerate labelled sex features from a series of sexes.\nFeatures take the form [\"sex&lt;option&gt;\"] or [\"\"] for missing data.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsexes\npandas.pandas.Series\nSeries of sex data.\nrequired\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npandas.pandas.Series\nSeries containing lists of sex features.\n\n\n\n\n\n\n\nembedder.features.gen_skip_grams(split_tokens)\nGenerate skip 2-grams from a set of tokens.\nThis function is a generator that contains a series of skip 2-grams.\n\n\n&gt;&gt;&gt; string = \"dave james\"\n&gt;&gt;&gt; tokens = split_string_underscore(string)\n&gt;&gt;&gt; skips = list(gen_skip_grams(tokens))\n&gt;&gt;&gt; print(skips)\n[\"_a\", \"dv\", \"ae\", \"v_\", \"_a\", \"jm\", \"ae\", \"ms\", \"e_\"]\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsplit_tokens\nlist\nAll the split-up tokens from which to form skip 2-grams.\nrequired\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nstr\nThe next skip 2-gram in the sequence.\n\n\n\n\n\n\n\nembedder.features.split_string_underscore(string)\nSplit and underwrap a string at typical punctuation marks.\nCurrently, we split at any combination of spaces, dashes, dots, commas, or underscores.\n\n\n&gt;&gt;&gt; strings = (\"dave  william johnson\", \"Francesca__Hogan-O'Malley\")\n&gt;&gt;&gt; for string in strings:\n...     print(split_string_underscore(string))\n[\"_dave_\", \"_william_\", \"_johnson_\"]\n[\"_Francesca_\", \"_Hogan_\", \"_O'Malley_\"]\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nstring\nstr\nString to split.\nrequired\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nlist[str]\nList of the split and wrapped tokens.",
+    "text": "embedder.features\nFeature generation functions for various column types.\n\n\n\n\n\nName\nDescription\n\n\n\n\ngen_dateofbirth_features\nGenerate labelled date features from a series of dates of birth.\n\n\ngen_double_metaphone\nGenerate the double methaphones of a string.\n\n\ngen_features\nGenerate string features of various types.\n\n\ngen_misc_features\nGenerate miscellaneous categorical features for a series.\n\n\ngen_misc_shingled_features\nGenerate shingled labelled features.\n\n\ngen_name_features\nGenerate a features series for a series of names.\n\n\ngen_ngram\nGenerate n-grams from a set of tokens.\n\n\ngen_sex_features\nGenerate labelled sex features from a series of sexes.\n\n\ngen_skip_grams\nGenerate skip 2-grams from a set of tokens.\n\n\nsplit_string_underscore\nSplit and underwrap a string at typical punctuation marks.\n\n\n\n\n\nembedder.features.gen_dateofbirth_features(dob, dayfirst=True, yearfirst=False, default=[])\nGenerate labelled date features from a series of dates of birth.\nFeatures take the form [\"day&lt;dd&gt;\", \"month&lt;mm&gt;\", \"year&lt;YYYY&gt;\"]. Note that this feature generator can be used for any sort of date data, not just dates of birth.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndob\npandas.pandas.Series\nSeries of dates of birth.\nrequired\n\n\ndayfirst\nbool\nWhether the day comes first in the DOBs. Passed to pd.to_datetime() and defaults to True.\nTrue\n\n\nyearfirst\nbool\nWhether the year comes first in the DOBs. Passed to pd.to_datetime() and defaults to False.\nFalse\n\n\ndefault\nlist[str]\nDefault date to fill in missing data in feature (list) form. Default is the feature form of 2050-01-01.\n[]\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npandas.pandas.Series\nSeries containing lists of date features.\n\n\n\n\n\n\n\nembedder.features.gen_double_metaphone(string)\nGenerate the double methaphones of a string.\nThis function is a generator containing all the possible, non-empty double metaphones of a given string, separated by spaces. This function uses the metaphone.doublemetaphone() function under the hood, ignoring any empty strings. See their repository for details.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nstring\nstr\nString from which to derive double metaphones.\nrequired\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nstr\nThe next double metaphone in the sequence.\n\n\n\n\n\n\n\nembedder.features.gen_features(string, ngram_length=[2, 3], use_gen_ngram=True, use_gen_skip_grams=False, use_double_metaphone=False)\nGenerate string features of various types.\nThis function is a generator capable of producing n-grams, skip 2-grams, and double metaphones from a single string. These outputs are referred to as features.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nstring\nstr\nBase string from which to generate features.\nrequired\n\n\nngram_length\nlist\nLengths of n-grams to make. Ignored if use_gen_ngram=False.\n[2, 3]\n\n\nuse_gen_ngram\nbool\nWhether to create n-grams. Default is True.\nTrue\n\n\nuse_gen_skip_grams\nbool\nWhether to create skip 2-grams. Default is False.\nFalse\n\n\nuse_double_metaphone\nbool\nWhether to create double metaphones. Default is False.\nFalse\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nstr\nThe next feature in the sequence.\n\n\n\n\n\n\n\nembedder.features.gen_misc_features(field, label=None)\nGenerate miscellaneous categorical features for a series.\nUseful for keeping raw columns in the linkage data. All features use a label and take the form [\"label&lt;option&gt;\"] except for missing data, which are coded as \"\".\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfield\npandas.pandas.Series\nSeries from which to generate our features.\nrequired\n\n\nlabel\nNone | str | typing.Hashable\nLabel for the series. By default, the name of the series is used if available. Otherwise, if not specified, misc is used.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npandas.pandas.Series\nSeries containing lists of miscellaneous features.\n\n\n\n\n\n\n\nembedder.features.gen_misc_shingled_features(field, ngram_length=[2, 3], use_gen_skip_grams=False, label=None)\nGenerate shingled labelled features.\nGenerate n-grams, with a label to distinguish them from (and ensure they’re hashed separately from) names. Like gen_name_features(), this function makes a call to gen_features() via pd.Series.apply().\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfield\npandas.pandas.Series\nSeries of string data.\nrequired\n\n\nngram_length\nlist\nShingle sizes to generate. By default [2, 3].\n[2, 3]\n\n\nuse_gen_skip_grams\nbool\nWhether to generate skip 2-grams. False by default.\nFalse\n\n\nlabel\nstr\nA label to differentiate from other shingled features. If field has no name, this defaults to zz.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npandas.pandas.Series\nSeries containing lists of shingled string features.\n\n\n\n\n\n\n\nembedder.features.gen_name_features(names, ngram_length=[2, 3], use_gen_ngram=True, use_gen_skip_grams=False, use_double_metaphone=False)\nGenerate a features series for a series of names.\nEffectively, this function is a call to pd.Series.apply() using our gen_features() string feature generator function.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nnames\npandas.pandas.Series\nSeries of names.\nrequired\n\n\nngram_length\nlist[int]\nLengths of n-grams to make. Ignored if use_gen_ngram=False.\n[2, 3]\n\n\nuse_gen_ngram\nbool\nWhether to create n-grams. Default is True.\nTrue\n\n\nuse_gen_skip_grams\nbool\nWhether to create skip 2-grams. Default is False.\nFalse\n\n\nuse_double_metaphone\nbool\nWhether to create double metaphones. Default is False.\nFalse\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npandas.pandas.Series\nSeries containing lists of features.\n\n\n\n\n\n\n\nembedder.features.gen_ngram(split_tokens, ngram_length)\nGenerate n-grams from a set of tokens.\nThis is a generator function that contains a series of n-grams the size of the sliding window.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsplit_tokens\nlist\nAll the split-up tokens from which to form n-grams.\nrequired\n\n\nngram_length\nlist\nDesired lengths of n-grams. For examples, ngram_length=[2, 3] would generate all 2-grams and 3-grams.\nrequired\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nstr\nThe next n-gram in the sequence.\n\n\n\n\n\n\n\nembedder.features.gen_sex_features(sexes)\nGenerate labelled sex features from a series of sexes.\nFeatures take the form [\"sex&lt;option&gt;\"] or [\"\"] for missing data.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsexes\npandas.pandas.Series\nSeries of sex data.\nrequired\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npandas.pandas.Series\nSeries containing lists of sex features.\n\n\n\n\n\n\n\nembedder.features.gen_skip_grams(split_tokens)\nGenerate skip 2-grams from a set of tokens.\nThis function is a generator that contains a series of skip 2-grams.\n\n\n&gt;&gt;&gt; string = \"dave james\"\n&gt;&gt;&gt; tokens = split_string_underscore(string)\n&gt;&gt;&gt; skips = list(gen_skip_grams(tokens))\n&gt;&gt;&gt; print(skips)\n[\"_a\", \"dv\", \"ae\", \"v_\", \"_a\", \"jm\", \"ae\", \"ms\", \"e_\"]\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsplit_tokens\nlist\nAll the split-up tokens from which to form skip 2-grams.\nrequired\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nstr\nThe next skip 2-gram in the sequence.\n\n\n\n\n\n\n\nembedder.features.split_string_underscore(string)\nSplit and underwrap a string at typical punctuation marks.\nCurrently, we split at any combination of spaces, dashes, dots, commas, or underscores.\n\n\n&gt;&gt;&gt; strings = (\"dave  william johnson\", \"Francesca__Hogan-O'Malley\")\n&gt;&gt;&gt; for string in strings:\n...     print(split_string_underscore(string))\n[\"_dave_\", \"_william_\", \"_johnson_\"]\n[\"_Francesca_\", \"_Hogan_\", \"_O'Malley_\"]\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nstring\nstr\nString to split.\nrequired\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nlist[str]\nList of the split and wrapped tokens.",
     "crumbs": [
       "About",
       "Docs",
@@ -599,7 +599,7 @@
     "href": "docs/reference/features.html#functions",
     "title": "features",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ngen_dateofbirth_features\nGenerate labelled date features from a series of dates of birth.\n\n\ngen_double_metaphone\nGenerate the double methaphones of a string.\n\n\ngen_features\nGenerate string features of various types.\n\n\ngen_misc_features\nGenerate miscellaneous categorical features for a series.\n\n\ngen_misc_shingled_features\nGenerate shingled labelled features.\n\n\ngen_name_features\nGenerate a features series for a series of names.\n\n\ngen_ngram\nGenerate n-grams from a set of tokens.\n\n\ngen_sex_features\nGenerate labelled sex features from a series of sexes.\n\n\ngen_skip_grams\nGenerate skip 2-grams from a set of tokens.\n\n\nsplit_string_underscore\nSplit and underwrap a string at typical punctuation marks.\n\n\n\n\n\nembedder.features.gen_dateofbirth_features(dob, dayfirst=True, yearfirst=False, default=['day&lt;01&gt;', 'month&lt;01&gt;', 'year&lt;2050&gt;'])\nGenerate labelled date features from a series of dates of birth.\nFeatures take the form [\"day&lt;dd&gt;\", \"month&lt;mm&gt;\", \"year&lt;YYYY&gt;\"]. Note that this feature generator can be used for any sort of date data, not just dates of birth.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndob\npandas.pandas.Series\nSeries of dates of birth.\nrequired\n\n\ndayfirst\nbool\nWhether the day comes first in the DOBs. Passed to pd.to_datetime() and defaults to True.\nTrue\n\n\nyearfirst\nbool\nWhether the year comes first in the DOBs. Passed to pd.to_datetime() and defaults to False.\nFalse\n\n\ndefault\nlist[str]\nDefault date to fill in missing data in feature (list) form. Default is the feature form of 2050-01-01.\n['day&lt;01&gt;', 'month&lt;01&gt;', 'year&lt;2050&gt;']\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npandas.pandas.Series\nSeries containing lists of date features.\n\n\n\n\n\n\n\nembedder.features.gen_double_metaphone(string)\nGenerate the double methaphones of a string.\nThis function is a generator containing all the possible, non-empty double metaphones of a given string, separated by spaces. This function uses the metaphone.doublemetaphone() function under the hood, ignoring any empty strings. See their repository for details.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nstring\nstr\nString from which to derive double metaphones.\nrequired\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nstr\nThe next double metaphone in the sequence.\n\n\n\n\n\n\n\nembedder.features.gen_features(string, ngram_length=[2, 3], use_gen_ngram=True, use_gen_skip_grams=False, use_double_metaphone=False)\nGenerate string features of various types.\nThis function is a generator capable of producing n-grams, skip 2-grams, and double metaphones from a single string. These outputs are referred to as features.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nstring\nstr\nBase string from which to generate features.\nrequired\n\n\nngram_length\nlist\nLengths of n-grams to make. Ignored if use_gen_ngram=False.\n[2, 3]\n\n\nuse_gen_ngram\nbool\nWhether to create n-grams. Default is True.\nTrue\n\n\nuse_gen_skip_grams\nbool\nWhether to create skip 2-grams. Default is False.\nFalse\n\n\nuse_double_metaphone\nbool\nWhether to create double metaphones. Default is False.\nFalse\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nstr\nThe next feature in the sequence.\n\n\n\n\n\n\n\nembedder.features.gen_misc_features(field, label=None)\nGenerate miscellaneous categorical features for a series.\nUseful for keeping raw columns in the linkage data. All features use a label and take the form [\"label&lt;option&gt;\"] except for missing data, which are coded as \"\".\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfield\npandas.pandas.Series\nSeries from which to generate our features.\nrequired\n\n\nlabel\nNone | str | typing.Hashable\nLabel for the series. By default, the name of the series is used if available. Otherwise, if not specified, misc is used.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npandas.pandas.Series\nSeries containing lists of miscellaneous features.\n\n\n\n\n\n\n\nembedder.features.gen_misc_shingled_features(field, ngram_length=[2, 3], use_gen_skip_grams=False, label=None)\nGenerate shingled labelled features.\nGenerate n-grams, with a label to distinguish them from (and ensure they’re hashed separately from) names. Like gen_name_features(), this function makes a call to gen_features() via pd.Series.apply().\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfield\npandas.pandas.Series\nSeries of string data.\nrequired\n\n\nngram_length\nlist\nShingle sizes to generate. By default [2, 3].\n[2, 3]\n\n\nuse_gen_skip_grams\nbool\nWhether to generate skip 2-grams. False by default.\nFalse\n\n\nlabel\nstr\nA label to differentiate from other shingled features. If field has no name, this defaults to zz.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npandas.pandas.Series\nSeries containing lists of shingled string features.\n\n\n\n\n\n\n\nembedder.features.gen_name_features(names, ngram_length=[2, 3], use_gen_ngram=True, use_gen_skip_grams=False, use_double_metaphone=False)\nGenerate a features series for a series of names.\nEffectively, this function is a call to pd.Series.apply() using our gen_features() string feature generator function.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nnames\npandas.pandas.Series\nSeries of names.\nrequired\n\n\nngram_length\nlist[int]\nLengths of n-grams to make. Ignored if use_gen_ngram=False.\n[2, 3]\n\n\nuse_gen_ngram\nbool\nWhether to create n-grams. Default is True.\nTrue\n\n\nuse_gen_skip_grams\nbool\nWhether to create skip 2-grams. Default is False.\nFalse\n\n\nuse_double_metaphone\nbool\nWhether to create double metaphones. Default is False.\nFalse\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npandas.pandas.Series\nSeries containing lists of features.\n\n\n\n\n\n\n\nembedder.features.gen_ngram(split_tokens, ngram_length)\nGenerate n-grams from a set of tokens.\nThis is a generator function that contains a series of n-grams the size of the sliding window.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsplit_tokens\nlist\nAll the split-up tokens from which to form n-grams.\nrequired\n\n\nngram_length\nlist\nDesired lengths of n-grams. For examples, ngram_length=[2, 3] would generate all 2-grams and 3-grams.\nrequired\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nstr\nThe next n-gram in the sequence.\n\n\n\n\n\n\n\nembedder.features.gen_sex_features(sexes)\nGenerate labelled sex features from a series of sexes.\nFeatures take the form [\"sex&lt;option&gt;\"] or [\"\"] for missing data.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsexes\npandas.pandas.Series\nSeries of sex data.\nrequired\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npandas.pandas.Series\nSeries containing lists of sex features.\n\n\n\n\n\n\n\nembedder.features.gen_skip_grams(split_tokens)\nGenerate skip 2-grams from a set of tokens.\nThis function is a generator that contains a series of skip 2-grams.\n\n\n&gt;&gt;&gt; string = \"dave james\"\n&gt;&gt;&gt; tokens = split_string_underscore(string)\n&gt;&gt;&gt; skips = list(gen_skip_grams(tokens))\n&gt;&gt;&gt; print(skips)\n[\"_a\", \"dv\", \"ae\", \"v_\", \"_a\", \"jm\", \"ae\", \"ms\", \"e_\"]\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsplit_tokens\nlist\nAll the split-up tokens from which to form skip 2-grams.\nrequired\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nstr\nThe next skip 2-gram in the sequence.\n\n\n\n\n\n\n\nembedder.features.split_string_underscore(string)\nSplit and underwrap a string at typical punctuation marks.\nCurrently, we split at any combination of spaces, dashes, dots, commas, or underscores.\n\n\n&gt;&gt;&gt; strings = (\"dave  william johnson\", \"Francesca__Hogan-O'Malley\")\n&gt;&gt;&gt; for string in strings:\n...     print(split_string_underscore(string))\n[\"_dave_\", \"_william_\", \"_johnson_\"]\n[\"_Francesca_\", \"_Hogan_\", \"_O'Malley_\"]\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nstring\nstr\nString to split.\nrequired\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nlist[str]\nList of the split and wrapped tokens.",
+    "text": "Name\nDescription\n\n\n\n\ngen_dateofbirth_features\nGenerate labelled date features from a series of dates of birth.\n\n\ngen_double_metaphone\nGenerate the double methaphones of a string.\n\n\ngen_features\nGenerate string features of various types.\n\n\ngen_misc_features\nGenerate miscellaneous categorical features for a series.\n\n\ngen_misc_shingled_features\nGenerate shingled labelled features.\n\n\ngen_name_features\nGenerate a features series for a series of names.\n\n\ngen_ngram\nGenerate n-grams from a set of tokens.\n\n\ngen_sex_features\nGenerate labelled sex features from a series of sexes.\n\n\ngen_skip_grams\nGenerate skip 2-grams from a set of tokens.\n\n\nsplit_string_underscore\nSplit and underwrap a string at typical punctuation marks.\n\n\n\n\n\nembedder.features.gen_dateofbirth_features(dob, dayfirst=True, yearfirst=False, default=[])\nGenerate labelled date features from a series of dates of birth.\nFeatures take the form [\"day&lt;dd&gt;\", \"month&lt;mm&gt;\", \"year&lt;YYYY&gt;\"]. Note that this feature generator can be used for any sort of date data, not just dates of birth.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndob\npandas.pandas.Series\nSeries of dates of birth.\nrequired\n\n\ndayfirst\nbool\nWhether the day comes first in the DOBs. Passed to pd.to_datetime() and defaults to True.\nTrue\n\n\nyearfirst\nbool\nWhether the year comes first in the DOBs. Passed to pd.to_datetime() and defaults to False.\nFalse\n\n\ndefault\nlist[str]\nDefault date to fill in missing data in feature (list) form. Default is the feature form of 2050-01-01.\n[]\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npandas.pandas.Series\nSeries containing lists of date features.\n\n\n\n\n\n\n\nembedder.features.gen_double_metaphone(string)\nGenerate the double methaphones of a string.\nThis function is a generator containing all the possible, non-empty double metaphones of a given string, separated by spaces. This function uses the metaphone.doublemetaphone() function under the hood, ignoring any empty strings. See their repository for details.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nstring\nstr\nString from which to derive double metaphones.\nrequired\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nstr\nThe next double metaphone in the sequence.\n\n\n\n\n\n\n\nembedder.features.gen_features(string, ngram_length=[2, 3], use_gen_ngram=True, use_gen_skip_grams=False, use_double_metaphone=False)\nGenerate string features of various types.\nThis function is a generator capable of producing n-grams, skip 2-grams, and double metaphones from a single string. These outputs are referred to as features.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nstring\nstr\nBase string from which to generate features.\nrequired\n\n\nngram_length\nlist\nLengths of n-grams to make. Ignored if use_gen_ngram=False.\n[2, 3]\n\n\nuse_gen_ngram\nbool\nWhether to create n-grams. Default is True.\nTrue\n\n\nuse_gen_skip_grams\nbool\nWhether to create skip 2-grams. Default is False.\nFalse\n\n\nuse_double_metaphone\nbool\nWhether to create double metaphones. Default is False.\nFalse\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nstr\nThe next feature in the sequence.\n\n\n\n\n\n\n\nembedder.features.gen_misc_features(field, label=None)\nGenerate miscellaneous categorical features for a series.\nUseful for keeping raw columns in the linkage data. All features use a label and take the form [\"label&lt;option&gt;\"] except for missing data, which are coded as \"\".\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfield\npandas.pandas.Series\nSeries from which to generate our features.\nrequired\n\n\nlabel\nNone | str | typing.Hashable\nLabel for the series. By default, the name of the series is used if available. Otherwise, if not specified, misc is used.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npandas.pandas.Series\nSeries containing lists of miscellaneous features.\n\n\n\n\n\n\n\nembedder.features.gen_misc_shingled_features(field, ngram_length=[2, 3], use_gen_skip_grams=False, label=None)\nGenerate shingled labelled features.\nGenerate n-grams, with a label to distinguish them from (and ensure they’re hashed separately from) names. Like gen_name_features(), this function makes a call to gen_features() via pd.Series.apply().\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfield\npandas.pandas.Series\nSeries of string data.\nrequired\n\n\nngram_length\nlist\nShingle sizes to generate. By default [2, 3].\n[2, 3]\n\n\nuse_gen_skip_grams\nbool\nWhether to generate skip 2-grams. False by default.\nFalse\n\n\nlabel\nstr\nA label to differentiate from other shingled features. If field has no name, this defaults to zz.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npandas.pandas.Series\nSeries containing lists of shingled string features.\n\n\n\n\n\n\n\nembedder.features.gen_name_features(names, ngram_length=[2, 3], use_gen_ngram=True, use_gen_skip_grams=False, use_double_metaphone=False)\nGenerate a features series for a series of names.\nEffectively, this function is a call to pd.Series.apply() using our gen_features() string feature generator function.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nnames\npandas.pandas.Series\nSeries of names.\nrequired\n\n\nngram_length\nlist[int]\nLengths of n-grams to make. Ignored if use_gen_ngram=False.\n[2, 3]\n\n\nuse_gen_ngram\nbool\nWhether to create n-grams. Default is True.\nTrue\n\n\nuse_gen_skip_grams\nbool\nWhether to create skip 2-grams. Default is False.\nFalse\n\n\nuse_double_metaphone\nbool\nWhether to create double metaphones. Default is False.\nFalse\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npandas.pandas.Series\nSeries containing lists of features.\n\n\n\n\n\n\n\nembedder.features.gen_ngram(split_tokens, ngram_length)\nGenerate n-grams from a set of tokens.\nThis is a generator function that contains a series of n-grams the size of the sliding window.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsplit_tokens\nlist\nAll the split-up tokens from which to form n-grams.\nrequired\n\n\nngram_length\nlist\nDesired lengths of n-grams. For examples, ngram_length=[2, 3] would generate all 2-grams and 3-grams.\nrequired\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nstr\nThe next n-gram in the sequence.\n\n\n\n\n\n\n\nembedder.features.gen_sex_features(sexes)\nGenerate labelled sex features from a series of sexes.\nFeatures take the form [\"sex&lt;option&gt;\"] or [\"\"] for missing data.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsexes\npandas.pandas.Series\nSeries of sex data.\nrequired\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\npandas.pandas.Series\nSeries containing lists of sex features.\n\n\n\n\n\n\n\nembedder.features.gen_skip_grams(split_tokens)\nGenerate skip 2-grams from a set of tokens.\nThis function is a generator that contains a series of skip 2-grams.\n\n\n&gt;&gt;&gt; string = \"dave james\"\n&gt;&gt;&gt; tokens = split_string_underscore(string)\n&gt;&gt;&gt; skips = list(gen_skip_grams(tokens))\n&gt;&gt;&gt; print(skips)\n[\"_a\", \"dv\", \"ae\", \"v_\", \"_a\", \"jm\", \"ae\", \"ms\", \"e_\"]\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsplit_tokens\nlist\nAll the split-up tokens from which to form skip 2-grams.\nrequired\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nstr\nThe next skip 2-gram in the sequence.\n\n\n\n\n\n\n\nembedder.features.split_string_underscore(string)\nSplit and underwrap a string at typical punctuation marks.\nCurrently, we split at any combination of spaces, dashes, dots, commas, or underscores.\n\n\n&gt;&gt;&gt; strings = (\"dave  william johnson\", \"Francesca__Hogan-O'Malley\")\n&gt;&gt;&gt; for string in strings:\n...     print(split_string_underscore(string))\n[\"_dave_\", \"_william_\", \"_johnson_\"]\n[\"_Francesca_\", \"_Hogan_\", \"_O'Malley_\"]\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nstring\nstr\nString to split.\nrequired\n\n\n\n\n\n\n\n\n\nType\nDescription\n\n\n\n\nlist[str]\nList of the split and wrapped tokens.",
     "crumbs": [
       "About",
       "Docs",
diff --git a/sitemap.xml b/sitemap.xml
index 812ac85..b322d28 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -2,66 +2,66 @@
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
   <url>
     <loc>https://datasciencecampus.github.io/pprl_toolkit/index.html</loc>
-    <lastmod>2024-05-02T10:17:42.950Z</lastmod>
+    <lastmod>2024-05-08T14:03:10.597Z</lastmod>
   </url>
   <url>
     <loc>https://datasciencecampus.github.io/pprl_toolkit/docs/reference/index.html</loc>
-    <lastmod>2024-05-02T10:18:31.033Z</lastmod>
+    <lastmod>2024-05-08T14:03:57.385Z</lastmod>
   </url>
   <url>
     <loc>https://datasciencecampus.github.io/pprl_toolkit/docs/reference/config.html</loc>
-    <lastmod>2024-05-02T10:18:31.153Z</lastmod>
+    <lastmod>2024-05-08T14:03:57.505Z</lastmod>
   </url>
   <url>
     <loc>https://datasciencecampus.github.io/pprl_toolkit/docs/reference/cloud.html</loc>
-    <lastmod>2024-05-02T10:18:31.185Z</lastmod>
+    <lastmod>2024-05-08T14:03:57.537Z</lastmod>
   </url>
   <url>
     <loc>https://datasciencecampus.github.io/pprl_toolkit/docs/reference/embedder.html</loc>
-    <lastmod>2024-05-02T10:18:31.101Z</lastmod>
+    <lastmod>2024-05-08T14:03:57.453Z</lastmod>
   </url>
   <url>
     <loc>https://datasciencecampus.github.io/pprl_toolkit/docs/reference/encryption.html</loc>
-    <lastmod>2024-05-02T10:18:31.149Z</lastmod>
+    <lastmod>2024-05-08T14:03:57.501Z</lastmod>
   </url>
   <url>
     <loc>https://datasciencecampus.github.io/pprl_toolkit/docs/tutorials/example-verknupfung.html</loc>
-    <lastmod>2024-05-02T10:17:42.950Z</lastmod>
+    <lastmod>2024-05-08T14:03:10.597Z</lastmod>
   </url>
   <url>
     <loc>https://datasciencecampus.github.io/pprl_toolkit/docs/tutorials/in-the-cloud.html</loc>
-    <lastmod>2024-05-02T10:17:42.950Z</lastmod>
+    <lastmod>2024-05-08T14:03:10.597Z</lastmod>
   </url>
   <url>
     <loc>https://datasciencecampus.github.io/pprl_toolkit/docs/tutorials/run-through.html</loc>
-    <lastmod>2024-05-02T10:17:42.950Z</lastmod>
+    <lastmod>2024-05-08T14:03:10.597Z</lastmod>
   </url>
   <url>
     <loc>https://datasciencecampus.github.io/pprl_toolkit/docs/tutorials/example-febrl.html</loc>
-    <lastmod>2024-05-02T10:17:42.950Z</lastmod>
+    <lastmod>2024-05-08T14:03:10.597Z</lastmod>
   </url>
   <url>
     <loc>https://datasciencecampus.github.io/pprl_toolkit/docs/tutorials/index.html</loc>
-    <lastmod>2024-05-02T10:17:42.950Z</lastmod>
+    <lastmod>2024-05-08T14:03:10.597Z</lastmod>
   </url>
   <url>
     <loc>https://datasciencecampus.github.io/pprl_toolkit/docs/reference/local.html</loc>
-    <lastmod>2024-05-02T10:18:31.189Z</lastmod>
+    <lastmod>2024-05-08T14:03:57.541Z</lastmod>
   </url>
   <url>
     <loc>https://datasciencecampus.github.io/pprl_toolkit/docs/reference/bloom_filters.html</loc>
-    <lastmod>2024-05-02T10:18:31.053Z</lastmod>
+    <lastmod>2024-05-08T14:03:57.405Z</lastmod>
   </url>
   <url>
     <loc>https://datasciencecampus.github.io/pprl_toolkit/docs/reference/features.html</loc>
-    <lastmod>2024-05-02T10:18:31.137Z</lastmod>
+    <lastmod>2024-05-08T14:03:57.485Z</lastmod>
   </url>
   <url>
     <loc>https://datasciencecampus.github.io/pprl_toolkit/docs/reference/perform.html</loc>
-    <lastmod>2024-05-02T10:18:31.201Z</lastmod>
+    <lastmod>2024-05-08T14:03:57.553Z</lastmod>
   </url>
   <url>
     <loc>https://datasciencecampus.github.io/pprl_toolkit/docs/reference/utils.html</loc>
-    <lastmod>2024-05-02T10:18:31.169Z</lastmod>
+    <lastmod>2024-05-08T14:03:57.521Z</lastmod>
   </url>
 </urlset>