datasciencecampus · SStock1 · Apr 3, 2024 · Apr 3, 2024 · Apr 3, 2024 · Apr 3, 2024
diff --git a/README.md b/README.md
@@ -133,9 +133,9 @@ uses the Soft Cosine Measure to calculate record-wise similarity scores.
 ```python
 >>> similarities = embedder.compare(edf1, edf2)
 >>> similarities
-SimilarityArray([[0.80074101, 0.18160957, 0.09722178],
-                 [0.40124732, 0.1877348 , 0.58792979],
-                 [0.13147656, 0.51426533, 0.11772856]])
+SimilarityArray([[0.81229552, 0.1115206 , 0.09557733],
+                 [0.35460909, 0.16368072, 0.60428527],
+                 [0.11720977, 0.50957391, 0.10343462]])
 
 ```
 

diff --git a/docs/tutorials/example-febrl.qmd b/docs/tutorials/example-febrl.qmd
@@ -72,11 +72,10 @@ feature_factory = dict(
 
 ## Initialise the embedder instance
 
-This instance embeds each feature twice into a Bloom filter of length 1025
-(with the offset).
+This instance embeds each feature twice into a Bloom filter of length 1024.
 
 ```{python}
-embedder = Embedder(feature_factory, bf_size=2**10, num_hashes=2)
+embedder = Embedder(feature_factory, bf_size=1024, num_hashes=2)
 ```
 
 ## Embed the datasets

diff --git a/docs/tutorials/run-through.qmd b/docs/tutorials/run-through.qmd
@@ -72,7 +72,7 @@ ff_args = dict(name={}, sex={}, dob={})
 ## Embedding
 
 Now we can create an `Embedder` object. We want our Bloom filter vectors to
-have a length of 1024 elements (actually 1025 because of an offset), and we
+have a length of 1024 elements, and we
 choose to hash each feature two times. These choices seem to work ok, but we
 haven't explored them systematically.
 

diff --git a/src/pprl/app/utils.py b/src/pprl/app/utils.py
@@ -138,11 +138,8 @@ def convert_dataframe_to_bf(
         other_columns = []
 
     output_columns = other_columns + ["bf_indices", "bf_norms", "thresholds"]
-    NUMHASHES = 2
-    OFFSET = 1
     NGRAMS = [1, 2, 3, 4]
     FFARGS = {"name": {"ngram_length": NGRAMS, "use_gen_skip_grams": True}}
-    BFSIZE = 2**10
 
     column_types_dict = {
         "name": features.gen_name_features,
@@ -155,9 +152,6 @@ def convert_dataframe_to_bf(
     embedder = Embedder(
         feature_factory=column_types_dict,
         ff_args=FFARGS,
-        bf_size=BFSIZE,
-        num_hashes=NUMHASHES,
-        offset=OFFSET,
         salt=salt,
     )
 

diff --git a/src/pprl/embedder/bloom_filters.py b/src/pprl/embedder/bloom_filters.py
@@ -11,38 +11,39 @@ class BloomFilterEncoder:
 
     1. Compute the hash digest for your tokens
     2. Convert the digest bytes into integers
-    3. Map the integer to a bloom filter vector (modulo `b`, where `b`
-       represents the length of the vector)
+    3. Map the integer to a bloom filter vector (modulo the length of the vector)
 
     Parameters
     ----------
     size: int
-        Size of the Bloom filter.
+        Size of the Bloom filter. Defaults to 1024
     num_hashes: int
-        Number of hashes to perform. Defaults to three.
+        Number of hashes to perform. Defaults to two.
     offset: int
         Offset for Bloom filter indices to allow for masking. Defaults
-        to one.
+        to zero.
     salt: str, optional
         Cryptographic salt appended to tokens prior to hashing.
 
     Attributes
     ----------
     hash_function: func
-        Hashing function (`hashlib.sha1`).
+        Hashing function (`hashlib.sha256`).
     """
 
     def __init__(
-        self, size: int, num_hashes: int = 3, offset: int = 1, salt: str | None = None
+        self, size: int = 1024, num_hashes: int = 2, offset: int = 0, salt: str | None = None
     ) -> None:
-        self.size = size - 1
+        self.size = size
         self.num_hashes = num_hashes
         self.offset = offset
         self.salt = salt or ""
 
-        self.hash_function = hashlib.sha1
+        self.hash_function = hashlib.sha256
 
-    def bloom_filter_vector_collision_fraction(self, feature: list) -> tuple[list, float]:
+    def bloom_filter_vector_collision_fraction(
+        self, feature: list[str]
+    ) -> tuple[list[int], float]:
         """Convert a feature vector and return its collision fraction.
 
         The index vector uses an optional offset for masking.
@@ -58,15 +59,28 @@ def bloom_filter_vector_collision_fraction(self, feature: list) -> tuple[list, f
             Index values used to create the Bloom filter vector.
         collision_fraction: float
             Proportion of repeated indices.
+
+        Examples
+        --------
+        >>> bfe = BloomFilterEncoder()
+        >>> bfe.bloom_filter_vector_collision_fraction(["a","b","c"])
+        ([334, 1013, 192, 381, 18, 720], 0.0)
         """
-        feature_int_repr = self.feature_to_big_int_repr(feature)
-        vec_idx = self.big_int_to_vec(feature_int_repr, offset=self.offset)
+        vec_idx: list = []
+
+        for gram in feature:
+            for i in range(self.num_hashes):
+                utf_string_with_salt = (str(gram) + str(i) + str(self.salt)).encode("UTF-8")
+                digest = self.hash_function(utf_string_with_salt).digest()
+                digest_as_int = (int.from_bytes(digest, "little") % self.size) + self.offset
+                vec_idx.append(digest_as_int)
+
         vec_idx_deduped = [*set(vec_idx)]
         collision_fraction = 1 - len(vec_idx_deduped) / len(vec_idx)
 
         return vec_idx_deduped, collision_fraction
 
-    def bloom_filter_vector(self, feature: list) -> list[int]:
+    def bloom_filter_vector(self, feature: list[str]) -> list[int]:
         """Convert a feature vector into indices for a Bloom vector.
 
         The index vector uses an optional offset for masking.
@@ -80,63 +94,13 @@ def bloom_filter_vector(self, feature: list) -> list[int]:
         -------
         vector_idxs: list
             Index values used to create the Bloom filter vector.
-        """
-        feature_int_repr = self.feature_to_big_int_repr(feature)
-        vec_idx = self.big_int_to_vec(feature_int_repr, offset=self.offset)
-        vec_idx_deduped = [*set(vec_idx)]
-
-        return vec_idx_deduped
-
-    def big_int_to_vec(self, feature_ints: list, offset: int = 1) -> list[int]:
-        """Convert an integer vector into indices for a Bloom vector.
-
-        This conversion inserts 1 at the location derived from the
-        integer vector, which is an integer representation of a
-        deterministic hash value, modulo to the size of the Bloom
-        filter.
-
-        Parameters
-        ----------
-        feature_ints: list
-            List of integer values representing the feature.
-        offset: int
-            An offset to indices to allow for masking. Defaults to one.
-
-        Returns
-        -------
-        vector_idxs: list
-            List of integers representing an index on the Bloom filter.
-        """
-        return list(map(lambda x: x % self.size + offset, feature_ints))
-
-    def feature_to_big_int_repr(self, feature: list) -> list[int]:
-        """Convert a feature vector into an integer vector.
-
-        This conversion first generates a hash digest for each member of
-        the feature vector and then converts them to an integer.
-
-        Parameters
-        ----------
-        feature: list
-            List of features to be processed.
 
-        Returns
-        -------
-        feature_ints: list
-            List of features as integers.
+        Examples
+        --------
+        >>> bfe = BloomFilterEncoder()
+        >>> bfe.bloom_filter_vector(["a","b","c"])
+        [334, 1013, 192, 381, 18, 720]
         """
-        feature_int_repr: list = []
-        # hash function will create a 256-bit integer
-        # under the random oracle model this integer will be deterministic
-        # depending on the token passed to
-        # the hash function
+        vec_idx_deduped, _ = self.bloom_filter_vector_collision_fraction(feature)
 
-        for gram in feature:
-            for i in range(self.num_hashes):
-                utf_string_with_salt = (str(gram) + str(i) + str(self.salt)).encode("UTF-8")
-                digest = self.hash_function(utf_string_with_salt).digest()
-                # integer value uses little endianness for amd64 architecture
-                int_repr = int.from_bytes(digest, "little")
-                feature_int_repr.append(int_repr)
-
-        return feature_int_repr
+        return vec_idx_deduped
diff --git a/src/pprl/embedder/embedder.py b/src/pprl/embedder/embedder.py
@@ -279,11 +279,11 @@ class Embedder:
         Mapping from dataset columns to keyword arguments for their
         respective feature generation functions.
     bf_size: int
-        Size of the Bloom filter. Default is `2**10`.
+        Size of the Bloom filter. Default is 1024.
     num_hashes: int
         Number of hashes to perform. Default is two.
     offset: int
-        Offset for Bloom filter to enable masking. Default is one.
+        Offset for Bloom filter to enable masking. Default is zero.
     salt: str, optional
         Cryptographic salt added to tokens from the data before hashing.
 
@@ -324,9 +324,9 @@ def __init__(
         self,
         feature_factory: dict,
         ff_args: dict[str, dict] | None = None,
-        bf_size: int = 2**10,
+        bf_size: int = 1024,
         num_hashes: int = 2,
-        offset: int = 1,
+        offset: int = 0,
         salt: str | None = None,
     ) -> None:
         # Get embedding from model

diff --git a/test/embedder/test_bloom_filters.py b/test/embedder/test_bloom_filters.py
@@ -23,7 +23,7 @@ def test_bloom_filter_vector_collision_fraction(feature, size, num_hashes, offse
     vec_idx_deduped, collision_fraction = bfencoder.bloom_filter_vector_collision_fraction(feature)
 
     assert all(isinstance(element, int) for element in vec_idx_deduped)
-    assert all(element <= (size + offset - 2) for element in vec_idx_deduped)
+    assert all(element <= (size + offset - 1) for element in vec_idx_deduped)
     assert all(element >= offset for element in vec_idx_deduped)
 
     assert collision_fraction <= 1