Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

32 refactor bloom filters #44

Merged
merged 3 commits into from
Apr 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,9 +133,9 @@ uses the Soft Cosine Measure to calculate record-wise similarity scores.
```python
>>> similarities = embedder.compare(edf1, edf2)
>>> similarities
SimilarityArray([[0.80074101, 0.18160957, 0.09722178],
[0.40124732, 0.1877348 , 0.58792979],
[0.13147656, 0.51426533, 0.11772856]])
SimilarityArray([[0.81229552, 0.1115206 , 0.09557733],
[0.35460909, 0.16368072, 0.60428527],
[0.11720977, 0.50957391, 0.10343462]])

```

Expand Down
5 changes: 2 additions & 3 deletions docs/tutorials/example-febrl.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,10 @@ feature_factory = dict(

## Initialise the embedder instance

This instance embeds each feature twice into a Bloom filter of length 1025
(with the offset).
This instance embeds each feature twice into a Bloom filter of length 1024.

```{python}
embedder = Embedder(feature_factory, bf_size=2**10, num_hashes=2)
embedder = Embedder(feature_factory, bf_size=1024, num_hashes=2)
```

## Embed the datasets
Expand Down
2 changes: 1 addition & 1 deletion docs/tutorials/run-through.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ ff_args = dict(name={}, sex={}, dob={})
## Embedding

Now we can create an `Embedder` object. We want our Bloom filter vectors to
have a length of 1024 elements (actually 1025 because of an offset), and we
have a length of 1024 elements, and we
choose to hash each feature two times. These choices seem to work ok, but we
haven't explored them systematically.

Expand Down
6 changes: 0 additions & 6 deletions src/pprl/app/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,11 +138,8 @@ def convert_dataframe_to_bf(
other_columns = []

output_columns = other_columns + ["bf_indices", "bf_norms", "thresholds"]
NUMHASHES = 2
OFFSET = 1
NGRAMS = [1, 2, 3, 4]
FFARGS = {"name": {"ngram_length": NGRAMS, "use_gen_skip_grams": True}}
BFSIZE = 2**10

column_types_dict = {
"name": features.gen_name_features,
Expand All @@ -155,9 +152,6 @@ def convert_dataframe_to_bf(
embedder = Embedder(
feature_factory=column_types_dict,
ff_args=FFARGS,
bf_size=BFSIZE,
num_hashes=NUMHASHES,
offset=OFFSET,
salt=salt,
)

Expand Down
104 changes: 34 additions & 70 deletions src/pprl/embedder/bloom_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,38 +11,39 @@ class BloomFilterEncoder:

1. Compute the hash digest for your tokens
2. Convert the digest bytes into integers
3. Map the integer to a bloom filter vector (modulo `b`, where `b`
represents the length of the vector)
3. Map the integer to a bloom filter vector (modulo the length of the vector)

Parameters
----------
size: int
Size of the Bloom filter.
Size of the Bloom filter. Defaults to 1024
num_hashes: int
Number of hashes to perform. Defaults to three.
Number of hashes to perform. Defaults to two.
offset: int
Offset for Bloom filter indices to allow for masking. Defaults
to one.
to zero.
salt: str, optional
Cryptographic salt appended to tokens prior to hashing.

Attributes
----------
hash_function: func
Hashing function (`hashlib.sha1`).
Hashing function (`hashlib.sha256`).
"""

def __init__(
self, size: int, num_hashes: int = 3, offset: int = 1, salt: str | None = None
self, size: int = 1024, num_hashes: int = 2, offset: int = 0, salt: str | None = None
) -> None:
self.size = size - 1
self.size = size
self.num_hashes = num_hashes
self.offset = offset
self.salt = salt or ""

self.hash_function = hashlib.sha1
self.hash_function = hashlib.sha256

def bloom_filter_vector_collision_fraction(self, feature: list) -> tuple[list, float]:
def bloom_filter_vector_collision_fraction(
self, feature: list[str]
) -> tuple[list[int], float]:
"""Convert a feature vector and return its collision fraction.

The index vector uses an optional offset for masking.
Expand All @@ -58,15 +59,28 @@ def bloom_filter_vector_collision_fraction(self, feature: list) -> tuple[list, f
Index values used to create the Bloom filter vector.
collision_fraction: float
Proportion of repeated indices.

Examples
--------
>>> bfe = BloomFilterEncoder()
>>> bfe.bloom_filter_vector_collision_fraction(["a","b","c"])
([334, 1013, 192, 381, 18, 720], 0.0)
"""
feature_int_repr = self.feature_to_big_int_repr(feature)
vec_idx = self.big_int_to_vec(feature_int_repr, offset=self.offset)
vec_idx: list = []

for gram in feature:
for i in range(self.num_hashes):
utf_string_with_salt = (str(gram) + str(i) + str(self.salt)).encode("UTF-8")
digest = self.hash_function(utf_string_with_salt).digest()
digest_as_int = (int.from_bytes(digest, "little") % self.size) + self.offset
vec_idx.append(digest_as_int)

vec_idx_deduped = [*set(vec_idx)]
collision_fraction = 1 - len(vec_idx_deduped) / len(vec_idx)

return vec_idx_deduped, collision_fraction

def bloom_filter_vector(self, feature: list) -> list[int]:
def bloom_filter_vector(self, feature: list[str]) -> list[int]:
"""Convert a feature vector into indices for a Bloom vector.

The index vector uses an optional offset for masking.
Expand All @@ -80,63 +94,13 @@ def bloom_filter_vector(self, feature: list) -> list[int]:
-------
vector_idxs: list
Index values used to create the Bloom filter vector.
"""
feature_int_repr = self.feature_to_big_int_repr(feature)
vec_idx = self.big_int_to_vec(feature_int_repr, offset=self.offset)
vec_idx_deduped = [*set(vec_idx)]

return vec_idx_deduped

def big_int_to_vec(self, feature_ints: list, offset: int = 1) -> list[int]:
"""Convert an integer vector into indices for a Bloom vector.

This conversion inserts 1 at the location derived from the
integer vector, which is an integer representation of a
deterministic hash value, modulo to the size of the Bloom
filter.

Parameters
----------
feature_ints: list
List of integer values representing the feature.
offset: int
An offset to indices to allow for masking. Defaults to one.

Returns
-------
vector_idxs: list
List of integers representing an index on the Bloom filter.
"""
return list(map(lambda x: x % self.size + offset, feature_ints))

def feature_to_big_int_repr(self, feature: list) -> list[int]:
"""Convert a feature vector into an integer vector.

This conversion first generates a hash digest for each member of
the feature vector and then converts them to an integer.

Parameters
----------
feature: list
List of features to be processed.

Returns
-------
feature_ints: list
List of features as integers.
Examples
--------
>>> bfe = BloomFilterEncoder()
>>> bfe.bloom_filter_vector(["a","b","c"])
[334, 1013, 192, 381, 18, 720]
"""
feature_int_repr: list = []
# hash function will create a 256-bit integer
# under the random oracle model this integer will be deterministic
# depending on the token passed to
# the hash function
vec_idx_deduped, _ = self.bloom_filter_vector_collision_fraction(feature)

for gram in feature:
for i in range(self.num_hashes):
utf_string_with_salt = (str(gram) + str(i) + str(self.salt)).encode("UTF-8")
digest = self.hash_function(utf_string_with_salt).digest()
# integer value uses little endianness for amd64 architecture
int_repr = int.from_bytes(digest, "little")
feature_int_repr.append(int_repr)

return feature_int_repr
return vec_idx_deduped
8 changes: 4 additions & 4 deletions src/pprl/embedder/embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,11 +279,11 @@ class Embedder:
Mapping from dataset columns to keyword arguments for their
respective feature generation functions.
bf_size: int
Size of the Bloom filter. Default is `2**10`.
Size of the Bloom filter. Default is 1024.
num_hashes: int
Number of hashes to perform. Default is two.
offset: int
Offset for Bloom filter to enable masking. Default is one.
Offset for Bloom filter to enable masking. Default is zero.
salt: str, optional
Cryptographic salt added to tokens from the data before hashing.

Expand Down Expand Up @@ -324,9 +324,9 @@ def __init__(
self,
feature_factory: dict,
ff_args: dict[str, dict] | None = None,
bf_size: int = 2**10,
bf_size: int = 1024,
num_hashes: int = 2,
offset: int = 1,
offset: int = 0,
salt: str | None = None,
) -> None:
# Get embedding from model
Expand Down
2 changes: 1 addition & 1 deletion test/embedder/test_bloom_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def test_bloom_filter_vector_collision_fraction(feature, size, num_hashes, offse
vec_idx_deduped, collision_fraction = bfencoder.bloom_filter_vector_collision_fraction(feature)

assert all(isinstance(element, int) for element in vec_idx_deduped)
assert all(element <= (size + offset - 2) for element in vec_idx_deduped)
assert all(element <= (size + offset - 1) for element in vec_idx_deduped)
assert all(element >= offset for element in vec_idx_deduped)

assert collision_fraction <= 1
Expand Down
Loading