Skip to content

Commit

Permalink
Respect the Rust tradition, and make breaking changes to the API
Browse files Browse the repository at this point in the history
But there's nothing wrong with version 2 which will keep being
maintained.

Serialization and deserialization are now less confusing, and
require less copies.
  • Loading branch information
jedisct1 committed Dec 2, 2024
1 parent 5a834b4 commit eccab33
Show file tree
Hide file tree
Showing 5 changed files with 382 additions and 128 deletions.
5 changes: 2 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "bloomfilter"
version = "2.0.0"
version = "3.0.0"
authors = ["Frank Denis <[email protected]>"]
description = "Bloom filter implementation"
license = "ISC"
Expand All @@ -17,10 +17,9 @@ getrandom = { version = "0.2", optional = true, features = ["js"] }
getrandom = { version = "0.2", optional = true }

[dependencies]
bit-vec = "0.8.0"
siphasher = "1.0.1"

[features]
default = ["random"]
random = ["getrandom"]
serde = ["siphasher/serde_std", "bit-vec/serde"]
serde = ["siphasher/serde_std"]
9 changes: 4 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,26 +1,25 @@
# bloomfilter <img src="img/logo.png" align="right" width="150" />

[![Crates.io](https://img.shields.io/crates/v/bloomfilter.svg)](https://crates.io/crates/bloomfilter)
[![docs.rs](https://docs.rs/bloomfilter/badge.svg)](https://docs.rs/bloomfilter)
[![License: ISC](https://img.shields.io/badge/License-ISC-blue.svg)](https://github.com/jedisct1/rust-bloom-filter/blob/master/LICENSE)
<a href="https://codecov.io/gh/jedisct1/rust-bloom-filter">
<img src="https://codecov.io/gh/jedisct1/rust-bloom-filter/branch/main/graph/badge.svg">
</a>


A simple but fast implementation of the Bloom filter in Rust. The Bloom filter is a a space-efficient probabilistic data structure supporting dynamic set membership queries with false positives. It was introduced by Burton H. Bloom in 1970 [(Bloom, 1970)](https://dl.acm.org/doi/10.1145/362686.362692) and have since been increasingly used in computing applications and bioinformatics.

### Documentation

Library documentation with examples is available on [docs.rs](https://docs.rs/bloomfilter).

Library documentation is available on [docs.rs](https://docs.rs/bloomfilter).

### Usage

Add this to your `Cargo.toml`:

```toml
[dependencies]
bloomfilter = "2"
bloomfilter = "3"
```

Here is a simple example for creating a bloom filter with a false positive rate of 0.001 and query for presence of some numbers.
Expand All @@ -31,7 +30,7 @@ use bloomfilter::Bloom;
let num_items = 100000;
let fp_rate = 0.001;

let mut bloom = Bloom::new_for_fp_rate(num_items, fp_rate);
let mut bloom = Bloom::new_for_fp_rate(num_items, fp_rate).unwrap();
bloom.set(&10); // insert 10 in the bloom filter
bloom.check(&10); // return true
bloom.check(&20); // return false
Expand Down
171 changes: 171 additions & 0 deletions src/bitmap.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
use std::convert::{TryFrom, TryInto};
use std::fmt::Debug;

pub const VERSION: u8 = 1;
pub const BITMAP_HEADER_SIZE: usize = 1 + 8 + 4 + 32;

#[derive(Clone, Debug)]
pub(crate) struct BitMap {
header_and_bits: Vec<u8>,
}

impl BitMap {
pub fn new(len_bytes: usize) -> Self {
let mut header_and_bits = vec![0; BITMAP_HEADER_SIZE + len_bytes];
let header = &mut header_and_bits[0..BITMAP_HEADER_SIZE];
Self::set_version(header, VERSION);
Self::set_len_bytes(header, len_bytes as u64);
Self::set_k_num(header, 0);
Self { header_and_bits }
}

fn bits(&self) -> &[u8] {
&self.header_and_bits[BITMAP_HEADER_SIZE..]
}

fn bits_mut(&mut self) -> &mut [u8] {
&mut self.header_and_bits[BITMAP_HEADER_SIZE..]
}

pub fn header(&self) -> &[u8] {
&self.header_and_bits[0..BITMAP_HEADER_SIZE]
}

pub fn header_mut(&mut self) -> &mut [u8] {
&mut self.header_and_bits[0..BITMAP_HEADER_SIZE]
}

fn get_version(header: &[u8]) -> u8 {
header[0]
}

fn set_version(header: &mut [u8], version: u8) {
header[0] = version;
}

fn get_len_bytes(header: &[u8]) -> u64 {
u64::from_le_bytes(header[1..][0..8].try_into().unwrap())
}

fn set_len_bytes(header: &mut [u8], len_bytes: u64) {
header[1..][0..8].copy_from_slice(&len_bytes.to_le_bytes());
}

pub fn get_k_num(header: &[u8]) -> u32 {
u32::from_le_bytes(header[9..][0..4].try_into().unwrap())
}

pub fn set_k_num(header: &mut [u8], k_num: u32) {
header[9..][0..4].copy_from_slice(&k_num.to_le_bytes());
}

pub fn get_seed(header: &[u8]) -> [u8; 32] {
header[13..][0..32].try_into().unwrap()
}

pub fn set_seed(header: &mut [u8], seed: &[u8; 32]) {
header[13..][0..32].copy_from_slice(seed);
}

pub fn from_bytes(bytes: Vec<u8>) -> Result<Self, &'static str> {
if bytes.len() < BITMAP_HEADER_SIZE {
return Err("Invalid size");
}
let header = &bytes[0..BITMAP_HEADER_SIZE];
let bits = &bytes[BITMAP_HEADER_SIZE..];
if Self::get_version(header) != VERSION {
return Err("Version mismatch");
}
if Self::get_k_num(header) == 0 {
return Err("Invalid number of keys");
}
let len_bytes_u64 = Self::get_len_bytes(header);
let len_bytes: usize = len_bytes_u64.try_into().map_err(|_| "Too big")?;
if bits.len() != len_bytes {
return Err("Invalid size");
}
let res = Self {
header_and_bits: bytes,
};
Ok(res)
}

pub fn from_slice(bytes: &[u8]) -> Result<Self, &'static str> {
if bytes.len() < BITMAP_HEADER_SIZE {
return Err("Invalid size");
}
let header = &bytes[0..BITMAP_HEADER_SIZE];
let bits = &bytes[BITMAP_HEADER_SIZE..];
if Self::get_version(header) != VERSION {
return Err("Version mismatch");
}
if Self::get_k_num(header) == 0 {
return Err("Invalid number of keys");
}
let len_bytes_u64 = Self::get_len_bytes(header);
let len_bytes: usize = len_bytes_u64.try_into().map_err(|_| "Too big")?;
if bits.len() != len_bytes {
return Err("Invalid size");
}
let res = Self {
header_and_bits: bytes.to_vec(),
};
Ok(res)
}

pub fn as_slice(&self) -> &[u8] {
&self.header_and_bits
}

pub fn into_bytes(self) -> Vec<u8> {
self.header_and_bits
}

pub fn to_bytes(&self) -> Vec<u8> {
self.header_and_bits.clone()
}

pub fn get(&self, bit_offset: usize) -> bool {
let byte_offset = bit_offset / 8;
let bit_shift = bit_offset % 8;
(self.bits()[byte_offset] & (1 << bit_shift)) != 0
}

pub fn set(&mut self, bit_offset: usize) {
let byte_offset = bit_offset / 8;
let bit_shift = bit_offset % 8;
self.bits_mut()[byte_offset] |= 1 << bit_shift;
}

pub fn clear(&mut self) {
for byte in self.bits_mut().iter_mut() {
*byte = 0;
}
}

pub fn set_all(&mut self) {
for byte in self.bits_mut().iter_mut() {
*byte = !0;
}
}

pub fn any(&self) -> bool {
self.bits().iter().any(|&byte| byte != 0)
}

pub fn len_bits(&self) -> u64 {
u64::try_from(self.bits().len())
.unwrap()
.checked_mul(8)
.unwrap()
}

#[doc(hidden)]
pub fn realloc_large_heap_allocated_objects(mut self, f: fn(Vec<u8>) -> Vec<u8>) -> Self {
let previous_len = self.header_and_bits.len();
self.header_and_bits = f(self.header_and_bits);
assert_eq!(previous_len, self.header_and_bits.len());
assert_eq!(Self::get_version(self.header()), VERSION);
self
}
}
Loading

0 comments on commit eccab33

Please sign in to comment.