Skip to content

Commit

Permalink
#3: switched to FixedBitSet, using bincode to performa binary encodin…
Browse files Browse the repository at this point in the history
…g and decoding
  • Loading branch information
mkviatkovskii committed Jul 6, 2024
1 parent 8367356 commit 1a6b78d
Show file tree
Hide file tree
Showing 8 changed files with 108 additions and 86 deletions.
5 changes: 2 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@ repository = "https://github.com/mkviatkovskii/ringo.git"
version = "0.0.1"

[dependencies]
bitcode = "0.6.0"
bit-set = "0.6.0"
bit-vec = "0.7.0"
bincode = "2.0.0-rc.3"
fixedbitset = "0.5.7"
nom = "7.1.3"
petgraph = "0.6.5"
82 changes: 22 additions & 60 deletions src/ringo/math/similarity/tanimoto.rs
Original file line number Diff line number Diff line change
@@ -1,72 +1,34 @@
use bit_vec::BitVec;
use fixedbitset::FixedBitSet;

pub fn tanimoto_vec(a: &BitVec, b: &BitVec) -> f32 {
pub fn tanimoto_bitset(a: &FixedBitSet, b: &FixedBitSet) -> f32 {
let mut and_ = a.clone();
let mut or_ = a.clone();
and_.and(b);
or_.or(b);

let mut dividend: u32 = 0;
for b in and_.blocks() {
dividend += b.count_ones();
}
let mut divisor: u32 = 0;
for b in or_.blocks() {
divisor += b.count_ones();
}

return dividend as f32 / divisor as f32;
}

pub unsafe fn tanimoto_array(a: &[u64; 4], b: &[u64; 4]) -> f32 {
let mut dividend: u32 = 0;
let mut divisor: u32 = 0;

for i in 0..4 {
dividend += ((a[i] & b[i]) as i64).count_ones();
divisor += ((a[i] | b[i]) as i64).count_ones();
}
return dividend as f32 / divisor as f32;
and_.intersect_with(b);
return and_.count_ones(..) as f32 / (a.count_ones(..) + b.count_ones(..) - and_.count_ones(..)) as f32;
}

#[cfg(test)]
mod tests {
use bit_vec::BitVec;
use crate::ringo::math::similarity::tanimoto::{tanimoto_array, tanimoto_vec};
use fixedbitset::FixedBitSet;
use crate::ringo::math::similarity::tanimoto::{tanimoto_bitset};

#[test]
fn test_tanimoto_vec_033() {
let a: BitVec = BitVec::from_bytes(&[0b00000101]);
let b = BitVec::from_bytes(&[0b00000011]);

assert_eq!(tanimoto_vec(&a, &b), 0.33333334);
fn test_tanimoto_bitset_033() {
let mut a = FixedBitSet::with_capacity(8);
a.insert(0);
a.insert(2);
let mut b = FixedBitSet::with_capacity(8);
b.insert(0);
b.insert(1);
assert_eq!(tanimoto_bitset(&a, &b), 0.33333334);
}

#[test]
fn test_tanimoto_vec_05() {
let a: BitVec = BitVec::from_bytes(&[0b0000001]);
let b = BitVec::from_bytes(&[0b00000011]);

assert_eq!(tanimoto_vec(&a, &b), 0.5);
fn test_tanimoto_bitset_05() {
let mut a = FixedBitSet::with_capacity(8);
a.insert(0);
let mut b = FixedBitSet::with_capacity(8);
b.insert(0);
b.insert(1);
assert_eq!(tanimoto_bitset(&a, &b), 0.5);
}

#[test]
fn test_tanimoto_array_033() {
let a: [u64; 4] = [0b00000101, 0, 0, 0];
let b = [0b00000011, 0, 0, 0];

unsafe {
assert_eq!(tanimoto_array(&a, &b), 0.33333334);
}
}

#[test]
fn test_tanimoto_array_05() {
let a: [u64; 4] = [0b00000001, 0, 0, 0];
let b = [0b00000011, 0, 0, 0];

unsafe {
assert_eq!(tanimoto_array(&a, &b), 0.5);
}
}
}
}
17 changes: 8 additions & 9 deletions src/ringo/molecule/model/molecule.rs
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
use crate::ringo::molecule::model::atom::Atom;
use crate::ringo::molecule::model::bond::Bond;
use crate::ringo::molecule::model::element::atomic_weight;
use bit_set::BitSet;
use bit_vec::BitVec;
use crate::ringo::ringo::fingerprint::Fingerprint;
use petgraph::stable_graph::{EdgeIndex, NodeIndex, StableGraph};
use petgraph::visit::EdgeRef;
use petgraph::Undirected;
use std::borrow::Borrow;
use std::collections::{BTreeSet};
use std::collections::hash_map::DefaultHasher;
use std::fmt::Debug;
use std::hash::Hasher;
use crate::ringo::math::similarity::tanimoto::tanimoto_vec;
use fixedbitset::FixedBitSet;
use crate::ringo::math::similarity::tanimoto::tanimoto_bitset;
use crate::ringo::molecule::smiles::reader::molecule::parse_molecule;

pub struct Molecule {
Expand Down Expand Up @@ -89,14 +88,14 @@ impl Molecule {
}

// TODO: move to Descriptors crate
pub fn ecfp(&self, radius: usize, fp_length: usize) -> BitVec {
let mut fp = BitSet::new();
pub fn ecfp(&self, radius: usize, fp_length: usize) -> Fingerprint {
let mut fp = FixedBitSet::new();

for node in self.graph.node_indices() {
ecfp_recursive(&self.graph, radius, 1, node, &mut fp, fp_length, &mut DefaultHasher::new());
}

BitVec::from_fn(fp_length, |idx| fp.contains(idx))
Fingerprint(fp)
}
}

Expand All @@ -105,7 +104,7 @@ fn ecfp_recursive(
radius: usize,
depth: usize,
node: NodeIndex,
fp: &mut BitSet,
fp: &mut FixedBitSet,
fp_length: usize,
hasher: &mut DefaultHasher,
) {
Expand Down Expand Up @@ -142,6 +141,6 @@ fn ecfp_recursive(
fn test_ecfp() {
let ecfp_ibuprofen = parse_molecule("CC(C)CC1=CC=C(C=C1)C(C)C(=O)O").unwrap().1.ecfp(2, 128);
let ecfp_naproxen = parse_molecule("CC(C1=CC2=C(C=C1)C=C(C=C2)OC)C(=O)O").unwrap().1.ecfp(2, 128);
let sim = tanimoto_vec(&ecfp_ibuprofen, &ecfp_naproxen);
let sim = tanimoto_bitset(&ecfp_ibuprofen.0, &ecfp_naproxen.0);
assert!(0.53 < sim && sim < 0.54);
}
5 changes: 1 addition & 4 deletions src/ringo/molecule/smiles/reader/molecule.rs
Original file line number Diff line number Diff line change
Expand Up @@ -388,10 +388,7 @@ mod tests {
println!("{}: ", smiles);
let m = parse_molecule(smiles).unwrap().1;
let result = m.ecfp(2, 512);
for bit in result {
print!("{}", if bit { 1 } else { 0 });
}
println!("");
println!("{:?}", result);
}
}
}
1 change: 1 addition & 0 deletions src/ringo/ringo.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
mod index;
mod search;
mod index_item;
pub(crate) mod fingerprint;
48 changes: 48 additions & 0 deletions src/ringo/ringo/fingerprint.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
use bincode::de::BorrowDecoder;
use bincode::error::{DecodeError, EncodeError};
use fixedbitset::{Block, FixedBitSet};

pub const FINGERPRINT_SIZE: usize = 512;

#[derive(Debug)]
pub struct Fingerprint(pub FixedBitSet);

impl bincode::Encode for Fingerprint {
fn encode<E: bincode::enc::Encoder>(&self, encoder: &mut E) -> Result<(), EncodeError> {
self.0.as_slice().encode(encoder)?;
Ok(())
}
}

impl bincode::Decode for Fingerprint {
fn decode<D: bincode::de::Decoder>(decoder: &mut D) -> Result<Self, DecodeError> {
let slice = Vec::<Block>::decode(decoder)?;
let fp = FixedBitSet::with_capacity_and_blocks(FINGERPRINT_SIZE, slice);
Ok(Fingerprint(fp))
}
}

impl<'de> bincode::BorrowDecode<'de> for Fingerprint {
fn borrow_decode<D: BorrowDecoder<'de>>(decoder: &mut D) -> Result<Self, DecodeError> {
let slice = Vec::<Block>::borrow_decode(decoder)?;
let fp = FixedBitSet::with_capacity_and_blocks(FINGERPRINT_SIZE, slice);
Ok(Fingerprint(fp))
}
}

#[cfg(test)]
mod tests {
use fixedbitset::{FixedBitSet};
use crate::ringo::ringo::fingerprint::{Fingerprint, FINGERPRINT_SIZE};

#[test]
fn test_fingerprint_encode_decode() {
let mut fp = Fingerprint(FixedBitSet::with_capacity(FINGERPRINT_SIZE));
fp.0.set(1, true);
fp.0.set(17, true);

let encoded = bincode::encode_to_vec(&fp, bincode::config::standard()).unwrap();
let decoded: Fingerprint = bincode::decode_from_slice(&encoded, bincode::config::standard()).unwrap().0;
assert_eq!(decoded.0.ones().collect::<Vec<usize>>(), vec![1, 17]);
}
}
3 changes: 2 additions & 1 deletion src/ringo/ringo/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,12 @@ fn index(smiles_file: &str) {
for line in std::io::BufReader::new(fi).lines() {
let line = line.unwrap();
let molecule = parse_molecule(&line).unwrap().1;
IndexItem::new(offset, molecule.ecfp(2, 512));
IndexItem{position: offset, fingerprint: molecule.ecfp(2, 512)};
offset += line.len() + 1;
}
}


#[test]
fn test_index() {
index("molecules.smi");
Expand Down
33 changes: 24 additions & 9 deletions src/ringo/ringo/index_item.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,30 @@
use bit_vec::BitVec;
use bincode::{Decode, Encode};
use crate::ringo::ringo::fingerprint::Fingerprint;

#[derive(Debug, Encode, Decode)]
pub struct IndexItem {
pub position: usize,
pub fingerprint: Vec<u8>
pub fingerprint: Fingerprint
}

impl IndexItem {
pub fn new(position: usize, fingerprint: BitVec) -> IndexItem {
IndexItem {
position,
fingerprint: fingerprint.to_bytes()
}
#[cfg(test)]
mod tests {
use bincode::config::standard;
use bincode::{decode_from_slice, encode_to_vec};
use fixedbitset::FixedBitSet;
use crate::ringo::ringo::index_item::IndexItem;
use crate::ringo::ringo::fingerprint::Fingerprint;

#[test]
fn test_index_item_encode_decode() {
let fp = Fingerprint(FixedBitSet::with_capacity(512));
let mut ii = IndexItem {position: 0, fingerprint: fp};
ii.position = 0;
ii.fingerprint.0.set(1, true);
ii.fingerprint.0.set(17, true);

let encoded = encode_to_vec(&ii, standard()).unwrap();
let decoded: IndexItem = decode_from_slice(&encoded, standard()).unwrap().0;
assert_eq!(decoded.fingerprint.0.ones().collect::<Vec<usize>>(), vec![1, 17]);
}
}
}

0 comments on commit 1a6b78d

Please sign in to comment.