From 1a6b78d9529f948afa5140249baff3e99325c94d Mon Sep 17 00:00:00 2001 From: Mikhail Kviatkovskii Date: Sat, 6 Jul 2024 09:57:50 +0400 Subject: [PATCH] #3: switched to FixedBitSet, using bincode to performa binary encoding and decoding --- Cargo.toml | 5 +- src/ringo/math/similarity/tanimoto.rs | 82 ++++++-------------- src/ringo/molecule/model/molecule.rs | 17 ++-- src/ringo/molecule/smiles/reader/molecule.rs | 5 +- src/ringo/ringo.rs | 1 + src/ringo/ringo/fingerprint.rs | 48 ++++++++++++ src/ringo/ringo/index.rs | 3 +- src/ringo/ringo/index_item.rs | 33 +++++--- 8 files changed, 108 insertions(+), 86 deletions(-) create mode 100644 src/ringo/ringo/fingerprint.rs diff --git a/Cargo.toml b/Cargo.toml index b4c27b3..04895de 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,8 +11,7 @@ repository = "https://github.com/mkviatkovskii/ringo.git" version = "0.0.1" [dependencies] -bitcode = "0.6.0" -bit-set = "0.6.0" -bit-vec = "0.7.0" +bincode = "2.0.0-rc.3" +fixedbitset = "0.5.7" nom = "7.1.3" petgraph = "0.6.5" diff --git a/src/ringo/math/similarity/tanimoto.rs b/src/ringo/math/similarity/tanimoto.rs index 23d3a9f..a224cb6 100644 --- a/src/ringo/math/similarity/tanimoto.rs +++ b/src/ringo/math/similarity/tanimoto.rs @@ -1,72 +1,34 @@ -use bit_vec::BitVec; +use fixedbitset::FixedBitSet; -pub fn tanimoto_vec(a: &BitVec, b: &BitVec) -> f32 { +pub fn tanimoto_bitset(a: &FixedBitSet, b: &FixedBitSet) -> f32 { let mut and_ = a.clone(); - let mut or_ = a.clone(); - and_.and(b); - or_.or(b); - - let mut dividend: u32 = 0; - for b in and_.blocks() { - dividend += b.count_ones(); - } - let mut divisor: u32 = 0; - for b in or_.blocks() { - divisor += b.count_ones(); - } - - return dividend as f32 / divisor as f32; -} - -pub unsafe fn tanimoto_array(a: &[u64; 4], b: &[u64; 4]) -> f32 { - let mut dividend: u32 = 0; - let mut divisor: u32 = 0; - - for i in 0..4 { - dividend += ((a[i] & b[i]) as i64).count_ones(); - divisor += ((a[i] | b[i]) as i64).count_ones(); - } - return dividend as f32 / divisor as f32; + and_.intersect_with(b); + return and_.count_ones(..) as f32 / (a.count_ones(..) + b.count_ones(..) - and_.count_ones(..)) as f32; } #[cfg(test)] mod tests { - use bit_vec::BitVec; - use crate::ringo::math::similarity::tanimoto::{tanimoto_array, tanimoto_vec}; + use fixedbitset::FixedBitSet; + use crate::ringo::math::similarity::tanimoto::{tanimoto_bitset}; #[test] - fn test_tanimoto_vec_033() { - let a: BitVec = BitVec::from_bytes(&[0b00000101]); - let b = BitVec::from_bytes(&[0b00000011]); - - assert_eq!(tanimoto_vec(&a, &b), 0.33333334); + fn test_tanimoto_bitset_033() { + let mut a = FixedBitSet::with_capacity(8); + a.insert(0); + a.insert(2); + let mut b = FixedBitSet::with_capacity(8); + b.insert(0); + b.insert(1); + assert_eq!(tanimoto_bitset(&a, &b), 0.33333334); } #[test] - fn test_tanimoto_vec_05() { - let a: BitVec = BitVec::from_bytes(&[0b0000001]); - let b = BitVec::from_bytes(&[0b00000011]); - - assert_eq!(tanimoto_vec(&a, &b), 0.5); + fn test_tanimoto_bitset_05() { + let mut a = FixedBitSet::with_capacity(8); + a.insert(0); + let mut b = FixedBitSet::with_capacity(8); + b.insert(0); + b.insert(1); + assert_eq!(tanimoto_bitset(&a, &b), 0.5); } - - #[test] - fn test_tanimoto_array_033() { - let a: [u64; 4] = [0b00000101, 0, 0, 0]; - let b = [0b00000011, 0, 0, 0]; - - unsafe { - assert_eq!(tanimoto_array(&a, &b), 0.33333334); - } - } - - #[test] - fn test_tanimoto_array_05() { - let a: [u64; 4] = [0b00000001, 0, 0, 0]; - let b = [0b00000011, 0, 0, 0]; - - unsafe { - assert_eq!(tanimoto_array(&a, &b), 0.5); - } - } -} \ No newline at end of file +} diff --git a/src/ringo/molecule/model/molecule.rs b/src/ringo/molecule/model/molecule.rs index 3ac6f2c..f2ded08 100644 --- a/src/ringo/molecule/model/molecule.rs +++ b/src/ringo/molecule/model/molecule.rs @@ -1,17 +1,16 @@ use crate::ringo::molecule::model::atom::Atom; use crate::ringo::molecule::model::bond::Bond; use crate::ringo::molecule::model::element::atomic_weight; -use bit_set::BitSet; -use bit_vec::BitVec; +use crate::ringo::ringo::fingerprint::Fingerprint; use petgraph::stable_graph::{EdgeIndex, NodeIndex, StableGraph}; use petgraph::visit::EdgeRef; use petgraph::Undirected; use std::borrow::Borrow; use std::collections::{BTreeSet}; use std::collections::hash_map::DefaultHasher; -use std::fmt::Debug; use std::hash::Hasher; -use crate::ringo::math::similarity::tanimoto::tanimoto_vec; +use fixedbitset::FixedBitSet; +use crate::ringo::math::similarity::tanimoto::tanimoto_bitset; use crate::ringo::molecule::smiles::reader::molecule::parse_molecule; pub struct Molecule { @@ -89,14 +88,14 @@ impl Molecule { } // TODO: move to Descriptors crate - pub fn ecfp(&self, radius: usize, fp_length: usize) -> BitVec { - let mut fp = BitSet::new(); + pub fn ecfp(&self, radius: usize, fp_length: usize) -> Fingerprint { + let mut fp = FixedBitSet::new(); for node in self.graph.node_indices() { ecfp_recursive(&self.graph, radius, 1, node, &mut fp, fp_length, &mut DefaultHasher::new()); } - BitVec::from_fn(fp_length, |idx| fp.contains(idx)) + Fingerprint(fp) } } @@ -105,7 +104,7 @@ fn ecfp_recursive( radius: usize, depth: usize, node: NodeIndex, - fp: &mut BitSet, + fp: &mut FixedBitSet, fp_length: usize, hasher: &mut DefaultHasher, ) { @@ -142,6 +141,6 @@ fn ecfp_recursive( fn test_ecfp() { let ecfp_ibuprofen = parse_molecule("CC(C)CC1=CC=C(C=C1)C(C)C(=O)O").unwrap().1.ecfp(2, 128); let ecfp_naproxen = parse_molecule("CC(C1=CC2=C(C=C1)C=C(C=C2)OC)C(=O)O").unwrap().1.ecfp(2, 128); - let sim = tanimoto_vec(&ecfp_ibuprofen, &ecfp_naproxen); + let sim = tanimoto_bitset(&ecfp_ibuprofen.0, &ecfp_naproxen.0); assert!(0.53 < sim && sim < 0.54); } diff --git a/src/ringo/molecule/smiles/reader/molecule.rs b/src/ringo/molecule/smiles/reader/molecule.rs index 55bbe32..573f446 100644 --- a/src/ringo/molecule/smiles/reader/molecule.rs +++ b/src/ringo/molecule/smiles/reader/molecule.rs @@ -388,10 +388,7 @@ mod tests { println!("{}: ", smiles); let m = parse_molecule(smiles).unwrap().1; let result = m.ecfp(2, 512); - for bit in result { - print!("{}", if bit { 1 } else { 0 }); - } - println!(""); + println!("{:?}", result); } } } diff --git a/src/ringo/ringo.rs b/src/ringo/ringo.rs index 0e77780..559227b 100644 --- a/src/ringo/ringo.rs +++ b/src/ringo/ringo.rs @@ -1,3 +1,4 @@ mod index; mod search; mod index_item; +pub(crate) mod fingerprint; diff --git a/src/ringo/ringo/fingerprint.rs b/src/ringo/ringo/fingerprint.rs new file mode 100644 index 0000000..55dc0d0 --- /dev/null +++ b/src/ringo/ringo/fingerprint.rs @@ -0,0 +1,48 @@ +use bincode::de::BorrowDecoder; +use bincode::error::{DecodeError, EncodeError}; +use fixedbitset::{Block, FixedBitSet}; + +pub const FINGERPRINT_SIZE: usize = 512; + +#[derive(Debug)] +pub struct Fingerprint(pub FixedBitSet); + +impl bincode::Encode for Fingerprint { + fn encode(&self, encoder: &mut E) -> Result<(), EncodeError> { + self.0.as_slice().encode(encoder)?; + Ok(()) + } +} + +impl bincode::Decode for Fingerprint { + fn decode(decoder: &mut D) -> Result { + let slice = Vec::::decode(decoder)?; + let fp = FixedBitSet::with_capacity_and_blocks(FINGERPRINT_SIZE, slice); + Ok(Fingerprint(fp)) + } +} + +impl<'de> bincode::BorrowDecode<'de> for Fingerprint { + fn borrow_decode>(decoder: &mut D) -> Result { + let slice = Vec::::borrow_decode(decoder)?; + let fp = FixedBitSet::with_capacity_and_blocks(FINGERPRINT_SIZE, slice); + Ok(Fingerprint(fp)) + } +} + +#[cfg(test)] +mod tests { + use fixedbitset::{FixedBitSet}; + use crate::ringo::ringo::fingerprint::{Fingerprint, FINGERPRINT_SIZE}; + + #[test] + fn test_fingerprint_encode_decode() { + let mut fp = Fingerprint(FixedBitSet::with_capacity(FINGERPRINT_SIZE)); + fp.0.set(1, true); + fp.0.set(17, true); + + let encoded = bincode::encode_to_vec(&fp, bincode::config::standard()).unwrap(); + let decoded: Fingerprint = bincode::decode_from_slice(&encoded, bincode::config::standard()).unwrap().0; + assert_eq!(decoded.0.ones().collect::>(), vec![1, 17]); + } +} diff --git a/src/ringo/ringo/index.rs b/src/ringo/ringo/index.rs index c95eab5..9efb2d3 100644 --- a/src/ringo/ringo/index.rs +++ b/src/ringo/ringo/index.rs @@ -13,11 +13,12 @@ fn index(smiles_file: &str) { for line in std::io::BufReader::new(fi).lines() { let line = line.unwrap(); let molecule = parse_molecule(&line).unwrap().1; - IndexItem::new(offset, molecule.ecfp(2, 512)); + IndexItem{position: offset, fingerprint: molecule.ecfp(2, 512)}; offset += line.len() + 1; } } + #[test] fn test_index() { index("molecules.smi"); diff --git a/src/ringo/ringo/index_item.rs b/src/ringo/ringo/index_item.rs index 06d8c90..f4c25ea 100644 --- a/src/ringo/ringo/index_item.rs +++ b/src/ringo/ringo/index_item.rs @@ -1,15 +1,30 @@ -use bit_vec::BitVec; +use bincode::{Decode, Encode}; +use crate::ringo::ringo::fingerprint::Fingerprint; +#[derive(Debug, Encode, Decode)] pub struct IndexItem { pub position: usize, - pub fingerprint: Vec + pub fingerprint: Fingerprint } -impl IndexItem { - pub fn new(position: usize, fingerprint: BitVec) -> IndexItem { - IndexItem { - position, - fingerprint: fingerprint.to_bytes() - } +#[cfg(test)] +mod tests { + use bincode::config::standard; + use bincode::{decode_from_slice, encode_to_vec}; + use fixedbitset::FixedBitSet; + use crate::ringo::ringo::index_item::IndexItem; + use crate::ringo::ringo::fingerprint::Fingerprint; + + #[test] + fn test_index_item_encode_decode() { + let fp = Fingerprint(FixedBitSet::with_capacity(512)); + let mut ii = IndexItem {position: 0, fingerprint: fp}; + ii.position = 0; + ii.fingerprint.0.set(1, true); + ii.fingerprint.0.set(17, true); + + let encoded = encode_to_vec(&ii, standard()).unwrap(); + let decoded: IndexItem = decode_from_slice(&encoded, standard()).unwrap().0; + assert_eq!(decoded.fingerprint.0.ones().collect::>(), vec![1, 17]); } -} +} \ No newline at end of file