From 320f2c8425f878c1936bcc6acc2c3e937505be4f Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Wed, 13 Sep 2023 14:43:44 +0200 Subject: [PATCH] [DOC] More documentation for HIBF and config. (#104) * [DOC] update HIBF documentation. * [DOC] Update config documentation. * [MISC] automatic linting * Update include/hibf/config.hpp * Update include/hibf/config.hpp * Apply suggestions from code review * Apply suggestions from code review * [MISC] automatic linting * Update test/snippet/hibf/hierarchical_interleaved_bloom_filter.cpp * [MISC] automatic linting * Update hierarchical_interleaved_bloom_filter.cpp * Create hierarchical_interleaved_bloom_filter.out * Apply suggestions from code review --------- Co-authored-by: seqan-actions[bot] Co-authored-by: Enrico Seiler --- include/hibf/config.hpp | 72 ++++--------- .../hierarchical_interleaved_bloom_filter.hpp | 101 ++++++++++++------ test/snippet/hibf/hibf_construction.cpp | 30 ++++++ .../hierarchical_interleaved_bloom_filter.cpp | 46 ++++++++ .../hierarchical_interleaved_bloom_filter.out | 3 + 5 files changed, 170 insertions(+), 82 deletions(-) create mode 100644 test/snippet/hibf/hibf_construction.cpp create mode 100644 test/snippet/hibf/hierarchical_interleaved_bloom_filter.cpp create mode 100644 test/snippet/hibf/hierarchical_interleaved_bloom_filter.out diff --git a/include/hibf/config.hpp b/include/hibf/config.hpp index a6063e8e..8cd08e56 100644 --- a/include/hibf/config.hpp +++ b/include/hibf/config.hpp @@ -26,15 +26,33 @@ namespace seqan::hibf using insert_iterator = std::insert_iterator>; /*!\brief The configuration used to build an (H)IBF + * + * # The (H)IBF config * * The configuration can be used to construct an HIBF or IBF. * * When constructing an IBF, only the members `General Configuration` are considered, layout parameters from * the section `HIBF Layout Configuration` are ignored. * - * \note If an option is marked [REQUIRED], an error will be thrown on (H)IBF - * construction if it is not set. An option is marked RECOMMENDED_TO_ADAPT if we give a sensible default - * but still recommend from experience that it is worth thinking about adjusting it to your data. + * Here is the list of all configs options: + * + * | Type | Option Name | Default | Note | + * |:--------|:-------------------------------------------------|:-------:|:-----------------------| + * | General | seqan::hibf::config::input_fn | - | [REQUIRED] | + * | General | seqan::hibf::config::number_of_user_bins | - | [REQUIRED] | + * | General | seqan::hibf::config::number_of_hash_functions | 2 | | + * | General | seqan::hibf::config::maximum_false_positive_rate | 0.05 | [RECOMMENDED_TO_ADAPT] | + * | General | seqan::hibf::config::threads | 1 | [RECOMMENDED_TO_ADAPT] | + * | Layout | seqan::hibf::config::sketch_bits | 12 | | + * | Layout | seqan::hibf::config::tmax | 0 | 0 indicates unset | + * | Layout | seqan::hibf::config::max_rearrangement_ratio | 0.5 | | + * | Layout | seqan::hibf::config::alpha | 1.2 | | + * | Layout | seqan::hibf::config::disable_estimate_union | false | | + * | Layout | seqan::hibf::config::disable_rearrangement | false | | + * + * As a copy and paste source, here are all config options with their defaults: + * + * \include test/snippet/hibf/hibf_construction.cpp * * ## The HIBF takes too long to construct? * @@ -51,54 +69,6 @@ using insert_iterator = std::insert_iterator= t`). + * + * For all practical applications it is recommended to research sensible thresholds based on the data, the false + * positive rate, the length of the query and whether (canonical) k-mers, minimizers, syncmers,.. etc were used for + * hashing genomic content. + * + * ## Counting Queries with the HIBF * * To count the occurrences in each user bin of a range of values in the Hierarchical Interleaved Bloom Filter, call * seqan::hibf::hierarchical_interleaved_bloom_filter::counting_agent() and use @@ -81,7 +91,36 @@ namespace seqan::hibf * calls to `const` member functions are safe from multiple threads (as long as no thread calls * a non-`const` member function at the same time). * - * [1]: https://docs.seqan.de/seqan/3.0.3/classseqan3_1_1interleaved__bloom__filter.html + * # Details on the data structure + * + * The following gives some insights about the general design of the HIBF data structure. More details can be found + * in the publication: https://doi.org/10.1186/s13059-023-02971-4 + * + * ## Terminology + * + * ### Technical Bin + * A Technical Bin represents an actual bin in the binning directory. In the IBF, it stores its kmers in a single Bloom + * Filter (which is interleaved with all the other BFs). + * + * ### User Bin + * The user may impose a structure on his sequence data in the form of logical groups (e.g. species). When querying the + * IBF, the user is interested in an answer that differentiates between these groups. + * + * ## Hierarchical Interleaved Bloom Filter (HIBF) + * + * In constrast to the [seqan::hibf::interleaved_bloom_filter][1], the user bins may be split across multiple technical + * bins, or multiple user bins may be merged into one technical bin. When merging multiple user bins, the HIBF stores + * another IBF that is built over the user bins constituting the merged bin. This lower-level IBF can then be used + * to further distinguish between merged bins. + * + * In this example, user bin 1 was split into two technical bins. Bins 3, 4, and 5 were merged into a single technical + * bin, and another IBF was added for the merged bin. + * \image html hibf.svg width=90% + * + * The individual IBFs may have a different number of technical bins and differ in their sizes, allowing an efficient + * distribution of the user bins. + * + * \see [seqan::hibf::interleaved_bloom_filter][1] */ class hierarchical_interleaved_bloom_filter { diff --git a/test/snippet/hibf/hibf_construction.cpp b/test/snippet/hibf/hibf_construction.cpp new file mode 100644 index 00000000..4679639d --- /dev/null +++ b/test/snippet/hibf/hibf_construction.cpp @@ -0,0 +1,30 @@ +#include // for insert_iterator, config +#include // for hierarchical_interleaved_bloom_filter + +int main() +{ + // 2 user bins: + std::vector> hashes{{1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u}, {1u, 2u, 3u, 4u, 5u}}; + + // input just passes hashes: + auto my_input = [&](size_t const user_bin_id, seqan::hibf::insert_iterator it) + { + for (auto const hash : hashes[user_bin_id]) + it = hash; + }; + + seqan::hibf::config config{.input_fn = my_input, // required + .number_of_user_bins = 2, // required + .number_of_hash_functions = 2, + .maximum_false_positive_rate = 0.05, // recommended to adapt + .threads = 1, // recommended to adapt + .sketch_bits = 12, + .tmax = 0, // triggers default copmutation + .alpha = 1.2, + .max_rearrangement_ratio = 0.5, + .disable_estimate_union = false, + .disable_rearrangement = false}; + + // construct the HIBF + seqan::hibf::hierarchical_interleaved_bloom_filter hibf{config}; +} diff --git a/test/snippet/hibf/hierarchical_interleaved_bloom_filter.cpp b/test/snippet/hibf/hierarchical_interleaved_bloom_filter.cpp new file mode 100644 index 00000000..b0392640 --- /dev/null +++ b/test/snippet/hibf/hierarchical_interleaved_bloom_filter.cpp @@ -0,0 +1,46 @@ +#include // for insert_iterator, config +#include // for hierarchical_interleaved_bloom_filter + +void print(std::vector const & vector) +{ + std::cout << '['; + + if (!vector.empty()) + { + for (size_t i = 0u; i < vector.size() - 1u; ++i) + std::cout << vector[i] << ','; + std::cout << vector.back(); + } + + std::cout << "]\n"; +} + +int main() +{ + // 2 user bins: + std::vector> hashes{{1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u}, {1u, 2u, 3u, 4u, 5u}}; + + // input just passes hashes: + auto my_input = [&](size_t const user_bin_id, seqan::hibf::insert_iterator it) + { + for (auto const hash : hashes[user_bin_id]) + it = hash; + }; + + seqan::hibf::config config{.input_fn = my_input, .number_of_user_bins = 2}; + + // construct the HIBF + seqan::hibf::hierarchical_interleaved_bloom_filter hibf{config}; + + // query the HIBF + std::vector query{1u, 2u, 3u}; + std::vector query2{8u, 9u, 10u}; + + auto agent = hibf.membership_agent(); // you need an agent for efficient queries + auto & result = agent.membership_for(query, 2u); // both user bins have hashes 1,2,3 + print(result); // [1,0] + agent.sort_results(); // Results can also be sorted + print(result); // [0,1] + auto & result2 = agent.membership_for(query2, 2u); // only user bin 0 has hashes 8,9,10 + print(result2); // [0] +} diff --git a/test/snippet/hibf/hierarchical_interleaved_bloom_filter.out b/test/snippet/hibf/hierarchical_interleaved_bloom_filter.out new file mode 100644 index 00000000..60e48db9 --- /dev/null +++ b/test/snippet/hibf/hierarchical_interleaved_bloom_filter.out @@ -0,0 +1,3 @@ +[1,0] +[0,1] +[0]