From 203e3332ee3b12a7c267c080d75c9d537d49e218 Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Wed, 23 Aug 2023 09:12:42 +0200 Subject: [PATCH 1/5] [MISC] Remove layout path from config. --- include/hibf/config.hpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/include/hibf/config.hpp b/include/hibf/config.hpp index bdebda25..cc98ef94 100644 --- a/include/hibf/config.hpp +++ b/include/hibf/config.hpp @@ -78,9 +78,6 @@ struct config // Related to k-mers bool disable_cutoffs{false}; - //!\brief If given, no layout algorithm is esxecuted but the layout from file is used for building. - std::filesystem::path layout_file{}; - // Related to IBF // bool compressed{false}; //!\} @@ -107,7 +104,6 @@ struct config archive(CEREAL_NVP(disable_rearrangement)); archive(CEREAL_NVP(disable_cutoffs)); - archive(CEREAL_NVP(layout_file)); } }; From 7817dc23e04a8e5275cf4a5e0c080f03384ce7f6 Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Wed, 23 Aug 2023 13:44:55 +0200 Subject: [PATCH 2/5] [MISC] Update and extend prefixes. --- include/hibf/detail/layout/layout.hpp | 4 +- include/hibf/detail/prefixes.hpp | 66 +++++++++++++++----- test/unit/hibf/detail/layout/layout_test.cpp | 6 +- 3 files changed, 55 insertions(+), 21 deletions(-) diff --git a/include/hibf/detail/layout/layout.hpp b/include/hibf/detail/layout/layout.hpp index d884d237..e0b3f914 100644 --- a/include/hibf/detail/layout/layout.hpp +++ b/include/hibf/detail/layout/layout.hpp @@ -27,7 +27,7 @@ struct layout requires std::derived_from friend stream_type & operator<<(stream_type & stream, max_bin const & object) { - stream << prefix::header << prefix::merged_bin << '_'; + stream << prefix::layout_header << prefix::layout_lower_level << '_'; auto it = object.previous_TB_indices.begin(); auto end = object.previous_TB_indices.end(); // If not empty, we join with ';' @@ -37,7 +37,7 @@ struct layout while (++it != end) stream << ';' << *it; } - stream << " max_bin_id:" << object.id; + stream << " " << prefix::layout_fullest_technical_bin_idx << object.id; return stream; } diff --git a/include/hibf/detail/prefixes.hpp b/include/hibf/detail/prefixes.hpp index eff32b96..2b1fe6e8 100644 --- a/include/hibf/detail/prefixes.hpp +++ b/include/hibf/detail/prefixes.hpp @@ -6,21 +6,55 @@ namespace hibf::prefix { - -constexpr std::string_view chopper{"chopper"}; - -constexpr std::string_view header{"#"}; - -constexpr std::string_view header_config{"#"}; - -constexpr std::string_view high_level{"HIGH_LEVEL_IBF"}; - -constexpr std::string_view first_header_line{"#HIGH_LEVEL_IBF"}; -static_assert(first_header_line.starts_with(header)); -static_assert(first_header_line.ends_with(high_level)); - -constexpr std::string_view merged_bin{"MERGED_BIN"}; - -constexpr std::string_view split_bin{"SPLIT_BIN"}; +/* These prefixes are for writing the layout file + + * It is structured like this: + * + * [0) Possibly metadata added by chopper/raptor-layout] + * 1) Metadata: the hibf config + * 2) Layout header: max bin ids for the merged bins + * 3) Layout content: Assignment of user bin idx to technical bin idx + * + * And marked like this: + * [0) First character is @; Start and End of meta data should be marked accordingly to (1)] + * 1) First character is @; Start and End are marked by @HIBF_CONFIG and @HIBF_CONFIG_END respectively + * 2) First character is #; + * 3) No mark, plain content. + * + * Example: + * + * ``` + * @CHOPPER_USER_BINS + * @0 /path/to/file1.fa + * @CHOPPER_USER_BINS_END + * @CHOPPER_CONFIG + * @0 k = 20 + * @CHOPPER_CONFIG_END + * + * `` + */ + +constexpr std::string_view meta_header{"@"}; + +constexpr std::string_view meta_hibf_config_start{"@HIBF_CONFIG"}; +static_assert(meta_hibf_config_start.starts_with(meta_header)); + +constexpr std::string_view meta_hibf_config_end{"@HIBF_CONFIG_END"}; +static_assert(meta_hibf_config_end.starts_with(meta_header)); + +constexpr std::string_view layout_header{"#"}; + +constexpr std::string_view layout_top_level{"TOP_LEVEL_IBF"}; + +constexpr std::string_view layout_lower_level{"LOWER_LEVEL_IBF"}; + +constexpr std::string_view layout_fullest_technical_bin_idx{"fullest_technical_bin_idx:"}; + +constexpr std::string_view layout_first_header_line{"#TOP_LEVEL_IBF"}; +static_assert(layout_first_header_line.starts_with(layout_header)); +static_assert(layout_first_header_line.ends_with(layout_top_level)); + +constexpr std::string_view layout_column_names{"#USER_BIN_IDX\tTECHNICAL_BIN_INDICES\tNUMBER_OF_TECHNICAL_BINS"}; +static_assert(layout_column_names.starts_with(layout_header)); } // namespace hibf::prefix diff --git a/test/unit/hibf/detail/layout/layout_test.cpp b/test/unit/hibf/detail/layout/layout_test.cpp index d3579c4d..f17b700c 100644 --- a/test/unit/hibf/detail/layout/layout_test.cpp +++ b/test/unit/hibf/detail/layout/layout_test.cpp @@ -21,9 +21,9 @@ TEST(layout_test, printing_max_bins) for (auto const & mb : layout.max_bins) ss << mb << "\n"; - std::string expected = R"mb(#MERGED_BIN_ max_bin_id:0 -#MERGED_BIN_2 max_bin_id:2 -#MERGED_BIN_1;2;3;4 max_bin_id:22 + std::string expected = R"mb(#LOWER_LEVEL_IBF_ fullest_technical_bin_idx:0 +#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:2 +#LOWER_LEVEL_IBF_1;2;3;4 fullest_technical_bin_idx:22 )mb"; EXPECT_EQ(ss.str(), expected); From 8817a76c8d53bb9a786551c9344570dbbd181342 Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Wed, 23 Aug 2023 13:46:11 +0200 Subject: [PATCH 3/5] [FEATURE] Add config::read_from and config::write_to. --- include/hibf/config.hpp | 4 + src/CMakeLists.txt | 1 + src/config.cpp | 60 +++++++++++++++ test/unit/hibf/CMakeLists.txt | 1 + test/unit/hibf/config_test.cpp | 130 +++++++++++++++++++++++++++++++++ 5 files changed, 196 insertions(+) create mode 100644 src/config.cpp create mode 100644 test/unit/hibf/config_test.cpp diff --git a/include/hibf/config.hpp b/include/hibf/config.hpp index cc98ef94..87abd64f 100644 --- a/include/hibf/config.hpp +++ b/include/hibf/config.hpp @@ -11,6 +11,7 @@ #include // for size_t #include // for path #include // for function +#include // for ostream #include // for insert_iterator #include // for unordered_flat_set @@ -82,6 +83,9 @@ struct config // bool compressed{false}; //!\} + void read_from(std::istream & stream); + void write_to(std::ostream & stream) const; + private: friend class cereal::access; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index cb361587..daa3df9a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,5 +1,6 @@ set (HIBF_SOURCE_FILES hierarchical_interleaved_bloom_filter.cpp + config.cpp detail/layout/simple_binning.cpp detail/layout/execute.cpp detail/layout/compute_fpr_correction.cpp diff --git a/src/config.cpp b/src/config.cpp new file mode 100644 index 00000000..281882ea --- /dev/null +++ b/src/config.cpp @@ -0,0 +1,60 @@ +// --------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/hibf/blob/main/LICENSE.md +// --------------------------------------------------------------------------------------------------- + +#include +#include +#include + +#include +#include + +#include + +namespace hibf +{ + +void config::read_from(std::istream & stream) +{ + std::string line; + std::stringstream config_str; + + while (std::getline(stream, line) && line != prefix::meta_hibf_config_start) + ; + + assert(line == prefix::meta_hibf_config_start); + + // TODO ##CONFIG: as prefix + while (std::getline(stream, line) && line != prefix::meta_hibf_config_end) + { + assert(line.size() >= 2); + assert(std::string_view{line}.substr(0, 1) == hibf::prefix::meta_header); + config_str << line.substr(1); // remove hibf::prefix::meta_header + } + + assert(line == prefix::meta_hibf_config_end); + + cereal::JSONInputArchive iarchive(config_str); + iarchive(*this); +} + +void config::write_to(std::ostream & stream) const +{ + // write json file to temprorary string stream with cereal + std::stringstream config_stream{}; + cereal::JSONOutputArchive output(config_stream); // stream to cout + output(cereal::make_nvp("hibf_config", *this)); + + // write config + stream << prefix::meta_hibf_config_start << '\n'; + std::string line; + while (std::getline(config_stream, line, '\n')) + stream << prefix::meta_header << line << '\n'; + stream << prefix::meta_header << "}\n" // last closing bracket isn't written by loop above + << prefix::meta_hibf_config_end << '\n'; +} + +} // namespace hibf diff --git a/test/unit/hibf/CMakeLists.txt b/test/unit/hibf/CMakeLists.txt index 55498c5b..8b326d3d 100644 --- a/test/unit/hibf/CMakeLists.txt +++ b/test/unit/hibf/CMakeLists.txt @@ -1,4 +1,5 @@ add_subdirectories () +hibf_test (config_test.cpp) hibf_test (hierarchical_interleaved_bloom_filter_test.cpp) hibf_test (interleaved_bloom_filter_test.cpp) diff --git a/test/unit/hibf/config_test.cpp b/test/unit/hibf/config_test.cpp new file mode 100644 index 00000000..ec2eb853 --- /dev/null +++ b/test/unit/hibf/config_test.cpp @@ -0,0 +1,130 @@ +#include // for Test, TestInfo, EXPECT_EQ, Message, TEST, TestPartResult + +#include // for size_t +#include // for operator<<, char_traits, basic_ostream, basic_stringstream, strings... +#include // for allocator, string +#include // for operator<< +#include // for vector + +#include // for config + +TEST(config_test, write_to) +{ + std::stringstream ss{}; + + hibf::config configuration; + + configuration.number_of_user_bins = 123456789; + configuration.number_of_hash_functions = 4; + configuration.maximum_false_positive_rate = 0.0001; + configuration.threads = 31; + configuration.sketch_bits = 8; + configuration.tmax = 128; + configuration.alpha = 1.0; + configuration.max_rearrangement_ratio = 0.333; + configuration.disable_estimate_union = true; + configuration.disable_rearrangement = false; + configuration.disable_cutoffs = false; + + configuration.write_to(ss); + + std::string const expected_file{"@HIBF_CONFIG\n" + "@{\n" + "@ \"hibf_config\": {\n" + "@ \"version\": 1,\n" + "@ \"number_of_user_bins\": 123456789,\n" + "@ \"number_of_hash_functions\": 4,\n" + "@ \"maximum_false_positive_rate\": 0.0001,\n" + "@ \"threads\": 31,\n" + "@ \"sketch_bits\": 8,\n" + "@ \"tmax\": 128,\n" + "@ \"alpha\": 1.0,\n" + "@ \"max_rearrangement_ratio\": 0.333,\n" + "@ \"disable_estimate_union\": true,\n" + "@ \"disable_rearrangement\": false,\n" + "@ \"disable_cutoffs\": false\n" + "@ }\n" + "@}\n" + "@HIBF_CONFIG_END\n"}; + + EXPECT_EQ(ss.str(), expected_file); +} + +TEST(config_test, read_from) +{ + std::stringstream ss{"@HIBF_CONFIG\n" + "@{\n" + "@ \"hibf_config\": {\n" + "@ \"version\": 1,\n" + "@ \"number_of_user_bins\": 123456789,\n" + "@ \"number_of_hash_functions\": 4,\n" + "@ \"maximum_false_positive_rate\": 0.0001,\n" + "@ \"threads\": 31,\n" + "@ \"sketch_bits\": 8,\n" + "@ \"tmax\": 128,\n" + "@ \"alpha\": 1.0,\n" + "@ \"max_rearrangement_ratio\": 0.333,\n" + "@ \"disable_estimate_union\": true,\n" + "@ \"disable_rearrangement\": false,\n" + "@ \"disable_cutoffs\": false\n" + "@ }\n" + "@}\n" + "@HIBF_CONFIG_END\n"}; + + hibf::config configuration; + configuration.read_from(ss); + + EXPECT_EQ(configuration.number_of_user_bins, 123456789); + EXPECT_EQ(configuration.number_of_hash_functions, 4); + EXPECT_EQ(configuration.maximum_false_positive_rate, 0.0001); + EXPECT_EQ(configuration.threads, 31); + EXPECT_EQ(configuration.sketch_bits, 8); + EXPECT_EQ(configuration.tmax, 128); + EXPECT_EQ(configuration.alpha, 1.0); + EXPECT_EQ(configuration.max_rearrangement_ratio, 0.333); + EXPECT_EQ(configuration.disable_estimate_union, true); + EXPECT_EQ(configuration.disable_rearrangement, false); + EXPECT_EQ(configuration.disable_cutoffs, false); +} + +TEST(config_test, read_from_with_more_meta) +{ + std::stringstream ss{"@blah some chopper stuff\n" + "@blah some chopper stuff\n" + "@blah some chopper stuff\n" + "@blah some chopper stuff\n" + "@blah some chopper stuff\n" + "@HIBF_CONFIG\n" + "@{\n" + "@ \"hibf_config\": {\n" + "@ \"version\": 1,\n" + "@ \"number_of_user_bins\": 123456789,\n" + "@ \"number_of_hash_functions\": 4,\n" + "@ \"maximum_false_positive_rate\": 0.0001,\n" + "@ \"threads\": 31,\n" + "@ \"sketch_bits\": 8,\n" + "@ \"tmax\": 128,\n" + "@ \"alpha\": 1.0,\n" + "@ \"max_rearrangement_ratio\": 0.333,\n" + "@ \"disable_estimate_union\": true,\n" + "@ \"disable_rearrangement\": false,\n" + "@ \"disable_cutoffs\": false\n" + "@ }\n" + "@}\n" + "@HIBF_CONFIG_END\n"}; + + hibf::config configuration; + configuration.read_from(ss); + + EXPECT_EQ(configuration.number_of_user_bins, 123456789); + EXPECT_EQ(configuration.number_of_hash_functions, 4); + EXPECT_EQ(configuration.maximum_false_positive_rate, 0.0001); + EXPECT_EQ(configuration.threads, 31); + EXPECT_EQ(configuration.sketch_bits, 8); + EXPECT_EQ(configuration.tmax, 128); + EXPECT_EQ(configuration.alpha, 1.0); + EXPECT_EQ(configuration.max_rearrangement_ratio, 0.333); + EXPECT_EQ(configuration.disable_estimate_union, true); + EXPECT_EQ(configuration.disable_rearrangement, false); + EXPECT_EQ(configuration.disable_cutoffs, false); +} From 3a33388cbb57bfe033bb72cecca8835b2ff0e9a8 Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Wed, 23 Aug 2023 13:46:53 +0200 Subject: [PATCH 4/5] [FEATURE] Add layout::read_from and layout::write_to. --- include/hibf/detail/layout/layout.hpp | 3 + src/CMakeLists.txt | 1 + src/detail/layout/layout.cpp | 144 +++++++++++++++++++ test/unit/hibf/detail/layout/layout_test.cpp | 54 +++++++ 4 files changed, 202 insertions(+) create mode 100644 src/detail/layout/layout.cpp diff --git a/include/hibf/detail/layout/layout.hpp b/include/hibf/detail/layout/layout.hpp index e0b3f914..1f3dbc30 100644 --- a/include/hibf/detail/layout/layout.hpp +++ b/include/hibf/detail/layout/layout.hpp @@ -86,6 +86,9 @@ struct layout } }; + void read_from(std::istream & stream); + void write_to(std::ostream & stream) const; + size_t top_level_max_bin_id{}; std::vector max_bins{}; std::vector user_bins{}; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index daa3df9a..a1f76fab 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -2,6 +2,7 @@ set (HIBF_SOURCE_FILES hierarchical_interleaved_bloom_filter.cpp config.cpp detail/layout/simple_binning.cpp + detail/layout/layout.cpp detail/layout/execute.cpp detail/layout/compute_fpr_correction.cpp detail/layout/compute_layout.cpp diff --git a/src/detail/layout/layout.cpp b/src/detail/layout/layout.cpp new file mode 100644 index 00000000..19926bb6 --- /dev/null +++ b/src/detail/layout/layout.cpp @@ -0,0 +1,144 @@ +// --------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/hibf/blob/main/LICENSE.md +// --------------------------------------------------------------------------------------------------- + +#include +#include +#include + +#include +#include +#include + +namespace hibf::layout +{ + +hibf::layout::layout::user_bin parse_layout_line(std::string const & current_line) +{ + hibf::layout::layout::user_bin result{}; + + size_t tmp{}; // integer buffer when reading numbers + + // initialize parsing + std::string_view const buffer{current_line}; + auto const buffer_end{buffer.end()}; + auto field_end = buffer.begin(); + assert(field_end != buffer_end); + + // read user bin index + field_end = std::from_chars(field_end, buffer_end, tmp).ptr; + result.idx = tmp; + assert(field_end != buffer_end && *field_end == '\t'); + + do // read bin_indices + { + ++field_end; // skip tab or ; + assert(field_end != buffer_end && *field_end != '\t'); + field_end = std::from_chars(field_end, buffer_end, tmp).ptr; + result.previous_TB_indices.push_back(tmp); + } + while (field_end != buffer_end && *field_end != '\t'); + + result.storage_TB_id = result.previous_TB_indices.back(); + result.previous_TB_indices.pop_back(); + + do // read number of technical bins + { + ++field_end; // skip tab or ; + field_end = std::from_chars(field_end, buffer_end, tmp).ptr; + result.number_of_technical_bins = tmp; // only the last number really counts + } + while (field_end != buffer_end && *field_end != '\t'); + + return result; +} + +void hibf::layout::layout::read_from(std::istream & stream) +{ + // parse header + auto parse_bin_indices = [](std::string_view const & buffer) + { + std::vector result; + + auto buffer_start = &buffer[0]; + auto const buffer_end = buffer_start + buffer.size(); + + size_t tmp{}; + + while (buffer_start < buffer_end) + { + buffer_start = std::from_chars(buffer_start, buffer_end, tmp).ptr; + ++buffer_start; // skip ; + result.push_back(tmp); + } + + return result; + }; + + auto parse_first_bin = [](std::string_view const & buffer) + { + size_t tmp{}; + std::from_chars(&buffer[0], &buffer[0] + buffer.size(), tmp); + return tmp; + }; + + std::string line; + + std::getline(stream, line); // get first line that is always the max bin index of the top level bin + assert(line.starts_with(prefix::layout_first_header_line)); + + // parse High Level max bin index + constexpr size_t fullest_tbx_prefix_size = prefix::layout_fullest_technical_bin_idx.size(); + assert(line.substr(prefix::layout_top_level.size() + 2, fullest_tbx_prefix_size) + == prefix::layout_fullest_technical_bin_idx); + std::string_view const hibf_max_bin_str{line.begin() + prefix::layout_top_level.size() + 2 + + fullest_tbx_prefix_size, + line.end()}; + top_level_max_bin_id = parse_first_bin(hibf_max_bin_str); + + // read and parse header records, in order to sort them before adding them to the graph + while (std::getline(stream, line) && line != prefix::layout_column_names) + { + assert(line.substr(1, prefix::layout_lower_level.size()) == prefix::layout_lower_level); + + // parse header line + std::string_view const indices_str{ + line.begin() + 1 /*#*/ + prefix::layout_lower_level.size() + 1 /*_*/, + std::find(line.begin() + prefix::layout_lower_level.size() + 2, line.end(), ' ')}; + + assert(line.substr(prefix::layout_lower_level.size() + indices_str.size() + 3, fullest_tbx_prefix_size) + == prefix::layout_fullest_technical_bin_idx); + std::string_view const max_id_str{line.begin() + prefix::layout_lower_level.size() + indices_str.size() + + fullest_tbx_prefix_size + 3, + line.end()}; + + max_bins.emplace_back(parse_bin_indices(indices_str), parse_first_bin(max_id_str)); + } + + assert(line == prefix::layout_column_names); + + // parse the rest of the file + while (std::getline(stream, line)) + user_bins.emplace_back(parse_layout_line(line)); +} + +void hibf::layout::layout::write_to(std::ostream & stream) const +{ + // write layout header with max bin ids + stream << prefix::layout_first_header_line << " " << prefix::layout_fullest_technical_bin_idx + << top_level_max_bin_id << '\n'; + for (auto const & max_bin : max_bins) + stream << max_bin << '\n'; + + // write header line + stream << prefix::layout_column_names << '\n'; + + // write layout entries + for (auto const & user_bin : user_bins) + stream << user_bin << '\n'; +} + +} // namespace hibf::layout diff --git a/test/unit/hibf/detail/layout/layout_test.cpp b/test/unit/hibf/detail/layout/layout_test.cpp index f17b700c..3bfa98d6 100644 --- a/test/unit/hibf/detail/layout/layout_test.cpp +++ b/test/unit/hibf/detail/layout/layout_test.cpp @@ -7,6 +7,7 @@ #include // for vector #include // for layout, operator<< +#include // for expect_range_eq, EXPECT_RANGE_EQ TEST(layout_test, printing_max_bins) { @@ -49,3 +50,56 @@ TEST(layout_test, printing_user_bins) EXPECT_EQ(ss.str(), expected); } + +TEST(layout_test, write_to) +{ + std::stringstream ss{}; + + hibf::layout::layout layout; + + layout.top_level_max_bin_id = 111; + layout.max_bins.emplace_back(std::vector{0}, 0); + layout.max_bins.emplace_back(std::vector{2}, 2); + layout.max_bins.emplace_back(std::vector{1, 2, 3, 4}, 22); + layout.user_bins.emplace_back(7, std::vector{}, 1, 0); + layout.user_bins.emplace_back(4, std::vector{1}, 22, 0); + layout.user_bins.emplace_back(5, std::vector{1, 2, 3, 4}, 21, 22); + + layout.write_to(ss); + + std::string expected = R"layout_file(#TOP_LEVEL_IBF fullest_technical_bin_idx:111 +#LOWER_LEVEL_IBF_0 fullest_technical_bin_idx:0 +#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:2 +#LOWER_LEVEL_IBF_1;2;3;4 fullest_technical_bin_idx:22 +#USER_BIN_IDX TECHNICAL_BIN_INDICES NUMBER_OF_TECHNICAL_BINS +7 0 1 +4 1;0 1;22 +5 1;2;3;4;22 1;1;1;1;21 +)layout_file"; + + EXPECT_EQ(ss.str(), expected); +} + +TEST(layout_test, read_from) +{ + std::stringstream ss{R"layout_file(#TOP_LEVEL_IBF fullest_technical_bin_idx:111 +#LOWER_LEVEL_IBF_0 fullest_technical_bin_idx:0 +#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:2 +#LOWER_LEVEL_IBF_1;2;3;4 fullest_technical_bin_idx:22 +#USER_BIN_IDX TECHNICAL_BIN_INDICES NUMBER_OF_TECHNICAL_BINS +7 0 1 +4 1;0 1;22 +5 1;2;3;4;22 1;1;1;1;21 +)layout_file"}; + + hibf::layout::layout layout; + layout.read_from(ss); + + EXPECT_EQ(layout.top_level_max_bin_id, 111); + EXPECT_EQ(layout.max_bins[0], (hibf::layout::layout::max_bin{{0}, 0})); + EXPECT_EQ(layout.max_bins[1], (hibf::layout::layout::max_bin{{2}, 2})); + EXPECT_EQ(layout.max_bins[2], (hibf::layout::layout::max_bin{{1, 2, 3, 4}, 22})); + EXPECT_EQ(layout.user_bins[0], (hibf::layout::layout::user_bin{7, std::vector{}, 1, 0})); + EXPECT_EQ(layout.user_bins[1], (hibf::layout::layout::user_bin{4, std::vector{1}, 22, 0})); + EXPECT_EQ(layout.user_bins[2], (hibf::layout::layout::user_bin{5, std::vector{1, 2, 3, 4}, 21, 22})); +} From a2fa58b09e746a5abb9ac8ea108b14475fea61f2 Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Wed, 23 Aug 2023 13:54:25 +0200 Subject: [PATCH 5/5] [FEATURE] Make hibf constructible from a layout file. --- .../hierarchical_interleaved_bloom_filter.hpp | 10 ++++ src/hierarchical_interleaved_bloom_filter.cpp | 15 ++++++ ...archical_interleaved_bloom_filter_test.cpp | 49 ++++++++++++++++++- 3 files changed, 73 insertions(+), 1 deletion(-) diff --git a/include/hibf/hierarchical_interleaved_bloom_filter.hpp b/include/hibf/hierarchical_interleaved_bloom_filter.hpp index 5f026707..cc23275e 100644 --- a/include/hibf/hierarchical_interleaved_bloom_filter.hpp +++ b/include/hibf/hierarchical_interleaved_bloom_filter.hpp @@ -105,6 +105,16 @@ class hierarchical_interleaved_bloom_filter ~hierarchical_interleaved_bloom_filter() = default; //!< Defaulted. hierarchical_interleaved_bloom_filter(config const & configuration); + + /*!\brief [Advanced] Constructs the HIBF from a layout file (stream) and a given input function + * \details + * This constructor makes it possible to construct an hibf from a given layout file instead of calculating the + * layout based on the input function. A hibf::config object is not needed as it is assumed to be stored in the + * layout file. A layout file can be constructed manually or via chopper (https://github.com/seqan/chopper) + * or raptor-layout (https://github.com/seqan/raptor). + */ + hierarchical_interleaved_bloom_filter(std::function input_fn, + std::istream & layout_stream); //!\} //!\brief The individual interleaved Bloom filters. diff --git a/src/hierarchical_interleaved_bloom_filter.cpp b/src/hierarchical_interleaved_bloom_filter.cpp index 8545010c..c5fa3772 100644 --- a/src/hierarchical_interleaved_bloom_filter.cpp +++ b/src/hierarchical_interleaved_bloom_filter.cpp @@ -199,4 +199,19 @@ hierarchical_interleaved_bloom_filter::hierarchical_interleaved_bloom_filter(con build_index(*this, configuration, layout); } +hierarchical_interleaved_bloom_filter::hierarchical_interleaved_bloom_filter( + std::function input_fn, + std::istream & layout_stream) +{ + // read config and layout from file + config configuration; + layout::layout hibf_layout; + configuration.read_from(layout_stream); + hibf_layout.read_from(layout_stream); + + configuration.input_fn = input_fn; // set input as it cannot be serialized. + + build_index(*this, configuration, hibf_layout); +} + } // namespace hibf diff --git a/test/unit/hibf/hierarchical_interleaved_bloom_filter_test.cpp b/test/unit/hibf/hierarchical_interleaved_bloom_filter_test.cpp index 0638382c..2987de0f 100644 --- a/test/unit/hibf/hierarchical_interleaved_bloom_filter_test.cpp +++ b/test/unit/hibf/hierarchical_interleaved_bloom_filter_test.cpp @@ -9,7 +9,8 @@ #include // for size_t #include // for function -#include // for vector, allocator +#include +#include // for vector, allocator #include // for insert_iterator, config #include // for hierarchical_interleaved_bloom_filter @@ -41,6 +42,52 @@ TEST(hibf_test, test_specific_hash_values) } } +TEST(hibf_test, build_from_layout) +{ + // range of range of sequences + std::vector> hashes{{1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u}, {1u, 2u, 3u, 4u, 5u}}; + + auto input_fn = [&](size_t const num, hibf::insert_iterator it) + { + for (auto const hash : hashes[num]) + it = hash; + }; + + std::stringstream stream{"@HIBF_CONFIG\n" + "@{\n" + "@ \"hibf_config\": {\n" + "@ \"version\": 1,\n" + "@ \"number_of_user_bins\": 2,\n" + "@ \"number_of_hash_functions\": 2,\n" + "@ \"maximum_false_positive_rate\": 0.05,\n" + "@ \"threads\": 1,\n" + "@ \"sketch_bits\": 12,\n" + "@ \"tmax\": 64,\n" + "@ \"alpha\": 1.2,\n" + "@ \"max_rearrangement_ratio\": 0.5,\n" + "@ \"disable_estimate_union\": false,\n" + "@ \"disable_rearrangement\": true,\n" + "@ \"disable_cutoffs\": false\n" + "@ }\n" + "@}\n" + "@HIBF_CONFIG_END\n" + "#TOP_LEVEL_IBF fullest_technical_bin_idx:0\n" + "#USER_BIN_IDX\tTECHNICAL_BIN_INDICES\tNUMBER_OF_TECHNICAL_BINS\n" + "1\t0\t34\n" + "0\t34\t30\n"}; + + hibf::hierarchical_interleaved_bloom_filter hibf{input_fn, stream}; + + { + std::vector query{1, 2, 3, 4, 5}; + + auto agent = hibf.membership_agent(); + auto result = agent.bulk_contains(query, 2); + + EXPECT_RANGE_EQ(result, (std::vector{0u, 1u})); + } +} + // #ifdef HIBF_HAS_SEQAN3 // #include