From 3a33388cbb57bfe033bb72cecca8835b2ff0e9a8 Mon Sep 17 00:00:00 2001 From: Svenja Mehringer Date: Wed, 23 Aug 2023 13:46:53 +0200 Subject: [PATCH] [FEATURE] Add layout::read_from and layout::write_to. --- include/hibf/detail/layout/layout.hpp | 3 + src/CMakeLists.txt | 1 + src/detail/layout/layout.cpp | 144 +++++++++++++++++++ test/unit/hibf/detail/layout/layout_test.cpp | 54 +++++++ 4 files changed, 202 insertions(+) create mode 100644 src/detail/layout/layout.cpp diff --git a/include/hibf/detail/layout/layout.hpp b/include/hibf/detail/layout/layout.hpp index e0b3f914..1f3dbc30 100644 --- a/include/hibf/detail/layout/layout.hpp +++ b/include/hibf/detail/layout/layout.hpp @@ -86,6 +86,9 @@ struct layout } }; + void read_from(std::istream & stream); + void write_to(std::ostream & stream) const; + size_t top_level_max_bin_id{}; std::vector max_bins{}; std::vector user_bins{}; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index daa3df9a..a1f76fab 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -2,6 +2,7 @@ set (HIBF_SOURCE_FILES hierarchical_interleaved_bloom_filter.cpp config.cpp detail/layout/simple_binning.cpp + detail/layout/layout.cpp detail/layout/execute.cpp detail/layout/compute_fpr_correction.cpp detail/layout/compute_layout.cpp diff --git a/src/detail/layout/layout.cpp b/src/detail/layout/layout.cpp new file mode 100644 index 00000000..19926bb6 --- /dev/null +++ b/src/detail/layout/layout.cpp @@ -0,0 +1,144 @@ +// --------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/hibf/blob/main/LICENSE.md +// --------------------------------------------------------------------------------------------------- + +#include +#include +#include + +#include +#include +#include + +namespace hibf::layout +{ + +hibf::layout::layout::user_bin parse_layout_line(std::string const & current_line) +{ + hibf::layout::layout::user_bin result{}; + + size_t tmp{}; // integer buffer when reading numbers + + // initialize parsing + std::string_view const buffer{current_line}; + auto const buffer_end{buffer.end()}; + auto field_end = buffer.begin(); + assert(field_end != buffer_end); + + // read user bin index + field_end = std::from_chars(field_end, buffer_end, tmp).ptr; + result.idx = tmp; + assert(field_end != buffer_end && *field_end == '\t'); + + do // read bin_indices + { + ++field_end; // skip tab or ; + assert(field_end != buffer_end && *field_end != '\t'); + field_end = std::from_chars(field_end, buffer_end, tmp).ptr; + result.previous_TB_indices.push_back(tmp); + } + while (field_end != buffer_end && *field_end != '\t'); + + result.storage_TB_id = result.previous_TB_indices.back(); + result.previous_TB_indices.pop_back(); + + do // read number of technical bins + { + ++field_end; // skip tab or ; + field_end = std::from_chars(field_end, buffer_end, tmp).ptr; + result.number_of_technical_bins = tmp; // only the last number really counts + } + while (field_end != buffer_end && *field_end != '\t'); + + return result; +} + +void hibf::layout::layout::read_from(std::istream & stream) +{ + // parse header + auto parse_bin_indices = [](std::string_view const & buffer) + { + std::vector result; + + auto buffer_start = &buffer[0]; + auto const buffer_end = buffer_start + buffer.size(); + + size_t tmp{}; + + while (buffer_start < buffer_end) + { + buffer_start = std::from_chars(buffer_start, buffer_end, tmp).ptr; + ++buffer_start; // skip ; + result.push_back(tmp); + } + + return result; + }; + + auto parse_first_bin = [](std::string_view const & buffer) + { + size_t tmp{}; + std::from_chars(&buffer[0], &buffer[0] + buffer.size(), tmp); + return tmp; + }; + + std::string line; + + std::getline(stream, line); // get first line that is always the max bin index of the top level bin + assert(line.starts_with(prefix::layout_first_header_line)); + + // parse High Level max bin index + constexpr size_t fullest_tbx_prefix_size = prefix::layout_fullest_technical_bin_idx.size(); + assert(line.substr(prefix::layout_top_level.size() + 2, fullest_tbx_prefix_size) + == prefix::layout_fullest_technical_bin_idx); + std::string_view const hibf_max_bin_str{line.begin() + prefix::layout_top_level.size() + 2 + + fullest_tbx_prefix_size, + line.end()}; + top_level_max_bin_id = parse_first_bin(hibf_max_bin_str); + + // read and parse header records, in order to sort them before adding them to the graph + while (std::getline(stream, line) && line != prefix::layout_column_names) + { + assert(line.substr(1, prefix::layout_lower_level.size()) == prefix::layout_lower_level); + + // parse header line + std::string_view const indices_str{ + line.begin() + 1 /*#*/ + prefix::layout_lower_level.size() + 1 /*_*/, + std::find(line.begin() + prefix::layout_lower_level.size() + 2, line.end(), ' ')}; + + assert(line.substr(prefix::layout_lower_level.size() + indices_str.size() + 3, fullest_tbx_prefix_size) + == prefix::layout_fullest_technical_bin_idx); + std::string_view const max_id_str{line.begin() + prefix::layout_lower_level.size() + indices_str.size() + + fullest_tbx_prefix_size + 3, + line.end()}; + + max_bins.emplace_back(parse_bin_indices(indices_str), parse_first_bin(max_id_str)); + } + + assert(line == prefix::layout_column_names); + + // parse the rest of the file + while (std::getline(stream, line)) + user_bins.emplace_back(parse_layout_line(line)); +} + +void hibf::layout::layout::write_to(std::ostream & stream) const +{ + // write layout header with max bin ids + stream << prefix::layout_first_header_line << " " << prefix::layout_fullest_technical_bin_idx + << top_level_max_bin_id << '\n'; + for (auto const & max_bin : max_bins) + stream << max_bin << '\n'; + + // write header line + stream << prefix::layout_column_names << '\n'; + + // write layout entries + for (auto const & user_bin : user_bins) + stream << user_bin << '\n'; +} + +} // namespace hibf::layout diff --git a/test/unit/hibf/detail/layout/layout_test.cpp b/test/unit/hibf/detail/layout/layout_test.cpp index f17b700c..3bfa98d6 100644 --- a/test/unit/hibf/detail/layout/layout_test.cpp +++ b/test/unit/hibf/detail/layout/layout_test.cpp @@ -7,6 +7,7 @@ #include // for vector #include // for layout, operator<< +#include // for expect_range_eq, EXPECT_RANGE_EQ TEST(layout_test, printing_max_bins) { @@ -49,3 +50,56 @@ TEST(layout_test, printing_user_bins) EXPECT_EQ(ss.str(), expected); } + +TEST(layout_test, write_to) +{ + std::stringstream ss{}; + + hibf::layout::layout layout; + + layout.top_level_max_bin_id = 111; + layout.max_bins.emplace_back(std::vector{0}, 0); + layout.max_bins.emplace_back(std::vector{2}, 2); + layout.max_bins.emplace_back(std::vector{1, 2, 3, 4}, 22); + layout.user_bins.emplace_back(7, std::vector{}, 1, 0); + layout.user_bins.emplace_back(4, std::vector{1}, 22, 0); + layout.user_bins.emplace_back(5, std::vector{1, 2, 3, 4}, 21, 22); + + layout.write_to(ss); + + std::string expected = R"layout_file(#TOP_LEVEL_IBF fullest_technical_bin_idx:111 +#LOWER_LEVEL_IBF_0 fullest_technical_bin_idx:0 +#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:2 +#LOWER_LEVEL_IBF_1;2;3;4 fullest_technical_bin_idx:22 +#USER_BIN_IDX TECHNICAL_BIN_INDICES NUMBER_OF_TECHNICAL_BINS +7 0 1 +4 1;0 1;22 +5 1;2;3;4;22 1;1;1;1;21 +)layout_file"; + + EXPECT_EQ(ss.str(), expected); +} + +TEST(layout_test, read_from) +{ + std::stringstream ss{R"layout_file(#TOP_LEVEL_IBF fullest_technical_bin_idx:111 +#LOWER_LEVEL_IBF_0 fullest_technical_bin_idx:0 +#LOWER_LEVEL_IBF_2 fullest_technical_bin_idx:2 +#LOWER_LEVEL_IBF_1;2;3;4 fullest_technical_bin_idx:22 +#USER_BIN_IDX TECHNICAL_BIN_INDICES NUMBER_OF_TECHNICAL_BINS +7 0 1 +4 1;0 1;22 +5 1;2;3;4;22 1;1;1;1;21 +)layout_file"}; + + hibf::layout::layout layout; + layout.read_from(ss); + + EXPECT_EQ(layout.top_level_max_bin_id, 111); + EXPECT_EQ(layout.max_bins[0], (hibf::layout::layout::max_bin{{0}, 0})); + EXPECT_EQ(layout.max_bins[1], (hibf::layout::layout::max_bin{{2}, 2})); + EXPECT_EQ(layout.max_bins[2], (hibf::layout::layout::max_bin{{1, 2, 3, 4}, 22})); + EXPECT_EQ(layout.user_bins[0], (hibf::layout::layout::user_bin{7, std::vector{}, 1, 0})); + EXPECT_EQ(layout.user_bins[1], (hibf::layout::layout::user_bin{4, std::vector{1}, 22, 0})); + EXPECT_EQ(layout.user_bins[2], (hibf::layout::layout::user_bin{5, std::vector{1, 2, 3, 4}, 21, 22})); +}