Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEATURE] Make the HIBF constructible from a layout file. #50

Merged
merged 5 commits into from
Aug 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions include/hibf/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <cstddef> // for size_t
#include <filesystem> // for path
#include <functional> // for function
#include <iosfwd> // for ostream
#include <iterator> // for insert_iterator

#include <hibf/contrib/robin_hood.hpp> // for unordered_flat_set
Expand Down Expand Up @@ -78,13 +79,13 @@ struct config
// Related to k-mers
bool disable_cutoffs{false};

//!\brief If given, no layout algorithm is esxecuted but the layout from file is used for building.
std::filesystem::path layout_file{};

// Related to IBF
// bool compressed{false};
//!\}

void read_from(std::istream & stream);
void write_to(std::ostream & stream) const;

private:
friend class cereal::access;

Expand All @@ -107,7 +108,6 @@ struct config
archive(CEREAL_NVP(disable_rearrangement));

archive(CEREAL_NVP(disable_cutoffs));
archive(CEREAL_NVP(layout_file));
}
};

Expand Down
7 changes: 5 additions & 2 deletions include/hibf/detail/layout/layout.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ struct layout
requires std::derived_from<stream_type, std::ostream>
friend stream_type & operator<<(stream_type & stream, max_bin const & object)
{
stream << prefix::header << prefix::merged_bin << '_';
stream << prefix::layout_header << prefix::layout_lower_level << '_';
auto it = object.previous_TB_indices.begin();
auto end = object.previous_TB_indices.end();
// If not empty, we join with ';'
Expand All @@ -37,7 +37,7 @@ struct layout
while (++it != end)
stream << ';' << *it;
}
stream << " max_bin_id:" << object.id;
stream << " " << prefix::layout_fullest_technical_bin_idx << object.id;

return stream;
}
Expand Down Expand Up @@ -86,6 +86,9 @@ struct layout
}
};

void read_from(std::istream & stream);
void write_to(std::ostream & stream) const;

size_t top_level_max_bin_id{};
std::vector<max_bin> max_bins{};
std::vector<user_bin> user_bins{};
Expand Down
66 changes: 50 additions & 16 deletions include/hibf/detail/prefixes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,55 @@

namespace hibf::prefix
{

constexpr std::string_view chopper{"chopper"};

constexpr std::string_view header{"#"};

constexpr std::string_view header_config{"#"};

constexpr std::string_view high_level{"HIGH_LEVEL_IBF"};

constexpr std::string_view first_header_line{"#HIGH_LEVEL_IBF"};
static_assert(first_header_line.starts_with(header));
static_assert(first_header_line.ends_with(high_level));

constexpr std::string_view merged_bin{"MERGED_BIN"};

constexpr std::string_view split_bin{"SPLIT_BIN"};
/* These prefixes are for writing the layout file

* It is structured like this:
*
* [0) Possibly metadata added by chopper/raptor-layout]
* 1) Metadata: the hibf config
* 2) Layout header: max bin ids for the merged bins
* 3) Layout content: Assignment of user bin idx to technical bin idx
*
* And marked like this:
* [0) First character is @; Start and End of meta data should be marked accordingly to (1)]
* 1) First character is @; Start and End are marked by @HIBF_CONFIG and @HIBF_CONFIG_END respectively
* 2) First character is #;
* 3) No mark, plain content.
*
* Example:
*
* ```
* @CHOPPER_USER_BINS
* @0 /path/to/file1.fa
* @CHOPPER_USER_BINS_END
* @CHOPPER_CONFIG
* @0 k = 20
* @CHOPPER_CONFIG_END
*
* ``
*/

constexpr std::string_view meta_header{"@"};

constexpr std::string_view meta_hibf_config_start{"@HIBF_CONFIG"};
static_assert(meta_hibf_config_start.starts_with(meta_header));

constexpr std::string_view meta_hibf_config_end{"@HIBF_CONFIG_END"};
static_assert(meta_hibf_config_end.starts_with(meta_header));

constexpr std::string_view layout_header{"#"};

constexpr std::string_view layout_top_level{"TOP_LEVEL_IBF"};

constexpr std::string_view layout_lower_level{"LOWER_LEVEL_IBF"};

constexpr std::string_view layout_fullest_technical_bin_idx{"fullest_technical_bin_idx:"};

constexpr std::string_view layout_first_header_line{"#TOP_LEVEL_IBF"};
static_assert(layout_first_header_line.starts_with(layout_header));
static_assert(layout_first_header_line.ends_with(layout_top_level));

constexpr std::string_view layout_column_names{"#USER_BIN_IDX\tTECHNICAL_BIN_INDICES\tNUMBER_OF_TECHNICAL_BINS"};
static_assert(layout_column_names.starts_with(layout_header));

} // namespace hibf::prefix
10 changes: 10 additions & 0 deletions include/hibf/hierarchical_interleaved_bloom_filter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,16 @@ class hierarchical_interleaved_bloom_filter
~hierarchical_interleaved_bloom_filter() = default; //!< Defaulted.

hierarchical_interleaved_bloom_filter(config const & configuration);

/*!\brief [Advanced] Constructs the HIBF from a layout file (stream) and a given input function
* \details
* This constructor makes it possible to construct an hibf from a given layout file instead of calculating the
* layout based on the input function. A hibf::config object is not needed as it is assumed to be stored in the
* layout file. A layout file can be constructed manually or via chopper (https://github.com/seqan/chopper)
* or raptor-layout (https://github.com/seqan/raptor).
*/
hierarchical_interleaved_bloom_filter(std::function<void(size_t const, insert_iterator &&)> input_fn,
std::istream & layout_stream);
//!\}

//!\brief The individual interleaved Bloom filters.
Expand Down
2 changes: 2 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
set (HIBF_SOURCE_FILES
hierarchical_interleaved_bloom_filter.cpp
config.cpp
detail/layout/simple_binning.cpp
detail/layout/layout.cpp
detail/layout/execute.cpp
detail/layout/compute_fpr_correction.cpp
detail/layout/compute_layout.cpp
Expand Down
60 changes: 60 additions & 0 deletions src/config.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
// ---------------------------------------------------------------------------------------------------
// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
// shipped with this file and also available at: https://github.com/seqan/hibf/blob/main/LICENSE.md
// ---------------------------------------------------------------------------------------------------

#include <cassert>
#include <charconv>
#include <iostream>

#include <hibf/config.hpp>
#include <hibf/detail/prefixes.hpp>

#include <cereal/archives/json.hpp>

namespace hibf
{

void config::read_from(std::istream & stream)
{
std::string line;
std::stringstream config_str;

while (std::getline(stream, line) && line != prefix::meta_hibf_config_start)
;

assert(line == prefix::meta_hibf_config_start);

// TODO ##CONFIG: as prefix
while (std::getline(stream, line) && line != prefix::meta_hibf_config_end)
{
assert(line.size() >= 2);
assert(std::string_view{line}.substr(0, 1) == hibf::prefix::meta_header);
config_str << line.substr(1); // remove hibf::prefix::meta_header
}

assert(line == prefix::meta_hibf_config_end);

cereal::JSONInputArchive iarchive(config_str);
iarchive(*this);
}

void config::write_to(std::ostream & stream) const
{
// write json file to temprorary string stream with cereal
std::stringstream config_stream{};
cereal::JSONOutputArchive output(config_stream); // stream to cout
output(cereal::make_nvp("hibf_config", *this));

// write config
stream << prefix::meta_hibf_config_start << '\n';
std::string line;
while (std::getline(config_stream, line, '\n'))
stream << prefix::meta_header << line << '\n';
stream << prefix::meta_header << "}\n" // last closing bracket isn't written by loop above
<< prefix::meta_hibf_config_end << '\n';
}

} // namespace hibf
144 changes: 144 additions & 0 deletions src/detail/layout/layout.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
// ---------------------------------------------------------------------------------------------------
// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
// shipped with this file and also available at: https://github.com/seqan/hibf/blob/main/LICENSE.md
// ---------------------------------------------------------------------------------------------------

#include <cassert>
#include <charconv>
#include <iostream>

#include <hibf/config.hpp>
#include <hibf/detail/layout/layout.hpp>
#include <hibf/detail/prefixes.hpp>

namespace hibf::layout
{

hibf::layout::layout::user_bin parse_layout_line(std::string const & current_line)
{
hibf::layout::layout::user_bin result{};

size_t tmp{}; // integer buffer when reading numbers

// initialize parsing
std::string_view const buffer{current_line};
auto const buffer_end{buffer.end()};
auto field_end = buffer.begin();
assert(field_end != buffer_end);

// read user bin index
field_end = std::from_chars(field_end, buffer_end, tmp).ptr;
result.idx = tmp;
assert(field_end != buffer_end && *field_end == '\t');

do // read bin_indices
{
++field_end; // skip tab or ;
assert(field_end != buffer_end && *field_end != '\t');
field_end = std::from_chars(field_end, buffer_end, tmp).ptr;
result.previous_TB_indices.push_back(tmp);
}
while (field_end != buffer_end && *field_end != '\t');

result.storage_TB_id = result.previous_TB_indices.back();
result.previous_TB_indices.pop_back();

do // read number of technical bins
{
++field_end; // skip tab or ;
field_end = std::from_chars(field_end, buffer_end, tmp).ptr;
result.number_of_technical_bins = tmp; // only the last number really counts
}
while (field_end != buffer_end && *field_end != '\t');

return result;
}

void hibf::layout::layout::read_from(std::istream & stream)
{
// parse header
auto parse_bin_indices = [](std::string_view const & buffer)
{
std::vector<size_t> result;

auto buffer_start = &buffer[0];
auto const buffer_end = buffer_start + buffer.size();

size_t tmp{};

while (buffer_start < buffer_end)
{
buffer_start = std::from_chars(buffer_start, buffer_end, tmp).ptr;
++buffer_start; // skip ;
result.push_back(tmp);
}

return result;
};

auto parse_first_bin = [](std::string_view const & buffer)
{
size_t tmp{};
std::from_chars(&buffer[0], &buffer[0] + buffer.size(), tmp);
return tmp;
};

std::string line;

std::getline(stream, line); // get first line that is always the max bin index of the top level bin
assert(line.starts_with(prefix::layout_first_header_line));

// parse High Level max bin index
constexpr size_t fullest_tbx_prefix_size = prefix::layout_fullest_technical_bin_idx.size();
assert(line.substr(prefix::layout_top_level.size() + 2, fullest_tbx_prefix_size)
== prefix::layout_fullest_technical_bin_idx);
std::string_view const hibf_max_bin_str{line.begin() + prefix::layout_top_level.size() + 2
+ fullest_tbx_prefix_size,
line.end()};
top_level_max_bin_id = parse_first_bin(hibf_max_bin_str);

// read and parse header records, in order to sort them before adding them to the graph
while (std::getline(stream, line) && line != prefix::layout_column_names)
{
assert(line.substr(1, prefix::layout_lower_level.size()) == prefix::layout_lower_level);

// parse header line
std::string_view const indices_str{
line.begin() + 1 /*#*/ + prefix::layout_lower_level.size() + 1 /*_*/,
std::find(line.begin() + prefix::layout_lower_level.size() + 2, line.end(), ' ')};

assert(line.substr(prefix::layout_lower_level.size() + indices_str.size() + 3, fullest_tbx_prefix_size)
== prefix::layout_fullest_technical_bin_idx);
std::string_view const max_id_str{line.begin() + prefix::layout_lower_level.size() + indices_str.size()
+ fullest_tbx_prefix_size + 3,
line.end()};

max_bins.emplace_back(parse_bin_indices(indices_str), parse_first_bin(max_id_str));
}

assert(line == prefix::layout_column_names);

// parse the rest of the file
while (std::getline(stream, line))
user_bins.emplace_back(parse_layout_line(line));
}

void hibf::layout::layout::write_to(std::ostream & stream) const
{
// write layout header with max bin ids
stream << prefix::layout_first_header_line << " " << prefix::layout_fullest_technical_bin_idx
<< top_level_max_bin_id << '\n';
for (auto const & max_bin : max_bins)
stream << max_bin << '\n';

// write header line
stream << prefix::layout_column_names << '\n';

// write layout entries
for (auto const & user_bin : user_bins)
stream << user_bin << '\n';
}

} // namespace hibf::layout
15 changes: 15 additions & 0 deletions src/hierarchical_interleaved_bloom_filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,4 +199,19 @@ hierarchical_interleaved_bloom_filter::hierarchical_interleaved_bloom_filter(con
build_index(*this, configuration, layout);
}

hierarchical_interleaved_bloom_filter::hierarchical_interleaved_bloom_filter(
std::function<void(size_t const, insert_iterator &&)> input_fn,
std::istream & layout_stream)
{
// read config and layout from file
config configuration;
layout::layout hibf_layout;
configuration.read_from(layout_stream);
hibf_layout.read_from(layout_stream);

configuration.input_fn = input_fn; // set input as it cannot be serialized.

build_index(*this, configuration, hibf_layout);
}

} // namespace hibf
1 change: 1 addition & 0 deletions test/unit/hibf/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
add_subdirectories ()

hibf_test (config_test.cpp)
hibf_test (hierarchical_interleaved_bloom_filter_test.cpp)
hibf_test (interleaved_bloom_filter_test.cpp)
Loading