Skip to content

Commit

Permalink
Merge pull request #237 from eseiler/infra/sketchy
Browse files Browse the repository at this point in the history
[MISC] Extend insert_iterator
  • Loading branch information
smehringer authored Oct 24, 2024
2 parents 6e1fd5a + 76202c1 commit 9e631fd
Show file tree
Hide file tree
Showing 11 changed files with 350 additions and 82 deletions.
1 change: 1 addition & 0 deletions .github/workflows/ci_coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ jobs:
--exclude-unreachable-branches \
--exclude-throw-branches \
--exclude-noncode-lines \
--merge-mode-functions separate \
-j \
--cobertura \
--output ${GITHUB_WORKSPACE}/build/coverage_report.xml
Expand Down
5 changes: 4 additions & 1 deletion include/hibf/interleaved_bloom_filter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
#include <cereal/types/base_class.hpp> // for base_class

#include <hibf/cereal/concepts.hpp> // for cereal_archive
#include <hibf/config.hpp> // for config
#include <hibf/contrib/aligned_allocator.hpp> // for aligned_allocator
#include <hibf/misc/bit_vector.hpp> // for bit_vector
#include <hibf/misc/counting_vector.hpp> // for counting_vector
Expand All @@ -33,6 +32,10 @@

namespace seqan::hibf
{

// config.hpp -> misc/insert_iterator.hpp (Needs interleaved_bloom_filter to be a complete class)
struct config;

/*!\brief A strong type that represents the number of bins for the seqan::hibf::interleaved_bloom_filter.
* \ingroup ibf
* \qualifier strong
Expand Down
74 changes: 52 additions & 22 deletions include/hibf/misc/insert_iterator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
#include <vector> // for vector

#include <hibf/contrib/robin_hood.hpp> // for unordered_flat_set, hash
#include <hibf/interleaved_bloom_filter.hpp>
#include <hibf/platform.hpp>
#include <hibf/sketch/hyperloglog.hpp>

// IWYU pragma: private, include <hibf/config.hpp>

Expand All @@ -29,32 +31,51 @@ class insert_iterator
using pointer = void;
using reference = void;

insert_iterator() = delete;
insert_iterator(insert_iterator const &) = default;
insert_iterator(insert_iterator &&) = default;
insert_iterator & operator=(insert_iterator const &) = default;
insert_iterator & operator=(insert_iterator &&) = default;
~insert_iterator() = default;
constexpr insert_iterator() = default;
constexpr insert_iterator(insert_iterator const &) = default;
constexpr insert_iterator(insert_iterator &&) = default;
constexpr insert_iterator & operator=(insert_iterator const &) = default;
constexpr insert_iterator & operator=(insert_iterator &&) = default;
constexpr ~insert_iterator() = default;

explicit constexpr insert_iterator(robin_hood::unordered_flat_set<uint64_t> & set) :
set{std::addressof(set)},
is_set{true}
using set_t = robin_hood::unordered_flat_set<uint64_t>;
using sketch_t = sketch::hyperloglog;
using ibf_t = interleaved_bloom_filter;
using function_t = std::function<void(uint64_t const)>;

explicit constexpr insert_iterator(set_t & set) : ptr{std::addressof(set)}, type{data_type::unordered_set}
{}

explicit constexpr insert_iterator(sketch_t & sketch) : ptr{std::addressof(sketch)}, type{data_type::sketch}
{}

explicit constexpr insert_iterator(std::vector<uint64_t> & vec) : vec{std::addressof(vec)}, is_set{false}
explicit constexpr insert_iterator(ibf_t & ibf, size_t ibf_bin_index) :
ptr{std::addressof(ibf)},
ibf_bin_index{ibf_bin_index},
type{data_type::ibf}
{}

insert_iterator & operator=(uint64_t const value) noexcept
explicit constexpr insert_iterator(function_t & fun) : ptr{std::addressof(fun)}, type{data_type::function}
{}

[[gnu::always_inline, gnu::flatten]] inline insert_iterator & operator=(uint64_t const value) noexcept
{
if (is_set)
{
assert(set != nullptr);
set->emplace(value);
}
else
assert(ptr != nullptr);

switch (type)
{
assert(vec != nullptr);
vec->emplace_back(value);
case data_type::unordered_set:
static_cast<set_t *>(ptr)->emplace(value);
break;
case data_type::sketch:
static_cast<sketch_t *>(ptr)->add(value);
break;
case data_type::ibf:
static_cast<ibf_t *>(ptr)->emplace(value, static_cast<bin_index>(ibf_bin_index));
break;
default:
assert(type == data_type::function);
static_cast<function_t *>(ptr)->operator()(value);
}
return *this;
}
Expand All @@ -75,9 +96,18 @@ class insert_iterator
}

private:
robin_hood::unordered_flat_set<uint64_t> * set{nullptr};
std::vector<uint64_t> * vec{nullptr};
bool is_set{false};
void * ptr{nullptr};

enum class data_type : uint8_t
{
unordered_set,
sketch,
ibf,
function
};

size_t ibf_bin_index{};
data_type type{};
};

} // namespace seqan::hibf
14 changes: 4 additions & 10 deletions src/build/insert_into_ibf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,20 +51,14 @@ void insert_into_ibf(build_data const & data,
layout::layout::user_bin const & record,
seqan::hibf::interleaved_bloom_filter & ibf)
{
auto const bin_index = seqan::hibf::bin_index{static_cast<size_t>(record.storage_TB_id)};
std::vector<uint64_t> values;

serial_timer local_user_bin_io_timer{};
local_user_bin_io_timer.start();
data.config.input_fn(record.idx, insert_iterator{values});
local_user_bin_io_timer.stop();
data.user_bin_io_timer += local_user_bin_io_timer;

serial_timer local_fill_ibf_timer{};
local_user_bin_io_timer.start();
local_fill_ibf_timer.start();
for (auto && value : values)
ibf.emplace(value, bin_index);
data.config.input_fn(record.idx, insert_iterator{ibf, record.storage_TB_id});
local_user_bin_io_timer.stop();
local_fill_ibf_timer.stop();
data.user_bin_io_timer += local_user_bin_io_timer;
data.fill_ibf_timer += local_fill_ibf_timer;
}

Expand Down
82 changes: 50 additions & 32 deletions src/interleaved_bloom_filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,22 @@

#include <hibf/build/bin_size_in_bits.hpp> // for bin_size_in_bits
#include <hibf/config.hpp> // for config, insert_iterator
#include <hibf/contrib/robin_hood.hpp> // for unordered_flat_set
#include <hibf/interleaved_bloom_filter.hpp> // for interleaved_bloom_filter, bin_count, bin_index, bin_size, hash_...
#include <hibf/misc/bit_vector.hpp> // for bit_vector
#include <hibf/misc/divide_and_ceil.hpp> // for divide_and_ceil
#include <hibf/platform.hpp> // for HIBF_COMPILER_IS_GCC
#include <hibf/misc/insert_iterator.hpp>
#include <hibf/platform.hpp> // for HIBF_COMPILER_IS_GCC
#include <hibf/sketch/compute_sketches.hpp> // for compute_sketches
#include <hibf/sketch/hyperloglog.hpp> // for hyperloglog

namespace seqan::hibf
{

#if HIBF_COMPILER_IS_GCC
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wattributes"
#endif // HIBF_COMPILER_IS_GCC

interleaved_bloom_filter::interleaved_bloom_filter(seqan::hibf::bin_count bins_,
seqan::hibf::bin_size size,
seqan::hibf::hash_function_count funs)
Expand All @@ -43,29 +50,48 @@ interleaved_bloom_filter::interleaved_bloom_filter(seqan::hibf::bin_count bins_,
resize(technical_bins * bin_size_);
}

size_t max_bin_size(config & configuration, size_t const max_bin_elements)
size_t find_biggest_bin(config const & configuration)
{
configuration.validate_and_set_defaults();

size_t bin_id{};
size_t max_size{};
seqan::hibf::sketch::hyperloglog sketch{configuration.sketch_bits};

if (max_bin_elements == 0u)
#pragma omp parallel for schedule(dynamic) num_threads(configuration.threads) firstprivate(sketch)
for (size_t i = 0u; i < configuration.number_of_user_bins; ++i)
{
robin_hood::unordered_flat_set<uint64_t> kmers;
#pragma omp parallel for schedule(dynamic) num_threads(configuration.threads) private(kmers)
for (size_t i = 0u; i < configuration.number_of_user_bins; ++i)
{
kmers.clear();
configuration.input_fn(i, insert_iterator{kmers});
sketch.reset();
configuration.input_fn(i, insert_iterator{sketch});

size_t const estimate = sketch.estimate();
#pragma omp critical
max_size = std::max(max_size, kmers.size());
{
if (estimate > max_size)
{
max_size = estimate;
bin_id = i;
}
}
}
else

return bin_id;
}

size_t max_bin_size(config & configuration, size_t const max_bin_elements)
{
configuration.validate_and_set_defaults();

size_t const max_size = [&]()
{
max_size = max_bin_elements;
}
if (max_bin_elements != 0u)
return max_bin_elements;

// Use sketches to determine biggest bin.
size_t const max_bin_id = find_biggest_bin(configuration);
// Get exact count for biggest bin. Sketch estimate's accuracy depends on configuration.sketch_bits
robin_hood::unordered_flat_set<uint64_t> kmers{};
configuration.input_fn(max_bin_id, insert_iterator{kmers});
return kmers.size();
}();

return build::bin_size_in_bits({.fpr = configuration.maximum_fpr, //
.hash_count = configuration.number_of_hash_functions,
Expand All @@ -80,16 +106,11 @@ interleaved_bloom_filter::interleaved_bloom_filter(config & configuration, size_
{
// NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
size_t const chunk_size = std::clamp<size_t>(std::bit_ceil(bin_count() / configuration.threads), 8u, 64u);
robin_hood::unordered_flat_set<uint64_t> kmers;

#pragma omp parallel for schedule(dynamic, chunk_size) num_threads(configuration.threads) private(kmers)
#pragma omp parallel for schedule(dynamic, chunk_size) num_threads(configuration.threads)
for (size_t i = 0u; i < configuration.number_of_user_bins; ++i)
{
kmers.clear();
configuration.input_fn(i, insert_iterator{kmers});

for (uint64_t const hash : kmers)
emplace(hash, seqan::hibf::bin_index{i});
configuration.input_fn(i, insert_iterator{*this, i});
}
}

Expand Down Expand Up @@ -118,12 +139,12 @@ inline auto interleaved_bloom_filter::emplace_impl(size_t const value, bin_index
return exists;
};

void interleaved_bloom_filter::emplace(size_t const value, bin_index const bin) noexcept
[[gnu::always_inline]] void interleaved_bloom_filter::emplace(size_t const value, bin_index const bin) noexcept
{
return emplace_impl<false>(value, bin);
}

bool interleaved_bloom_filter::emplace_exists(size_t const value, bin_index const bin) noexcept
[[gnu::always_inline]] bool interleaved_bloom_filter::emplace_exists(size_t const value, bin_index const bin) noexcept
{
return emplace_impl<true>(value, bin);
}
Expand Down Expand Up @@ -178,16 +199,9 @@ void interleaved_bloom_filter::increase_bin_number_to(seqan::hibf::bin_count con
technical_bins = new_technical_bins;
}

#if HIBF_COMPILER_IS_GCC
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wattributes"
#endif // HIBF_COMPILER_IS_GCC
[[gnu::always_inline]] bit_vector const &
interleaved_bloom_filter::membership_agent_type::bulk_contains(size_t const value) & noexcept
{
#if HIBF_COMPILER_IS_GCC
# pragma GCC diagnostic pop
#endif // HIBF_COMPILER_IS_GCC
assert(ibf_ptr != nullptr);
assert(result_buffer.size() == ibf_ptr->bin_count());

Expand Down Expand Up @@ -276,4 +290,8 @@ interleaved_bloom_filter::membership_agent_type::bulk_contains(size_t const valu
return result_buffer;
}

#if HIBF_COMPILER_IS_GCC
# pragma GCC diagnostic pop
#endif // HIBF_COMPILER_IS_GCC

} // namespace seqan::hibf
21 changes: 9 additions & 12 deletions src/sketch/compute_sketches.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,21 +27,18 @@ namespace seqan::hibf::sketch
void compute_sketches(config const & config, std::vector<sketch::hyperloglog> & hll_sketches)
{
// compute hll_sketches
hll_sketches.resize(config.number_of_user_bins);
hll_sketches.resize(config.number_of_user_bins, config.sketch_bits);

assert(std::ranges::all_of(hll_sketches,
[bits = config.sketch_bits](hyperloglog const & sketch)
{
return sketch.data_size() == (1ULL << bits);
}));

robin_hood::unordered_flat_set<uint64_t> kmers;
#pragma omp parallel for schedule(dynamic) num_threads(config.threads) private(kmers)
#pragma omp parallel for schedule(dynamic) num_threads(config.threads)
for (size_t i = 0; i < config.number_of_user_bins; ++i)
{
seqan::hibf::sketch::hyperloglog hll_sketch(config.sketch_bits);

kmers.clear();
config.input_fn(i, insert_iterator{kmers});

for (auto k_hash : kmers)
hll_sketch.add(k_hash);

hll_sketches[i] = std::move(hll_sketch);
config.input_fn(i, insert_iterator{hll_sketches[i]});
}
}

Expand Down
1 change: 1 addition & 0 deletions test/performance/ibf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
hibf_benchmark (bit_vector_benchmark.cpp)
hibf_benchmark (bit_vector_serialisation_benchmark.cpp)
hibf_benchmark (interleaved_bloom_filter_benchmark.cpp)
hibf_benchmark (interleaved_bloom_filter_construction_benchmark.cpp)
Loading

0 comments on commit 9e631fd

Please sign in to comment.