Skip to content

Commit

Permalink
[WIP] Extend insert_iterator
Browse files Browse the repository at this point in the history
  • Loading branch information
eseiler committed Oct 18, 2024
1 parent 28e15a2 commit 68fa7a2
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 26 deletions.
40 changes: 33 additions & 7 deletions include/hibf/misc/insert_iterator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

#include <hibf/contrib/robin_hood.hpp> // for unordered_flat_set, hash
#include <hibf/platform.hpp>
#include <hibf/sketch/hyperloglog.hpp> // for hyperloglog

// IWYU pragma: private, include <hibf/config.hpp>

Expand All @@ -38,23 +39,39 @@ class insert_iterator

explicit constexpr insert_iterator(robin_hood::unordered_flat_set<uint64_t> & set) :
set{std::addressof(set)},
is_set{true}
type{data_type::unordered_set}
{}

explicit constexpr insert_iterator(std::vector<uint64_t> & vec) : vec{std::addressof(vec)}, is_set{false}
explicit constexpr insert_iterator(std::vector<uint64_t> & vec) : vec{std::addressof(vec)}, type{data_type::vector}
{}

explicit constexpr insert_iterator(sketch::hyperloglog & sketch) :
sketch{std::addressof(sketch)},
type{data_type::sketch}
{}

insert_iterator & operator=(uint64_t const value) noexcept
{
if (is_set)
switch (type)
{
case data_type::unordered_set:
assert(set != nullptr);
set->emplace(value);
}
else
{
break;
case data_type::vector:
assert(vec != nullptr);
vec->emplace_back(value);
break;
case data_type::sketch:
assert(sketch != nullptr);
sketch->add(value);
break;
default:

Check warning on line 69 in include/hibf/misc/insert_iterator.hpp

View check run for this annotation

Codecov / codecov/patch

include/hibf/misc/insert_iterator.hpp#L69

Added line #L69 was not covered by tests
#ifndef NDEBUG
assert(false);

Check warning on line 71 in include/hibf/misc/insert_iterator.hpp

View check run for this annotation

Codecov / codecov/patch

include/hibf/misc/insert_iterator.hpp#L71

Added line #L71 was not covered by tests
#else
__builtin_unreachable();
#endif
}
return *this;
}
Expand All @@ -77,7 +94,16 @@ class insert_iterator
private:
robin_hood::unordered_flat_set<uint64_t> * set{nullptr};
std::vector<uint64_t> * vec{nullptr};
bool is_set{false};
sketch::hyperloglog * sketch{nullptr};

enum class data_type : uint8_t
{
unordered_set,
vector,
sketch
};

data_type type{};
};

} // namespace seqan::hibf
13 changes: 7 additions & 6 deletions src/interleaved_bloom_filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@

#include <hibf/build/bin_size_in_bits.hpp> // for bin_size_in_bits
#include <hibf/config.hpp> // for config, insert_iterator
#include <hibf/contrib/robin_hood.hpp> // for unordered_flat_set
#include <hibf/interleaved_bloom_filter.hpp> // for interleaved_bloom_filter, bin_count, bin_index, bin_size, hash_...
#include <hibf/misc/bit_vector.hpp> // for bit_vector
#include <hibf/misc/divide_and_ceil.hpp> // for divide_and_ceil
#include <hibf/platform.hpp> // for HIBF_COMPILER_IS_GCC
#include <hibf/sketch/hyperloglog.hpp> // for hyperloglog

namespace seqan::hibf
{
Expand Down Expand Up @@ -51,15 +51,16 @@ size_t max_bin_size(config & configuration, size_t const max_bin_elements)

if (max_bin_elements == 0u)
{
robin_hood::unordered_flat_set<uint64_t> kmers;
#pragma omp parallel for schedule(dynamic) num_threads(configuration.threads) private(kmers)
seqan::hibf::sketch::hyperloglog sketch{15u};
#pragma omp parallel for schedule(dynamic) num_threads(configuration.threads) private(sketch)
for (size_t i = 0u; i < configuration.number_of_user_bins; ++i)
{
kmers.clear();
configuration.input_fn(i, insert_iterator{kmers});
sketch.reset();
configuration.input_fn(i, insert_iterator{sketch});

size_t const estimate = sketch.estimate();
#pragma omp critical
max_size = std::max(max_size, kmers.size());
max_size = std::max(max_size, estimate);
}
}
else
Expand Down
21 changes: 9 additions & 12 deletions src/sketch/compute_sketches.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,21 +27,18 @@ namespace seqan::hibf::sketch
void compute_sketches(config const & config, std::vector<sketch::hyperloglog> & hll_sketches)
{
// compute hll_sketches
hll_sketches.resize(config.number_of_user_bins);
hll_sketches.resize(config.number_of_user_bins, config.sketch_bits);

assert(std::ranges::all_of(hll_sketches,
[bits = config.sketch_bits](hyperloglog const & sketch)
{
return sketch.data_size() == (1ULL << bits);
}));

robin_hood::unordered_flat_set<uint64_t> kmers;
#pragma omp parallel for schedule(dynamic) num_threads(config.threads) private(kmers)
#pragma omp parallel for schedule(dynamic) num_threads(config.threads)
for (size_t i = 0; i < config.number_of_user_bins; ++i)
{
seqan::hibf::sketch::hyperloglog hll_sketch(config.sketch_bits);

kmers.clear();
config.input_fn(i, insert_iterator{kmers});

for (auto k_hash : kmers)
hll_sketch.add(k_hash);

hll_sketches[i] = std::move(hll_sketch);
config.input_fn(i, insert_iterator{hll_sketches[i]});
}
}

Expand Down
2 changes: 1 addition & 1 deletion test/unit/hibf/interleaved_bloom_filter_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ TEST(ibf_test, construction_from_config_with_max_bin_elements)

seqan::hibf::interleaved_bloom_filter only_config{ibf_config};
seqan::hibf::interleaved_bloom_filter default_num_elements{ibf_config, 0u};
seqan::hibf::interleaved_bloom_filter appropriate_num_elements{ibf_config, 10u};
seqan::hibf::interleaved_bloom_filter appropriate_num_elements{ibf_config, 11u}; // hll sketch estimate is 11
seqan::hibf::interleaved_bloom_filter larger_num_elements{ibf_config, 20u};

EXPECT_EQ(only_config, default_num_elements);
Expand Down

0 comments on commit 68fa7a2

Please sign in to comment.