Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MISC] Improve bulk_contains performance #45

Merged
merged 4 commits into from
Aug 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .github/workflows/ci_linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ jobs:
compiler: "intel"
build: unit
build_type: Release
cxx_flags: "-Xclang=-Wno-pass-failed"

steps:
- name: Checkout
Expand Down Expand Up @@ -83,7 +84,8 @@ jobs:
cd build
cmake ../test/${{ matrix.build }} -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DHIBF_NATIVE_BUILD=OFF \
-DHIBF_VERBOSE_TESTS=OFF
-DHIBF_VERBOSE_TESTS=OFF \
-DCMAKE_CXX_FLAGS="${{ matrix.cxx_flags }}"
make -j2 gtest_build

- name: Build tests
Expand Down
116 changes: 104 additions & 12 deletions include/hibf/interleaved_bloom_filter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -654,6 +654,9 @@ class interleaved_bloom_filter::membership_agent_type
//!\brief A pointer to the augmented hibf::interleaved_bloom_filter.
ibf_t const * ibf_ptr{nullptr};

//!\brief Stores access positions of augmented hibf::interleaved_bloom_filter.
std::array<size_t, 5> bloom_filter_indices;

public:
/*!\name Constructors, destructor and assignment
* \{
Expand Down Expand Up @@ -701,24 +704,113 @@ class interleaved_bloom_filter::membership_agent_type
assert(ibf_ptr != nullptr);
assert(result_buffer.size() == ibf_ptr->bin_count());

std::array<size_t, 5> bloom_filter_indices;
std::memcpy(&bloom_filter_indices, &ibf_ptr->hash_seeds, sizeof(size_t) * ibf_ptr->hash_funs);
// Needed for auto-vectorization of loop. ibf_ptr->bin_words could change bewtween loops.
size_t const bin_words = ibf_ptr->bin_words;
size_t const hash_funs = ibf_ptr->hash_funs;

#ifndef NDEBUG
assert(bin_words != 0u);
assert(hash_funs != 0u);
#else
Comment on lines +711 to +714
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

auch wegen der Auto-vectorisierung? Denn eigentlich sollten asserts im release mode ja sowieso wegcompiliert werden oder?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I want to actually assert in Debug mode, and not just declare it as UB. No idea what happens (guaranteed) when I have both __builtin_unreachable() and an assert :)

// Removes case for bin_words == 0u. The same statment inside the switch-case wouldn't have that effect.
if (bin_words == 0u)
__builtin_unreachable();
if (hash_funs == 0u)
__builtin_unreachable();
#endif

for (size_t i = 0; i < hash_funs; ++i)
bloom_filter_indices[i] = ibf_ptr->hash_and_fit(value, ibf_ptr->hash_seeds[i]) >> 6;

for (size_t i = 0; i < ibf_ptr->hash_funs; ++i)
bloom_filter_indices[i] = ibf_ptr->hash_and_fit(value, bloom_filter_indices[i]);
uint64_t * const raw = result_buffer.raw_data().data(); // TODO: std::assume_aligned<64> once memory-aligned
uint64_t const * const ibf_data = ibf_ptr->data.data(); // TODO: std::assume_aligned<64> once memory-aligned
std::memcpy(raw, ibf_data + bloom_filter_indices[0], sizeof(uint64_t) * bin_words);

for (size_t batch = 0; batch < ibf_ptr->bin_words; ++batch)
// https://godbolt.org/z/1nbhvqeGj
// Having the loop inside is faster.
// GCOVR_EXCL_START
switch (bin_words)
{
size_t tmp{-1ULL};
for (size_t i = 0; i < ibf_ptr->hash_funs; ++i)
case 1u: // 1 AND (64 bit)
for (size_t i = 1; i < hash_funs; ++i)
{
assert(bloom_filter_indices[i] < ibf_ptr->data.size());
tmp &= ibf_ptr->data.get_int(bloom_filter_indices[i]);
bloom_filter_indices[i] += 64;
uint64_t const * const ibf_raw = ibf_data + bloom_filter_indices[i];
raw[0] &= ibf_raw[0];
}
break;
case 2u: // 1 SSE4 instruction (128 bit)
for (size_t i = 1; i < hash_funs; ++i)
{
uint64_t const * const ibf_raw = ibf_data + bloom_filter_indices[i];
#pragma omp simd
for (size_t batch = 0; batch < 2u; ++batch)
raw[batch] &= ibf_raw[batch];
}
break;
case 3u: // 1 SSE4 instruction (128 bit) + 1 AND (64 bit)
for (size_t i = 1; i < hash_funs; ++i)
{
uint64_t const * const ibf_raw = ibf_data + bloom_filter_indices[i];
#pragma omp simd
for (size_t batch = 0; batch < 3u; ++batch)
raw[batch] &= ibf_raw[batch];
}
break;
case 4u: // 1 AVX2 instruction (256 bit)
for (size_t i = 1; i < hash_funs; ++i)
{
uint64_t const * const ibf_raw = ibf_data + bloom_filter_indices[i];
#pragma omp simd
for (size_t batch = 0; batch < 4u; ++batch)
raw[batch] &= ibf_raw[batch];
}
break;
case 5u: // 1 AVX2 instruction (256 bit) + 1 AND (64 bit)
for (size_t i = 1; i < hash_funs; ++i)
{
uint64_t const * const ibf_raw = ibf_data + bloom_filter_indices[i];
#pragma omp simd
for (size_t batch = 0; batch < 5u; ++batch)
raw[batch] &= ibf_raw[batch];
}
break;
case 6u: // 1 AVX2 instruction (256 bit) + 1 SSE4 instruction (128 bit)
for (size_t i = 1; i < hash_funs; ++i)
{
uint64_t const * const ibf_raw = ibf_data + bloom_filter_indices[i];
#pragma omp simd
for (size_t batch = 0; batch < 6u; ++batch)
raw[batch] &= ibf_raw[batch];
}
break;
case 7u: // 1 AVX2 instruction (256 bit) + 1 SSE4 instruction (128 bit) + 1 AND (64 bit)
for (size_t i = 1; i < hash_funs; ++i)
{
uint64_t const * const ibf_raw = ibf_data + bloom_filter_indices[i];
#pragma omp simd
for (size_t batch = 0; batch < 7u; ++batch)
raw[batch] &= ibf_raw[batch];
}
break;
case 8u: // 1 AVX512 instruction (512 bit)
for (size_t i = 1; i < hash_funs; ++i)
{
uint64_t const * const ibf_raw = ibf_data + bloom_filter_indices[i];
#pragma omp simd
for (size_t batch = 0; batch < 8u; ++batch)
raw[batch] &= ibf_raw[batch];
}
break;
default: // Auto vectorize. Might create different versions.
for (size_t i = 1; i < hash_funs; ++i)
{
uint64_t const * const ibf_raw = ibf_data + bloom_filter_indices[i];
#pragma omp simd
for (size_t batch = 0; batch < bin_words; ++batch)
raw[batch] &= ibf_raw[batch];
}

result_buffer.data.set_int(batch << 6, tmp);
}
// GCOVR_EXCL_STOP

return result_buffer;
}
Expand Down
14 changes: 7 additions & 7 deletions test/cmake/hibf_require_benchmark.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,25 @@ cmake_minimum_required (VERSION 3.16)
macro (hibf_require_benchmark)
enable_testing ()

set (benchmark_version "1.8.0")
set (gbenchmark_git_tag "v${benchmark_version}")
set (HIBF_BENCHMARK_TAG "v1.8.2")

find_package (benchmark ${benchmark_version} EXACT QUIET)
find_package (benchmark QUIET)

if (NOT benchmark_FOUND)
message (STATUS "Fetching Google Benchmark ${benchmark_version}")
# Also ensure that Google Benchmark if fetched for the latest library cron, which sets the tag to "main".
if (NOT benchmark_FOUND OR "${HIBF_BENCHMARK_TAG}" STREQUAL "main")
message (STATUS "Fetching Google Benchmark ${HIBF_BENCHMARK_TAG}")

include (FetchContent)
FetchContent_Declare (
gbenchmark_fetch_content
GIT_REPOSITORY "https://github.com/google/benchmark.git"
GIT_TAG "${gbenchmark_git_tag}")
GIT_TAG "${HIBF_BENCHMARK_TAG}")
option (BENCHMARK_ENABLE_TESTING "" OFF)
option (BENCHMARK_ENABLE_WERROR "" OFF) # Does not apply to Debug builds.
option (BENCHMARK_ENABLE_INSTALL "" OFF)
FetchContent_MakeAvailable (gbenchmark_fetch_content)
else ()
message (STATUS "Found Google Benchmark ${benchmark_version}")
message (STATUS " Test dependency: Google Benchmark ${benchmark_VERSION} found.")
endif ()

# NOTE: google benchmark's CMakeLists.txt already defines Shlwapi
Expand Down
14 changes: 7 additions & 7 deletions test/cmake/hibf_require_test.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -12,24 +12,24 @@ cmake_minimum_required (VERSION 3.16)
macro (hibf_require_test)
enable_testing ()

set (gtest_version "1.13.0")
set (gtest_git_tag "v${gtest_version}")
set (HIBF_GTEST_TAG "v1.13.0")

find_package (GTest ${gtest_version} EXACT QUIET)
find_package (GTest QUIET)

if (NOT GTest_FOUND)
message (STATUS "Fetching Google Test ${gtest_version}")
# Also ensure that Google Test if fetched for the latest library cron, which sets the tag to "main".
if (NOT GTest_FOUND OR "${HIBF_GTEST_TAG}" STREQUAL "main")
message (STATUS "Fetching Google Test ${HIBF_GTEST_TAG}")

include (FetchContent)
FetchContent_Declare (
gtest_fetch_content
GIT_REPOSITORY "https://github.com/google/googletest.git"
GIT_TAG "${gtest_git_tag}")
GIT_TAG "${HIBF_GTEST_TAG}")
option (BUILD_GMOCK "" OFF)
option (INSTALL_GTEST "" OFF)
FetchContent_MakeAvailable (gtest_fetch_content)
else ()
message (STATUS "Found Google Test ${gtest_version}")
message (STATUS " Test dependency: Google Test ${GTest_VERSION} found.")
endif ()

if (NOT TARGET gtest_build)
Expand Down
1 change: 1 addition & 0 deletions test/performance/ibf/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
hibf_benchmark (interleaved_bloom_filter_benchmark.cpp)
175 changes: 175 additions & 0 deletions test/performance/ibf/interleaved_bloom_filter_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
// -----------------------------------------------------------------------------------------------------
// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
// -----------------------------------------------------------------------------------------------------

#include <benchmark/benchmark.h>

#include <hibf/contrib/std/to.hpp>
#include <hibf/contrib/std/zip_view.hpp>
#include <hibf/interleaved_bloom_filter.hpp>

inline benchmark::Counter hashes_per_second(size_t const count)
{
return benchmark::Counter(count, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1000);
}

#if 1
static void arguments(benchmark::internal::Benchmark * b)
{
// Total size: 1MiB
// bins, bin_size, hash_num, sequence_length
b->Args({64, 1LL << 17, 2, 1LL << 17});
b->Args({128, 1LL << 16, 2, 1LL << 17});
b->Args({192, 1LL << 16, 2, 1LL << 17});
b->Args({256, 1LL << 15, 2, 1LL << 17});
b->Args({1024, 1LL << 10, 2, 1LL << 17});
}
#else
static void arguments(benchmark::internal::Benchmark * b)
{
// Total size: 1GiB
// bins, bin_size, hash_num, sequence_length
b->Args({64, 1LL << 27, 2, 1LL << 27});
b->Args({128, 1LL << 26, 2, 1LL << 27});
b->Args({192, 1LL << 26, 2, 1LL << 27});
b->Args({256, 1LL << 25, 2, 1LL << 27});
b->Args({1024, 1LL << 20, 2, 1LL << 27});
}
#endif

auto set_up(::benchmark::State const & state)
{
size_t const bins = state.range(0);
size_t const bits = state.range(1);
size_t const hash_num = state.range(2);
size_t const sequence_length = state.range(3);

auto generate = [sequence_length](size_t const max_value = std::numeric_limits<size_t>::max())
{
auto generator = [max_value]()
{
std::uniform_int_distribution<size_t> distr{0u, max_value};
std::mt19937_64 engine{0ULL};
return distr(engine);
};
std::vector<size_t> result(sequence_length);

std::ranges::generate(result, generator);
return result;
};

std::vector<size_t> const bin_indices{generate(bins - 1)};
std::vector<size_t> const hash_values{generate()};

hibf::interleaved_bloom_filter ibf{hibf::bin_count{bins},
hibf::bin_size{bits},
hibf::hash_function_count{hash_num}};

return std::make_tuple(bin_indices, hash_values, ibf);
}

void emplace_benchmark(::benchmark::State & state)
{
auto && [bin_indices, hash_values, ibf] = set_up(state);

for (auto _ : state)
{
for (auto [hash, bin] : seqan::std::views::zip(hash_values, bin_indices))
ibf.emplace(hash, hibf::bin_index{bin});
}

state.counters["hashes/sec"] = hashes_per_second(std::ranges::size(hash_values));
}

void clear_benchmark(::benchmark::State & state)
{
auto && [bin_indices, hash_values, ibf] = set_up(state);
(void)bin_indices;
(void)hash_values;

std::vector<hibf::bin_index> bin_range = std::views::iota(0u, static_cast<size_t>(state.range(0)))
| std::views::transform(
[](size_t i)
{
return hibf::bin_index{i};
})
| seqan::std::ranges::to<std::vector>();

for (auto _ : state)
{
for (auto bin : bin_range)
ibf.clear(bin);
}

state.counters["bins/sec"] = hashes_per_second(std::ranges::size(bin_range));
}

void clear_range_benchmark(::benchmark::State & state)
{
auto && [bin_indices, hash_values, ibf] = set_up(state);
(void)bin_indices;
(void)hash_values;

std::vector<hibf::bin_index> bin_range = std::views::iota(0u, static_cast<size_t>(state.range(0)))
| std::views::transform(
[](size_t i)
{
return hibf::bin_index{i};
})
| seqan::std::ranges::to<std::vector>();

for (auto _ : state)
{
ibf.clear(bin_range);
}

state.counters["bins/sec"] = hashes_per_second(std::ranges::size(bin_range));
}

void bulk_contains_benchmark(::benchmark::State & state)
{
auto && [bin_indices, hash_values, ibf] = set_up(state);

for (auto [hash, bin] : seqan::std::views::zip(hash_values, bin_indices))
ibf.emplace(hash, hibf::bin_index{bin});

auto agent = ibf.membership_agent();
for (auto _ : state)
{
for (auto hash : hash_values)
{
[[maybe_unused]] auto & res = agent.bulk_contains(hash);
benchmark::ClobberMemory();
}
}

state.counters["hashes/sec"] = hashes_per_second(std::ranges::size(hash_values));
}

void bulk_count_benchmark(::benchmark::State & state)
{
auto && [bin_indices, hash_values, ibf] = set_up(state);

for (auto [hash, bin] : seqan::std::views::zip(hash_values, bin_indices))
ibf.emplace(hash, hibf::bin_index{bin});

auto agent = ibf.counting_agent();
for (auto _ : state)
{
[[maybe_unused]] auto & res = agent.bulk_count(hash_values);
benchmark::ClobberMemory();
}

state.counters["hashes/sec"] = hashes_per_second(std::ranges::size(hash_values));
}

BENCHMARK(emplace_benchmark)->Apply(arguments);
BENCHMARK(clear_benchmark)->Apply(arguments);
BENCHMARK(clear_range_benchmark)->Apply(arguments);
BENCHMARK(bulk_contains_benchmark)->Apply(arguments);
BENCHMARK(bulk_count_benchmark)->Apply(arguments);

BENCHMARK_MAIN();