Skip to content

Commit

Permalink
[FIX] Include relaxed FPR into the DP algorithm. (#224)
Browse files Browse the repository at this point in the history
* [FIX] Include relaxed FPR into the DP algorithm.

* [REVIEW] Put relaxed_fpr_correction into data_store

* [REVIEW] Remove maximum_bin_tracker::choose_max_bin

* [REVIEW] Fix return value of get_weight

Otherwise, returns double, and may cause integer-to-double promotion

---------

Co-authored-by: Enrico Seiler <[email protected]>
  • Loading branch information
smehringer and eseiler authored Sep 10, 2024
1 parent bdeadea commit 14026d0
Show file tree
Hide file tree
Showing 9 changed files with 125 additions and 92 deletions.
2 changes: 1 addition & 1 deletion include/hibf/layout/compute_fpr_correction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,6 @@ struct fpr_correction_parameters
* \ingroup hibf_layout
* \sa https://godbolt.org/z/zTj1v9W94
*/
std::vector<double> compute_fpr_correction(fpr_correction_parameters const & params);
[[nodiscard]] std::vector<double> compute_fpr_correction(fpr_correction_parameters const & params);

} // namespace seqan::hibf::layout
30 changes: 30 additions & 0 deletions include/hibf/layout/compute_relaxed_fpr_correction.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: BSD-3-Clause

#pragma once

#include <cstddef> // for size_t

#include <hibf/platform.hpp>

namespace seqan::hibf::layout
{

/*!\brief Contains parameters for compute_relaxed_fpr_correction.
* \ingroup hibf_layout
* \qualifier strong
*/
struct relaxed_fpr_correction_parameters
{
double fpr{};
double relaxed_fpr{};
size_t hash_count{};
};

/*!\brief Precompute size correction factor for merged bins which are allowed to have a relaxed FPR.
* \ingroup hibf_layout
*/
[[nodiscard]] double compute_relaxed_fpr_correction(relaxed_fpr_correction_parameters const & params);

} // namespace seqan::hibf::layout
3 changes: 3 additions & 0 deletions include/hibf/layout/data_store.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ struct data_store
//!\brief The false positive correction based on fp_rate, num_hash_functions and requested_max_tb.
std::vector<double> fpr_correction{};

//!\brief The correction factor for merged bins which are allowed to have a relaxed FPR.
double relaxed_fpr_correction{};

//!\brief Information about previous levels of the IBF if the algorithm is called recursively.
previous_level previous{};

Expand Down
52 changes: 2 additions & 50 deletions include/hibf/layout/hierarchical_binning.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,8 @@ class hierarchical_binning
//!\brief Simplifies passing the parameters needed for tracking the maximum technical bin.
struct maximum_bin_tracker
{
size_t max_id{}; //!< The ID of the technical bin with maximal size.
size_t max_size{}; //!< The maximum technical bin size seen so far.
size_t max_split_id{}; //!< The ID of the split bin with maximal size (if any).
size_t max_split_size{}; //!< The maximum split bin size seen so far.
size_t max_id{}; //!< The ID of the technical bin with maximal size.
size_t max_size{}; //!< The maximum technical bin size seen so far.

void update_max(size_t const new_id, size_t const new_size)
{
Expand All @@ -50,52 +48,6 @@ class hierarchical_binning
max_size = new_size;
}
}

//!\brief Split cardinality `new_size` must already account for fpr-correction.
void update_split_max(size_t const new_id, size_t const new_size)
{
if (new_size > max_split_size)
{
max_split_id = new_id;
max_split_size = new_size;
}
}

/*!\brief Decides which bin is reported as the maximum bin.
*\param config The HIBF configuration.
*\return The chosen max bin id.
*
* As a HIBF feature, the merged bin FPR can differ from the overall maximum FPR. Merged bins in an HIBF layout
* will always be followed by one or more lower-level IBFs that will have split bins or single bins (split = 1)
* to recover the original user bins.
*
* We need to make sure, though, that downsizing merged bins does not affect split bins.
* Therefore, we check if choosing a merged bin as the max bin violates the minimum_bits needed for split bins.
* If so, we can report the largest split bin as the max bin as it will choose the correct size and downsize
* larger merged bins only a little.
*/
size_t choose_max_bin(seqan::hibf::config const & config)
{
if (max_id == max_split_id) // Overall max bin is a split bin.
return max_id;

// Split cardinality `max_split_size` already accounts for fpr correction.
// The minimum size of the TBs of this IBF to ensure the maximum_false_positive_rate for split bins.
size_t const minimum_bits{build::bin_size_in_bits({.fpr = config.maximum_fpr,
.hash_count = config.number_of_hash_functions,
.elements = max_split_size})};

// The potential size of the TBs of this IBF given the allowed merged bin FPR.
size_t const merged_bits{build::bin_size_in_bits({.fpr = config.relaxed_fpr, //
.hash_count = config.number_of_hash_functions,
.elements = max_size})};

// If split and merged bits are the same, we prefer merged bins. Better for build parallelisation.
if ((minimum_bits > merged_bits))
return max_split_id;

return max_id;
}
};

public:
Expand Down
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ set (HIBF_SOURCE_FILES
layout/layout.cpp
layout/compute_fpr_correction.cpp
layout/compute_layout.cpp
layout/compute_relaxed_fpr_correction.cpp
sketch/compute_sketches.cpp
layout/graph.cpp
layout/hierarchical_binning.cpp
Expand Down
23 changes: 14 additions & 9 deletions src/layout/compute_layout.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,16 @@
#include <utility> // for move
#include <vector> // for vector

#include <hibf/config.hpp> // for config
#include <hibf/layout/compute_fpr_correction.hpp> // for compute_fpr_correction
#include <hibf/layout/compute_layout.hpp> // for compute_layout
#include <hibf/layout/data_store.hpp> // for data_store
#include <hibf/layout/hierarchical_binning.hpp> // for hierarchical_binning
#include <hibf/layout/layout.hpp> // for layout
#include <hibf/misc/iota_vector.hpp> // for iota_vector
#include <hibf/misc/timer.hpp> // for concurrent_timer
#include <hibf/sketch/hyperloglog.hpp> // for hyperloglog
#include <hibf/config.hpp> // for config
#include <hibf/layout/compute_fpr_correction.hpp> // for compute_fpr_correction
#include <hibf/layout/compute_layout.hpp> // for compute_layout
#include <hibf/layout/compute_relaxed_fpr_correction.hpp> // for compute_relaxed_fpr_correction
#include <hibf/layout/data_store.hpp> // for data_store
#include <hibf/layout/hierarchical_binning.hpp> // for hierarchical_binning
#include <hibf/layout/layout.hpp> // for layout
#include <hibf/misc/iota_vector.hpp> // for iota_vector
#include <hibf/misc/timer.hpp> // for concurrent_timer
#include <hibf/sketch/hyperloglog.hpp> // for hyperloglog

namespace seqan::hibf::layout
{
Expand Down Expand Up @@ -46,6 +47,10 @@ layout compute_layout(config const & config,
.hash_count = config.number_of_hash_functions,
.t_max = config.tmax});

store.relaxed_fpr_correction = compute_relaxed_fpr_correction({.fpr = config.maximum_fpr, //
.relaxed_fpr = config.relaxed_fpr,
.hash_count = config.number_of_hash_functions});

store.hibf_layout->top_level_max_bin_id = seqan::hibf::layout::hierarchical_binning{store, config}.execute();
union_estimation_timer = store.union_estimation_timer;
rearrangement_timer = store.rearrangement_timer;
Expand Down
28 changes: 28 additions & 0 deletions src/layout/compute_relaxed_fpr_correction.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: BSD-3-Clause

#include <cassert> // for assert
#include <cmath> // for log1p, exp, log

#include <hibf/layout/compute_relaxed_fpr_correction.hpp> // for compute_fpr_correction

namespace seqan::hibf::layout
{

double compute_relaxed_fpr_correction(relaxed_fpr_correction_parameters const & params)
{
assert(params.fpr > 0.0 && params.fpr <= 1.0);
assert(params.relaxed_fpr > 0.0 && params.relaxed_fpr <= 1.0);
assert(params.hash_count > 0u);
assert(params.fpr <= params.relaxed_fpr);

double const numerator = std::log1p(-std::exp(std::log(params.fpr) / params.hash_count));
double const denominator = std::log1p(-std::exp(std::log(params.relaxed_fpr) / params.hash_count));
double const relaxed_fpr_correction = numerator / denominator;

assert(relaxed_fpr_correction > 0.0 && relaxed_fpr_correction <= 1.0);
return relaxed_fpr_correction;
}

} // namespace seqan::hibf::layout
17 changes: 9 additions & 8 deletions src/layout/hierarchical_binning.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ void hierarchical_binning::initialization(std::vector<std::vector<size_t>> & mat
for (size_t j = 1; j < num_user_bins; ++j)
{
sum += (*data->kmer_counts)[data->positions[j]];
matrix[0][j] = data->union_estimates[j];
matrix[0][j] = data->union_estimates[j] * data->relaxed_fpr_correction;
ll_matrix[0][j] = max_merge_levels(j + 1) * sum;
trace[0][j] = {0u, j - 1}; // unnecessary?
}
Expand All @@ -130,7 +130,7 @@ void hierarchical_binning::initialization(std::vector<std::vector<size_t>> & mat
assert(j < data->positions.size());
assert(data->positions[j] < data->kmer_counts->size());
sum += (*data->kmer_counts)[data->positions[j]];
matrix[0][j] = sum;
matrix[0][j] = sum * data->relaxed_fpr_correction;
ll_matrix[0][j] = max_merge_levels(j + 1) * sum;
trace[0][j] = {0u, j - 1}; // unnecessary?
}
Expand Down Expand Up @@ -206,12 +206,13 @@ void hierarchical_binning::recursion(std::vector<std::vector<size_t>> & matrix,
size_t j_prime{j - 1};
size_t weight{current_weight};

auto get_weight = [&]()
auto get_weight = [&]() -> size_t
{
// if we use the union estimate we plug in that value instead of the sum (weight)
// union_estimates[j_prime] is the union of {j_prime, ..., j}
// the + 1 is necessary because j_prime is decremented directly after weight is updated
return config.disable_estimate_union ? weight : data->union_estimates[j_prime + 1];
size_t const uncorrected = config.disable_estimate_union ? weight : data->union_estimates[j_prime + 1];
return data->relaxed_fpr_correction * uncorrected;
};

// if the user bin j-1 was not split into multiple technical bins!
Expand Down Expand Up @@ -278,7 +279,7 @@ void hierarchical_binning::backtrack_merged_bin(size_t trace_j,
if (!config.disable_estimate_union)
kmer_count = sketch.estimate(); // overwrite kmer_count high_level_max_id/size bin

max_tracker.update_max(bin_id, kmer_count);
max_tracker.update_max(bin_id, kmer_count * data->relaxed_fpr_correction);
// std::cout << "]: " << kmer_count << std::endl;
}

Expand All @@ -302,7 +303,6 @@ void hierarchical_binning::backtrack_split_bin(size_t trace_j,
size_t const cardinality_per_bin = divide_and_ceil(corrected_cardinality, number_of_bins);

max_tracker.update_max(bin_id, cardinality_per_bin);
max_tracker.update_split_max(bin_id, cardinality_per_bin);

// std::cout << "split " << trace_j << " into " << number_of_bins << ": " << cardinality_per_bin << std::endl;
}
Expand Down Expand Up @@ -359,7 +359,7 @@ size_t hierarchical_binning::backtracking(std::vector<std::vector<std::pair<size
backtrack_split_bin(trace_j, trace_i + 1, bin_id, max_tracker);
}

return max_tracker.choose_max_bin(config);
return max_tracker.max_id;
}

data_store hierarchical_binning::initialise_libf_data(size_t const trace_j) const
Expand All @@ -369,7 +369,8 @@ data_store hierarchical_binning::initialise_libf_data(size_t const trace_j) cons
.kmer_counts = data->kmer_counts,
.sketches = data->sketches,
.positions = {data->positions[trace_j]},
.fpr_correction = data->fpr_correction};
.fpr_correction = data->fpr_correction,
.relaxed_fpr_correction = data->relaxed_fpr_correction};

return libf_data;
}
Expand Down
Loading

0 comments on commit 14026d0

Please sign in to comment.