[FIX] Include relaxed FPR into the DP algorithm. (#224)

* [FIX] Include relaxed FPR into the DP algorithm. * [REVIEW] Put relaxed_fpr_correction into data_store * [REVIEW] Remove maximum_bin_tracker::choose_max_bin * [REVIEW] Fix return value of get_weight Otherwise, returns double, and may cause integer-to-double promotion --------- Co-authored-by: Enrico Seiler <[email protected]>
seqan · Sep 10, 2024 · 14026d0 · 14026d0
1 parent bdeadea
commit 14026d0
Show file tree

Hide file tree

Showing 9 changed files with 125 additions and 92 deletions.
diff --git a/include/hibf/layout/compute_fpr_correction.hpp b/include/hibf/layout/compute_fpr_correction.hpp
@@ -27,6 +27,6 @@ struct fpr_correction_parameters
  * \ingroup hibf_layout
  * \sa https://godbolt.org/z/zTj1v9W94
  */
-std::vector<double> compute_fpr_correction(fpr_correction_parameters const & params);
+[[nodiscard]] std::vector<double> compute_fpr_correction(fpr_correction_parameters const & params);
 
 } // namespace seqan::hibf::layout
diff --git a/include/hibf/layout/compute_relaxed_fpr_correction.hpp b/include/hibf/layout/compute_relaxed_fpr_correction.hpp
@@ -0,0 +1,30 @@
+// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin
+// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik
+// SPDX-License-Identifier: BSD-3-Clause
+
+#pragma once
+
+#include <cstddef> // for size_t
+
+#include <hibf/platform.hpp>
+
+namespace seqan::hibf::layout
+{
+
+/*!\brief Contains parameters for compute_relaxed_fpr_correction.
+ * \ingroup hibf_layout
+ * \qualifier strong
+ */
+struct relaxed_fpr_correction_parameters
+{
+    double fpr{};
+    double relaxed_fpr{};
+    size_t hash_count{};
+};
+
+/*!\brief Precompute size correction factor for merged bins which are allowed to have a relaxed FPR.
+ * \ingroup hibf_layout
+ */
+[[nodiscard]] double compute_relaxed_fpr_correction(relaxed_fpr_correction_parameters const & params);
+
+} // namespace seqan::hibf::layout
diff --git a/include/hibf/layout/data_store.hpp b/include/hibf/layout/data_store.hpp
@@ -76,6 +76,9 @@ struct data_store
     //!\brief The false positive correction based on fp_rate, num_hash_functions and requested_max_tb.
     std::vector<double> fpr_correction{};
 
+    //!\brief The correction factor for merged bins which are allowed to have a relaxed FPR.
+    double relaxed_fpr_correction{};
+
     //!\brief Information about previous levels of the IBF if the algorithm is called recursively.
     previous_level previous{};
 

diff --git a/include/hibf/layout/hierarchical_binning.hpp b/include/hibf/layout/hierarchical_binning.hpp
@@ -37,10 +37,8 @@ class hierarchical_binning
     //!\brief Simplifies passing the parameters needed for tracking the maximum technical bin.
     struct maximum_bin_tracker
     {
-        size_t max_id{};         //!< The ID of the technical bin with maximal size.
-        size_t max_size{};       //!< The maximum technical bin size seen so far.
-        size_t max_split_id{};   //!< The ID of the split bin with maximal size (if any).
-        size_t max_split_size{}; //!< The maximum split bin size seen so far.
+        size_t max_id{};   //!< The ID of the technical bin with maximal size.
+        size_t max_size{}; //!< The maximum technical bin size seen so far.
 
         void update_max(size_t const new_id, size_t const new_size)
         {
@@ -50,52 +48,6 @@ class hierarchical_binning
                 max_size = new_size;
             }
         }
-
-        //!\brief Split cardinality `new_size` must already account for fpr-correction.
-        void update_split_max(size_t const new_id, size_t const new_size)
-        {
-            if (new_size > max_split_size)
-            {
-                max_split_id = new_id;
-                max_split_size = new_size;
-            }
-        }
-
-        /*!\brief Decides which bin is reported as the maximum bin.
-         *\param config The HIBF configuration.
-         *\return The chosen max bin id.
-         *
-         * As a HIBF feature, the merged bin FPR can differ from the overall maximum FPR. Merged bins in an HIBF layout
-         * will always be followed by one or more lower-level IBFs that will have split bins or single bins (split = 1)
-         * to recover the original user bins.
-         *
-         * We need to make sure, though, that downsizing merged bins does not affect split bins.
-         * Therefore, we check if choosing a merged bin as the max bin violates the minimum_bits needed for split bins.
-         * If so, we can report the largest split bin as the max bin as it will choose the correct size and downsize
-         * larger merged bins only a little.
-         */
-        size_t choose_max_bin(seqan::hibf::config const & config)
-        {
-            if (max_id == max_split_id) // Overall max bin is a split bin.
-                return max_id;
-
-            // Split cardinality `max_split_size` already accounts for fpr correction.
-            // The minimum size of the TBs of this IBF to ensure the maximum_false_positive_rate for split bins.
-            size_t const minimum_bits{build::bin_size_in_bits({.fpr = config.maximum_fpr,
-                                                               .hash_count = config.number_of_hash_functions,
-                                                               .elements = max_split_size})};
-
-            // The potential size of the TBs of this IBF given the allowed merged bin FPR.
-            size_t const merged_bits{build::bin_size_in_bits({.fpr = config.relaxed_fpr, //
-                                                              .hash_count = config.number_of_hash_functions,
-                                                              .elements = max_size})};
-
-            // If split and merged bits are the same, we prefer merged bins. Better for build parallelisation.
-            if ((minimum_bits > merged_bits))
-                return max_split_id;
-
-            return max_id;
-        }
     };
 
 public:

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -10,6 +10,7 @@ set (HIBF_SOURCE_FILES
      layout/layout.cpp
      layout/compute_fpr_correction.cpp
      layout/compute_layout.cpp
+     layout/compute_relaxed_fpr_correction.cpp
      sketch/compute_sketches.cpp
      layout/graph.cpp
      layout/hierarchical_binning.cpp

diff --git a/src/layout/compute_layout.cpp b/src/layout/compute_layout.cpp
@@ -9,15 +9,16 @@
 #include <utility>   // for move
 #include <vector>    // for vector
 
-#include <hibf/config.hpp>                        // for config
-#include <hibf/layout/compute_fpr_correction.hpp> // for compute_fpr_correction
-#include <hibf/layout/compute_layout.hpp>         // for compute_layout
-#include <hibf/layout/data_store.hpp>             // for data_store
-#include <hibf/layout/hierarchical_binning.hpp>   // for hierarchical_binning
-#include <hibf/layout/layout.hpp>                 // for layout
-#include <hibf/misc/iota_vector.hpp>              // for iota_vector
-#include <hibf/misc/timer.hpp>                    // for concurrent_timer
-#include <hibf/sketch/hyperloglog.hpp>            // for hyperloglog
+#include <hibf/config.hpp>                                // for config
+#include <hibf/layout/compute_fpr_correction.hpp>         // for compute_fpr_correction
+#include <hibf/layout/compute_layout.hpp>                 // for compute_layout
+#include <hibf/layout/compute_relaxed_fpr_correction.hpp> // for compute_relaxed_fpr_correction
+#include <hibf/layout/data_store.hpp>                     // for data_store
+#include <hibf/layout/hierarchical_binning.hpp>           // for hierarchical_binning
+#include <hibf/layout/layout.hpp>                         // for layout
+#include <hibf/misc/iota_vector.hpp>                      // for iota_vector
+#include <hibf/misc/timer.hpp>                            // for concurrent_timer
+#include <hibf/sketch/hyperloglog.hpp>                    // for hyperloglog
 
 namespace seqan::hibf::layout
 {
@@ -46,6 +47,10 @@ layout compute_layout(config const & config,
                                                    .hash_count = config.number_of_hash_functions,
                                                    .t_max = config.tmax});
 
+    store.relaxed_fpr_correction = compute_relaxed_fpr_correction({.fpr = config.maximum_fpr, //
+                                                                   .relaxed_fpr = config.relaxed_fpr,
+                                                                   .hash_count = config.number_of_hash_functions});
+
     store.hibf_layout->top_level_max_bin_id = seqan::hibf::layout::hierarchical_binning{store, config}.execute();
     union_estimation_timer = store.union_estimation_timer;
     rearrangement_timer = store.rearrangement_timer;

diff --git a/src/layout/compute_relaxed_fpr_correction.cpp b/src/layout/compute_relaxed_fpr_correction.cpp
@@ -0,0 +1,28 @@
+// SPDX-FileCopyrightText: 2006-2024, Knut Reinert & Freie Universität Berlin
+// SPDX-FileCopyrightText: 2016-2024, Knut Reinert & MPI für molekulare Genetik
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <cassert> // for assert
+#include <cmath>   // for log1p, exp, log
+
+#include <hibf/layout/compute_relaxed_fpr_correction.hpp> // for compute_fpr_correction
+
+namespace seqan::hibf::layout
+{
+
+double compute_relaxed_fpr_correction(relaxed_fpr_correction_parameters const & params)
+{
+    assert(params.fpr > 0.0 && params.fpr <= 1.0);
+    assert(params.relaxed_fpr > 0.0 && params.relaxed_fpr <= 1.0);
+    assert(params.hash_count > 0u);
+    assert(params.fpr <= params.relaxed_fpr);
+
+    double const numerator = std::log1p(-std::exp(std::log(params.fpr) / params.hash_count));
+    double const denominator = std::log1p(-std::exp(std::log(params.relaxed_fpr) / params.hash_count));
+    double const relaxed_fpr_correction = numerator / denominator;
+
+    assert(relaxed_fpr_correction > 0.0 && relaxed_fpr_correction <= 1.0);
+    return relaxed_fpr_correction;
+}
+
+} // namespace seqan::hibf::layout
diff --git a/src/layout/hierarchical_binning.cpp b/src/layout/hierarchical_binning.cpp
@@ -118,7 +118,7 @@ void hierarchical_binning::initialization(std::vector<std::vector<size_t>> & mat
         for (size_t j = 1; j < num_user_bins; ++j)
         {
             sum += (*data->kmer_counts)[data->positions[j]];
-            matrix[0][j] = data->union_estimates[j];
+            matrix[0][j] = data->union_estimates[j] * data->relaxed_fpr_correction;
             ll_matrix[0][j] = max_merge_levels(j + 1) * sum;
             trace[0][j] = {0u, j - 1}; // unnecessary?
         }
@@ -130,7 +130,7 @@ void hierarchical_binning::initialization(std::vector<std::vector<size_t>> & mat
             assert(j < data->positions.size());
             assert(data->positions[j] < data->kmer_counts->size());
             sum += (*data->kmer_counts)[data->positions[j]];
-            matrix[0][j] = sum;
+            matrix[0][j] = sum * data->relaxed_fpr_correction;
             ll_matrix[0][j] = max_merge_levels(j + 1) * sum;
             trace[0][j] = {0u, j - 1}; // unnecessary?
         }
@@ -206,12 +206,13 @@ void hierarchical_binning::recursion(std::vector<std::vector<size_t>> & matrix,
             size_t j_prime{j - 1};
             size_t weight{current_weight};
 
-            auto get_weight = [&]()
+            auto get_weight = [&]() -> size_t
             {
                 // if we use the union estimate we plug in that value instead of the sum (weight)
                 // union_estimates[j_prime] is the union of {j_prime, ..., j}
                 // the + 1 is necessary because j_prime is decremented directly after weight is updated
-                return config.disable_estimate_union ? weight : data->union_estimates[j_prime + 1];
+                size_t const uncorrected = config.disable_estimate_union ? weight : data->union_estimates[j_prime + 1];
+                return data->relaxed_fpr_correction * uncorrected;
             };
 
             // if the user bin j-1 was not split into multiple technical bins!
@@ -278,7 +279,7 @@ void hierarchical_binning::backtrack_merged_bin(size_t trace_j,
     if (!config.disable_estimate_union)
         kmer_count = sketch.estimate(); // overwrite kmer_count high_level_max_id/size bin
 
-    max_tracker.update_max(bin_id, kmer_count);
+    max_tracker.update_max(bin_id, kmer_count * data->relaxed_fpr_correction);
     // std::cout << "]: " << kmer_count << std::endl;
 }
 
@@ -302,7 +303,6 @@ void hierarchical_binning::backtrack_split_bin(size_t trace_j,
     size_t const cardinality_per_bin = divide_and_ceil(corrected_cardinality, number_of_bins);
 
     max_tracker.update_max(bin_id, cardinality_per_bin);
-    max_tracker.update_split_max(bin_id, cardinality_per_bin);
 
     // std::cout << "split " << trace_j << " into " << number_of_bins << ": " << cardinality_per_bin << std::endl;
 }
@@ -359,7 +359,7 @@ size_t hierarchical_binning::backtracking(std::vector<std::vector<std::pair<size
         backtrack_split_bin(trace_j, trace_i + 1, bin_id, max_tracker);
     }
 
-    return max_tracker.choose_max_bin(config);
+    return max_tracker.max_id;
 }
 
 data_store hierarchical_binning::initialise_libf_data(size_t const trace_j) const
@@ -369,7 +369,8 @@ data_store hierarchical_binning::initialise_libf_data(size_t const trace_j) cons
                          .kmer_counts = data->kmer_counts,
                          .sketches = data->sketches,
                          .positions = {data->positions[trace_j]},
-                         .fpr_correction = data->fpr_correction};
+                         .fpr_correction = data->fpr_correction,
+                         .relaxed_fpr_correction = data->relaxed_fpr_correction};
 
     return libf_data;
 }