From 5f079069de30414bb3f1117575d6911bc5ad17b9 Mon Sep 17 00:00:00 2001 From: LTLA Date: Mon, 18 Nov 2024 08:44:59 -0800 Subject: [PATCH] Just use two separate parameters to control the HVG bound. There's no point trying to force them into a single parameter. Also improved the documentation of all options. --- .../choose_highly_variable_genes.hpp | 41 +++++++++++-------- tests/src/choose_highly_variable_genes.cpp | 13 +++--- 2 files changed, 33 insertions(+), 21 deletions(-) diff --git a/include/scran_variances/choose_highly_variable_genes.hpp b/include/scran_variances/choose_highly_variable_genes.hpp index f745719..a4cb830 100644 --- a/include/scran_variances/choose_highly_variable_genes.hpp +++ b/include/scran_variances/choose_highly_variable_genes.hpp @@ -19,8 +19,11 @@ namespace scran_variances { struct ChooseHighlyVariableGenesOptions { /** * Number of top genes to choose. - * The actual number of genes may be smaller, if `top` is greater than the number of genes in the dataset with statistics greater than `threshold`; - * or larger, if `ChooseHighlyVariableGenesOptions::keep_ties = true`. + * Note that the actual number of chosen genes may be: + * + * - smaller than `top`, if the latter is greater than the total number of genes in the dataset. + * - smaller than `top`, if `ChooseHighlyVariableGenesOptions::use_bound = true` and `top` is greater than the total number of genes in the dataset with statistics greater than `ChooseHighlyVariableGenesOptions::bound`. + * - larger than `top`, if `ChooseHighlyVariableGenesOptions::keep_ties = true` and there are multiple ties at the `top`-th chosen gene. */ size_t top = 4000; @@ -30,15 +33,21 @@ struct ChooseHighlyVariableGenesOptions { bool larger = true; /** - * The first value specifies whether to consider an absolute bound on the statistic when choosing HVGs. - * The second value is a lower bound for the statistic, at or below which a gene will not be considered as highly variable even if it is among the top `top` genes. + * Whether to consider an absolute bound on the statistic when choosing HVGs. + * The value of the bound is determined by `ChooseHighlyVariableGenesOptions::bound`. + */ + bool use_bound = false; + + /** + * A lower bound for the statistic, at or below which a gene will not be considered as highly variable even if it is among the top `top` genes. * If `ChooseHighlyVariableGenesOptions::larger = false`, this is an upper bound instead. + * Only used if `ChooseHighlyVariableGenesOptions::use_bound = true`. */ - std::pair bound = std::make_pair(false, 0); + double bound = 0; /** - * Whether to keep all genes with statistics that are tied with the `top`-th gene. - * If `false`, ties are arbitrarily broken but the number of retained genes will not be greater than `top`. + * Whether to keep all genes with statistics that are tied with the `ChooseHighlyVariableGenesOptions::top`-th gene. + * If `false`, ties are arbitrarily broken but the number of retained genes will not be greater than `ChooseHighlyVariableGenesOptions::top`. */ bool keep_ties = true; }; @@ -71,9 +80,9 @@ void choose_highly_variable_genes(size_t n, const Stat_* statistic, Output_* out return; } - Stat_ bound = options.bound.second; + Stat_ bound = options.bound; if (options.top >= n) { - if (options.bound.first) { + if (options.use_bound) { for (size_t i = 0; i < n; ++i) { output[i] = cmp(statistic[i], bound); } @@ -87,7 +96,7 @@ void choose_highly_variable_genes(size_t n, const Stat_* statistic, Output_* out Stat_ threshold = statistic[collected[options.top - 1]]; if (options.keep_ties) { - if (options.bound.first && !cmp(threshold, bound)) { + if (options.use_bound && !cmp(threshold, bound)) { for (size_t i = 0; i < n; ++i) { output[i] = cmp(statistic[i], bound); } @@ -101,7 +110,7 @@ void choose_highly_variable_genes(size_t n, const Stat_* statistic, Output_* out std::fill_n(output, n, false); size_t counter = options.top; - if (options.bound.first && !cmp(threshold, bound)) { + if (options.use_bound && !cmp(threshold, bound)) { --counter; while (counter > 0) { --counter; @@ -124,11 +133,11 @@ std::vector choose_highly_variable_genes_index(size_t n, const Stat_* st return output; } - Stat_ bound = options.bound.second; + Stat_ bound = options.bound; if (options.top >= n) { - if (options.bound.first) { + if (options.use_bound) { for (size_t i = 0; i < n; ++i) { - if (options.bound.first && cmp(statistic[i], bound)) { + if (options.use_bound && cmp(statistic[i], bound)) { output.push_back(i); } } @@ -144,7 +153,7 @@ std::vector choose_highly_variable_genes_index(size_t n, const Stat_* st if (options.keep_ties) { output.clear(); - if (options.bound.first && !cmp(threshold, bound)) { + if (options.use_bound && !cmp(threshold, bound)) { for (size_t i = 0; i < n; ++i) { if (cmp(statistic[i], bound)) { output.push_back(i); @@ -161,7 +170,7 @@ std::vector choose_highly_variable_genes_index(size_t n, const Stat_* st } size_t counter = options.top; - if (options.bound.first && !cmp(threshold, bound)) { + if (options.use_bound && !cmp(threshold, bound)) { --counter; while (counter > 0) { --counter; diff --git a/tests/src/choose_highly_variable_genes.cpp b/tests/src/choose_highly_variable_genes.cpp index c7de6b8..6679a03 100644 --- a/tests/src/choose_highly_variable_genes.cpp +++ b/tests/src/choose_highly_variable_genes.cpp @@ -32,7 +32,8 @@ TEST_P(ChooseHvgsTest, Basic) { scran_variances::ChooseHighlyVariableGenesOptions opt; opt.top = ntop; - opt.bound = bound; + opt.use_bound = bound.first; + opt.bound = bound.second; auto output = scran_variances::choose_highly_variable_genes(ngenes, x.data(), opt); // Checking that everything is in order. @@ -127,7 +128,8 @@ TEST(ChooseHvgs, Ties) { compare_bool_with_index(output, ioutput); // Introducing a bound. - opt.bound = { true, 3.3 }; + opt.use_bound = true; + opt.bound = 3.3; auto outputb = scran_variances::choose_highly_variable_genes(x.size(), x.data(), opt); EXPECT_EQ(2, std::accumulate(outputb.begin(), outputb.end(), 0)); @@ -136,7 +138,7 @@ TEST(ChooseHvgs, Ties) { // Keeping all ties. opt.keep_ties = true; - opt.bound.first = false; + opt.use_bound = false; auto output2 = scran_variances::choose_highly_variable_genes(x.size(), x.data(), opt); EXPECT_LT(opt.top, std::accumulate(output2.begin(), output2.end(), 0)); @@ -157,7 +159,8 @@ TEST(ChooseHvgs, Ties) { compare_bool_with_index(output, ioutput); // Introducing a bound. - opt.bound = { true, 1.1 }; + opt.use_bound = true; + opt.bound = 1.1; auto outputb = scran_variances::choose_highly_variable_genes(x.size(), x.data(), opt); EXPECT_EQ(1, std::accumulate(outputb.begin(), outputb.end(), 0)); @@ -166,7 +169,7 @@ TEST(ChooseHvgs, Ties) { // Keeping all ties. opt.keep_ties = true; - opt.bound.first = false; + opt.use_bound = false; auto output2 = scran_variances::choose_highly_variable_genes(x.size(), x.data(), opt); EXPECT_LT(opt.top, std::accumulate(output2.begin(), output2.end(), 0));