Skip to content

Commit

Permalink
Just use two separate parameters to control the HVG bound.
Browse files Browse the repository at this point in the history
There's no point trying to force them into a single parameter. Also
improved the documentation of all options.
  • Loading branch information
LTLA committed Nov 18, 2024
1 parent 6787665 commit 5f07906
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 21 deletions.
41 changes: 25 additions & 16 deletions include/scran_variances/choose_highly_variable_genes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,11 @@ namespace scran_variances {
struct ChooseHighlyVariableGenesOptions {
/**
* Number of top genes to choose.
* The actual number of genes may be smaller, if `top` is greater than the number of genes in the dataset with statistics greater than `threshold`;
* or larger, if `ChooseHighlyVariableGenesOptions::keep_ties = true`.
* Note that the actual number of chosen genes may be:
*
* - smaller than `top`, if the latter is greater than the total number of genes in the dataset.
* - smaller than `top`, if `ChooseHighlyVariableGenesOptions::use_bound = true` and `top` is greater than the total number of genes in the dataset with statistics greater than `ChooseHighlyVariableGenesOptions::bound`.
* - larger than `top`, if `ChooseHighlyVariableGenesOptions::keep_ties = true` and there are multiple ties at the `top`-th chosen gene.
*/
size_t top = 4000;

Expand All @@ -30,15 +33,21 @@ struct ChooseHighlyVariableGenesOptions {
bool larger = true;

/**
* The first value specifies whether to consider an absolute bound on the statistic when choosing HVGs.
* The second value is a lower bound for the statistic, at or below which a gene will not be considered as highly variable even if it is among the top `top` genes.
* Whether to consider an absolute bound on the statistic when choosing HVGs.
* The value of the bound is determined by `ChooseHighlyVariableGenesOptions::bound`.
*/
bool use_bound = false;

/**
* A lower bound for the statistic, at or below which a gene will not be considered as highly variable even if it is among the top `top` genes.
* If `ChooseHighlyVariableGenesOptions::larger = false`, this is an upper bound instead.
* Only used if `ChooseHighlyVariableGenesOptions::use_bound = true`.
*/
std::pair<bool, double> bound = std::make_pair<bool, double>(false, 0);
double bound = 0;

/**
* Whether to keep all genes with statistics that are tied with the `top`-th gene.
* If `false`, ties are arbitrarily broken but the number of retained genes will not be greater than `top`.
* Whether to keep all genes with statistics that are tied with the `ChooseHighlyVariableGenesOptions::top`-th gene.
* If `false`, ties are arbitrarily broken but the number of retained genes will not be greater than `ChooseHighlyVariableGenesOptions::top`.
*/
bool keep_ties = true;
};
Expand Down Expand Up @@ -71,9 +80,9 @@ void choose_highly_variable_genes(size_t n, const Stat_* statistic, Output_* out
return;
}

Stat_ bound = options.bound.second;
Stat_ bound = options.bound;
if (options.top >= n) {
if (options.bound.first) {
if (options.use_bound) {
for (size_t i = 0; i < n; ++i) {
output[i] = cmp(statistic[i], bound);
}
Expand All @@ -87,7 +96,7 @@ void choose_highly_variable_genes(size_t n, const Stat_* statistic, Output_* out
Stat_ threshold = statistic[collected[options.top - 1]];

if (options.keep_ties) {
if (options.bound.first && !cmp(threshold, bound)) {
if (options.use_bound && !cmp(threshold, bound)) {
for (size_t i = 0; i < n; ++i) {
output[i] = cmp(statistic[i], bound);
}
Expand All @@ -101,7 +110,7 @@ void choose_highly_variable_genes(size_t n, const Stat_* statistic, Output_* out

std::fill_n(output, n, false);
size_t counter = options.top;
if (options.bound.first && !cmp(threshold, bound)) {
if (options.use_bound && !cmp(threshold, bound)) {
--counter;
while (counter > 0) {
--counter;
Expand All @@ -124,11 +133,11 @@ std::vector<Index_> choose_highly_variable_genes_index(size_t n, const Stat_* st
return output;
}

Stat_ bound = options.bound.second;
Stat_ bound = options.bound;
if (options.top >= n) {
if (options.bound.first) {
if (options.use_bound) {
for (size_t i = 0; i < n; ++i) {
if (options.bound.first && cmp(statistic[i], bound)) {
if (options.use_bound && cmp(statistic[i], bound)) {
output.push_back(i);
}
}
Expand All @@ -144,7 +153,7 @@ std::vector<Index_> choose_highly_variable_genes_index(size_t n, const Stat_* st

if (options.keep_ties) {
output.clear();
if (options.bound.first && !cmp(threshold, bound)) {
if (options.use_bound && !cmp(threshold, bound)) {
for (size_t i = 0; i < n; ++i) {
if (cmp(statistic[i], bound)) {
output.push_back(i);
Expand All @@ -161,7 +170,7 @@ std::vector<Index_> choose_highly_variable_genes_index(size_t n, const Stat_* st
}

size_t counter = options.top;
if (options.bound.first && !cmp(threshold, bound)) {
if (options.use_bound && !cmp(threshold, bound)) {
--counter;
while (counter > 0) {
--counter;
Expand Down
13 changes: 8 additions & 5 deletions tests/src/choose_highly_variable_genes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ TEST_P(ChooseHvgsTest, Basic) {

scran_variances::ChooseHighlyVariableGenesOptions opt;
opt.top = ntop;
opt.bound = bound;
opt.use_bound = bound.first;
opt.bound = bound.second;
auto output = scran_variances::choose_highly_variable_genes(ngenes, x.data(), opt);

// Checking that everything is in order.
Expand Down Expand Up @@ -127,7 +128,8 @@ TEST(ChooseHvgs, Ties) {
compare_bool_with_index(output, ioutput);

// Introducing a bound.
opt.bound = { true, 3.3 };
opt.use_bound = true;
opt.bound = 3.3;
auto outputb = scran_variances::choose_highly_variable_genes(x.size(), x.data(), opt);
EXPECT_EQ(2, std::accumulate(outputb.begin(), outputb.end(), 0));

Expand All @@ -136,7 +138,7 @@ TEST(ChooseHvgs, Ties) {

// Keeping all ties.
opt.keep_ties = true;
opt.bound.first = false;
opt.use_bound = false;
auto output2 = scran_variances::choose_highly_variable_genes(x.size(), x.data(), opt);
EXPECT_LT(opt.top, std::accumulate(output2.begin(), output2.end(), 0));

Expand All @@ -157,7 +159,8 @@ TEST(ChooseHvgs, Ties) {
compare_bool_with_index(output, ioutput);

// Introducing a bound.
opt.bound = { true, 1.1 };
opt.use_bound = true;
opt.bound = 1.1;
auto outputb = scran_variances::choose_highly_variable_genes(x.size(), x.data(), opt);
EXPECT_EQ(1, std::accumulate(outputb.begin(), outputb.end(), 0));

Expand All @@ -166,7 +169,7 @@ TEST(ChooseHvgs, Ties) {

// Keeping all ties.
opt.keep_ties = true;
opt.bound.first = false;
opt.use_bound = false;
auto output2 = scran_variances::choose_highly_variable_genes(x.size(), x.data(), opt);
EXPECT_LT(opt.top, std::accumulate(output2.begin(), output2.end(), 0));

Expand Down

0 comments on commit 5f07906

Please sign in to comment.