Skip to content

Commit

Permalink
Set an absolute lower bound on the statistic when choosing HVGs. (#1)
Browse files Browse the repository at this point in the history
This ensures that we don't choose genes with a negative residual, even if they
lie within the top set of genes because the residuals are relatively highly ranked.
  • Loading branch information
LTLA authored Oct 7, 2024
1 parent da3b9cb commit 6787665
Show file tree
Hide file tree
Showing 3 changed files with 235 additions and 78 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
cmake_minimum_required(VERSION 3.14)

project(scran_variances
VERSION 0.1.2
VERSION 0.1.3
DESCRIPTION "Model per-gene variances in single-cell expression"
LANGUAGES CXX)

Expand Down
242 changes: 175 additions & 67 deletions include/scran_variances/choose_highly_variable_genes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@ namespace scran_variances {
struct ChooseHighlyVariableGenesOptions {
/**
* Number of top genes to choose.
* This should be positive.
* The actual number of genes may be smaller, if there are fewer genes in the dataset than `top`;
* The actual number of genes may be smaller, if `top` is greater than the number of genes in the dataset with statistics greater than `threshold`;
* or larger, if `ChooseHighlyVariableGenesOptions::keep_ties = true`.
*/
size_t top = 4000;
Expand All @@ -30,6 +29,13 @@ struct ChooseHighlyVariableGenesOptions {
*/
bool larger = true;

/**
* The first value specifies whether to consider an absolute bound on the statistic when choosing HVGs.
* The second value is a lower bound for the statistic, at or below which a gene will not be considered as highly variable even if it is among the top `top` genes.
* If `ChooseHighlyVariableGenesOptions::larger = false`, this is an upper bound instead.
*/
std::pair<bool, double> bound = std::make_pair<bool, double>(false, 0);

/**
* Whether to keep all genes with statistics that are tied with the `top`-th gene.
* If `false`, ties are arbitrarily broken but the number of retained genes will not be greater than `top`.
Expand All @@ -38,51 +44,175 @@ struct ChooseHighlyVariableGenesOptions {
};

/**
* @tparam Stat_ Type of the variance statistic.
* @tparam Bool_ Type to be used as a boolean.
*
* @param n Number of genes.
* @param[in] statistic Pointer to an array of length `n` containing the per-gene variance statistics.
* @param[out] output Pointer to an array of length `n`.
* On output, this is filled with `true` if the gene is to be retained and `false` otherwise.
* @param options Further options.
* @cond
*/
template<typename Stat_, typename Bool_>
void choose_highly_variable_genes(size_t n, const Stat_* statistic, Bool_* output, const ChooseHighlyVariableGenesOptions& options) {
namespace internal {

template<typename Index_, typename Stat_, class Cmp_>
std::vector<Index_> create_semisorted_indices(size_t n, const Stat_* statistic, Cmp_ cmp, size_t top) {
std::vector<Index_> collected(n);
std::iota(collected.begin(), collected.end(), static_cast<Index_>(0));
auto cBegin = collected.begin(), cMid = cBegin + top - 1, cEnd = collected.end();
std::nth_element(cBegin, cMid, cEnd, [&](Index_ l, Index_ r) -> bool {
auto L = statistic[l], R = statistic[r];
if (L == R) {
return l < r; // always favor the earlier index for a stable sort, even if options.larger = false.
} else {
return cmp(L, R);
}
});
return collected;
}

template<typename Stat_, class Output_, class Cmp_, class CmpEqual_>
void choose_highly_variable_genes(size_t n, const Stat_* statistic, Output_* output, Cmp_ cmp, CmpEqual_ cmpeq, const ChooseHighlyVariableGenesOptions& options) {
if (options.top == 0) {
std::fill_n(output, n, false);
return;
}

Stat_ bound = options.bound.second;
if (options.top >= n) {
std::fill_n(output, n, true);
if (options.bound.first) {
for (size_t i = 0; i < n; ++i) {
output[i] = cmp(statistic[i], bound);
}
} else {
std::fill_n(output, n, true);
}
return;
} else if (options.top == 0) {
std::fill_n(output, n, false);
}

auto collected = create_semisorted_indices<size_t>(n, statistic, cmp, options.top);
Stat_ threshold = statistic[collected[options.top - 1]];

if (options.keep_ties) {
if (options.bound.first && !cmp(threshold, bound)) {
for (size_t i = 0; i < n; ++i) {
output[i] = cmp(statistic[i], bound);
}
} else {
for (size_t i = 0; i < n; ++i) {
output[i] = cmpeq(statistic[i], threshold);
}
}
return;
}

std::vector<size_t> collected(n);
std::iota(collected.begin(), collected.end(), static_cast<size_t>(0));
auto cBegin = collected.begin(), cMid = cBegin + options.top - 1, cEnd = collected.end();
if (options.larger) {
std::nth_element(cBegin, cMid, cEnd, [&](size_t l, size_t r) -> bool { return statistic[l] > statistic[r]; });
} else {
std::nth_element(cBegin, cMid, cEnd, [&](size_t l, size_t r) -> bool { return statistic[l] < statistic[r]; });
std::fill_n(output, n, false);
size_t counter = options.top;
if (options.bound.first && !cmp(threshold, bound)) {
--counter;
while (counter > 0) {
--counter;
if (cmp(statistic[collected[counter]], bound)) {
++counter;
break;
}
}
}

if (!options.keep_ties) {
std::fill_n(output, n, false);
for (size_t i = 0; i < options.top; ++i) {
output[collected[i]] = true;
for (size_t i = 0; i < counter; ++i) {
output[collected[i]] = true;
}
}

template<typename Index_, typename Stat_, class Cmp_, class CmpEqual_>
std::vector<Index_> choose_highly_variable_genes_index(size_t n, const Stat_* statistic, Cmp_ cmp, CmpEqual_ cmpeq, const ChooseHighlyVariableGenesOptions& options) {
std::vector<Index_> output;
if (options.top == 0) {
return output;
}

Stat_ bound = options.bound.second;
if (options.top >= n) {
if (options.bound.first) {
for (size_t i = 0; i < n; ++i) {
if (options.bound.first && cmp(statistic[i], bound)) {
output.push_back(i);
}
}
} else {
output.resize(n);
std::iota(output.begin(), output.end(), static_cast<Index_>(0));
}
} else {
auto threshold = statistic[collected[options.top - 1]];
if (options.larger) {
return output;
}

output = create_semisorted_indices<Index_>(n, statistic, cmp, options.top);
Stat_ threshold = statistic[output[options.top - 1]];

if (options.keep_ties) {
output.clear();
if (options.bound.first && !cmp(threshold, bound)) {
for (size_t i = 0; i < n; ++i) {
output[i] = statistic[i] >= threshold;
if (cmp(statistic[i], bound)) {
output.push_back(i);
}
}
} else {
for (size_t i = 0; i < n; ++i) {
output[i] = statistic[i] <= threshold;
if (cmpeq(statistic[i], threshold)) {
output.push_back(i);
}
}
}
return output;
}

size_t counter = options.top;
if (options.bound.first && !cmp(threshold, bound)) {
--counter;
while (counter > 0) {
--counter;
if (cmp(statistic[output[counter]], bound)) {
++counter;
break;
}
}
}

output.resize(counter);
std::sort(output.begin(), output.end());
return output;
}

}
/**
* @endcond
*/

/**
* @tparam Stat_ Type of the variance statistic.
* @tparam Bool_ Type to be used as a boolean.
*
* @param n Number of genes.
* @param[in] statistic Pointer to an array of length `n` containing the per-gene variance statistics.
* @param[out] output Pointer to an array of length `n`.
* On output, this is filled with `true` if the gene is to be retained and `false` otherwise.
* @param options Further options.
*/
template<typename Stat_, typename Bool_>
void choose_highly_variable_genes(size_t n, const Stat_* statistic, Bool_* output, const ChooseHighlyVariableGenesOptions& options) {
if (options.larger) {
internal::choose_highly_variable_genes(
n,
statistic,
output,
[](Stat_ l, Stat_ r) -> bool { return l > r; },
[](Stat_ l, Stat_ r) -> bool { return l >= r; },
options
);
} else {
internal::choose_highly_variable_genes(
n,
statistic,
output,
[](Stat_ l, Stat_ r) -> bool { return l < r; },
[](Stat_ l, Stat_ r) -> bool { return l <= r; },
options
);
}
}

/**
Expand Down Expand Up @@ -115,45 +245,23 @@ std::vector<Bool_> choose_highly_variable_genes(size_t n, const Stat_* statistic
*/
template<typename Index_, typename Stat_>
std::vector<Index_> choose_highly_variable_genes_index(Index_ n, const Stat_* statistic, const ChooseHighlyVariableGenesOptions& options) {
if (options.top >= static_cast<size_t>(n)) {
std::vector<Index_> output(n);
std::iota(output.begin(), output.end(), static_cast<Index_>(0));
return output;
} else if (options.top == 0) {
return std::vector<Index_>();
}

std::vector<Index_> collected(n);
std::iota(collected.begin(), collected.end(), static_cast<Index_>(0));
auto cBegin = collected.begin(), cMid = cBegin + options.top - 1, cEnd = collected.end();
if (options.larger) {
std::nth_element(cBegin, cMid, cEnd, [&](size_t l, size_t r) -> bool { return statistic[l] > statistic[r]; });
return internal::choose_highly_variable_genes_index<Index_>(
n,
statistic,
[](Stat_ l, Stat_ r) -> bool { return l > r; },
[](Stat_ l, Stat_ r) -> bool { return l >= r; },
options
);
} else {
std::nth_element(cBegin, cMid, cEnd, [&](size_t l, size_t r) -> bool { return statistic[l] < statistic[r]; });
return internal::choose_highly_variable_genes_index<Index_>(
n,
statistic,
[](Stat_ l, Stat_ r) -> bool { return l < r; },
[](Stat_ l, Stat_ r) -> bool { return l <= r; },
options
);
}

if (!options.keep_ties) {
collected.resize(options.top);
std::sort(collected.begin(), collected.end());
} else {
auto threshold = statistic[collected[options.top - 1]];
collected.clear();
if (options.larger) {
for (Index_ i = 0; i < n; ++i) {
if (statistic[i] >= threshold) {
collected.push_back(i);
}
}
} else {
for (Index_ i = 0; i < n; ++i) {
if (statistic[i] <= threshold) {
collected.push_back(i);
}
}
}
}

return collected;
}

}
Expand Down
Loading

0 comments on commit 6787665

Please sign in to comment.