From d8d9ed20160c8772007689ed2c696108f7b5a539 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Sat, 14 Dec 2024 19:20:10 -0800
Subject: [PATCH] [r] add feature selection methods

---
 r/NAMESPACE                              |   2 +
 r/NEWS.md                                |   1 +
 r/R/singlecell_utils.R                   | 113 +++++++++++++++++++++++
 r/man/select_features_by_dispersion.Rd   |  42 +++++++++
 r/man/select_features_by_mean.Rd         |  34 +++++++
 r/man/select_features_by_variance.Rd     |  41 ++++++++
 r/pkgdown/_pkgdown.yml                   |   3 +
 r/tests/testthat/test-singlecell_utils.R |  21 ++++-
 8 files changed, 256 insertions(+), 1 deletion(-)
 create mode 100644 r/man/select_features_by_dispersion.Rd
 create mode 100644 r/man/select_features_by_mean.Rd
 create mode 100644 r/man/select_features_by_variance.Rd

diff --git a/r/NAMESPACE b/r/NAMESPACE
index 622d3a88..c90e1a15 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -110,6 +110,8 @@ export(rowVars.default)
 export(sctransform_pearson)
 export(select_cells)
 export(select_chromosomes)
+export(select_features_by_mean)
+export(select_features_by_variance)
 export(select_regions)
 export(set_trackplot_height)
 export(set_trackplot_label)
diff --git a/r/NEWS.md b/r/NEWS.md
index 890f2755..73ca847d 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -23,6 +23,7 @@ Contributions welcome :)
   If `matrixStats` or `MatrixGenerics` packages are installed, `BPCells::colQuantiles()` will fall back to their implementations for non-BPCells objects. (pull request #128)
 - Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128)
 - Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #168)
+- Add feature selection functions `select_features_by_{variance,dispersion,mean}()`, with parameterization for normalization steps, and number of variable features (pull request #169)
 
 ## Improvements
 - `trackplot_loop()` now accepts discrete color scales
diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 0fa1711d..cd7bc554 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -6,6 +6,118 @@
 # option. This file may not be copied, modified, or distributed
 # except according to those terms.
 
+
+#################
+# Feature selection
+#################
+
+#' Get the most variable features within a matrix.
+#' @param mat (IterableMatrix) dimensions features x cells
+#' @param num_feats (integer) Number of features to return.  If the number is higher than the number of features in the matrix, 
+#' all features will be returned.
+#' @param normalize (function) Normalize matrix using a given function. If `NULL`, no normalization is performed.
+#' @param threads (integer) Number of threads to use.
+#' @returns
+#' Return a dataframe with the following columns, sorted descending by variance:
+#' - `names`: Feature name.
+#' - `score`: Variance of the feature.
+#' - `highly_variable`: Logical vector of whether the feature is highly variable.
+#' @details 
+#' Calculate using the following process:
+#'  1. Perform an optional term frequency + log normalization, for each feature.
+#'  2. Find `num_feats` features with the highest variance across clusters.
+#' @export
+select_features_by_variance <- function(
+  mat, num_feats = 25000, 
+  normalize = normalize_log,
+  threads = 1L
+) {
+  assert_is(mat, "IterableMatrix")
+  assert_greater_than_zero(num_feats)
+  assert_is_wholenumber(num_feats)
+  assert_len(num_feats, 1)
+  assert_is(num_feats, "numeric")
+  num_feats <- min(max(num_feats, 0), nrow(mat))
+
+  if (!is.null(normalize)) mat <- normalize(mat, threads = threads)
+  features_df <- tibble::tibble(
+    names = rownames(mat),
+    score = matrix_stats(mat, row_stats = "variance", threads = threads)$row_stats["variance",]
+  ) %>% 
+    dplyr::arrange(desc(score)) %>% 
+    dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats)
+  return(features_df)
+}
+
+
+#' Get the features with the highest dispersion within a matrix.
+#' @returns
+#' Return a dataframe with the following columns, sorted descending by dispersion:
+#'   - `names`: Feature name.
+#'   - `score`: Variance of the feature.
+#'   - `highly_variable`: Logical vector of whether the feature is highly variable.
+#' @inheritParams select_features_by_variance
+#' @details 
+#' Calculate using the following process:
+#'  1. Perform an optional term frequency + log normalization, for each feature.
+#'  2. Find the dispersion (variance/mean) of each feature.
+#'  3. Find `num_feats` features with the highest dispersion.
+select_features_by_dispersion <- function(
+  mat, num_feats = 25000, 
+  normalize = normalize_log,
+  threads = 1L
+) {
+  assert_is(mat, "IterableMatrix")
+  assert_greater_than_zero(num_feats)
+  assert_is_wholenumber(num_feats)
+  assert_len(num_feats, 1)
+  assert_is(num_feats, "numeric")
+  num_feats <- min(max(num_feats, 0), nrow(mat))
+
+  if (!is.null(normalize)) mat <- normalize(mat, threads = threads)
+  mat_stats <- matrix_stats(mat, row_stats = "variance", threads = threads)
+  features_df <- tibble::tibble(
+    names = rownames(mat),
+    score = mat_stats$row_stats["variance", ] / mat_stats$row_stats["mean", ]
+  ) %>% 
+    dplyr::arrange(desc(score)) %>% 
+    dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats)
+  return(features_df)
+}
+
+
+#' Get the top features from a matrix, based on the mean accessibility of each feature.
+#' @param num_feats Number of features to deem as highly accessible.  If the number is higher than the number of features in the matrix,
+#' all features will be returned.
+#' @inheritParams select_features_by_variance
+#' @returns
+#' Return a dataframe with the following columns, sorted descending by mean accessibility:
+#'   - `names`: Feature name.
+#'   - `score`: Binarize sum of each feature.
+#'   - `highly_variable`: Logical vector of whether the feature is highly accessible by mean accessibility.
+#' @details 
+#' Calculate using the following process:
+#' 1. Get the sum of each binarized feature.
+#' 2. Find `num_feats` features with the highest accessibility.
+#' @export
+select_features_by_mean <- function(mat, num_feats = 25000, threads = 1L) {
+  assert_is(mat, "IterableMatrix")
+  assert_is_wholenumber(num_feats)
+  assert_greater_than_zero(num_feats)
+  assert_is(num_feats, "numeric")
+  num_feats <- min(max(num_feats, 0), nrow(mat))
+  # get the sum of each feature, binarized
+  # get the top features
+  features_df <- tibble::tibble(
+    names = rownames(mat),
+    score = matrix_stats(mat, row_stats = "nonzero", threads = threads)$row_stats["nonzero", ]
+  ) %>%
+    dplyr::arrange(desc(score)) %>%
+    dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats)
+  return(features_df)
+}
+
+
 #' Test for marker features
 #'
 #' Given a features x cells matrix, perform one-vs-all differential
@@ -70,6 +182,7 @@ marker_features <- function(mat, groups, method="wilcoxon") {
     )
 }
 
+
 #' Aggregate counts matrices by cell group or feature.
 #'
 #' Given a `(features x cells)` matrix, group cells by `cell_groups` and aggregate counts by `method` for each
diff --git a/r/man/select_features_by_dispersion.Rd b/r/man/select_features_by_dispersion.Rd
new file mode 100644
index 00000000..5d0bc177
--- /dev/null
+++ b/r/man/select_features_by_dispersion.Rd
@@ -0,0 +1,42 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/singlecell_utils.R
+\name{select_features_by_dispersion}
+\alias{select_features_by_dispersion}
+\title{Get the features with the highest dispersion within a matrix.}
+\usage{
+select_features_by_dispersion(
+  mat,
+  num_feats = 25000,
+  normalize = normalize_log,
+  threads = 1L
+)
+}
+\arguments{
+\item{mat}{(IterableMatrix) dimensions features x cells}
+
+\item{num_feats}{(integer) Number of features to return.  If the number is higher than the number of features in the matrix,
+all features will be returned.}
+
+\item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.}
+
+\item{threads}{(integer) Number of threads to use.}
+}
+\value{
+Return a dataframe with the following columns, sorted descending by dispersion:
+\itemize{
+\item \code{names}: Feature name.
+\item \code{score}: Variance of the feature.
+\item \code{highly_variable}: Logical vector of whether the feature is highly variable.
+}
+}
+\description{
+Get the features with the highest dispersion within a matrix.
+}
+\details{
+Calculate using the following process:
+\enumerate{
+\item Perform an optional term frequency + log normalization, for each feature.
+\item Find the dispersion (variance/mean) of each feature.
+\item Find \code{num_feats} features with the highest dispersion.
+}
+}
diff --git a/r/man/select_features_by_mean.Rd b/r/man/select_features_by_mean.Rd
new file mode 100644
index 00000000..c05b0acb
--- /dev/null
+++ b/r/man/select_features_by_mean.Rd
@@ -0,0 +1,34 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/singlecell_utils.R
+\name{select_features_by_mean}
+\alias{select_features_by_mean}
+\title{Get the top features from a matrix, based on the mean accessibility of each feature.}
+\usage{
+select_features_by_mean(mat, num_feats = 25000, threads = 1L)
+}
+\arguments{
+\item{mat}{(IterableMatrix) dimensions features x cells}
+
+\item{num_feats}{Number of features to deem as highly accessible.  If the number is higher than the number of features in the matrix,
+all features will be returned.}
+
+\item{threads}{(integer) Number of threads to use.}
+}
+\value{
+Return a dataframe with the following columns, sorted descending by mean accessibility:
+\itemize{
+\item \code{names}: Feature name.
+\item \code{score}: Binarize sum of each feature.
+\item \code{highly_variable}: Logical vector of whether the feature is highly accessible by mean accessibility.
+}
+}
+\description{
+Get the top features from a matrix, based on the mean accessibility of each feature.
+}
+\details{
+Calculate using the following process:
+\enumerate{
+\item Get the sum of each binarized feature.
+\item Find \code{num_feats} features with the highest accessibility.
+}
+}
diff --git a/r/man/select_features_by_variance.Rd b/r/man/select_features_by_variance.Rd
new file mode 100644
index 00000000..b7cc375f
--- /dev/null
+++ b/r/man/select_features_by_variance.Rd
@@ -0,0 +1,41 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/singlecell_utils.R
+\name{select_features_by_variance}
+\alias{select_features_by_variance}
+\title{Get the most variable features within a matrix.}
+\usage{
+select_features_by_variance(
+  mat,
+  num_feats = 25000,
+  normalize = normalize_log,
+  threads = 1L
+)
+}
+\arguments{
+\item{mat}{(IterableMatrix) dimensions features x cells}
+
+\item{num_feats}{(integer) Number of features to return.  If the number is higher than the number of features in the matrix,
+all features will be returned.}
+
+\item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.}
+
+\item{threads}{(integer) Number of threads to use.}
+}
+\value{
+Return a dataframe with the following columns, sorted descending by variance:
+\itemize{
+\item \code{names}: Feature name.
+\item \code{score}: Variance of the feature.
+\item \code{highly_variable}: Logical vector of whether the feature is highly variable.
+}
+}
+\description{
+Get the most variable features within a matrix.
+}
+\details{
+Calculate using the following process:
+\enumerate{
+\item Perform an optional term frequency + log normalization, for each feature.
+\item Find \code{num_feats} features with the highest variance across clusters.
+}
+}
diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml
index ae2772eb..526018e4 100644
--- a/r/pkgdown/_pkgdown.yml
+++ b/r/pkgdown/_pkgdown.yml
@@ -133,6 +133,9 @@ reference:
   - regress_out
   - normalize_log
   - normalize_tfidf
+  - select_features_by_variance
+  - select_features_by_dispersion
+  - select_features_by_mean
   - IterableMatrix-methods
   - pseudobulk_matrix
 
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index 8bae8966..8117ed02 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -13,6 +13,24 @@ generate_sparse_matrix <- function(nrow, ncol, fraction_nonzero = 0.5, max_val =
   as(m, "dgCMatrix")
 }
 
+test_that("select_features works general case", {
+  m1 <- generate_sparse_matrix(100, 50) %>% as("IterableMatrix")
+  for (fn in c("select_features_by_variance", "select_features_by_dispersion", "select_features_by_mean")) {
+    res <- do.call(fn, list(m1, num_feats = 10))
+    expect_equal(nrow(res), nrow(m1)) # Check that dataframe has correct features we're expecting
+    expect_equal(sum(res$highly_variable), 10) # Only 10 features marked as highly variable
+    expect_setequal(res$names, rownames(m1))
+    res_more_feats_than_rows <- do.call(fn, list(m1, num_feats = 10000)) # more features than rows
+    res_feats_equal_rows <- do.call(fn, list(m1, num_feats = 100)) 
+    expect_identical(res_more_feats_than_rows, res_feats_equal_rows)
+    if (fn != "select_features_by_mean") {
+      # Check that normalization actually does something
+      res_no_norm <- do.call(fn, list(m1, num_feats = 10, normalize = NULL))
+      expect_true(!all((res %>% dplyr::arrange(names))$score == (res_no_norm %>% dplyr::arrange(names))$score))
+    }
+  }
+})
+
 test_that("Wilcoxon rank sum works (small)", {
     x <- c(0.80, 0.83, 1.89, 1.04, 1.45, 1.38, 1.91, 1.64, 0.73, 1.46)
     y <- c(1.15, 0.88, 0.90, 0.74, 1.21)
@@ -160,4 +178,5 @@ test_that("Pseudobulk aggregation works with multiple return types", {
       }
     }
   }
-})
\ No newline at end of file
+})
+