From d8d9ed20160c8772007689ed2c696108f7b5a539 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Sat, 14 Dec 2024 19:20:10 -0800 Subject: [PATCH] [r] add feature selection methods --- r/NAMESPACE | 2 + r/NEWS.md | 1 + r/R/singlecell_utils.R | 113 +++++++++++++++++++++++ r/man/select_features_by_dispersion.Rd | 42 +++++++++ r/man/select_features_by_mean.Rd | 34 +++++++ r/man/select_features_by_variance.Rd | 41 ++++++++ r/pkgdown/_pkgdown.yml | 3 + r/tests/testthat/test-singlecell_utils.R | 21 ++++- 8 files changed, 256 insertions(+), 1 deletion(-) create mode 100644 r/man/select_features_by_dispersion.Rd create mode 100644 r/man/select_features_by_mean.Rd create mode 100644 r/man/select_features_by_variance.Rd diff --git a/r/NAMESPACE b/r/NAMESPACE index 622d3a88..c90e1a15 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -110,6 +110,8 @@ export(rowVars.default) export(sctransform_pearson) export(select_cells) export(select_chromosomes) +export(select_features_by_mean) +export(select_features_by_variance) export(select_regions) export(set_trackplot_height) export(set_trackplot_label) diff --git a/r/NEWS.md b/r/NEWS.md index 890f2755..73ca847d 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -23,6 +23,7 @@ Contributions welcome :) If `matrixStats` or `MatrixGenerics` packages are installed, `BPCells::colQuantiles()` will fall back to their implementations for non-BPCells objects. (pull request #128) - Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128) - Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #168) +- Add feature selection functions `select_features_by_{variance,dispersion,mean}()`, with parameterization for normalization steps, and number of variable features (pull request #169) ## Improvements - `trackplot_loop()` now accepts discrete color scales diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index 0fa1711d..cd7bc554 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -6,6 +6,118 @@ # option. This file may not be copied, modified, or distributed # except according to those terms. + +################# +# Feature selection +################# + +#' Get the most variable features within a matrix. +#' @param mat (IterableMatrix) dimensions features x cells +#' @param num_feats (integer) Number of features to return. If the number is higher than the number of features in the matrix, +#' all features will be returned. +#' @param normalize (function) Normalize matrix using a given function. If `NULL`, no normalization is performed. +#' @param threads (integer) Number of threads to use. +#' @returns +#' Return a dataframe with the following columns, sorted descending by variance: +#' - `names`: Feature name. +#' - `score`: Variance of the feature. +#' - `highly_variable`: Logical vector of whether the feature is highly variable. +#' @details +#' Calculate using the following process: +#' 1. Perform an optional term frequency + log normalization, for each feature. +#' 2. Find `num_feats` features with the highest variance across clusters. +#' @export +select_features_by_variance <- function( + mat, num_feats = 25000, + normalize = normalize_log, + threads = 1L +) { + assert_is(mat, "IterableMatrix") + assert_greater_than_zero(num_feats) + assert_is_wholenumber(num_feats) + assert_len(num_feats, 1) + assert_is(num_feats, "numeric") + num_feats <- min(max(num_feats, 0), nrow(mat)) + + if (!is.null(normalize)) mat <- normalize(mat, threads = threads) + features_df <- tibble::tibble( + names = rownames(mat), + score = matrix_stats(mat, row_stats = "variance", threads = threads)$row_stats["variance",] + ) %>% + dplyr::arrange(desc(score)) %>% + dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats) + return(features_df) +} + + +#' Get the features with the highest dispersion within a matrix. +#' @returns +#' Return a dataframe with the following columns, sorted descending by dispersion: +#' - `names`: Feature name. +#' - `score`: Variance of the feature. +#' - `highly_variable`: Logical vector of whether the feature is highly variable. +#' @inheritParams select_features_by_variance +#' @details +#' Calculate using the following process: +#' 1. Perform an optional term frequency + log normalization, for each feature. +#' 2. Find the dispersion (variance/mean) of each feature. +#' 3. Find `num_feats` features with the highest dispersion. +select_features_by_dispersion <- function( + mat, num_feats = 25000, + normalize = normalize_log, + threads = 1L +) { + assert_is(mat, "IterableMatrix") + assert_greater_than_zero(num_feats) + assert_is_wholenumber(num_feats) + assert_len(num_feats, 1) + assert_is(num_feats, "numeric") + num_feats <- min(max(num_feats, 0), nrow(mat)) + + if (!is.null(normalize)) mat <- normalize(mat, threads = threads) + mat_stats <- matrix_stats(mat, row_stats = "variance", threads = threads) + features_df <- tibble::tibble( + names = rownames(mat), + score = mat_stats$row_stats["variance", ] / mat_stats$row_stats["mean", ] + ) %>% + dplyr::arrange(desc(score)) %>% + dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats) + return(features_df) +} + + +#' Get the top features from a matrix, based on the mean accessibility of each feature. +#' @param num_feats Number of features to deem as highly accessible. If the number is higher than the number of features in the matrix, +#' all features will be returned. +#' @inheritParams select_features_by_variance +#' @returns +#' Return a dataframe with the following columns, sorted descending by mean accessibility: +#' - `names`: Feature name. +#' - `score`: Binarize sum of each feature. +#' - `highly_variable`: Logical vector of whether the feature is highly accessible by mean accessibility. +#' @details +#' Calculate using the following process: +#' 1. Get the sum of each binarized feature. +#' 2. Find `num_feats` features with the highest accessibility. +#' @export +select_features_by_mean <- function(mat, num_feats = 25000, threads = 1L) { + assert_is(mat, "IterableMatrix") + assert_is_wholenumber(num_feats) + assert_greater_than_zero(num_feats) + assert_is(num_feats, "numeric") + num_feats <- min(max(num_feats, 0), nrow(mat)) + # get the sum of each feature, binarized + # get the top features + features_df <- tibble::tibble( + names = rownames(mat), + score = matrix_stats(mat, row_stats = "nonzero", threads = threads)$row_stats["nonzero", ] + ) %>% + dplyr::arrange(desc(score)) %>% + dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats) + return(features_df) +} + + #' Test for marker features #' #' Given a features x cells matrix, perform one-vs-all differential @@ -70,6 +182,7 @@ marker_features <- function(mat, groups, method="wilcoxon") { ) } + #' Aggregate counts matrices by cell group or feature. #' #' Given a `(features x cells)` matrix, group cells by `cell_groups` and aggregate counts by `method` for each diff --git a/r/man/select_features_by_dispersion.Rd b/r/man/select_features_by_dispersion.Rd new file mode 100644 index 00000000..5d0bc177 --- /dev/null +++ b/r/man/select_features_by_dispersion.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/singlecell_utils.R +\name{select_features_by_dispersion} +\alias{select_features_by_dispersion} +\title{Get the features with the highest dispersion within a matrix.} +\usage{ +select_features_by_dispersion( + mat, + num_feats = 25000, + normalize = normalize_log, + threads = 1L +) +} +\arguments{ +\item{mat}{(IterableMatrix) dimensions features x cells} + +\item{num_feats}{(integer) Number of features to return. If the number is higher than the number of features in the matrix, +all features will be returned.} + +\item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.} + +\item{threads}{(integer) Number of threads to use.} +} +\value{ +Return a dataframe with the following columns, sorted descending by dispersion: +\itemize{ +\item \code{names}: Feature name. +\item \code{score}: Variance of the feature. +\item \code{highly_variable}: Logical vector of whether the feature is highly variable. +} +} +\description{ +Get the features with the highest dispersion within a matrix. +} +\details{ +Calculate using the following process: +\enumerate{ +\item Perform an optional term frequency + log normalization, for each feature. +\item Find the dispersion (variance/mean) of each feature. +\item Find \code{num_feats} features with the highest dispersion. +} +} diff --git a/r/man/select_features_by_mean.Rd b/r/man/select_features_by_mean.Rd new file mode 100644 index 00000000..c05b0acb --- /dev/null +++ b/r/man/select_features_by_mean.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/singlecell_utils.R +\name{select_features_by_mean} +\alias{select_features_by_mean} +\title{Get the top features from a matrix, based on the mean accessibility of each feature.} +\usage{ +select_features_by_mean(mat, num_feats = 25000, threads = 1L) +} +\arguments{ +\item{mat}{(IterableMatrix) dimensions features x cells} + +\item{num_feats}{Number of features to deem as highly accessible. If the number is higher than the number of features in the matrix, +all features will be returned.} + +\item{threads}{(integer) Number of threads to use.} +} +\value{ +Return a dataframe with the following columns, sorted descending by mean accessibility: +\itemize{ +\item \code{names}: Feature name. +\item \code{score}: Binarize sum of each feature. +\item \code{highly_variable}: Logical vector of whether the feature is highly accessible by mean accessibility. +} +} +\description{ +Get the top features from a matrix, based on the mean accessibility of each feature. +} +\details{ +Calculate using the following process: +\enumerate{ +\item Get the sum of each binarized feature. +\item Find \code{num_feats} features with the highest accessibility. +} +} diff --git a/r/man/select_features_by_variance.Rd b/r/man/select_features_by_variance.Rd new file mode 100644 index 00000000..b7cc375f --- /dev/null +++ b/r/man/select_features_by_variance.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/singlecell_utils.R +\name{select_features_by_variance} +\alias{select_features_by_variance} +\title{Get the most variable features within a matrix.} +\usage{ +select_features_by_variance( + mat, + num_feats = 25000, + normalize = normalize_log, + threads = 1L +) +} +\arguments{ +\item{mat}{(IterableMatrix) dimensions features x cells} + +\item{num_feats}{(integer) Number of features to return. If the number is higher than the number of features in the matrix, +all features will be returned.} + +\item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.} + +\item{threads}{(integer) Number of threads to use.} +} +\value{ +Return a dataframe with the following columns, sorted descending by variance: +\itemize{ +\item \code{names}: Feature name. +\item \code{score}: Variance of the feature. +\item \code{highly_variable}: Logical vector of whether the feature is highly variable. +} +} +\description{ +Get the most variable features within a matrix. +} +\details{ +Calculate using the following process: +\enumerate{ +\item Perform an optional term frequency + log normalization, for each feature. +\item Find \code{num_feats} features with the highest variance across clusters. +} +} diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml index ae2772eb..526018e4 100644 --- a/r/pkgdown/_pkgdown.yml +++ b/r/pkgdown/_pkgdown.yml @@ -133,6 +133,9 @@ reference: - regress_out - normalize_log - normalize_tfidf + - select_features_by_variance + - select_features_by_dispersion + - select_features_by_mean - IterableMatrix-methods - pseudobulk_matrix diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R index 8bae8966..8117ed02 100644 --- a/r/tests/testthat/test-singlecell_utils.R +++ b/r/tests/testthat/test-singlecell_utils.R @@ -13,6 +13,24 @@ generate_sparse_matrix <- function(nrow, ncol, fraction_nonzero = 0.5, max_val = as(m, "dgCMatrix") } +test_that("select_features works general case", { + m1 <- generate_sparse_matrix(100, 50) %>% as("IterableMatrix") + for (fn in c("select_features_by_variance", "select_features_by_dispersion", "select_features_by_mean")) { + res <- do.call(fn, list(m1, num_feats = 10)) + expect_equal(nrow(res), nrow(m1)) # Check that dataframe has correct features we're expecting + expect_equal(sum(res$highly_variable), 10) # Only 10 features marked as highly variable + expect_setequal(res$names, rownames(m1)) + res_more_feats_than_rows <- do.call(fn, list(m1, num_feats = 10000)) # more features than rows + res_feats_equal_rows <- do.call(fn, list(m1, num_feats = 100)) + expect_identical(res_more_feats_than_rows, res_feats_equal_rows) + if (fn != "select_features_by_mean") { + # Check that normalization actually does something + res_no_norm <- do.call(fn, list(m1, num_feats = 10, normalize = NULL)) + expect_true(!all((res %>% dplyr::arrange(names))$score == (res_no_norm %>% dplyr::arrange(names))$score)) + } + } +}) + test_that("Wilcoxon rank sum works (small)", { x <- c(0.80, 0.83, 1.89, 1.04, 1.45, 1.38, 1.91, 1.64, 0.73, 1.46) y <- c(1.15, 0.88, 0.90, 0.74, 1.21) @@ -160,4 +178,5 @@ test_that("Pseudobulk aggregation works with multiple return types", { } } } -}) \ No newline at end of file +}) +