Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update to the latest version of the singlepp library. #274

Merged
merged 11 commits into from
Sep 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: SingleR
Title: Reference-Based Single-Cell RNA-Seq Annotation
Version: 2.7.1
Date: 2024-05-22
Version: 2.7.2
Date: 2024-09-06
Authors@R: c(person("Dvir", "Aran", email="[email protected]", role=c("aut", "cph")),
person("Aaron", "Lun", email="[email protected]", role=c("ctb", "cre")),
person("Daniel", "Bunis", role="ctb"),
Expand All @@ -20,6 +20,7 @@ Imports:
DelayedMatrixStats,
BiocParallel,
BiocSingular,
BiocNeighbors,
stats,
utils,
Rcpp,
Expand All @@ -28,6 +29,7 @@ Imports:
LinkingTo:
Rcpp,
beachmat,
assorthead,
BiocNeighbors
Suggests:
testthat,
Expand Down Expand Up @@ -57,6 +59,6 @@ biocViews:
SystemRequirements: C++17
VignetteBuilder: knitr
Encoding: UTF-8
RoxygenNote: 7.3.1
RoxygenNote: 7.3.2
URL: https://github.com/LTLA/SingleR
BugReports: https://support.bioconductor.org/
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ importClassesFrom(Matrix,dgCMatrix)
importClassesFrom(S4Vectors,DataFrame)
importClassesFrom(S4Vectors,List)
importClassesFrom(SummarizedExperiment,SummarizedExperiment)
importFrom(BiocNeighbors,AnnoyParam)
importFrom(BiocNeighbors,KmknnParam)
importFrom(BiocNeighbors,defineBuilder)
importFrom(BiocParallel,SerialParam)
importFrom(BiocParallel,bpisup)
importFrom(BiocParallel,bpmapply)
Expand Down
32 changes: 16 additions & 16 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

classify_integrated <- function(test, results, integrated_build, quantile, nthreads) {
.Call('_SingleR_classify_integrated', PACKAGE = 'SingleR', test, results, integrated_build, quantile, nthreads)
}

#' @importFrom Rcpp sourceCpp
#' @useDynLib SingleR
classify_single <- function(test, prebuilt, quantile, use_fine_tune, fine_tune_threshold, nthreads) {
.Call('_SingleR_classify_single', PACKAGE = 'SingleR', test, prebuilt, quantile, use_fine_tune, fine_tune_threshold, nthreads)
}

find_classic_markers <- function(nlabels, ngenes, labels, ref, de_n, nthreads) {
.Call('_SingleR_find_classic_markers', PACKAGE = 'SingleR', nlabels, ngenes, labels, ref, de_n, nthreads)
}
Expand All @@ -9,31 +19,21 @@ grouped_medians <- function(ref, groups, ngroups, nthreads) {
.Call('_SingleR_grouped_medians', PACKAGE = 'SingleR', ref, groups, ngroups, nthreads)
}

integrate_build <- function(test_features, references, ref_ids, labels, prebuilt, nthreads) {
.Call('_SingleR_integrate_build', PACKAGE = 'SingleR', test_features, references, ref_ids, labels, prebuilt, nthreads)
}

integrate_run <- function(test, results, integrated_build, quantile, nthreads) {
.Call('_SingleR_integrate_run', PACKAGE = 'SingleR', test, results, integrated_build, quantile, nthreads)
train_integrated <- function(test_features, references, ref_ids, labels, prebuilt, nthreads) {
.Call('_SingleR_train_integrated', PACKAGE = 'SingleR', test_features, references, ref_ids, labels, prebuilt, nthreads)
}

#' @importFrom Rcpp sourceCpp
#' @useDynLib SingleR
prebuild <- function(ref, labels, markers, approximate, nthreads) {
.Call('_SingleR_prebuild', PACKAGE = 'SingleR', ref, labels, markers, approximate, nthreads)
train_single <- function(test_features, ref, ref_features, labels, markers, builder, nthreads) {
.Call('_SingleR_train_single', PACKAGE = 'SingleR', test_features, ref, ref_features, labels, markers, builder, nthreads)
}

get_subset <- function(built) {
.Call('_SingleR_get_subset', PACKAGE = 'SingleR', built)
get_ref_subset <- function(built) {
.Call('_SingleR_get_ref_subset', PACKAGE = 'SingleR', built)
}

is_valid_built <- function(built) {
.Call('_SingleR_is_valid_built', PACKAGE = 'SingleR', built)
}

#' @importFrom Rcpp sourceCpp
#' @useDynLib SingleR
run <- function(test, subset, prebuilt, quantile, use_fine_tune, fine_tune_threshold, nthreads) {
.Call('_SingleR_run', PACKAGE = 'SingleR', test, subset, prebuilt, quantile, use_fine_tune, fine_tune_threshold, nthreads)
}

37 changes: 14 additions & 23 deletions R/SingleR.R
Original file line number Diff line number Diff line change
Expand Up @@ -102,33 +102,23 @@ SingleR <- function(
bpstart(BPPARAM)
on.exit(bpstop(BPPARAM))
}
test <- .to_clean_matrix(test, assay.type.test, check.missing, msg="test", BPPARAM=BPPARAM)

# Converting to a common list format for ease of data munging.
if (single.ref <- !.is_list(ref)) {
ref <- list(ref)
}

ref <- lapply(ref, FUN=.to_clean_matrix, assay.type=assay.type.ref,
check.missing=check.missing, msg="ref", BPPARAM=BPPARAM)
refnames <- Reduce(intersect, lapply(ref, rownames))
# We have to do all this row-subsetting at the start before trainSingleR,
# otherwise 'test.genes' won't match up to the filtered 'test'.
test <- .to_clean_matrix(test, assay.type.test, check.missing, msg="test", BPPARAM=BPPARAM)

keep <- intersect(rownames(test), refnames)
if (length(keep) == 0) {
stop("no common genes between 'test' and 'ref'")
tmp.ref <- ref
if (!is.list(tmp.ref) || is.data.frame(tmp.ref)) {
tmp.ref <- list(ref)
}
if (!identical(keep, rownames(test))) {
test <- test[keep,]
}
for (i in seq_along(ref)) {
if (!identical(keep, rownames(ref[[i]]))) {
ref[[i]] <- ref[[i]][keep,,drop=FALSE]
for (rr in tmp.ref) {
keep <- rownames(test) %in% rownames(rr)
if (!all(keep)) {
test <- DelayedArray(test)[keep,,drop=FALSE] # only keeping the intersection, for safety's sake - see ?combineRecomputedResults.
}
}

# Converting back.
if (single.ref) {
ref <- ref[[1]]
if (nrow(test) == 0) {
stop("no common genes between 'test' and 'ref")
}

trained <- trainSingleR(
Expand All @@ -143,7 +133,8 @@ SingleR <- function(
aggr.args = aggr.args,
recompute=recompute,
restrict = restrict,
check.missing=FALSE,
test.genes=rownames(test),
check.missing=check.missing,
BNPARAM=BNPARAM,
num.threads = num.threads,
BPPARAM=BPPARAM
Expand Down
54 changes: 35 additions & 19 deletions R/classifySingleR.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,19 @@
#' Assign labels to each cell in a test dataset, using a pre-trained classifier combined with an iterative fine-tuning approach.
#'
#' @param test A numeric matrix of single-cell expression values where rows are genes and columns are cells.
#' Each row should be named with the gene name.
#'
#' Alternatively, a \linkS4class{SummarizedExperiment} object containing such a matrix.
#' @param trained A \linkS4class{List} containing the output of the \code{\link{trainSingleR}} function.
#' If the row names of \code{test} are not exactly the same as the reference dataset, the call to \code{trainSingleR} should set \code{test.genes=rownames(test)}.
#'
#' Alternatively, a List of Lists produced by \code{\link{trainSingleR}} for multiple references.
#' @param quantile A numeric scalar specifying the quantile of the correlation distribution to use to compute the score for each label.
#' @param fine.tune A logical scalar indicating whether fine-tuning should be performed.
#' @param tune.thresh A numeric scalar specifying the maximum difference from the maximum correlation to use in fine-tuning.
#' @param sd.thresh Deprecated and ignored.
#' @param assay.type Integer scalar or string specifying the matrix of expression values to use if \code{test} is a \linkS4class{SummarizedExperiment}.
#' @param check.missing Logical scalar indicating whether rows should be checked for missing values.
#' If true and any missing values are found, the rows containing these values are silently removed.
#' @param check.missing Deprecated and ignored, as any row filtering will cause mismatches with the \code{test.genes=} used in \code{\link{trainSingleR}}.
#' @param prune A logical scalar indicating whether label pruning should be performed.
#' @param num.threads Integer scalar specifying the number of threads to use for classification.
#' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying the parallelization scheme to use for \code{NA} scanning, when \code{check.missing=TRUE}.
Expand Down Expand Up @@ -98,25 +100,30 @@ classifySingleR <- function(
sd.thresh=NULL,
prune=TRUE,
assay.type="logcounts",
check.missing=TRUE,
check.missing=FALSE,
num.threads = bpnworkers(BPPARAM),
BPPARAM=SerialParam())
{
test <- .to_clean_matrix(test, assay.type, check.missing, msg="test", BPPARAM=BPPARAM)
test <- .to_clean_matrix(test, assay.type, check.missing=FALSE, msg="test", BPPARAM=BPPARAM)

solo <- .is_solo(trained)
if (solo) {
trained <- list(trained)
}

results <- lapply(trained, FUN=.classify_internals,
test=test,
quantile=quantile,
fine.tune=fine.tune,
tune.thresh=tune.thresh,
prune=prune,
num.threads=num.threads
)
results <- vector("list", length(trained))
names(results) <- names(trained)
for (l in seq_along(results)) {
results[[l]] <- .classify_internals(
test=test,
trained=trained[[l]],
quantile=quantile,
fine.tune=fine.tune,
tune.thresh=tune.thresh,
prune=prune,
num.threads=num.threads
)
}

if (solo) {
results[[1]]
Expand All @@ -131,21 +138,30 @@ classifySingleR <- function(
}
}

#' @importFrom S4Vectors DataFrame metadata metadata<- I
.classify_internals <- function(test, trained, quantile, fine.tune, tune.thresh=0.05, prune=TRUE, num.threads=1) {
m <- match(trained$markers$unique, rownames(test))
if (anyNA(m)) {
stop("'rownames(test)' does not contain all genes used in 'trained'")
.check_test_genes <- function(test, trained) {
if (!is.null(trained$options$test.genes)) {
if (!identical(trained$options$test.genes, rownames(test))) {
stop("expected 'rownames(test)' to be the same as 'test.genes' in 'trained'")
}
} else if (!identical(rownames(trained$ref), rownames(test))) {
stop("expected 'rownames(test)' to be the same as 'rownames(ref)' in 'trained'")
}
}

#' @importFrom S4Vectors DataFrame metadata metadata<- I
.classify_internals <- function(test, trained, quantile, fine.tune, tune.thresh=0.05, prune=TRUE, num.threads=1) {
.check_test_genes(test, trained)
trained <- rebuildIndex(trained, num.threads = num.threads)

parsed <- initializeCpp(test)
out <- run(parsed, m - 1L, trained$built,
out <- classify_single(
test = parsed,
prebuilt = trained$built,
quantile = quantile,
use_fine_tune = fine.tune,
fine_tune_threshold = tune.thresh,
nthreads = num.threads)
nthreads = num.threads
)

colnames(out$scores) <- trained$labels$unique
output <- DataFrame(
Expand Down
75 changes: 46 additions & 29 deletions R/combineRecomputedResults.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@
#'
#' @param results A list of \linkS4class{DataFrame} prediction results as returned by \code{\link{classifySingleR}} when run on each reference separately.
#' @inheritParams SingleR
#' @param check.missing Deprecated and ignored, as any row filtering will cause mismatches with the \code{test.genes=} used in \code{\link{trainSingleR}}.
#' @param trained A list of \linkS4class{List}s containing the trained outputs of multiple references,
#' equivalent to either (i) the output of \code{\link{trainSingleR}} on multiple references with \code{recompute=TRUE},
#' or (ii) running \code{trainSingleR} on each reference separately and manually making a list of the trained outputs.
#' @param warn.lost Logical scalar indicating whether to emit a warning if markers from one reference in \code{trained} are \dQuote{lost} in other references.
#' @param warn.lost Logical scalar indicating whether to emit a warning if markers from one reference in \code{trained} are absent in other references.
#' @param quantile Numeric scalar specifying the quantile of the correlation distribution to use for computing the score, see \code{\link{classifySingleR}}.
#' @param allow.lost Deprecated.
#'
Expand All @@ -20,7 +21,7 @@
#' For any given cell, entries of this matrix are only non-\code{NA} for the assigned label in each reference;
#' scores are not recomputed for the other labels.
#' \item \code{labels}, a character vector containing the per-cell combined label across references.
#' \item \code{references}, an integer vector specifying the reference from which the combined label was derived.
#' \item \code{reference}, an integer vector specifying the reference from which the combined label was derived.
#' \item \code{orig.results}, a DataFrame containing \code{results}.
#' }
#' It may also contain \code{pruned.labels} if these were also present in \code{results}.
Expand Down Expand Up @@ -100,66 +101,82 @@
#'
#' @export
#' @importFrom S4Vectors DataFrame metadata<-
#' @importFrom beachmat initializeCpp
combineRecomputedResults <- function(
results,
test,
trained,
quantile=0.8,
assay.type.test="logcounts",
check.missing=TRUE,
allow.lost=FALSE,
check.missing=FALSE,
warn.lost=TRUE,
allow.lost=FALSE,
num.threads = bpnworkers(BPPARAM),
BPPARAM=SerialParam())
{
all.names <- c(list(colnames(test)), lapply(results, rownames))
if (length(unique(all.names)) != 1) {
stop("cell/cluster names in 'results' are not identical")
}
all.nrow <- c(ncol(test), vapply(results, nrow, 0L))
if (length(unique(all.nrow)) != 1) {
stop("numbers of cells/clusters in 'results' are not identical")
test <- .to_clean_matrix(test, assay.type=assay.type.test, check.missing=FALSE, msg="test", BPPARAM=BPPARAM)

# Applying the sanity checks.
stopifnot(length(results) == length(trained))
for (i in seq_along(results)) {
curres <- results[[i]]
if (ncol(test) != nrow(curres)) {
stop("numbers of cells/clusters in 'results' are not identical")
}
if (!identical(rownames(curres), colnames(test))) {
stop("cell/cluster names in 'results' are not identical")
}

curtrain <- trained[[i]]
if (!all(curres$labels %in% curtrain$labels$unique)) {
stop("not all labels in 'results' are present in 'trained'")
}
.check_test_genes(test, curtrain)
}

# Checking the marker consistency.
# Checking the genes.
all.refnames <- lapply(trained, function(x) rownames(x$ref))
intersected <- Reduce(intersect, all.refnames)
for (i in seq_along(trained)) {
if (!all(trained[[i]]$markers$unique %in% rownames(test))) {
stop("all markers stored in 'results' should be present in 'test'")
} else if (warn.lost && !all(trained[[i]]$markers$unique %in% intersected)) {
warning("entries of 'trained' differ in the universe of available markers")
if (warn.lost) {
intersected <- Reduce(intersect, all.refnames)
for (i in seq_along(trained)) {
if (!all(trained[[i]]$markers$unique %in% intersected)) {
warning("not all markers in 'trained' are available in each reference")
}
}
}

# Applying the integration.
universe <- Reduce(union, c(list(rownames(test)), all.refnames))
ibuilt <- integrate_build(
match(rownames(test), universe) - 1L,
lapply(trained, function(x) initializeCpp(x$ref)),
lapply(trained, function(x) match(rownames(x$ref), universe) - 1L),
lapply(trained, function(x) match(x$labels$full, x$labels$unique) - 1L),
lapply(trained, function(x) x$built),
ibuilt <- train_integrated(
test_features=match(rownames(test), universe) - 1L,
references=lapply(trained, function(x) initializeCpp(x$ref)),
ref_ids=lapply(all.refnames, function(x) match(x, universe) - 1L),
labels=lapply(trained, function(x) match(x$labels$full, x$labels$unique) - 1L),
prebuilt=lapply(trained, function(x) rebuildIndex(x)$built),
nthreads = num.threads
)

test <- .to_clean_matrix(test, assay.type=assay.type.test, check.missing=check.missing, msg="test", BPPARAM=BPPARAM)
collated <- vector("list", length(trained))
for (i in seq_along(collated)) {
collated[[i]] <- match(results[[i]]$labels, trained[[i]]$labels$unique) - 1L
}

parsed <- initializeCpp(test)
irun <- integrate_run(parsed, collated, ibuilt, quantile = quantile, nthreads = num.threads)
scores <- irun$scores
irun <- classify_integrated(
test=parsed,
results=collated,
integrated_build=ibuilt,
quantile=quantile,
nthreads=num.threads
)

# Organizing the outputs.
base.scores <- vector("list", length(results))
for (r in seq_along(base.scores)) {
mat <- results[[r]]$scores
mat[] <- NA_real_
idx <- cbind(seq_len(nrow(mat)), collated[[r]] + 1L)
mat[idx] <- scores[,r]
mat[idx] <- irun$scores[,r]
base.scores[[r]] <- mat
}

Expand Down
Loading
Loading