SingleR-inc · LTLA · Sep 6, 2024 · Sep 6, 2024 · Sep 6, 2024 · Sep 6, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: SingleR
 Title: Reference-Based Single-Cell RNA-Seq Annotation
-Version: 2.7.1
-Date: 2024-05-22
+Version: 2.7.2
+Date: 2024-09-06
 Authors@R: c(person("Dvir", "Aran", email="[email protected]", role=c("aut", "cph")),
     person("Aaron", "Lun", email="[email protected]", role=c("ctb", "cre")),
     person("Daniel", "Bunis", role="ctb"),
@@ -20,6 +20,7 @@ Imports:
     DelayedMatrixStats,
     BiocParallel,
     BiocSingular,
+    BiocNeighbors,
     stats,
     utils,
     Rcpp,
@@ -28,6 +29,7 @@ Imports:
 LinkingTo:
     Rcpp,
     beachmat,
+    assorthead,
     BiocNeighbors
 Suggests:
     testthat,
@@ -57,6 +59,6 @@ biocViews:
 SystemRequirements: C++17
 VignetteBuilder: knitr
 Encoding: UTF-8
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 URL: https://github.com/LTLA/SingleR
 BugReports: https://support.bioconductor.org/
diff --git a/NAMESPACE b/NAMESPACE
@@ -27,6 +27,9 @@ importClassesFrom(Matrix,dgCMatrix)
 importClassesFrom(S4Vectors,DataFrame)
 importClassesFrom(S4Vectors,List)
 importClassesFrom(SummarizedExperiment,SummarizedExperiment)
+importFrom(BiocNeighbors,AnnoyParam)
+importFrom(BiocNeighbors,KmknnParam)
+importFrom(BiocNeighbors,defineBuilder)
 importFrom(BiocParallel,SerialParam)
 importFrom(BiocParallel,bpisup)
 importFrom(BiocParallel,bpmapply)

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -1,6 +1,16 @@
 # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
+classify_integrated <- function(test, results, integrated_build, quantile, nthreads) {
+    .Call('_SingleR_classify_integrated', PACKAGE = 'SingleR', test, results, integrated_build, quantile, nthreads)
+}
+
+#' @importFrom Rcpp sourceCpp
+#' @useDynLib SingleR
+classify_single <- function(test, prebuilt, quantile, use_fine_tune, fine_tune_threshold, nthreads) {
+    .Call('_SingleR_classify_single', PACKAGE = 'SingleR', test, prebuilt, quantile, use_fine_tune, fine_tune_threshold, nthreads)
+}
+
 find_classic_markers <- function(nlabels, ngenes, labels, ref, de_n, nthreads) {
     .Call('_SingleR_find_classic_markers', PACKAGE = 'SingleR', nlabels, ngenes, labels, ref, de_n, nthreads)
 }
@@ -9,31 +19,21 @@ grouped_medians <- function(ref, groups, ngroups, nthreads) {
     .Call('_SingleR_grouped_medians', PACKAGE = 'SingleR', ref, groups, ngroups, nthreads)
 }
 
-integrate_build <- function(test_features, references, ref_ids, labels, prebuilt, nthreads) {
-    .Call('_SingleR_integrate_build', PACKAGE = 'SingleR', test_features, references, ref_ids, labels, prebuilt, nthreads)
-}
-
-integrate_run <- function(test, results, integrated_build, quantile, nthreads) {
-    .Call('_SingleR_integrate_run', PACKAGE = 'SingleR', test, results, integrated_build, quantile, nthreads)
+train_integrated <- function(test_features, references, ref_ids, labels, prebuilt, nthreads) {
+    .Call('_SingleR_train_integrated', PACKAGE = 'SingleR', test_features, references, ref_ids, labels, prebuilt, nthreads)
 }
 
 #' @importFrom Rcpp sourceCpp
 #' @useDynLib SingleR
-prebuild <- function(ref, labels, markers, approximate, nthreads) {
-    .Call('_SingleR_prebuild', PACKAGE = 'SingleR', ref, labels, markers, approximate, nthreads)
+train_single <- function(test_features, ref, ref_features, labels, markers, builder, nthreads) {
+    .Call('_SingleR_train_single', PACKAGE = 'SingleR', test_features, ref, ref_features, labels, markers, builder, nthreads)
 }
 
-get_subset <- function(built) {
-    .Call('_SingleR_get_subset', PACKAGE = 'SingleR', built)
+get_ref_subset <- function(built) {
+    .Call('_SingleR_get_ref_subset', PACKAGE = 'SingleR', built)
 }
 
 is_valid_built <- function(built) {
     .Call('_SingleR_is_valid_built', PACKAGE = 'SingleR', built)
 }
 
-#' @importFrom Rcpp sourceCpp
-#' @useDynLib SingleR
-run <- function(test, subset, prebuilt, quantile, use_fine_tune, fine_tune_threshold, nthreads) {
-    .Call('_SingleR_run', PACKAGE = 'SingleR', test, subset, prebuilt, quantile, use_fine_tune, fine_tune_threshold, nthreads)
-}
-
diff --git a/R/SingleR.R b/R/SingleR.R
@@ -102,33 +102,23 @@ SingleR <- function(
         bpstart(BPPARAM)
         on.exit(bpstop(BPPARAM))
     }
-    test <- .to_clean_matrix(test, assay.type.test, check.missing, msg="test", BPPARAM=BPPARAM)
-
-    # Converting to a common list format for ease of data munging.
-    if (single.ref <- !.is_list(ref)) {
-        ref <- list(ref)
-    }
 
-    ref <- lapply(ref, FUN=.to_clean_matrix, assay.type=assay.type.ref, 
-        check.missing=check.missing, msg="ref", BPPARAM=BPPARAM)
-    refnames <- Reduce(intersect, lapply(ref, rownames))
+    # We have to do all this row-subsetting at the start before trainSingleR,
+    # otherwise 'test.genes' won't match up to the filtered 'test'.
+    test <- .to_clean_matrix(test, assay.type.test, check.missing, msg="test", BPPARAM=BPPARAM)
 
-    keep <- intersect(rownames(test), refnames)
-    if (length(keep) == 0) {
-        stop("no common genes between 'test' and 'ref'")
+    tmp.ref <- ref
+    if (!is.list(tmp.ref) || is.data.frame(tmp.ref)) {
+        tmp.ref <- list(ref)
     }
-    if (!identical(keep, rownames(test))) {
-        test <- test[keep,]
-    }
-    for (i in seq_along(ref)) {
-        if (!identical(keep, rownames(ref[[i]]))) {
-            ref[[i]] <- ref[[i]][keep,,drop=FALSE]
+    for (rr in tmp.ref) {
+        keep <- rownames(test) %in% rownames(rr)
+        if (!all(keep)) {
+            test <- DelayedArray(test)[keep,,drop=FALSE] # only keeping the intersection, for safety's sake - see ?combineRecomputedResults.
         }
     }
-
-    # Converting back.
-    if (single.ref) {
-        ref <- ref[[1]]
+    if (nrow(test) == 0) {
+        stop("no common genes between 'test' and 'ref")
     }
 
     trained <- trainSingleR(
@@ -143,7 +133,8 @@ SingleR <- function(
         aggr.args = aggr.args, 
         recompute=recompute,
         restrict = restrict, 
-        check.missing=FALSE, 
+        test.genes=rownames(test),
+        check.missing=check.missing, 
         BNPARAM=BNPARAM, 
         num.threads = num.threads, 
         BPPARAM=BPPARAM

diff --git a/R/classifySingleR.R b/R/classifySingleR.R
@@ -3,17 +3,19 @@
 #' Assign labels to each cell in a test dataset, using a pre-trained classifier combined with an iterative fine-tuning approach.
 #' 
 #' @param test A numeric matrix of single-cell expression values where rows are genes and columns are cells.
+#' Each row should be named with the gene name.
 #'
 #' Alternatively, a \linkS4class{SummarizedExperiment} object containing such a matrix.
 #' @param trained A \linkS4class{List} containing the output of the \code{\link{trainSingleR}} function.
+#' If the row names of \code{test} are not exactly the same as the reference dataset, the call to \code{trainSingleR} should set \code{test.genes=rownames(test)}.
+#'
 #' Alternatively, a List of Lists produced by \code{\link{trainSingleR}} for multiple references.
 #' @param quantile A numeric scalar specifying the quantile of the correlation distribution to use to compute the score for each label.
 #' @param fine.tune A logical scalar indicating whether fine-tuning should be performed. 
 #' @param tune.thresh A numeric scalar specifying the maximum difference from the maximum correlation to use in fine-tuning.
 #' @param sd.thresh Deprecated and ignored.
 #' @param assay.type Integer scalar or string specifying the matrix of expression values to use if \code{test} is a \linkS4class{SummarizedExperiment}.
-#' @param check.missing Logical scalar indicating whether rows should be checked for missing values.
-#' If true and any missing values are found, the rows containing these values are silently removed.
+#' @param check.missing Deprecated and ignored, as any row filtering will cause mismatches with the \code{test.genes=} used in \code{\link{trainSingleR}}.
 #' @param prune A logical scalar indicating whether label pruning should be performed.
 #' @param num.threads Integer scalar specifying the number of threads to use for classification.
 #' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying the parallelization scheme to use for \code{NA} scanning, when \code{check.missing=TRUE}.
@@ -98,25 +100,30 @@ classifySingleR <- function(
     sd.thresh=NULL, 
     prune=TRUE, 
     assay.type="logcounts", 
-    check.missing=TRUE,
+    check.missing=FALSE,
     num.threads = bpnworkers(BPPARAM),
     BPPARAM=SerialParam()) 
 {
-    test <- .to_clean_matrix(test, assay.type, check.missing, msg="test", BPPARAM=BPPARAM)
+    test <- .to_clean_matrix(test, assay.type, check.missing=FALSE, msg="test", BPPARAM=BPPARAM)
 
     solo <- .is_solo(trained)
     if (solo) { 
         trained <- list(trained)
     }
 
-    results <- lapply(trained, FUN=.classify_internals, 
-        test=test, 
-        quantile=quantile, 
-        fine.tune=fine.tune, 
-        tune.thresh=tune.thresh, 
-        prune=prune, 
-        num.threads=num.threads
-    )
+    results <- vector("list", length(trained))
+    names(results) <- names(trained)
+    for (l in seq_along(results)) {
+        results[[l]] <- .classify_internals(
+            test=test, 
+            trained=trained[[l]],
+            quantile=quantile, 
+            fine.tune=fine.tune, 
+            tune.thresh=tune.thresh, 
+            prune=prune, 
+            num.threads=num.threads
+        )
+    }
 
     if (solo) {
         results[[1]]
@@ -131,21 +138,30 @@ classifySingleR <- function(
     } 
 }
 
-#' @importFrom S4Vectors DataFrame metadata metadata<- I
-.classify_internals <- function(test, trained, quantile, fine.tune, tune.thresh=0.05, prune=TRUE, num.threads=1) {
-    m <- match(trained$markers$unique, rownames(test))
-    if (anyNA(m)) {
-        stop("'rownames(test)' does not contain all genes used in 'trained'")
+.check_test_genes <- function(test, trained) {
+    if (!is.null(trained$options$test.genes)) {
+        if (!identical(trained$options$test.genes, rownames(test))) {
+            stop("expected 'rownames(test)' to be the same as 'test.genes' in 'trained'")
+        }
+    } else if (!identical(rownames(trained$ref), rownames(test))) {
+        stop("expected 'rownames(test)' to be the same as 'rownames(ref)' in 'trained'")
     }
+}
 
+#' @importFrom S4Vectors DataFrame metadata metadata<- I
+.classify_internals <- function(test, trained, quantile, fine.tune, tune.thresh=0.05, prune=TRUE, num.threads=1) {
+    .check_test_genes(test, trained)
     trained <- rebuildIndex(trained, num.threads = num.threads)
 
     parsed <- initializeCpp(test)
-    out <- run(parsed, m - 1L, trained$built, 
+    out <- classify_single(
+        test = parsed, 
+        prebuilt = trained$built, 
         quantile = quantile, 
         use_fine_tune = fine.tune, 
         fine_tune_threshold = tune.thresh, 
-        nthreads = num.threads)
+        nthreads = num.threads
+    )
 
     colnames(out$scores) <- trained$labels$unique
     output <- DataFrame(

diff --git a/R/combineRecomputedResults.R b/R/combineRecomputedResults.R
@@ -6,10 +6,11 @@
 #'
 #' @param results A list of \linkS4class{DataFrame} prediction results as returned by \code{\link{classifySingleR}} when run on each reference separately.
 #' @inheritParams SingleR
+#' @param check.missing Deprecated and ignored, as any row filtering will cause mismatches with the \code{test.genes=} used in \code{\link{trainSingleR}}.
 #' @param trained A list of \linkS4class{List}s containing the trained outputs of multiple references,
 #' equivalent to either (i) the output of \code{\link{trainSingleR}} on multiple references with \code{recompute=TRUE},
 #' or (ii) running \code{trainSingleR} on each reference separately and manually making a list of the trained outputs.
-#' @param warn.lost Logical scalar indicating whether to emit a warning if markers from one reference in \code{trained} are \dQuote{lost} in other references.
+#' @param warn.lost Logical scalar indicating whether to emit a warning if markers from one reference in \code{trained} are absent in other references.
 #' @param quantile Numeric scalar specifying the quantile of the correlation distribution to use for computing the score, see \code{\link{classifySingleR}}.
 #' @param allow.lost Deprecated.
 #'
@@ -20,7 +21,7 @@
 #' For any given cell, entries of this matrix are only non-\code{NA} for the assigned label in each reference;
 #' scores are not recomputed for the other labels.
 #' \item \code{labels}, a character vector containing the per-cell combined label across references.
-#' \item \code{references}, an integer vector specifying the reference from which the combined label was derived.
+#' \item \code{reference}, an integer vector specifying the reference from which the combined label was derived.
 #' \item \code{orig.results}, a DataFrame containing \code{results}.
 #' }
 #' It may also contain \code{pruned.labels} if these were also present in \code{results}.
@@ -100,66 +101,82 @@
 #'
 #' @export
 #' @importFrom S4Vectors DataFrame metadata<-
+#' @importFrom beachmat initializeCpp
 combineRecomputedResults <- function(
     results, 
     test, 
     trained, 
     quantile=0.8, 
     assay.type.test="logcounts", 
-    check.missing=TRUE, 
-    allow.lost=FALSE, 
+    check.missing=FALSE,
     warn.lost=TRUE,
+    allow.lost=FALSE, 
     num.threads = bpnworkers(BPPARAM),
     BPPARAM=SerialParam())
 {
-    all.names <- c(list(colnames(test)), lapply(results, rownames))
-    if (length(unique(all.names)) != 1) {
-        stop("cell/cluster names in 'results' are not identical")
-    }
-    all.nrow <- c(ncol(test), vapply(results, nrow, 0L))
-    if (length(unique(all.nrow)) != 1) {
-        stop("numbers of cells/clusters in 'results' are not identical")
+    test <- .to_clean_matrix(test, assay.type=assay.type.test, check.missing=FALSE, msg="test", BPPARAM=BPPARAM)
+
+    # Applying the sanity checks.
+    stopifnot(length(results) == length(trained))
+    for (i in seq_along(results)) {
+        curres <- results[[i]]
+        if (ncol(test) != nrow(curres)) {
+            stop("numbers of cells/clusters in 'results' are not identical")
+        }
+        if (!identical(rownames(curres), colnames(test))) {
+            stop("cell/cluster names in 'results' are not identical")
+        }
+
+        curtrain <- trained[[i]]
+        if (!all(curres$labels %in% curtrain$labels$unique)) {
+            stop("not all labels in 'results' are present in 'trained'")
+        }
+        .check_test_genes(test, curtrain)
     }
 
-    # Checking the marker consistency.
+    # Checking the genes.
     all.refnames <- lapply(trained, function(x) rownames(x$ref))
-    intersected <- Reduce(intersect, all.refnames)
-    for (i in seq_along(trained)) {
-        if (!all(trained[[i]]$markers$unique %in% rownames(test))) {
-            stop("all markers stored in 'results' should be present in 'test'")
-        } else if (warn.lost && !all(trained[[i]]$markers$unique %in% intersected)) {
-            warning("entries of 'trained' differ in the universe of available markers")
+    if (warn.lost) {
+        intersected <- Reduce(intersect, all.refnames)
+        for (i in seq_along(trained)) {
+            if (!all(trained[[i]]$markers$unique %in% intersected)) {
+                warning("not all markers in 'trained' are available in each reference")
+            }
         }
     }
-    
+
     # Applying the integration.
     universe <- Reduce(union, c(list(rownames(test)), all.refnames))
-    ibuilt <- integrate_build(
-        match(rownames(test), universe) - 1L,
-        lapply(trained, function(x) initializeCpp(x$ref)),
-        lapply(trained, function(x) match(rownames(x$ref), universe) - 1L), 
-        lapply(trained, function(x) match(x$labels$full, x$labels$unique) - 1L),
-        lapply(trained, function(x) x$built),
+    ibuilt <- train_integrated(
+        test_features=match(rownames(test), universe) - 1L,
+        references=lapply(trained, function(x) initializeCpp(x$ref)),
+        ref_ids=lapply(all.refnames, function(x) match(x, universe) - 1L), 
+        labels=lapply(trained, function(x) match(x$labels$full, x$labels$unique) - 1L),
+        prebuilt=lapply(trained, function(x) rebuildIndex(x)$built),
         nthreads = num.threads
     )
 
-    test <- .to_clean_matrix(test, assay.type=assay.type.test, check.missing=check.missing, msg="test", BPPARAM=BPPARAM)
     collated <- vector("list", length(trained))
     for (i in seq_along(collated)) {
         collated[[i]] <- match(results[[i]]$labels, trained[[i]]$labels$unique) - 1L
     }
 
     parsed <- initializeCpp(test)
-    irun <- integrate_run(parsed, collated, ibuilt, quantile = quantile, nthreads = num.threads) 
-    scores <- irun$scores
+    irun <- classify_integrated(
+        test=parsed,
+        results=collated,
+        integrated_build=ibuilt,
+        quantile=quantile,
+        nthreads=num.threads
+    ) 
 
     # Organizing the outputs.
     base.scores <- vector("list", length(results))
     for (r in seq_along(base.scores)) {
         mat <- results[[r]]$scores   
         mat[] <- NA_real_
         idx <- cbind(seq_len(nrow(mat)), collated[[r]] + 1L)
-        mat[idx] <- scores[,r]
+        mat[idx] <- irun$scores[,r]
         base.scores[[r]] <- mat
     }