diff --git a/DESCRIPTION b/DESCRIPTION index 7e96163..ef584aa 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: SingleR Title: Reference-Based Single-Cell RNA-Seq Annotation -Version: 2.7.1 -Date: 2024-05-22 +Version: 2.7.2 +Date: 2024-09-06 Authors@R: c(person("Dvir", "Aran", email="dvir.aran@ucsf.edu", role=c("aut", "cph")), person("Aaron", "Lun", email="infinite.monkeys.with.keyboards@gmail.com", role=c("ctb", "cre")), person("Daniel", "Bunis", role="ctb"), @@ -20,6 +20,7 @@ Imports: DelayedMatrixStats, BiocParallel, BiocSingular, + BiocNeighbors, stats, utils, Rcpp, @@ -28,6 +29,7 @@ Imports: LinkingTo: Rcpp, beachmat, + assorthead, BiocNeighbors Suggests: testthat, @@ -57,6 +59,6 @@ biocViews: SystemRequirements: C++17 VignetteBuilder: knitr Encoding: UTF-8 -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 URL: https://github.com/LTLA/SingleR BugReports: https://support.bioconductor.org/ diff --git a/NAMESPACE b/NAMESPACE index 91f111f..5cbaed2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -27,6 +27,9 @@ importClassesFrom(Matrix,dgCMatrix) importClassesFrom(S4Vectors,DataFrame) importClassesFrom(S4Vectors,List) importClassesFrom(SummarizedExperiment,SummarizedExperiment) +importFrom(BiocNeighbors,AnnoyParam) +importFrom(BiocNeighbors,KmknnParam) +importFrom(BiocNeighbors,defineBuilder) importFrom(BiocParallel,SerialParam) importFrom(BiocParallel,bpisup) importFrom(BiocParallel,bpmapply) diff --git a/R/RcppExports.R b/R/RcppExports.R index 78fc7a3..d4c7890 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,6 +1,16 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 +classify_integrated <- function(test, results, integrated_build, quantile, nthreads) { + .Call('_SingleR_classify_integrated', PACKAGE = 'SingleR', test, results, integrated_build, quantile, nthreads) +} + +#' @importFrom Rcpp sourceCpp +#' @useDynLib SingleR +classify_single <- function(test, prebuilt, quantile, use_fine_tune, fine_tune_threshold, nthreads) { + .Call('_SingleR_classify_single', PACKAGE = 'SingleR', test, prebuilt, quantile, use_fine_tune, fine_tune_threshold, nthreads) +} + find_classic_markers <- function(nlabels, ngenes, labels, ref, de_n, nthreads) { .Call('_SingleR_find_classic_markers', PACKAGE = 'SingleR', nlabels, ngenes, labels, ref, de_n, nthreads) } @@ -9,31 +19,21 @@ grouped_medians <- function(ref, groups, ngroups, nthreads) { .Call('_SingleR_grouped_medians', PACKAGE = 'SingleR', ref, groups, ngroups, nthreads) } -integrate_build <- function(test_features, references, ref_ids, labels, prebuilt, nthreads) { - .Call('_SingleR_integrate_build', PACKAGE = 'SingleR', test_features, references, ref_ids, labels, prebuilt, nthreads) -} - -integrate_run <- function(test, results, integrated_build, quantile, nthreads) { - .Call('_SingleR_integrate_run', PACKAGE = 'SingleR', test, results, integrated_build, quantile, nthreads) +train_integrated <- function(test_features, references, ref_ids, labels, prebuilt, nthreads) { + .Call('_SingleR_train_integrated', PACKAGE = 'SingleR', test_features, references, ref_ids, labels, prebuilt, nthreads) } #' @importFrom Rcpp sourceCpp #' @useDynLib SingleR -prebuild <- function(ref, labels, markers, approximate, nthreads) { - .Call('_SingleR_prebuild', PACKAGE = 'SingleR', ref, labels, markers, approximate, nthreads) +train_single <- function(test_features, ref, ref_features, labels, markers, builder, nthreads) { + .Call('_SingleR_train_single', PACKAGE = 'SingleR', test_features, ref, ref_features, labels, markers, builder, nthreads) } -get_subset <- function(built) { - .Call('_SingleR_get_subset', PACKAGE = 'SingleR', built) +get_ref_subset <- function(built) { + .Call('_SingleR_get_ref_subset', PACKAGE = 'SingleR', built) } is_valid_built <- function(built) { .Call('_SingleR_is_valid_built', PACKAGE = 'SingleR', built) } -#' @importFrom Rcpp sourceCpp -#' @useDynLib SingleR -run <- function(test, subset, prebuilt, quantile, use_fine_tune, fine_tune_threshold, nthreads) { - .Call('_SingleR_run', PACKAGE = 'SingleR', test, subset, prebuilt, quantile, use_fine_tune, fine_tune_threshold, nthreads) -} - diff --git a/R/SingleR.R b/R/SingleR.R index 6f6f4e7..a8bf931 100644 --- a/R/SingleR.R +++ b/R/SingleR.R @@ -102,33 +102,23 @@ SingleR <- function( bpstart(BPPARAM) on.exit(bpstop(BPPARAM)) } - test <- .to_clean_matrix(test, assay.type.test, check.missing, msg="test", BPPARAM=BPPARAM) - - # Converting to a common list format for ease of data munging. - if (single.ref <- !.is_list(ref)) { - ref <- list(ref) - } - ref <- lapply(ref, FUN=.to_clean_matrix, assay.type=assay.type.ref, - check.missing=check.missing, msg="ref", BPPARAM=BPPARAM) - refnames <- Reduce(intersect, lapply(ref, rownames)) + # We have to do all this row-subsetting at the start before trainSingleR, + # otherwise 'test.genes' won't match up to the filtered 'test'. + test <- .to_clean_matrix(test, assay.type.test, check.missing, msg="test", BPPARAM=BPPARAM) - keep <- intersect(rownames(test), refnames) - if (length(keep) == 0) { - stop("no common genes between 'test' and 'ref'") + tmp.ref <- ref + if (!is.list(tmp.ref) || is.data.frame(tmp.ref)) { + tmp.ref <- list(ref) } - if (!identical(keep, rownames(test))) { - test <- test[keep,] - } - for (i in seq_along(ref)) { - if (!identical(keep, rownames(ref[[i]]))) { - ref[[i]] <- ref[[i]][keep,,drop=FALSE] + for (rr in tmp.ref) { + keep <- rownames(test) %in% rownames(rr) + if (!all(keep)) { + test <- DelayedArray(test)[keep,,drop=FALSE] # only keeping the intersection, for safety's sake - see ?combineRecomputedResults. } } - - # Converting back. - if (single.ref) { - ref <- ref[[1]] + if (nrow(test) == 0) { + stop("no common genes between 'test' and 'ref") } trained <- trainSingleR( @@ -143,7 +133,8 @@ SingleR <- function( aggr.args = aggr.args, recompute=recompute, restrict = restrict, - check.missing=FALSE, + test.genes=rownames(test), + check.missing=check.missing, BNPARAM=BNPARAM, num.threads = num.threads, BPPARAM=BPPARAM diff --git a/R/classifySingleR.R b/R/classifySingleR.R index 3a3b747..27e055d 100644 --- a/R/classifySingleR.R +++ b/R/classifySingleR.R @@ -3,17 +3,19 @@ #' Assign labels to each cell in a test dataset, using a pre-trained classifier combined with an iterative fine-tuning approach. #' #' @param test A numeric matrix of single-cell expression values where rows are genes and columns are cells. +#' Each row should be named with the gene name. #' #' Alternatively, a \linkS4class{SummarizedExperiment} object containing such a matrix. #' @param trained A \linkS4class{List} containing the output of the \code{\link{trainSingleR}} function. +#' If the row names of \code{test} are not exactly the same as the reference dataset, the call to \code{trainSingleR} should set \code{test.genes=rownames(test)}. +#' #' Alternatively, a List of Lists produced by \code{\link{trainSingleR}} for multiple references. #' @param quantile A numeric scalar specifying the quantile of the correlation distribution to use to compute the score for each label. #' @param fine.tune A logical scalar indicating whether fine-tuning should be performed. #' @param tune.thresh A numeric scalar specifying the maximum difference from the maximum correlation to use in fine-tuning. #' @param sd.thresh Deprecated and ignored. #' @param assay.type Integer scalar or string specifying the matrix of expression values to use if \code{test} is a \linkS4class{SummarizedExperiment}. -#' @param check.missing Logical scalar indicating whether rows should be checked for missing values. -#' If true and any missing values are found, the rows containing these values are silently removed. +#' @param check.missing Deprecated and ignored, as any row filtering will cause mismatches with the \code{test.genes=} used in \code{\link{trainSingleR}}. #' @param prune A logical scalar indicating whether label pruning should be performed. #' @param num.threads Integer scalar specifying the number of threads to use for classification. #' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying the parallelization scheme to use for \code{NA} scanning, when \code{check.missing=TRUE}. @@ -98,25 +100,30 @@ classifySingleR <- function( sd.thresh=NULL, prune=TRUE, assay.type="logcounts", - check.missing=TRUE, + check.missing=FALSE, num.threads = bpnworkers(BPPARAM), BPPARAM=SerialParam()) { - test <- .to_clean_matrix(test, assay.type, check.missing, msg="test", BPPARAM=BPPARAM) + test <- .to_clean_matrix(test, assay.type, check.missing=FALSE, msg="test", BPPARAM=BPPARAM) solo <- .is_solo(trained) if (solo) { trained <- list(trained) } - results <- lapply(trained, FUN=.classify_internals, - test=test, - quantile=quantile, - fine.tune=fine.tune, - tune.thresh=tune.thresh, - prune=prune, - num.threads=num.threads - ) + results <- vector("list", length(trained)) + names(results) <- names(trained) + for (l in seq_along(results)) { + results[[l]] <- .classify_internals( + test=test, + trained=trained[[l]], + quantile=quantile, + fine.tune=fine.tune, + tune.thresh=tune.thresh, + prune=prune, + num.threads=num.threads + ) + } if (solo) { results[[1]] @@ -131,21 +138,30 @@ classifySingleR <- function( } } -#' @importFrom S4Vectors DataFrame metadata metadata<- I -.classify_internals <- function(test, trained, quantile, fine.tune, tune.thresh=0.05, prune=TRUE, num.threads=1) { - m <- match(trained$markers$unique, rownames(test)) - if (anyNA(m)) { - stop("'rownames(test)' does not contain all genes used in 'trained'") +.check_test_genes <- function(test, trained) { + if (!is.null(trained$options$test.genes)) { + if (!identical(trained$options$test.genes, rownames(test))) { + stop("expected 'rownames(test)' to be the same as 'test.genes' in 'trained'") + } + } else if (!identical(rownames(trained$ref), rownames(test))) { + stop("expected 'rownames(test)' to be the same as 'rownames(ref)' in 'trained'") } +} +#' @importFrom S4Vectors DataFrame metadata metadata<- I +.classify_internals <- function(test, trained, quantile, fine.tune, tune.thresh=0.05, prune=TRUE, num.threads=1) { + .check_test_genes(test, trained) trained <- rebuildIndex(trained, num.threads = num.threads) parsed <- initializeCpp(test) - out <- run(parsed, m - 1L, trained$built, + out <- classify_single( + test = parsed, + prebuilt = trained$built, quantile = quantile, use_fine_tune = fine.tune, fine_tune_threshold = tune.thresh, - nthreads = num.threads) + nthreads = num.threads + ) colnames(out$scores) <- trained$labels$unique output <- DataFrame( diff --git a/R/combineRecomputedResults.R b/R/combineRecomputedResults.R index 773baa4..f9b1fe9 100644 --- a/R/combineRecomputedResults.R +++ b/R/combineRecomputedResults.R @@ -6,10 +6,11 @@ #' #' @param results A list of \linkS4class{DataFrame} prediction results as returned by \code{\link{classifySingleR}} when run on each reference separately. #' @inheritParams SingleR +#' @param check.missing Deprecated and ignored, as any row filtering will cause mismatches with the \code{test.genes=} used in \code{\link{trainSingleR}}. #' @param trained A list of \linkS4class{List}s containing the trained outputs of multiple references, #' equivalent to either (i) the output of \code{\link{trainSingleR}} on multiple references with \code{recompute=TRUE}, #' or (ii) running \code{trainSingleR} on each reference separately and manually making a list of the trained outputs. -#' @param warn.lost Logical scalar indicating whether to emit a warning if markers from one reference in \code{trained} are \dQuote{lost} in other references. +#' @param warn.lost Logical scalar indicating whether to emit a warning if markers from one reference in \code{trained} are absent in other references. #' @param quantile Numeric scalar specifying the quantile of the correlation distribution to use for computing the score, see \code{\link{classifySingleR}}. #' @param allow.lost Deprecated. #' @@ -20,7 +21,7 @@ #' For any given cell, entries of this matrix are only non-\code{NA} for the assigned label in each reference; #' scores are not recomputed for the other labels. #' \item \code{labels}, a character vector containing the per-cell combined label across references. -#' \item \code{references}, an integer vector specifying the reference from which the combined label was derived. +#' \item \code{reference}, an integer vector specifying the reference from which the combined label was derived. #' \item \code{orig.results}, a DataFrame containing \code{results}. #' } #' It may also contain \code{pruned.labels} if these were also present in \code{results}. @@ -100,58 +101,74 @@ #' #' @export #' @importFrom S4Vectors DataFrame metadata<- +#' @importFrom beachmat initializeCpp combineRecomputedResults <- function( results, test, trained, quantile=0.8, assay.type.test="logcounts", - check.missing=TRUE, - allow.lost=FALSE, + check.missing=FALSE, warn.lost=TRUE, + allow.lost=FALSE, num.threads = bpnworkers(BPPARAM), BPPARAM=SerialParam()) { - all.names <- c(list(colnames(test)), lapply(results, rownames)) - if (length(unique(all.names)) != 1) { - stop("cell/cluster names in 'results' are not identical") - } - all.nrow <- c(ncol(test), vapply(results, nrow, 0L)) - if (length(unique(all.nrow)) != 1) { - stop("numbers of cells/clusters in 'results' are not identical") + test <- .to_clean_matrix(test, assay.type=assay.type.test, check.missing=FALSE, msg="test", BPPARAM=BPPARAM) + + # Applying the sanity checks. + stopifnot(length(results) == length(trained)) + for (i in seq_along(results)) { + curres <- results[[i]] + if (ncol(test) != nrow(curres)) { + stop("numbers of cells/clusters in 'results' are not identical") + } + if (!identical(rownames(curres), colnames(test))) { + stop("cell/cluster names in 'results' are not identical") + } + + curtrain <- trained[[i]] + if (!all(curres$labels %in% curtrain$labels$unique)) { + stop("not all labels in 'results' are present in 'trained'") + } + .check_test_genes(test, curtrain) } - # Checking the marker consistency. + # Checking the genes. all.refnames <- lapply(trained, function(x) rownames(x$ref)) - intersected <- Reduce(intersect, all.refnames) - for (i in seq_along(trained)) { - if (!all(trained[[i]]$markers$unique %in% rownames(test))) { - stop("all markers stored in 'results' should be present in 'test'") - } else if (warn.lost && !all(trained[[i]]$markers$unique %in% intersected)) { - warning("entries of 'trained' differ in the universe of available markers") + if (warn.lost) { + intersected <- Reduce(intersect, all.refnames) + for (i in seq_along(trained)) { + if (!all(trained[[i]]$markers$unique %in% intersected)) { + warning("not all markers in 'trained' are available in each reference") + } } } - + # Applying the integration. universe <- Reduce(union, c(list(rownames(test)), all.refnames)) - ibuilt <- integrate_build( - match(rownames(test), universe) - 1L, - lapply(trained, function(x) initializeCpp(x$ref)), - lapply(trained, function(x) match(rownames(x$ref), universe) - 1L), - lapply(trained, function(x) match(x$labels$full, x$labels$unique) - 1L), - lapply(trained, function(x) x$built), + ibuilt <- train_integrated( + test_features=match(rownames(test), universe) - 1L, + references=lapply(trained, function(x) initializeCpp(x$ref)), + ref_ids=lapply(all.refnames, function(x) match(x, universe) - 1L), + labels=lapply(trained, function(x) match(x$labels$full, x$labels$unique) - 1L), + prebuilt=lapply(trained, function(x) rebuildIndex(x)$built), nthreads = num.threads ) - test <- .to_clean_matrix(test, assay.type=assay.type.test, check.missing=check.missing, msg="test", BPPARAM=BPPARAM) collated <- vector("list", length(trained)) for (i in seq_along(collated)) { collated[[i]] <- match(results[[i]]$labels, trained[[i]]$labels$unique) - 1L } parsed <- initializeCpp(test) - irun <- integrate_run(parsed, collated, ibuilt, quantile = quantile, nthreads = num.threads) - scores <- irun$scores + irun <- classify_integrated( + test=parsed, + results=collated, + integrated_build=ibuilt, + quantile=quantile, + nthreads=num.threads + ) # Organizing the outputs. base.scores <- vector("list", length(results)) @@ -159,7 +176,7 @@ combineRecomputedResults <- function( mat <- results[[r]]$scores mat[] <- NA_real_ idx <- cbind(seq_len(nrow(mat)), collated[[r]] + 1L) - mat[idx] <- scores[,r] + mat[idx] <- irun$scores[,r] base.scores[[r]] <- mat } diff --git a/R/getClassicMarkers.R b/R/getClassicMarkers.R index 5ab6c93..4a95dee 100644 --- a/R/getClassicMarkers.R +++ b/R/getClassicMarkers.R @@ -64,6 +64,10 @@ getClassicMarkers <- function(ref, labels, assay.type="logcounts", check.missing labels <- list(labels) } + for (i in seq_along(ref)) { + ref[[i]] <- .to_clean_matrix(ref[[i]], assay.type, check.missing, msg="ref", BPPARAM=BPPARAM) + } + # Setting up references. common <- Reduce(intersect, lapply(ref, rownames)) if (length(common)==0L && any(vapply(ref, nrow, 0L) > 0L)) { @@ -73,9 +77,7 @@ getClassicMarkers <- function(ref, labels, assay.type="logcounts", check.missing for (i in seq_along(ref)) { current <- ref[[i]][common,,drop=FALSE] - current <- .to_clean_matrix(current, assay.type, check.missing, msg="ref", BPPARAM=BPPARAM) curptr <- initializeCpp(current) - flabels <- factor(labels[[i]]) gm <- grouped_medians(curptr, as.integer(flabels) - 1L, nlevels(flabels), nthreads = num.threads) colnames(gm) <- levels(flabels) diff --git a/R/rebuildIndices.R b/R/rebuildIndices.R index 87566ab..72e3261 100644 --- a/R/rebuildIndices.R +++ b/R/rebuildIndices.R @@ -46,7 +46,8 @@ rebuildIndex <- function(trained, num.threads=1) { markers=trained$markers$full, labels=trained$labels$full, ulabels=trained$labels$unique, - approximate=trained$options$approximate, + BNPARAM=trained$options$BNPARAM, + test.genes=trained$options$test.genes, num.threads=num.threads) } trained diff --git a/R/trainSingleR.R b/R/trainSingleR.R index fe56ec5..e74ea41 100644 --- a/R/trainSingleR.R +++ b/R/trainSingleR.R @@ -8,12 +8,11 @@ #' #' Alternatively, a \linkS4class{SummarizedExperiment} object containing such a matrix. #' -#' Alternatively, a list or \linkS4class{List} of SummarizedExperiment objects or numeric matrices containing multiple references, -#' in which case the row names are expected to be the same across all objects. +#' Alternatively, a list or \linkS4class{List} of SummarizedExperiment objects or numeric matrices containing multiple references. #' @param labels A character vector or factor of known labels for all samples in \code{ref}. #' #' Alternatively, if \code{ref} is a list, \code{labels} should be a list of the same length. -#' Each element should contain a character vector or factor specifying the label for the corresponding entry of \code{ref}. +#' Each element should contain a character vector or factor specifying the labels for the columns of the corresponding element of \code{ref}. #' @param genes A string containing \code{"de"}, indicating that markers should be calculated from \code{ref}. #' For back compatibility, other string values are allowed but will be ignored with a deprecation warning. #' @@ -45,13 +44,15 @@ #' if \code{ref} is a \linkS4class{SummarizedExperiment} object (or is a list that contains one or more such objects). #' @param check.missing Logical scalar indicating whether rows should be checked for missing values. #' If true and any missing values are found, the rows containing these values are silently removed. -#' @param BNPARAM Deprecated and ignored. -#' @param approximate Logical scalar indicating whether a faster approximate method should be used to compute the quantile. +#' @param BNPARAM A \linkS4class{BiocNeighborParam} object specifying how the neighbor search index should be constructed. +#' @param approximate Deprecated, use \code{BNPARAM} instead. #' @param num.threads Integer scalar specifying the number of threads to use for index building. #' @param BPPARAM A \linkS4class{BiocParallelParam} object specifying how parallelization should be performed. #' Relevant for marker detection if \code{genes = NULL}, aggregation if \code{aggr.ref = TRUE}, and \code{NA} checking if \code{check.missing = TRUE}. #' @param restrict A character vector of gene names to use for marker selection. #' By default, all genes in \code{ref} are used. +#' @param test.genes Character vector of the names of the genes in the test dataset, i.e., the row names of \code{test} in \code{\link{classifySingleR}}. +#' If \code{NULL}, it is assumed that the test dataset and \code{ref} have the same genes in the same row order. #' #' @return #' For a single reference, a \linkS4class{List} is returned containing: @@ -78,9 +79,15 @@ #' For each label, the top \code{de.n} genes with the largest differences compared to another label are chosen as markers to distinguish the two labels. #' The expression values are expected to be log-transformed and normalized. #' -#' If \code{restrict} is specified, \code{ref} is subsetted to only include the rows with names that are in \code{restrict}. -#' Marker selection and all subsequent classification will be performed using this restrictive subset of genes. -#' This can be convenient for ensuring that only appropriate genes are used (e.g., not pseudogenes or predicted genes). +#' Classification with \code{classifySingleR} assumes that the test dataset contains all marker genes that were detected from the reference. +#' If the test and reference datasets do not have the same genes in the same order, we can set \code{test.genes} to the row names of the test dataset. +#' This will instruct \code{trainSingleR} to only consider markers that are present in the test dataset. +#' Any subsequent call to \code{classifySingleR} will also check that \code{test.genes} is consistent with \code{rownames(test)}. +#' +#' On a similar note, if \code{restrict} is specified, marker selection will only be performed using the specified subset of genes. +#' This can be convenient for ignoring inappropriate genes like pseudogenes or predicted genes. +#' It has the same effect as filtering out undesirable rows from \code{ref} prior to calling \code{trainSingleR}. +#' Unlike \code{test.genes}, setting \code{restrict} does not introduce further checks on \code{rownames(test)} in \code{classifySingleR}. #' #' @section Custom feature specification: #' Rather than relying on the in-built feature selection, users can pass in their own features of interest to \code{genes}. @@ -164,11 +171,15 @@ #' #' @export #' @importFrom S4Vectors List isSingleString metadata metadata<- +#' @importFrom BiocNeighbors defineBuilder AnnoyParam KmknnParam #' @importFrom BiocParallel SerialParam bpisup bpstart bpstop #' @importFrom beachmat initializeCpp +#' @importFrom S4Vectors List +#' @importFrom SummarizedExperiment assay trainSingleR <- function( ref, labels, + test.genes=NULL, genes="de", sd.thresh=NULL, de.method=c("classic", "wilcox", "t"), @@ -195,70 +206,98 @@ trainSingleR <- function( } } - if (!bpisup(BPPARAM) && !is(BPPARAM, "MulticoreParam")) { - bpstart(BPPARAM) - on.exit(bpstop(BPPARAM)) - } - - ref <- lapply(ref, FUN=.to_clean_matrix, assay.type=assay.type, - check.missing=check.missing, msg="ref", BPPARAM=BPPARAM) - - # Cleaning the genes. - gns <- lapply(ref, rownames) - if (length(unique(gns))!=1L) { - stop("row names are not identical across references") - } - - if (!is.null(restrict)) { - keep <- gns[[1]] %in% restrict - ref <- lapply(ref, FUN="[", i=keep, , drop=FALSE) - } - if (isSingleString(genes)) { genes <- rep(genes, length(ref)) } else if (length(genes)!=length(ref)) { stop("list-like 'genes' should be the same length as 'ref'") } - # Cleaning the labels. - labels <- lapply(labels, as.character) - if (length(labels)!=length(ref)) { - stop("lists in 'labels' and 'ref' should be of the same length") + if (is.null(BNPARAM)) { + if (approximate) { + BNPARAM <- AnnoyParam() + } else { + BNPARAM <- KmknnParam() + } } - for (l in seq_along(labels)) { - keep <- !is.na(labels[[l]]) + if (!bpisup(BPPARAM) && !is(BPPARAM, "MulticoreParam")) { + bpstart(BPPARAM) + on.exit(bpstop(BPPARAM)) + } + + output <- vector("list", length(ref)) + names(output) <- names(ref) + for (l in seq_along(ref)) { + curref <- .to_clean_matrix(ref[[l]], assay.type, check.missing, msg="ref", BPPARAM=BPPARAM) + + curlabels <- as.character(labels[[l]]) + stopifnot(length(curlabels) == ncol(curref)) + keep <- !is.na(curlabels) if (!all(keep)) { - labels[[l]] <- labels[[l]][keep] - ref[[l]] <- ref[[l]][,keep,drop=FALSE] + curref <- curref[,keep,drop=FALSE] + curlabels <- curlabels[keep] } - } - gene.info <- mapply(FUN=.identify_genes, ref=ref, labels=labels, genes=genes, - MoreArgs=list(de.method=de.method, de.n=de.n, de.args=de.args, BPPARAM=BPPARAM), - SIMPLIFY=FALSE) + markers <- .identify_genes( + ref=curref, + labels=curlabels, + genes=genes[[l]], + de.method=de.method, + de.n=de.n, + de.args=de.args, + restrict=restrict, + test.genes=test.genes, + BPPARAM=BPPARAM + ) + + if (aggr.ref) { + aggr <- do.call(aggregateReference, c(list(ref=quote(curref), label=curlabels, check.missing=FALSE, BPPARAM=BPPARAM), aggr.args)) + curref <- assay(aggr) + curlabels <- aggr$label + } - if (!solo && !recompute) { - .Deprecated(old="recompute = FALSE") + ulabels <- .get_levels(curlabels) + built <- .build_index( + ref=curref, + labels=curlabels, + ulabels=ulabels, + test.genes=test.genes, + markers=markers, + BNPARAM=BNPARAM, + num.threads=num.threads + ) + + output[[l]] <- List( + built = built, + ref = curref, + labels = list(full = curlabels, unique = ulabels), + markers = list(full = markers, unique = rownames(curref)[get_ref_subset(built) + 1]), + options = list(BNPARAM = BNPARAM, test.genes = test.genes) + ) } - output <- mapply(FUN=.build_trained_index, ref=ref, labels=labels, markers=gene.info, - MoreArgs = list(aggr.ref=aggr.ref, aggr.args=aggr.args, BPPARAM=BPPARAM, approximate = approximate, num.threads = num.threads), - SIMPLIFY=FALSE) if (solo) { output[[1]] } else { - final <- List(output) - metadata(final)$recompute <- recompute - final + List(output) } } -.identify_genes <- function(ref, labels, genes="de", de.method="classic", de.n=NULL, de.args=list(), BPPARAM=BPPARAM) { +.identify_genes <- function(ref, labels, genes, de.method, de.n, test.genes, restrict, de.args, BPPARAM) { if (length(labels)!=ncol(ref)) { stop("number of labels must be equal to number of cells") } + # Note that the genes are reported as names rather than indexing, so these + # row-subsetting operations don't require re-indexing of the output. We use + # DelayedArrays to avoid making copies of the data. + if (!is.null(restrict)) { + ref <- DelayedArray(ref)[rownames(ref) %in% restrict,,drop=FALSE] + } + if (!is.null(test.genes)) { + ref <- DelayedArray(ref)[rownames(ref) %in% test.genes,,drop=FALSE] + } + if (.is_list(genes)) { is.char <- vapply(genes, is.character, TRUE) if (all(is.char)) { @@ -284,36 +323,8 @@ trainSingleR <- function( genes } -#' @importFrom S4Vectors List -#' @importFrom SummarizedExperiment assay -.build_trained_index <- function(ref, labels, markers, aggr.ref, aggr.args, search.info, approximate = FALSE, BPPARAM = SerialParam(), num.threads = 1) { - if (aggr.ref) { - aggr <- do.call(aggregateReference, c(list(ref=quote(ref), label=labels, check.missing=FALSE, BPPARAM=BPPARAM), aggr.args)) - ref <- assay(aggr) - labels <- aggr$label - } - - if (anyNA(labels)) { - keep <- !is.na(labels) - ref <- ref[,keep,drop=FALSE] - labels <- labels[keep] - } - - ulabels <- .get_levels(labels) - - built <- .build_index(ref, markers=markers, labels=labels, ulabels=ulabels, approximate=approximate, num.threads=num.threads) - - List( - built = built, - ref = ref, - labels = list(full = labels, unique = ulabels), - markers = list(full = markers, unique = rownames(ref)[get_subset(built) + 1]), - options = list(approximate = approximate) - ) -} - #' @importFrom beachmat initializeCpp -.build_index <- function(ref, markers, labels, ulabels, approximate, num.threads) { +.build_index <- function(ref, labels, ulabels, markers, test.genes, BNPARAM, num.threads) { for (m in seq_along(markers)) { current <- markers[[m]] for (n in seq_along(current)) { @@ -326,8 +337,25 @@ trainSingleR <- function( markers[[m]] <- current } + if (is.null(test.genes)) { + test.genes <- ref.genes <- seq_len(nrow(ref)) + } else { + universe <- union(test.genes, rownames(ref)) + test.genes <- match(test.genes, universe) + ref.genes <- match(rownames(ref), universe) + } + + builder <- defineBuilder(BNPARAM) parsed <- initializeCpp(ref) - prebuild(parsed, match(labels, ulabels) - 1L, markers, approximate = approximate, nthreads = num.threads) + train_single( + test_features=test.genes - 1L, + ref=parsed, + ref_features=ref.genes - 1L, + labels=match(labels, ulabels) - 1L, + markers=markers, + builder=builder, + nthreads=num.threads + ) } .get_levels <- function(labels) sort(unique(labels)) diff --git a/R/utils.R b/R/utils.R index 39ea03d..80bfdec 100644 --- a/R/utils.R +++ b/R/utils.R @@ -23,10 +23,12 @@ old <- getAutoBPPARAM() setAutoBPPARAM(BPPARAM) on.exit(setAutoBPPARAM(old)) - discard <- rowAnyNAs(DelayedArray(x)) + y <- DelayedArray(x) + discard <- rowAnyNAs(y) if (any(discard)) { - x <- x[!discard,,drop=FALSE] + # Returning a DelayedArray to avoid making an actual subset. + x <- y[!discard,,drop=FALSE] } } diff --git a/inst/NEWS.Rd b/inst/NEWS.Rd index e6bc6dc..264922e 100644 --- a/inst/NEWS.Rd +++ b/inst/NEWS.Rd @@ -2,6 +2,18 @@ \title{SingleR News} \encoding{UTF-8} +\section{Version 2.8.0}{\itemize{ +\item Added the \code{test.genes=} argument to \code{trainSingleR()}, to restrict marker detection to only those genes in the test dataset. +This is also checked against \code{rownames(test)} in \code{classifySingleR()} to ensure that the test's feature space is consistent with the space used during training. + +\item Restored the \code{BNPARAM=} argument in \code{trainSingleR()}, to enable more fine-grained specification of neighbor search algorithms. +The \code{approximate=} argument is deprecated. + +\item Soft-deprecated \code{check.missing=} in \code{classifySingleR()} and \code{combineRecomputedResults()}. +This is because any filtering will cause a mismatch between the row names of \code{tests} and the \code{test.genes} in \code{trained}. +Rather, filtering should be done prior to \code{trainSingleR()}, as is done in the main \code{SingleR()} function. +}} + \section{Version 2.0.0}{\itemize{ \item The format of the output of \code{trainSingleR()} has changed and is no longer back-compatible. \item \code{recompute=FALSE} in \code{trainSingleR()} does nothing; all integrated analyses are now done with \code{recompute=TRUE}. diff --git a/inst/include/singlepp/BasicBuilder.hpp b/inst/include/singlepp/BasicBuilder.hpp deleted file mode 100644 index 111f9bf..0000000 --- a/inst/include/singlepp/BasicBuilder.hpp +++ /dev/null @@ -1,317 +0,0 @@ -#ifndef SINGLEPP_BASIC_BUILDER_HPP -#define SINGLEPP_BASIC_BUILDER_HPP - -#include "macros.hpp" - -#include "knncolle/knncolle.hpp" -#include "tatami/tatami.hpp" - -#include "build_indices.hpp" -#include "process_features.hpp" - -#include - -/** - * @file BasicBuilder.hpp - * - * @brief Defines the `BasicBuilder` class. - */ - -namespace singlepp { - -/** - * @brief Construct a single pre-built reference for further classification. - * - * This class prepares a single labelled reference dataset for classification of a test dataset. - * We select the top marker genes to use for each pairwise comparison between labels, - * and we use them to pre-compute the ranks in advance of computing the Spearman correlations. - * The pre-built reference can then be used in `BasicScorer::run()`. - */ -class BasicBuilder { -public: - /** - * @brief Default parameters for reference building. - */ - struct Defaults { - /** - * See `set_top()` for details. - */ - static constexpr int top = -1; - - /** - * See `set_approximate()` for details. - */ - static constexpr bool approximate = false; - - /** - * See `set_num_threads()` for details. - */ - static constexpr int num_threads = 1; - }; - -private: - int top = Defaults::top; - bool approximate = Defaults::approximate; - int nthreads = Defaults::num_threads; - -public: - /** - * @param t Number of top markers to use from each pairwise comparison between labels, see `Classifier::set_top()`. - * - * @return A reference to this `BasicBuilder` object. - */ - BasicBuilder& set_top(int t = Defaults::top) { - top = t; - return *this; - } - - /** - * @param a Whether to use an approximate method to quickly find the quantile, see `Classifier::set_approximate()`. - * - * @return A reference to this `BasicBuilder` object. - */ - BasicBuilder& set_approximate(bool a = Defaults::approximate) { - approximate = a; - return *this; - } - - /** - * @param n Number of threads to use. - * - * @return A reference to this `BasicBuilder` object. - */ - BasicBuilder& set_num_threads(int n = Defaults::num_threads) { - nthreads = n; - return *this; - } - -private: - std::vector build_internal(const tatami::Matrix* ref, const int* labels, const std::vector& subset) const { - std::vector subref; - if (approximate) { - subref = build_indices( - ref, - labels, - subset, - [](size_t nr, size_t nc, const double* ptr) { - return std::shared_ptr >(new knncolle::AnnoyEuclidean(nr, nc, ptr)); - }, - nthreads - ); - } else { - subref = build_indices( - ref, - labels, - subset, - [](size_t nr, size_t nc, const double* ptr) { - return std::shared_ptr >(new knncolle::KmknnEuclidean(nr, nc, ptr)); - }, - nthreads - ); - } - return subref; - } - -public: - /** - * @brief Prebuilt reference that can be directly used for annotation. - */ - struct Prebuilt { - /** - * @cond - */ - Prebuilt(Markers m, std::vector s, std::vector r) : - markers(std::move(m)), subset(std::move(s)), references(std::move(r)) {} - /** - * @endcond - */ - - /** - * A vector of vectors of ranked marker genes to be used in the classification. - * Values are indices into the `subset` vector. - * The set of marker genes is typically a subset of those in the input `markers` in `build()`. - */ - Markers markers; - - /** - * The subset of features in the test/reference datasets that were used in the classification. - * Values are row indices into the relevant matrices. - */ - std::vector subset; - - /** - * @return Number of labels in this reference. - */ - size_t num_labels() const { - return references.size(); - } - - /** - * @return Number of profiles in this reference. - */ - size_t num_profiles() const { - size_t n = 0; - for (const auto& ref : references) { - n += ref.ranked.size(); - } - return n; - } - - /** - * @cond - */ - std::vector references; - /** - * @endcond - */ - }; - - /** - * @param ref Matrix for the reference expression profiles. - * Rows are genes while columns are samples. - * @param[in] labels An array of length equal to the number of columns of `ref`, containing the label for each sample. - * The smallest label should be 0 and the largest label should be equal to the total number of unique labels minus 1. - * @param markers A vector of vectors of ranked marker genes for each pairwise comparison between labels, see `Markers` for more details. - * - * @return A `Prebuilt` instance that can be used in `run()` for annotation of a test dataset. - */ - Prebuilt run(const tatami::Matrix* ref, const int* labels, Markers markers) const { - auto subset = subset_markers(markers, top); - auto subref = build_internal(ref, labels, subset); - return Prebuilt(std::move(markers), std::move(subset), std::move(subref)); - } - -public: - /** - * @brief Prebuilt reference requiring an intersection of features. - */ - struct PrebuiltIntersection { - /** - * @cond - */ - PrebuiltIntersection(Markers m, std::vector mats, std::vector refs, std::vector r) : - markers(std::move(m)), mat_subset(std::move(mats)), ref_subset(std::move(refs)), references(std::move(r)) {} - /** - * @endcond - */ - - /** - * A vector of vectors of ranked marker genes to be used in the classification. - * Values are indices into the `mat_subset` and `ref_subset` vectors for the respective matrices. - * The set of marker genes is typically a subset of those in the input `markers` in `build()`. - */ - Markers markers; - - /** - * Row indices of test dataset, specifying the features in the intersection. - * This has the same length as `ref_subset`, where corresponding entries refer to the same features in the respective datasets. - */ - std::vector mat_subset; - - /** - * Row indices of reference dataset, specifying the features in the intersection. - * This has the same length as `mat_subset`, where corresponding entries refer to the same features in the respective datasets. - */ - std::vector ref_subset; - - /** - * @return Number of labels in this reference. - */ - size_t num_labels() const { - return references.size(); - } - - /** - * @return Number of profiles in this reference. - */ - size_t num_profiles() const { - size_t n = 0; - for (const auto& ref : references) { - n += ref.ranked.size(); - } - return n; - } - - /** - * @cond - */ - std::vector references; - /** - * @endcond - */ - }; - - /** - * @param intersection Vector defining the intersection of features betweent the test and reference datasets. - * Each entry is a pair where the first element is the row index in the test matrix, - * and the second element is the row index for the corresponding feature in the reference matrix. - * Each row index for either matrix should occur no more than once in `intersection`. - * @param ref An expression matrix for the reference expression profiles, where rows are genes and columns are cells. - * This should have non-zero columns. - * @param[in] labels An array of length equal to the number of columns of `ref`, containing the label for each sample. - * The smallest label should be 0 and the largest label should be equal to the total number of unique labels minus 1. - * @param markers A vector of vectors of ranked marker genes for each pairwise comparison between labels, see `Markers` for more details. - * - * @return A `PrebuiltIntersection` instance that can be used in `run()` for annotation of a test dataset with the same order of genes as specified in `mat_id`. - * - * This method deals with the case where the genes are not in the same order and number across the test and reference datasets. - * It finds the intersection of genes and then prepares the references accordingly. - */ - PrebuiltIntersection run( - std::vector > intersection, - const tatami::Matrix* ref, - const int* labels, - Markers markers) - const { - // Sorting it if it wasn't already. - for (size_t i = 1, end = intersection.size(); i < end; ++i) { - if (intersection[i] < intersection[i-1]) { - std::sort(intersection.begin(), intersection.end()); - break; - } - } - - subset_markers(intersection, markers, top); - auto pairs = unzip(intersection); - auto subref = build_internal(ref, labels, pairs.second); - return PrebuiltIntersection(std::move(markers), std::move(pairs.first), std::move(pairs.second), std::move(subref)); - } - - /** - * @tparam Id Gene identifier for each row. - * - * @param mat_nrow Number of rows (genes) in the test dataset. - * @param[in] mat_id Pointer to an array of identifiers of length equal to `mat_nrow`. - * This should contain a unique identifier for each row of `mat` (typically a gene name or index). - * If any duplicate IDs are present, only the first occurrence is used. - * @param ref An expression matrix for the reference expression profiles, where rows are genes and columns are cells. - * This should have non-zero columns. - * @param[in] ref_id Pointer to an array of identifiers of length equal to the number of rows of any `ref`. - * This should contain a unique identifier for each row in `ref`, and should be comparable to `mat_id`. - * If any duplicate IDs are present, only the first occurrence is used. - * @param[in] labels An array of length equal to the number of columns of `ref`, containing the label for each sample. - * The smallest label should be 0 and the largest label should be equal to the total number of unique labels minus 1. - * @param markers A vector of vectors of ranked marker genes for each pairwise comparison between labels, see `Markers` for more details. - * - * @return A `PrebuiltIntersection` instance that can be used in `run()` for annotation of a test dataset with the same order of genes as specified in `mat_id`. - * - * This method deals with the case where the genes are not in the same order and number across the test and reference datasets. - * It finds the intersection of genes and then prepares the references accordingly. - */ - template - PrebuiltIntersection run( - size_t mat_nrow, - const Id* mat_id, - const tatami::Matrix* ref, - const Id* ref_id, - const int* labels, - Markers markers) - const { - auto intersection = intersect_features(mat_nrow, mat_id, ref->nrow(), ref_id); - return run(std::move(intersection), ref, labels, std::move(markers)); - } -}; - -} - -#endif diff --git a/inst/include/singlepp/BasicScorer.hpp b/inst/include/singlepp/BasicScorer.hpp deleted file mode 100644 index af4db05..0000000 --- a/inst/include/singlepp/BasicScorer.hpp +++ /dev/null @@ -1,292 +0,0 @@ -#ifndef SINGLEPP_BASIC_SCORER_HPP -#define SINGLEPP_BASIC_SCORER_HPP - -#include "macros.hpp" - -#include "tatami/tatami.hpp" - -#include "annotate_cells.hpp" -#include "process_features.hpp" -#include "BasicBuilder.hpp" - -#include -#include - -/** - * @file BasicScorer.hpp - * - * @brief Defines the `BasicScorer` class. - */ - -namespace singlepp { - -/** - * @brief Classify cells from a single pre-built reference dataset. - * - * This class uses the pre-built reference from `BasicBuilder` to classify each cell in a test dataset. - * The algorithm and parameters are the same as described for the `Classifier` class; - * in fact, `Classifier` just calls `BasicBuilder::run()` and then `BasicScorer::run()`. - * - * It is occasionally useful to call these two functions separately if the same reference dataset is to be used multiple times, - * e.g., on different test datasets or with different parameters. - * In such cases, we can save time by avoiding redundant builds; - * we just have to call `BasicScorer::run()` in all subsequent uses of the pre-built reference. - */ -class BasicScorer { -public: - /** - * @brief Default parameters for classification. - */ - struct Defaults { - /** - * See `set_quantile()` for details. - */ - static constexpr double quantile = 0.8; - - /** - * See `set_fine_tune_threshold()` for details. - */ - static constexpr double fine_tune_threshold = 0.05; - - /** - * See `set_fine_tune()` for details. - */ - static constexpr bool fine_tune = true; - - /** - * See `set_num_threads()` for details. - */ - static constexpr int num_threads = 1; - }; - -private: - double quantile = Defaults::quantile; - double fine_tune_threshold = Defaults::fine_tune_threshold; - bool fine_tune = Defaults::fine_tune; - int nthreads = Defaults::num_threads; - -public: - /** - * @param q Quantile to use to compute a per-label score from the correlations, see `Classifier::set_quantile()`. - * - * @return A reference to this `BasicScorer` object. - */ - BasicScorer& set_quantile(double q = Defaults::quantile) { - quantile = q; - return *this; - } - - /** - * @param t Threshold to use to select the top-scoring subset of labels during fine-tuning, see `Classifier::set_fine_tune_threshold()`. - * - * @return A reference to this `BasicScorer` object. - */ - BasicScorer& set_fine_tune_threshold(double t = Defaults::fine_tune_threshold) { - fine_tune_threshold = t; - return *this; - } - - /** - * @param f Whether to perform fine-tuning, see `Classifier::set_fine_tune()`. - * - * @return A reference to this `BasicScorer` object. - */ - BasicScorer& set_fine_tune(bool f = Defaults::fine_tune) { - fine_tune = f; - return *this; - } - - /** - * @param n Number of threads to use. - * - * @return A reference to this `BasicScorer` object. - */ - BasicScorer& set_num_threads(int n = Defaults::num_threads) { - nthreads = n; - return *this; - } - -public: - /** - * @param mat Expression matrix of the test dataset, where rows are genes and columns are cells. - * This may have a different ordering of genes compared to the reference matrix used to create `built`, - * provided that all genes corresponding to `built.subset` are present. - * @param built An object produced by `BasicBuilder::build()`. - * @param[in] mat_subset Pointer to an array of length equal to that of `built.subset`, - * containing the index of the row of `mat` corresponding to each gene in `built.subset`. - * That is, row `mat_subset[i]` in `mat` should be the same gene as row `built.subset[i]` in the reference matrix. - * @param[out] best Pointer to an array of length equal to the number of columns in `mat`. - * On output, this is filled with the index of the assigned label for each cell. - * @param[out] scores Vector of pointers to arrays of length equal to the number of columns in `mat`. - * On output, this is filled with the (non-fine-tuned) score for each label for each cell. - * Any pointer may be `NULL` in which case the scores for that label will not be reported. - * @param[out] delta Pointer to an array of length equal to the number of columns in `mat`. - * On output, this is filled with the difference between the highest and second-highest scores, possibly after fine-tuning. - * This may also be `NULL` in which case the deltas are not reported. - */ - void run(const tatami::Matrix* mat, const BasicBuilder::Prebuilt& built, const int* mat_subset, int* best, std::vector& scores, double* delta) const { - annotate_cells_simple( - mat, - built.subset.size(), - mat_subset, - built.references, - built.markers, - quantile, - fine_tune, - fine_tune_threshold, - best, - scores, - delta, - nthreads - ); - return; - } - - /** - * @param mat Expression matrix of the test dataset, where rows are genes and columns are cells. - * This should have the same order and identity of genes as the reference matrix used to create `built`. - * @param built An object produced by `BasicBuilder::build()`. - * @param[out] best Pointer to an array of length equal to the number of columns in `mat`. - * On output, this is filled with the index of the assigned label for each cell. - * @param[out] scores Vector of pointers to arrays of length equal to the number of columns in `mat`. - * On output, this is filled with the (non-fine-tuned) score for each label for each cell. - * Any pointer may be `NULL` in which case the scores for that label will not be reported. - * @param[out] delta Pointer to an array of length equal to the number of columns in `mat`. - * On output, this is filled with the difference between the highest and second-highest scores, possibly after fine-tuning. - * This may also be `NULL` in which case the deltas are not reported. - */ - void run(const tatami::Matrix* mat, const BasicBuilder::Prebuilt& built, int* best, std::vector& scores, double* delta) const { - run(mat, built, built.subset.data(), best, scores, delta); - return; - } - -public: - /** - * @brief Automated classification results. - */ - struct Results { - /** - * @cond - */ - Results(size_t ncells, size_t nlabels) : best(ncells), scores(nlabels, std::vector(ncells)), delta(ncells) {} - - std::vector scores_to_pointers() { - std::vector output(scores.size()); - for (size_t s = 0; s < scores.size(); ++s) { - output[s] = scores[s].data(); - } - return output; - }; - /** - * @endcond - */ - - /** - * Vector of length equal to the number of cells in the test dataset, - * containing the index of the assigned label for each cell. - */ - std::vector best; - - /** - * Vector of length equal to the number of labels, - * containing vectors of length equal to the number of cells in the test dataset. - * Each vector corresponds to a label and contains the (non-fine-tuned) score for each cell. - */ - std::vector > scores; - - /** - * Vector of length equal to the number of cells in the test dataset. - * This contains the difference between the highest and second-highest scores for each cell, possibly after fine-tuning. - */ - std::vector delta; - }; - - /** - * @param mat Expression matrix of the test dataset, where rows are genes and columns are cells. - * Each row should correspond to an element of `Prebuilt::subset`. - * @param built An object produced by `BasicBuilder::build()`. - * - * @return A `Results` object containing the assigned labels and scores. - */ - Results run(const tatami::Matrix* mat, const BasicBuilder::Prebuilt& built) const { - size_t nlabels = built.references.size(); - Results output(mat->ncol(), nlabels); - auto scores = output.scores_to_pointers(); - run(mat, built, output.best.data(), scores, output.delta.data()); - return output; - } - - /** - * @param mat Expression matrix of the test dataset, where rows are genes and columns are cells. - * This may have a different ordering of genes compared to the reference matrix used in `build()`, - * provided that all genes corresponding to `Prebuilt::subset` are present. - * @param built An object produced by `BasicBuilder::build()`. - * @param[in] mat_subset Pointer to an array of length equal to that of `Prebuilt::subset`, - * containing the index of the row of `mat` corresponding to each gene in `Prebuilt::subset`. - * - * @return A `Results` object containing the assigned labels and scores. - */ - Results run(const tatami::Matrix* mat, const BasicBuilder::Prebuilt& built, const int* mat_subset) const { - size_t nlabels = built.references.size(); - Results output(mat->ncol(), nlabels); - auto scores = output.scores_to_pointers(); - run(mat, built, mat_subset, output.best.data(), scores, output.delta.data()); - return output; - } - -public: - /** - * @param mat Expression matrix of the test dataset, where rows are genes and columns are cells. - * @param built An object produced by `build()` with intersections. - * @param[out] best Pointer to an array of length equal to the number of columns in `mat`. - * On output, this is filled with the index of the assigned label for each cell. - * @param[out] scores Vector of pointers to arrays of length equal to the number of columns in `mat`. - * On output, this is filled with the (non-fine-tuned) score for each label for each cell. - * Any pointer may be `NULL` in which case the scores for that label will not be reported. - * @param[out] delta Pointer to an array of length equal to the number of columns in `mat`. - * On output, tkkhis is filled with the difference between the highest and second-highest scores, possibly after fine-tuning. - * This may also be `NULL` in which case the deltas are not reported. - */ - void run( - const tatami::Matrix* mat, - const BasicBuilder::PrebuiltIntersection& built, - int* best, - std::vector& scores, - double* delta) - const { - annotate_cells_simple(mat, - built.mat_subset.size(), - built.mat_subset.data(), - built.references, - built.markers, - quantile, - fine_tune, - fine_tune_threshold, - best, - scores, - delta, - nthreads - ); - return; - } - - /** - * @param mat Expression matrix of the test dataset, where rows are genes and columns are cells. - * @param built An object produced by `build()` with intersections. - * - * @return A `Results` object containing the assigned labels and scores. - */ - Results run(const tatami::Matrix* mat, const BasicBuilder::PrebuiltIntersection& built) const { - size_t nlabels = built.references.size(); - Results output(mat->ncol(), nlabels); - auto scores = output.scores_to_pointers(); - - run(mat, built, output.best.data(), scores, output.delta.data()); - return output; - } -}; - -} - - -#endif diff --git a/inst/include/singlepp/ChooseClassicMarkers.hpp b/inst/include/singlepp/ChooseClassicMarkers.hpp deleted file mode 100644 index c84feea..0000000 --- a/inst/include/singlepp/ChooseClassicMarkers.hpp +++ /dev/null @@ -1,292 +0,0 @@ -#ifndef SINGLEPP_CHOOSE_CLASSIC_MARKERS_HPP -#define SINGLEPP_CHOOSE_CLASSIC_MARKERS_HPP - -#include "macros.hpp" - -#include "tatami/tatami.hpp" - -#include "Markers.hpp" - -#include -#include -#include - -/** - * @file ChooseClassicMarkers.hpp - * - * @brief Classic method for choosing markers. - */ - -namespace singlepp { - -/** - * @brief Classic method for choosing markers. - * - * This class implements the classic **SingleR** method for choosing markers from (typically bulk) reference dataasets. - * We assume that we have a matrix of representative log-expression profiles for each label, typically computed by taking some average across all reference profiles for that label. - * For the comparison between labels A and B, we define the marker set as the top genes with the largest positive differences in A's profile over B (i.e., the log-fold change). - * The number of top genes can either be explicitly specified or it can be automatically determined from the number of labels. - * - * If multiple references are present, we compute the sum of the log-fold changes across all references with both labels A and B. - * The ordering of the top genes is then performed using the sum across references. - * It is assumed that all references have the same number and ordering of features in their rows. - */ -class ChooseClassicMarkers { -public: - /** - * @brief Default parameter settings. - */ - struct Defaults { - /** - * See `set_number()` for more details. - */ - static constexpr int number = -1; - - /** - * See `set_num_threads()` for more details. - */ - static constexpr int num_threads = 1; - }; - -private: - int number = Defaults::number; - int nthreads = Defaults::num_threads; - -public: - /** - * @param n Number of top genes to use as the marker set in each pairwise comparison. - * If -1, this is automatically determined from the number of labels, see `number_of_markers()`. - * - * @return A reference to this `ChooseClassicMarkers` object. - */ - ChooseClassicMarkers& set_number(int n = Defaults::number) { - number = n; - return *this; - } - - /** - * @param nlabels Number of labels in the reference(s). - * - * @return An appropriate number of markers for each pairwise comparison. - * - * The exact expression is defined as \f$500 (\frac{2}{3})^{\log_2{N}}\f$ for \f$N\f$ labels, - * which steadily decreases the markers per comparison as the number of labels increases. - * This aims to avoid an excessive number of features when dealing with references with many labels. - */ - static int number_of_markers(int nlabels) { - return std::round(500.0 * std::pow(2.0/3.0, std::log(static_cast(nlabels)) / std::log(2.0))); - } - - /** - * @param n Number of threads to use. - * - * @return A reference to this `ChooseClassicMarkers` object. - */ - ChooseClassicMarkers& set_num_threads(int n = Defaults::num_threads) { - nthreads = n; - return *this; - } - -public: - /** - * @tparam Matrix A **tatami** matrix. - * @tparam Label Integer type for the label identity. - * - * @param representatives Vector of representative matrices. - * Each matrix should contain one column per label; each column should have a representative log-expression profile for that label. - * All matrices should have the same number of rows, corresponding to the same features. - * @param labels Vector of pointers of length equal to `representatives`. - * Each array should be of length equal to the number of columns of the corresponding entry of `representatives`. - * Each value of the array should specify the label for the corresponding column in its matrix. - * Values should lie in $[0, N)$ for $N$ unique labels across all entries of `labels`. - * - * @return A `Markers` object containing the top markers for each pairwise comparison between labels. - */ - template - Markers run(const std::vector& representatives, const std::vector& labels) const { - /** - * @cond - */ - size_t nrefs = representatives.size(); - if (nrefs != labels.size()) { - throw std::runtime_error("'representatives' and 'labels' should have the same length"); - } - if (nrefs == 0) { - throw std::runtime_error("'representatives' should contain at least one entry"); - } - size_t ngenes = representatives.front()->nrow(); - - // Determining the total number of labels. - int nlabels = 0; - for (size_t r = 0; r < nrefs; ++r) { - size_t ncols = representatives[r]->ncol(); - auto curlab = labels[r]; - for (size_t c = 0; c < ncols; ++c) { - if (nlabels <= curlab[c]) { - nlabels = curlab[c] + 1; - } - } - } - - // Generating mappings. - std::vector > labels_to_index(nrefs, std::vector(nlabels, -1)); - for (size_t r = 0; r < nrefs; ++r) { - size_t ncols = representatives[r]->ncol(); - auto curlab = labels[r]; - auto& current = labels_to_index[r]; - for (size_t c = 0; c < ncols; ++c) { - auto& dest = current[curlab[c]]; - if (dest != -1) { - throw std::runtime_error("each label should correspond to no more than one column in each reference"); - } - current[curlab[c]] = c; - } - } - - // Generating pairs for compute; this sacrifices some memory for convenience. - std::vector > pairs; - { - std::set > pairs0; - for (size_t r = 0; r < nrefs; ++r) { - size_t ncols = representatives[r]->ncol(); - auto curlab = labels[r]; - for (size_t c1 = 0; c1 < ncols; ++c1) { - for (size_t c2 = 0; c2 < c1; ++c2) { - pairs0.emplace(curlab[c1], curlab[c2]); - } - } - } - pairs.insert(pairs.end(), pairs0.begin(), pairs0.end()); - std::sort(pairs.begin(), pairs.end()); - } - size_t npairs = pairs.size(); - - Markers output(nlabels, std::vector >(nlabels)); - - int actual_number = number; - if (number < 0) { - actual_number = number_of_markers(nlabels); - } - if (actual_number > static_cast(ngenes)) { - actual_number = ngenes; - } - -#ifndef SINGLEPP_CUSTOM_PARALLEL - #pragma omp parallel num_threads(nthreads) - { -#else - SINGLEPP_CUSTOM_PARALLEL([&](int, size_t start, size_t len) -> void { -#endif - - std::vector > sorter(ngenes), sorted_copy(ngenes); - typedef typename Matrix::value_type Value_; - typedef typename Matrix::index_type Index_; - std::vector rbuffer(ngenes), lbuffer(ngenes); - std::vector > > rworks(nrefs), lworks(nrefs); - -#ifndef SINGLEPP_CUSTOM_PARALLEL - #pragma omp for - for (size_t p = 0; p < npairs; ++p) { -#else - for (size_t p = start, end = start + len; p < end; ++p) { -#endif - - auto curleft = pairs[p].first; - auto curright = pairs[p].second; - - auto sIt = sorter.begin(); - for (int g = 0; g < ngenes; ++g, ++sIt) { - sIt->first = 0; - sIt->second = g; - } - - for (size_t i = 0; i < nrefs; ++i) { - const auto& curavail = labels_to_index[i]; - auto lcol = curavail[curleft]; - auto rcol = curavail[curright]; - if (lcol == -1 || rcol == -1) { - continue; - } - - // Initialize extractors as needed. - auto& lwrk = lworks[i]; - if (!lwrk) { - lwrk = representatives[i]->dense_column(); - } - auto lptr = lwrk->fetch(lcol, lbuffer.data()); - - auto& rwrk = rworks[i]; - if (!rwrk) { - rwrk = representatives[i]->dense_column(); - } - auto rptr = rwrk->fetch(rcol, rbuffer.data()); - - auto sIt = sorter.begin(); - for (int g = 0; g < ngenes; ++g, ++lptr, ++rptr, ++sIt) { - sIt->first += *lptr - *rptr; - } - } - - for (int flip = 0; flip < 2; ++flip) { - // Flipping the log-fold changes so we do the reverse. - // We do a copy so that the treatment of tries would be the same as if - // we had sorted on the reversed log-fold changes in the first place; - // otherwise the first sort might change the order of ties. - if (flip) { - sorter = sorted_copy; - for (auto& s : sorter) { - s.first *= -1; - } - } else { - sorted_copy = sorter; - } - - // partial sort is guaranteed to be stable due to the second index resolving ties. - std::partial_sort(sorter.begin(), sorter.begin() + actual_number, sorter.end()); - - std::vector stuff; - stuff.reserve(actual_number); - for (int g = 0; g < actual_number && sorter[g].first < 0; ++g) { // only keeping those with positive log-fold changes (negative after reversing). - stuff.push_back(sorter[g].second); - } - - if (flip) { - output[curleft][curright] = std::move(stuff); - } else { - output[curright][curleft] = std::move(stuff); - } - } - } - -#ifndef SINGLEPP_CUSTOM_PARALLEL - } -#else - }, npairs, nthreads); -#endif - /** - * @endcond - */ - - return output; - } - - /** - * @tparam Matrix A **tatami** matrix. - * - * @param representative A representative matrix, containing one column per label. - * Each column should have a representative log-expression profile for that label. - * @param labels Pointer to an array of length equal to the number of columns in `representative`. - * Each value of the array should specify the label for the corresponding column. - * Values should lie in $[0, N)$ for $N$ unique labels. - * - * @return A `Markers` object containing the top markers for each pairwise comparison between labels. - */ - template - Markers run(const Matrix* representative, const int* labels) const { - return run(std::vector{ representative }, std::vector{ labels }); - } -}; - -} - -#endif diff --git a/inst/include/singlepp/Classifier.hpp b/inst/include/singlepp/Classifier.hpp deleted file mode 100644 index 8d784fc..0000000 --- a/inst/include/singlepp/Classifier.hpp +++ /dev/null @@ -1,312 +0,0 @@ -#ifndef SINGLEPP_CLASSIFIER_HPP -#define SINGLEPP_CLASSIFIER_HPP - -#include "macros.hpp" - -#include "tatami/tatami.hpp" - -#include "BasicBuilder.hpp" -#include "BasicScorer.hpp" - -#include -#include - -/** - * @file Classifier.hpp - * - * @brief Defines the `Classifier` class. - */ - -namespace singlepp { - -/** - * @brief Automatically assign cell type labels based on an expression matrix. - * - * This implements the [**SingleR**](https://bioconductor.org/packages/SingleR) algorithm for automated annotation of single-cell RNA-seq data. - * For each cell, we compute the Spearman rank correlation between that cell and the reference expression profiles. - * This is done using only the subset of genes that are label-specific markers, - * most typically the top genes from pairwise comparisons between each label's expression profiles. - * For each label, we take the correlations involving that label's reference profiles and convert it into a score. - * The label with the highest score is used as an initial label for that cell. - * - * For each cell, we apply fine-tuning iterations to improve the label accuracy by refining the feature space. - * At each iteration, we find the subset of labels with scores that are close to the maximum score according to some threshold. - * We recompute the scores based on the markers for this label subset, and we repeat the process until only one label is left in the subset or the subset is unchanged. - * At the end of the iterations, the label with the highest score (or the only label, if just one is left) is used as the label for the cell. - * This process aims to remove noise by eliminating irrelevant genes when attempting to distinguish closely related labels. - * - * Each label's score is defined as a user-specified quantile of the distribution of correlations across all reference profiles assigned to that label. - * (We typically consider a large quantile, e.g., the 80% percentile of the correlations.) - * The use of a quantile avoids problems with differences in the number of reference profiles per label; - * in contrast, just using the "top X correlations" would implicitly favor labels with more reference profiles. - * - * The choice of Spearman's correlation provides some robustness against batch effects when comparing reference and test datasets. - * Only the relative expression _within_ each cell needs to be comparable, not their relative expression across cells. - * As a result, it does not matter whether raw counts are supplied or log-transformed expression values, as the latter is a monotonic transformation of the latter (within each cell). - * The algorithm is also robust to differences in technologies between reference and test profiles, though it is preferable to have like-for-like comparisons. - * - * @see - * Aran D et al. (2019). - * Reference-based analysis of lung single-cell sequencing reveals a transitional profibrotic macrophage. - * _Nat. Immunol._ 20, 163-172 - */ -class Classifier { -public: - /** - * @brief Default parameters for annotation. - */ - struct Defaults { - /** - * See `set_quantile()` for details. - */ - static constexpr double quantile = BasicScorer::Defaults::quantile; - - /** - * See `set_fine_tune_threshold()` for details. - */ - static constexpr double fine_tune_threshold = BasicScorer::Defaults::fine_tune_threshold; - - /** - * See `set_fine_tune()` for details. - */ - static constexpr bool fine_tune = BasicScorer::Defaults::fine_tune; - - /** - * See `set_top()` for details. - */ - static constexpr int top = BasicBuilder::Defaults::top; - - /** - * See `set_approximate()` for details. - */ - static constexpr bool approximate = BasicBuilder::Defaults::approximate; - - /** - * See `set_num_threads()` for details. - */ - static constexpr int num_threads = 1; - }; - -private: - int top = Defaults::top; - bool approximate = Defaults::approximate; - double quantile = Defaults::quantile; - double fine_tune_threshold = Defaults::fine_tune_threshold; - bool fine_tune = Defaults::fine_tune; - int nthreads = Defaults::num_threads; - -public: - /** - * @param q Quantile to use to compute a per-label score from the correlations. - * - * @return A reference to this `Classifier` object. - * - * Values of `q` closer to 0.5 focus on the behavior of the majority of a label's reference profiles. - * Smaller values will be more sensitive to the presence of a subset of profiles that are more similar to the test cell, - * which can be useful when the reference profiles themselves are heterogeneous. - */ - Classifier& set_quantile(double q = Defaults::quantile) { - quantile = q; - return *this; - } - - /** - * @param t Threshold to use to select the top-scoring subset of labels during fine-tuning. - * Larger values increase the chance of recovering the correct label at the cost of computational time. - * - * @return A reference to this `Classifier` object. - * - * Needless to say, one should not set `t` to a value that is too large. - * Otherwise, the first fine-tuning iteration would just contain all labels and there would be no reduction of the marker space. - */ - Classifier& set_fine_tune_threshold(double t = Defaults::fine_tune_threshold) { - fine_tune_threshold = t; - return *this; - } - - /** - * @param f Whether to perform fine-tuning. - * This can be disabled to improve speed at the cost of accuracy. - * - * @return A reference to this `Classifier` object. - */ - Classifier& set_fine_tune(bool f = Defaults::fine_tune) { - fine_tune = f; - return *this; - } - - /** - * @param t Number of top markers to use from each pairwise comparison between labels. - * Larger values improve the stability of the correlations at the cost of increasing noise and computational work. - * - * Setting it to a negative value will instruct `run()` to use all supplied markers. - * This is useful in situations where the supplied markers have already been curated. - * - * @return A reference to this `Classifier` object. - */ - Classifier& set_top(int t = Defaults::top) { - top = t; - return *this; - } - - /** - * @param a Whether to use an approximate method to quickly find the quantile. - * This sacrifices some accuracy for speed when labels have many reference profiles. - * - * @return A reference to this `Classifier` object. - */ - Classifier& set_approximate(bool a = Defaults::approximate) { - approximate = a; - return *this; - } - - /** - * @param n Number of threads to use. - * - * @return A reference to this `Classifier` object. - */ - Classifier& set_num_threads(int n = Defaults::num_threads) { - nthreads = n; - return *this; - } - -private: - BasicBuilder::Prebuilt build_reference(const tatami::Matrix* ref, const int* labels, Markers markers) const { - BasicBuilder builder; - builder - .set_top(top) - .set_approximate(approximate) - .set_num_threads(nthreads); - return builder.run(ref, labels, std::move(markers)); - } - - template - BasicBuilder::PrebuiltIntersection build_reference(size_t mat_nrow, const Id* mat_id, const tatami::Matrix* ref, const Id* ref_id, const int* labels, Markers markers) const { - BasicBuilder builder; - builder - .set_top(top) - .set_approximate(approximate) - .set_num_threads(nthreads); - return builder.run(mat_nrow, mat_id, ref, ref_id, labels, std::move(markers)); - } - - BasicScorer set_up_scorer() const { - BasicScorer scorer; - scorer - .set_quantile(quantile) - .set_fine_tune(fine_tune) - .set_fine_tune_threshold(fine_tune_threshold) - .set_num_threads(nthreads); - return scorer; - } - -public: - /** - * @param mat Expression matrix of the test dataset, where rows are genes and columns are cells. - * @param ref An expression matrix for the reference expression profiles. - * This should have non-zero columns and the same number of rows (i.e., genes) at `mat`. - * @param[in] labels An array of length equal to the number of columns of `ref`, containing the label for each sample. - * The smallest label should be 0 and the largest label should be equal to the total number of unique labels minus 1. - * @param markers A vector of vectors of ranked marker genes for each pairwise comparison between labels, see `Markers` for more details. - * @param[out] best Pointer to an array of length equal to the number of columns in `mat`. - * On output, this is filled with the index of the assigned label for each cell. - * @param[out] scores Vector of pointers of length equal to the number of labels. - * Each pointer should point to an array of length equal to the number of columns in `mat`. - * On output, this is filled with the (non-fine-tuned) score for that label for each cell. - * Any pointer may be `NULL` in which case the scores for that label will not be reported. - * @param[out] delta Pointer to an array of length equal to the number of columns in `mat`. - * On output, this is filled with the difference between the highest and second-highest scores, possibly after fine-tuning. - * This may also be `NULL` in which case the deltas are not reported. - */ - void run(const tatami::Matrix* mat, const tatami::Matrix* ref, const int* labels, Markers markers, int* best, std::vector& scores, double* delta) const { - auto prebuilt = build_reference(ref, labels, std::move(markers)); - set_up_scorer().run(mat, prebuilt, best, scores, delta); - return; - } - - /** - * @param mat Expression matrix of the test dataset, where rows are genes and columns are cells. - * @param ref An expression matrix for the reference expression profiles. - * This should have non-zero columns and the same number of rows (i.e., genes) at `mat`. - * @param[in] labels An array of length equal to the number of columns of `ref`, containing the label for each sample. - * The smallest label should be 0 and the largest label should be equal to the total number of unique labels minus 1. - * @param markers A vector of vectors of ranked marker genes for each pairwise comparison between labels, see `Markers` for more details. - * - * @return A `BasicScorer::Results` object containing the assigned labels and scores. - */ - BasicScorer::Results run(const tatami::Matrix* mat, const tatami::Matrix* ref, const int* labels, Markers markers) const { - auto prebuilt = build_reference(ref, labels, std::move(markers)); - return set_up_scorer().run(mat, prebuilt); - } - -public: - /** - * @tparam Id Gene identifier for each row. - * - * @param mat Expression matrix of the test dataset, where rows are genes and columns are cells. - * @param[in] mat_id Pointer to an array of identifiers of length equal to the number of rows of `mat`. - * This should contain a unique identifier for each row of `mat` (typically a gene name or index). - * @param ref An expression matrix for the reference expression profiles, where rows are genes and columns are cells. - * This should have non-zero columns. - * @param[in] ref_id Pointer to an array of identifiers of length equal to the number of rows of any `ref`. - * This should contain a unique identifier for each row in `ref`, and should be comparable to `mat_id`. - * @param[in] labels An array of length equal to the number of columns of `ref`, containing the label for each sample. - * The smallest label should be 0 and the largest label should be equal to the total number of unique labels minus 1. - * @param markers A vector of vectors of ranked marker genes for each pairwise comparison between labels, see `Markers` for more details. - * @param[out] best Pointer to an array of length equal to the number of columns in `mat`. - * On output, this is filled with the index of the assigned label for each cell. - * @param[out] scores Vector of pointers of length equal to the number of labels. - * Each pointer should point to an array of length equal to the number of columns in `mat`. - * On output, this is filled with the (non-fine-tuned) score for that label for each cell. - * Any pointer may be `NULL` in which case the scores for that label will not be reported. - * @param[out] delta Pointer to an array of length equal to the number of columns in `mat`. - * On output, this is filled with the difference between the highest and second-highest scores, possibly after fine-tuning. - * This may also be `NULL` in which case the deltas are not reported. - * - * This version of `run()` applies an intersection to find the common genes between `mat` and `ref`, based on their shared values in `mat_id` and `ref_id`. - * The annotation is then performed using only the subset of common genes. - * The aim is to easily accommodate differences in feature annotation between the test and reference profiles. - */ - template - void run( - const tatami::Matrix* mat, - const Id* mat_id, - const tatami::Matrix* ref, - const Id* ref_id, - const int* labels, - Markers markers, - int* best, - std::vector& scores, - double* delta) - const { - auto built = build_reference(mat->nrow(), mat_id, ref, ref_id, labels, std::move(markers)); - set_up_scorer().run(mat, built, best, scores, delta); - return; - } - - /** - * @tparam Id Gene identifier for each row. - * - * @param mat Expression matrix of the test dataset, where rows are genes and columns are cells. - * @param[in] mat_id Pointer to an array of identifiers of length equal to the number of rows of `mat`. - * This should contain a unique identifier for each row of `mat` (typically a gene name or index). - * @param ref An expression matrix for the reference expression profiles, where rows are genes and columns are cells. - * This should have non-zero columns. - * @param[in] ref_id Pointer to an array of identifiers of length equal to the number of rows of any `ref`. - * This should contain a unique identifier for each row in `ref`, and should be comparable to `mat_id`. - * @param[in] labels An array of length equal to the number of columns of `ref`, containing the label for each sample. - * The smallest label should be 0 and the largest label should be equal to the total number of unique labels minus 1. - * @param markers A vector of vectors of ranked marker genes for each pairwise comparison between labels, see `Markers` for more details. - * - * @return A `BasicScorer::Results` object containing the assigned labels and scores. - */ - template - BasicScorer::Results run(const tatami::Matrix* mat, const Id* mat_id, const tatami::Matrix* ref, const Id* ref_id, const int* labels, Markers markers) const { - auto built = build_reference(mat->nrow(), mat_id, ref, ref_id, labels, std::move(markers)); - return set_up_scorer().run(mat, built); - } -}; - -} - -#endif diff --git a/inst/include/singlepp/IntegratedBuilder.hpp b/inst/include/singlepp/IntegratedBuilder.hpp deleted file mode 100644 index 1ba3509..0000000 --- a/inst/include/singlepp/IntegratedBuilder.hpp +++ /dev/null @@ -1,578 +0,0 @@ -#ifndef SINGLEPP_INTEGRATED_BUILDER_HPP -#define SINGLEPP_INTEGRATED_BUILDER_HPP - -#include "macros.hpp" - -#include "scaled_ranks.hpp" -#include "BasicBuilder.hpp" - -#include -#include -#include -#include - -/** - * @file IntegratedBuilder.hpp - * - * @brief Prepare for integrated classification across references. - */ - -namespace singlepp { - -/** - * @brief Reference datasets prepared for integrated classification. - */ -struct IntegratedReferences { - /** - * @return Number of reference datasets. - * Each object corresponds to the reference used in an `IntegratedBuilder::add()` call, in the same order. - */ - size_t num_references() const { - return markers.size(); - } - - /** - * @param r Reference dataset of interest. - * @return Number of labels in this reference. - */ - size_t num_labels(size_t r) const { - return markers[r].size(); - } - - /** - * @param r Reference dataset of interest. - * @return Number of profiles in this reference. - */ - size_t num_profiles(size_t r) const { - size_t n = 0; - for (const auto& ref : ranked[r]) { - n += ref.size(); - } - return n; - } - - /** - * @cond - */ - std::vector universe; // To be used by IntegratedScorer for indexed extraction. - - std::vector check_availability; - std::vector > available; // indices to 'universe' - std::vector > > markers; // indices to 'universe' - std::vector > > > ranked; // .second contains indices to 'universe' - - void resize(size_t n) { - check_availability.resize(n); - available.resize(n); - markers.resize(n); - ranked.resize(n); - } - /** - * @endcond - */ -}; - -/** - * @brief Factory to prepare multiple references for integrated classification. - * - * For each reference dataset, we expect a `BasicBuilder::Prebuilt` or `BasicBuilder::PrebuiltIntersection` object, - * as well as the original data structures (matrix, labels, etc.) used to construct that object. - * These values are passed into `add()` to register that dataset, which can be repeated multiple times for different references. - * Finally, calling `finish()` will return a vector of integrated references that can be used in `IntegratedScorer::run()`. - * - * The preparation process mostly involves checking that the gene indices are consistent across references. - * This is especially true when each reference contains a different set of features that must be intersected with the features in the test dataset. - * See the documentation for `IntegratedScorer` for more details on the classification based on the integrated references. - */ -class IntegratedBuilder { -private: - std::vector*> stored_matrices; - std::vector stored_labels; - IntegratedReferences references; - std::vector > gene_mapping; - int nthreads = Defaults::num_threads; - -public: - /** - * @brief Default parameters. - */ - struct Defaults { - /** - * See `set_num_threads()` for details. - */ - static constexpr int num_threads = 1; - }; - - /** - * @param n Number of threads to use. - * - * @return A reference to this `IntegratedBuilder` object. - */ - IntegratedBuilder& set_num_threads(int n = Defaults::num_threads) { - nthreads = n; - return *this; - } - -private: - void add_internal_base(const tatami::Matrix* ref, const int* labels) { - stored_matrices.push_back(ref); - stored_labels.push_back(labels); - references.resize(stored_matrices.size()); - gene_mapping.resize(stored_matrices.size()); - } - - // Adding a reference without requiring any pruning of markers. - template - void add_internal_direct(const tatami::Matrix* ref, const int* labels, const Markers& old_markers, const Subset& mat_subset) { - add_internal_base(ref, labels); - - // Adding the markers for each label, indexed according to their - // position in the test matrix. This assumes that 'mat_subset' is - // appropriately specified to contain the test's row indices. - auto& new_markers = references.markers.back(); - new_markers.reserve(old_markers.size()); - - for (size_t i = 0; i < old_markers.size(); ++i) { - const auto& cur_old_markers = old_markers[i]; - - std::unordered_set unified; - for (const auto& x : cur_old_markers) { - unified.insert(x.begin(), x.end()); - } - - new_markers.emplace_back(unified.begin(), unified.end()); - - if constexpr(!std::is_same::value) { - auto& cur_new_markers = new_markers.back(); - for (auto& y : cur_new_markers) { - y = mat_subset[y]; - } - } - } - return; - } - - // Adding a reference with different features from the test, requiring some - // pruning to remove markers that are not present in the intersection. - template - void add_internal_intersect( - const std::vector >& intersection, - const tatami::Matrix* ref, - const int* labels, - const Markers& old_markers, - const Subset& ref_subset) - { - add_internal_base(ref, labels); - - // Manually constructing the markers. This involves (i) pruning out the - // markers that aren't present in the intersection, and (ii) updating - // their indices so that they point to rows of 'mat', not 'ref'. - std::unordered_map reverse_map; - for (const auto& i : intersection) { - reverse_map[i.second] = i.first; - } - - auto subindex = [&](int i) -> int { - if constexpr(!std::is_same::value) { - return ref_subset[i]; - } else { - return i; - } - }; - - auto& new_markers = references.markers.back(); - new_markers.resize(old_markers.size()); - - for (size_t i = 0; i < old_markers.size(); ++i) { - const auto& cur_old_markers = old_markers[i]; - - std::unordered_set unified; - for (const auto& x : cur_old_markers) { - unified.insert(x.begin(), x.end()); - } - - auto& cur_new_markers = new_markers[i]; - cur_new_markers.reserve(unified.size()); - - for (auto y : unified) { - auto it = reverse_map.find(subindex(y)); - if (it != reverse_map.end()) { - cur_new_markers.push_back(it->second); - } - } - } - - // Constructing the mapping of mat's rows to the reference rows. - references.check_availability.back() = true; - auto& mapping = gene_mapping.back(); - for (const auto& i : intersection) { - mapping[i.first] = i.second; - } - return; - } - -public: - /** - * Add a reference dataset to this object for later use in `finish()`. - * This overload assumes that the reference and test datasets have the same features. - * `ref` and `labels` are expected to remain valid until `finish()` is called. - * - * @param ref Matrix containing the reference expression values. - * Rows are features and columns are reference samples. - * The number and identity of features should be identical to the test dataset to be classified in `IntegratedScorer`. - * @param[in] labels Pointer to an array of label assignments. - * The smallest label should be 0 and the largest label should be equal to the total number of unique labels minus 1. - * @param markers A vector of vectors of ranked marker genes for each pairwise comparison between labels in `ref`, see `Markers` for more details. - */ - void add(const tatami::Matrix* ref, const int* labels, const Markers& markers) { - add_internal_direct(ref, labels, markers, false); - } - - /** - * Add a reference dataset to this object for later use in `finish()`. - * This overload automatically identifies the intersection of features between the test and reference datasets. - * `ref` and `labels` are expected to remain valid until `finish()` is called. - * `mat_id` and `mat_nrow` should also be constant for all invocations to `add()`. - * - * @tparam Id Type of the gene identifier for each row. - * - * @param mat_nrow Number of rows (genes) in the test dataset. - * @param[in] mat_id Pointer to an array of identifiers of length equal to `mat_nrow`. - * This should contain a unique identifier for each row of `mat` (typically a gene name or index). - * If any duplicate IDs are present, only the first occurrence is used. - * @param ref An expression matrix for the reference expression profiles, where rows are genes and columns are cells. - * This should have non-zero columns. - * @param[in] ref_id Pointer to an array of identifiers of length equal to the number of rows of any `ref`. - * This should contain a unique identifier for each row in `ref`, and should be comparable to `mat_id`. - * If any duplicate IDs are present, only the first occurrence is used. - * @param[in] labels An array of length equal to the number of columns of `ref`, containing the label for each sample. - * The smallest label should be 0 and the largest label should be equal to the total number of unique labels minus 1. - * @param markers A vector of vectors of ranked marker genes for each pairwise comparison between labels in `ref`, see `Markers` for more details. - */ - template - void add(size_t mat_nrow, - const Id* mat_id, - const tatami::Matrix* ref, - const Id* ref_id, - const int* labels, - const Markers& markers) - { - auto intersection = intersect_features(mat_nrow, mat_id, ref->nrow(), ref_id); - add_internal_intersect(intersection, ref, labels, markers, false); - } - -public: - /** - * Add a reference dataset to this object for later use in `finish()`. - * This overload assumes that the reference and test datasets have the same features, - * and that the reference dataset has already been processed through `BasicBuilder::run()`. - * `ref` and `labels` are expected to remain valid until `finish()` is called. - * - * @param ref Matrix containing the reference expression values. - * Rows are features and columns are reference samples. - * The number and identity of features should be identical to the test dataset to be classified in `IntegratedScorer`. - * @param[in] labels Pointer to an array of label assignments. - * The smallest label should be 0 and the largest label should be equal to the total number of unique labels minus 1. - * @param built The built reference created by running `BasicBuilder::run()` on `ref` and `labels`. - */ - void add(const tatami::Matrix* ref, const int* labels, const BasicBuilder::Prebuilt& built) { - add_internal_direct(ref, labels, built.markers, built.subset); - return; - } - - /** - * Add a reference dataset to this object for later use in `finish()`. - * This overload requires an existing intersection between the test and reference datasets, - * and assumes that the reference dataset has already been processed through `BasicBuilder::run()`. - * `ref` and `labels` are expected to remain valid until `finish()` is called. - * - * @param intersection Vector defining the intersection of features between the test and reference datasets. - * Each entry is a pair where the first element is the row index in the test matrix, - * and the second element is the row index for the corresponding feature in the reference matrix. - * Each row index for either matrix should occur no more than once in `intersection`. - * @param ref An expression matrix for the reference expression profiles, where rows are genes and columns are cells. - * This should have non-zero columns. - * @param[in] labels An array of length equal to the number of columns of `ref`, containing the label for each sample. - * The smallest label should be 0 and the largest label should be equal to the total number of unique labels minus 1. - * @param built The built reference created by running `BasicBuilder::run()` on all preceding arguments. - */ - void add(const std::vector >& intersection, - const tatami::Matrix* ref, - const int* labels, - const BasicBuilder::PrebuiltIntersection& built) - { - add_internal_direct(ref, labels, built.markers, built.mat_subset); - references.check_availability.back() = true; - - // Constructing the mapping of mat's rows to the reference rows. - auto& mapping = gene_mapping.back(); - for (const auto& i : intersection) { - mapping[i.first] = i.second; - } - return; - } - - /** - * Add a reference dataset to this object for later use in `finish()`. - * This overload automatically identifies the intersection of features between the test and reference datasets, - * and assumes that the reference dataset has already been processed through `BasicBuilder::run()`. - * `ref` and `labels` are expected to remain valid until `finish()` is called. - * `mat_id` and `mat_nrow` should also be constant for all invocations to `add()`. - * - * @tparam Id Type of the gene identifier for each row. - * - * @param mat_nrow Number of rows (genes) in the test dataset. - * @param[in] mat_id Pointer to an array of identifiers of length equal to `mat_nrow`. - * This should contain a unique identifier for each row of `mat` (typically a gene name or index). - * If any duplicate IDs are present, only the first occurrence is used. - * @param ref An expression matrix for the reference expression profiles, where rows are genes and columns are cells. - * This should have non-zero columns. - * @param[in] ref_id Pointer to an array of identifiers of length equal to the number of rows of any `ref`. - * This should contain a unique identifier for each row in `ref`, and should be comparable to `mat_id`. - * If any duplicate IDs are present, only the first occurrence is used. - * @param[in] labels An array of length equal to the number of columns of `ref`, containing the label for each sample. - * The smallest label should be 0 and the largest label should be equal to the total number of unique labels minus 1. - * @param built The built reference created by running `BasicBuilder::run()` on all preceding arguments. - */ - template - void add(size_t mat_nrow, - const Id* mat_id, - const tatami::Matrix* ref, - const Id* ref_id, - const int* labels, - const BasicBuilder::PrebuiltIntersection& built) - { - auto intersection = intersect_features(mat_nrow, mat_id, ref->nrow(), ref_id); - add(intersection, ref, labels, built); - return; - } - - /** - * Add a reference dataset to this object for later use in `finish()`. - * This overload assumes that the reference and test datasets have the same features, - * and that the reference dataset has already been processed through `BasicBuilder::run()`. - * `ref` and `labels` are expected to remain valid until `finish()` is called. - * - * @param intersection Vector defining the intersection of features betweent the test and reference datasets. - * Each entry is a pair where the first element is the row index in the test matrix, - * and the second element is the row index for the corresponding feature in the reference matrix. - * Each row index for either matrix should occur no more than once in `intersection`. - * @param ref An expression matrix for the reference expression profiles, where rows are genes and columns are cells. - * This should have non-zero columns. - * @param[in] labels An array of length equal to the number of columns of `ref`, containing the label for each sample. - * The smallest label should be 0 and the largest label should be equal to the total number of unique labels minus 1. - * @param built The built reference created by running `BasicBuilder::run()` on `ref` and `labels`. - */ - void add(const std::vector >& intersection, - const tatami::Matrix* ref, - const int* labels, - const BasicBuilder::Prebuilt& built) - { - add_internal_intersect(intersection, ref, labels, built.markers, built.subset); - } - - /** - * Add a reference dataset to this object for later use in `finish()`. - * This overload automatically identifies the intersection of features between the test and reference datasets, - * and assumes that the reference dataset has already been processed through `BasicBuilder::run()`. - * `ref` and `labels` are expected to remain valid until `finish()` is called. - * `mat_id` and `mat_nrow` should also be constant for all invocations to `add()`. - * - * @tparam Id Type of the gene identifier for each row. - * - * @param mat_nrow Number of rows (genes) in the test dataset. - * @param[in] mat_id Pointer to an array of identifiers of length equal to `mat_nrow`. - * This should contain a unique identifier for each row of `mat` (typically a gene name or index). - * If any duplicate IDs are present, only the first occurrence is used. - * @param ref An expression matrix for the reference expression profiles, where rows are genes and columns are cells. - * This should have non-zero columns. - * @param[in] ref_id Pointer to an array of identifiers of length equal to the number of rows of any `ref`. - * This should contain a unique identifier for each row in `ref`, and should be comparable to `mat_id`. - * If any duplicate IDs are present, only the first occurrence is used. - * @param[in] labels An array of length equal to the number of columns of `ref`, containing the label for each sample. - * The smallest label should be 0 and the largest label should be equal to the total number of unique labels minus 1. - * @param built The built reference created by running `BasicBuilder::run()` on `ref` and `labels`. - */ - template - void add(size_t mat_nrow, - const Id* mat_id, - const tatami::Matrix* ref, - const Id* ref_id, - const int* labels, - const BasicBuilder::Prebuilt& built) - { - auto intersection = intersect_features(mat_nrow, mat_id, ref->nrow(), ref_id); - add(intersection, ref, labels, built); - } - -private: - /* Here, we've split out some of the functions for easier reading. - * Otherwise everything would land in a single mega-function. - */ - - static void fill_ranks( - const tatami::Matrix* curmat, - const int* curlab, - const std::vector& in_use, - const std::vector& positions, - std::vector > >& cur_ranked, - int nthreads) - { - // If we don't need to check availability, this implies that - // the reference has 1:1 feature mapping to the test dataset. - // In that case, we can proceed quite simply. - tatami::parallelize([&](int, int start, int len) -> void { - RankedVector tmp_ranked; - tmp_ranked.reserve(in_use.size()); - - // 'in_use' is guaranteed to be sorted and unique, see its derivation in finish(). - // This means we can directly use it for indexed extraction. - auto wrk = tatami::consecutive_extractor(curmat, false, start, len, in_use); - std::vector buffer(in_use.size()); - - for (int c = start, end = start + len; c < end; ++c) { - auto ptr = wrk->fetch(c, buffer.data()); - - tmp_ranked.clear(); - for (int i = 0, end = in_use.size(); i < end; ++i, ++ptr) { - tmp_ranked.emplace_back(*ptr, i); - } - std::sort(tmp_ranked.begin(), tmp_ranked.end()); - - auto& final_ranked = cur_ranked[curlab[c]][positions[c]]; - simplify_ranks(tmp_ranked, final_ranked); - } - }, curmat->ncol(), nthreads); - } - - static void fill_ranks( - const tatami::Matrix* curmat, - const int* curlab, - const std::vector& in_use, - const std::vector& positions, - const std::unordered_map& cur_mapping, - std::unordered_set& cur_available, - std::vector > >& cur_ranked, - int nthreads) - { - // If we need to check availability, then we need to check - // the mapping of test genes to row indices of the reference. - std::vector > remapping; - remapping.reserve(in_use.size()); - - for (int i = 0, end = in_use.size(); i < end; ++i) { - auto it = cur_mapping.find(in_use[i]); - if (it != cur_mapping.end()) { - remapping.emplace_back(it->second, i); // using 'i' instead of 'in_use[i]', as we want to work with indices into 'in_use', not the values of 'in_use' themselves. - cur_available.insert(i); - } - } - - std::sort(remapping.begin(), remapping.end()); - - // This section is just to enable indexed extraction by tatami. - // There's no need to consider duplicates among the - // 'remapping[i].first', 'cur_mapping->second' is guaranteed to be - // unique as a consequence of how intersect_features works. - std::vector remapped_in_use; - remapped_in_use.reserve(remapping.size()); - for (const auto& p : remapping) { - remapped_in_use.push_back(p.first); - } - - tatami::parallelize([&](int, int start, int len) -> void { - RankedVector tmp_ranked; - tmp_ranked.reserve(remapped_in_use.size()); - std::vector buffer(remapped_in_use.size()); - auto wrk = tatami::consecutive_extractor(curmat, false, start, len, remapped_in_use); - - for (size_t c = start, end = start + len; c < end; ++c) { - auto ptr = wrk->fetch(c, buffer.data()); - - tmp_ranked.clear(); - for (const auto& p : remapping) { - tmp_ranked.emplace_back(*ptr, p.second); // remember, 'p.second' corresponds to indices into 'in_use'. - ++ptr; - } - std::sort(tmp_ranked.begin(), tmp_ranked.end()); - - auto& final_ranked = cur_ranked[curlab[c]][positions[c]]; - simplify_ranks(tmp_ranked, final_ranked); - } - }, curmat->ncol(), nthreads); - } - -public: - /** - * @return The set of integrated references, for use in `IntegratedScorer`. - * - * This function should only be called once, after all reference datasets have been registered with `add()`. - * Any further invocations of this function will not be valid. - */ - IntegratedReferences finish() { - // Identify the union of all marker genes. - auto& in_use = references.universe; - std::unordered_map remap_to_universe; - { - std::unordered_set in_use_tmp; - for (const auto& refmarkers : references.markers) { - for (const auto& mrk : refmarkers) { - in_use_tmp.insert(mrk.begin(), mrk.end()); - } - } - - in_use.insert(in_use.end(), in_use_tmp.begin(), in_use_tmp.end()); - std::sort(in_use.begin(), in_use.end()); - - for (int i = 0, end = in_use.size(); i < end; ++i) { - remap_to_universe[in_use[i]] = i; - } - } - - for (size_t r = 0, end = references.num_references(); r < end; ++r) { - auto curlab = stored_labels[r]; - auto curmat = stored_matrices[r]; - - // Reindexing the markers to point to the universe. - auto& curmarkers = references.markers[r]; - for (auto& outer : curmarkers) { - for (auto& x : outer) { - x = remap_to_universe.find(x)->second; - } - } - - auto& cur_ranked = references.ranked[r]; - std::vector positions; - { - size_t nlabels = curmarkers.size(); - size_t NC = curmat->ncol(); - positions.reserve(NC); - - std::vector samples_per_label(nlabels); - for (size_t c = 0; c < NC; ++c) { - auto& pos = samples_per_label[curlab[c]]; - positions.push_back(pos); - ++pos; - } - - cur_ranked.resize(nlabels); - for (size_t l = 0; l < nlabels; ++l) { - cur_ranked[l].resize(samples_per_label[l]); - } - } - - // Finally filling the rankings. - if (!references.check_availability[r]) { - fill_ranks(curmat, curlab, in_use, positions, references.ranked[r], nthreads); - } else { - fill_ranks(curmat, curlab, in_use, positions, gene_mapping[r], references.available[r], references.ranked[r], nthreads); - } - } - - return std::move(references); - } -}; - -} - -#endif diff --git a/inst/include/singlepp/IntegratedScorer.hpp b/inst/include/singlepp/IntegratedScorer.hpp deleted file mode 100644 index e57b804..0000000 --- a/inst/include/singlepp/IntegratedScorer.hpp +++ /dev/null @@ -1,344 +0,0 @@ -#ifndef SINGLEPP_RECOMPUTE_SCORES_HPP -#define SINGLEPP_RECOMPUTE_SCORES_HPP - -#include "macros.hpp" - -#include "tatami/tatami.hpp" - -#include "compute_scores.hpp" -#include "scaled_ranks.hpp" -#include "Classifier.hpp" -#include "IntegratedBuilder.hpp" - -#include -#include -#include - -/** - * @file IntegratedScorer.hpp - * - * @brief Integrate classifications from multiple references. - */ - -namespace singlepp { - -/** - * @brief Integrate classifications from multiple references. - * - * In situations where multiple reference datasets are available, - * we would like to obtain a single prediction for each cell from all of those references. - * This is somewhat tricky as the different references are likely to contain strong batch effects, - * complicating the calculation of marker genes between labels from different references (and thus precluding direct use of the usual `Classifier::run()`). - * The labels themselves also tend to be inconsistent, e.g., different vocabularies and resolutions, making it difficult to define sensible groups in a combined "super-reference". - * - * To avoid these issues, we first perform classification within each reference individually. - * For each test cell, we identify its predicted label from a given reference, and we collect all the marker genes for that label (across all pairwise comparisons in that reference). - * After doing this for each reference, we pool all of the collected markers to obtain a common set of interesting genes. - * We then compute the correlation-based score between the test cell's expression profile and its predicted label from each reference, using that common set of genes. - * The label with the highest score is considered the best representative across all references. - * - * This strategy is similar to using `Classifier::run()` without fine-tuning, - * except that we are choosing between the best labels from all references rather than between all labels from one reference. - * The main idea is to create a common feature set so that the correlations can be reasonably compared across references. - * Note that differences in the feature sets across references are tolerated by simply ignoring missing genes when computing the correlations. - * This reduces the comparability of the scores as the effective feature set will vary a little (or a lot, depending) across references; - * nonetheless, it is preferred to taking the intersection, which is liable to leave us with very few genes. - * - * Our approach avoids any direct comparison between the expression profiles of different references, - * allowing us to side-step the question of how to deal with the batch effects. - * Similarly, we defer responsibility on solving the issue of label heterogeneity, - * by just passing along the existing labels and leaving it to the user's interpretation. - */ -class IntegratedScorer { -public: - /** - * @brief Default parameters. - */ - struct Defaults { - /** - * See `set_quantile()` for details. - */ - static constexpr double quantile = Classifier::Defaults::quantile; - - /** - * See `set_num_threads()` for details. - */ - static constexpr int num_threads = 1; - }; - - /** - * @param q Quantile to use to compute a per-label score from the correlations. - * - * @return A reference to this `IntegratedScorer` object. - * - * See `Classifier::set_quantile()` for more details. - */ - IntegratedScorer& set_quantile(double q = Defaults::quantile) { - quantile = q; - return *this; - } - - /** - * @param n Number of threads to use. - * By default, this is inherited from the parent `IntegratedBuilder` object. - * - * @return A reference to this `IntegratedScorer` object. - */ - IntegratedScorer& set_num_threads(int n = Defaults::num_threads) { - nthreads = n; - return *this; - } - -private: - double quantile = Defaults::quantile; - int nthreads = Defaults::num_threads; - -private: - /* Here, we've split out some of the functions for easier reading. - * Otherwise everything would land in a single mega-function. - */ - - static void build_miniverse(int cell, - const std::vector& assigned, - const IntegratedReferences& built, - std::unordered_set& uset, - std::vector& uvec) - { - uset.clear(); - size_t nref = built.num_references(); - - for (size_t r = 0; r < nref; ++r) { - auto best = assigned[r][cell]; - const auto& markers = built.markers[r][best]; - uset.insert(markers.begin(), markers.end()); - } - - uvec.clear(); - uvec.insert(uvec.end(), uset.begin(), uset.end()); - std::sort(uvec.begin(), uvec.end()); - return; - } - - template - static void fill_ranks( - Extractor_* wrk, - const std::vector& miniverse, - int cell, - std::vector& buffer, - RankedVector& data_ranked) - { - data_ranked.clear(); - if (miniverse.empty()) { - return; - } - - auto ptr = wrk->fetch(cell, buffer.data()); - for (auto u : miniverse) { - data_ranked.emplace_back(ptr[u], u); - } - - std::sort(data_ranked.begin(), data_ranked.end()); - return; - } - - static void prepare_mapping( - const IntegratedReferences& built, - size_t ref, - const std::vector& miniverse, - std::unordered_map& mapping) - { - mapping.clear(); - if (built.check_availability[ref]) { - const auto& cur_available = built.available[ref]; - - // If we need to check availability, we reconstruct the mapping - // for the intersection of features available in this reference. - // We then calculate the scaled ranks for the data. - int counter = 0; - for (auto c : miniverse) { - if (cur_available.find(c) != cur_available.end()) { - mapping[c] = counter; - ++counter; - } - } - } else { - // If we don't need to check availability, the mapping is - // much simpler. Technically, we could do this in the outer - // loop if none of the references required checks, but this - // seems like a niche optimization in practical settings. - for (size_t s = 0; s < miniverse.size(); ++s) { - auto u = miniverse[s]; - mapping[u] = s; - } - } - return; - } - -public: - /** - * @param mat Expression matrix of the test dataset, where rows are genes and columns are cells. - * The identity of the rows should be consistent with the arguments used in `IntegratedBuilder::add()`. - * @param[in] assigned Vector of pointers of length equal to the number of references. - * Each pointer should point to an array of length equal to the number of columns in `mat`, - * containing the assigned label for each column in each reference. - * @param built Set of integrated references produced by `IntegratedBuilder::finish()`. - * @param[out] best Pointer to an array of length equal to the number of columns in `mat`. - * On output, this is filled with the index of the reference with the best label for each cell. - * @param[out] scores Vector of pointers of length equal to the number of references. - * Each pointer should point to an array of length equal to the number of columns in `mat`. - * On output, this is filled with the (non-fine-tuned) score for the best label of that reference for each cell. - * Any pointer may be `NULL` in which case the scores for that label will not be reported. - * @param[out] delta Pointer to an array of length equal to the number of columns in `mat`. - * On output, this is filled with the difference between the highest and second-highest scores. - * This may also be `NULL` in which case the deltas are not reported. - */ - void run( - const tatami::Matrix* mat, - const std::vector& assigned, - const IntegratedReferences& built, - int* best, - std::vector& scores, - double* delta) - const { - auto NR = mat->nrow(); - auto nref = built.num_references(); - - tatami::parallelize([&](int, int start, int len) -> void { - // We perform an indexed extraction, so all subsequent indices - // will refer to indices into this subset (i.e., 'built.universe'). - auto wrk = tatami::consecutive_extractor(mat, false, start, len, built.universe); - std::vector buffer(built.universe.size()); - - RankedVector data_ranked, data_ranked2; - data_ranked.reserve(NR); - data_ranked2.reserve(NR); - RankedVector ref_ranked; - ref_ranked.reserve(NR); - - std::vector scaled_data(NR); - std::vector scaled_ref(NR); - std::unordered_set miniverse_tmp; - std::vector miniverse; - std::unordered_map mapping; - std::vector all_correlations; - - for (int i = start, end = start + len; i < end; ++i) { - build_miniverse(i, assigned, built, miniverse_tmp, miniverse); - fill_ranks(wrk.get(), miniverse, i, buffer, data_ranked); - - // Scanning through each reference and computing the score for the best group. - double best_score = -1000, next_best = -1000; - int best_ref = 0; - - for (size_t r = 0; r < nref; ++r) { - prepare_mapping(built, r, miniverse, mapping); - - scaled_ref.resize(mapping.size()); - scaled_data.resize(mapping.size()); - - data_ranked2.clear(); - subset_ranks(data_ranked, data_ranked2, mapping); - scaled_ranks(data_ranked2, scaled_data.data()); - - // Now actually calculating the score for the best group for - // this cell in this reference. This assumes that 'ref.ranked' - // already contains sorted pairs where the indices refer to the - // rows of the original data matrix. - auto best = assigned[r][i]; - const auto& best_ranked = built.ranked[r][best]; - all_correlations.clear(); - - for (size_t s = 0; s < best_ranked.size(); ++s) { - ref_ranked.clear(); - subset_ranks(best_ranked[s], ref_ranked, mapping); - scaled_ranks(ref_ranked, scaled_ref.data()); - double cor = distance_to_correlation(scaled_ref.size(), scaled_data, scaled_ref); - all_correlations.push_back(cor); - } - - double score = correlations_to_scores(all_correlations, quantile); - if (scores[r]) { - scores[r][i] = score; - } - if (score > best_score) { - next_best = best_score; - best_score = score; - best_ref = r; - } else if (score > next_best) { - next_best = score; - } - } - - if (best) { - best[i] = best_ref; - } - if (delta && nref > 1) { - delta[i] = best_score - next_best; - } - } - - }, mat->ncol(), nthreads); - } - -public: - /** - * @brief Results of the integrated annotation. - */ - struct Results { - /** - * @cond - */ - Results(size_t ncells, size_t nrefs) : best(ncells), scores(nrefs, std::vector(ncells)), delta(ncells) {} - - std::vector scores_to_pointers() { - std::vector output(scores.size()); - for (size_t s = 0; s < scores.size(); ++s) { - output[s] = scores[s].data(); - } - return output; - }; - /** - * @endcond - */ - - /** - * Vector of length equal to the number of cells in the test dataset, - * containing the index of the reference with the top-scoring label for each cell. - */ - std::vector best; - - /** - * Vector of length equal to the number of references, - * containing vectors of length equal to the number of cells in the test dataset. - * Each vector corresponds to a reference and contains the score for the best label in that reference for each cell. - */ - std::vector > scores; - - /** - * Vector of length equal to the number of cells in the test dataset. - * This contains the difference between the highest and second-highest scores for each cell. - */ - std::vector delta; - }; - - /** - * @param mat Expression matrix of the test dataset, where rows are genes and columns are cells. - * The identity of the rows should be consistent with the arguments used in `IntegratedBuilder::add()`. - * @param[in] assigned Vector of pointers of length equal to the number of references. - * Each pointer should point to an array of length equal to the number of columns in `mat`, - * containing the assigned label for each column in each reference. - * @param built Set of integrated references produced by `IntegratedBuilder::finish()`. - * - * @return A `Results` object containing the assigned labels and scores. - */ - Results run(const tatami::Matrix* mat, const std::vector& assigned, const IntegratedReferences& built) const { - Results output(mat->ncol(), built.num_references()); - auto scores = output.scores_to_pointers(); - run(mat, assigned, built, output.best.data(), scores, output.delta.data()); - return output; - } -}; - -} - -#endif diff --git a/inst/include/singlepp/Markers.hpp b/inst/include/singlepp/Markers.hpp deleted file mode 100644 index 9228ec6..0000000 --- a/inst/include/singlepp/Markers.hpp +++ /dev/null @@ -1,43 +0,0 @@ -#ifndef SINGLEPP_MARKERS_HPP -#define SINGLEPP_MARKERS_HPP - -#include "macros.hpp" - -#include - -/** - * @file Markers.hpp - * - * @brief Define the `Markers` typedef. - */ - -namespace singlepp { - -/** - * A vector of vectors of ranked marker lists, used to determine which features should be used to compute correlations in `Classifier`. - * - * For a `Markers` object `markers`, let us consider the vector at `markers[0][1]`. - * This vector is expected to contain the ranked indices of the marker genes for label 0 compared to label 1. - * Most typically, this is generated by identifying the genes that are upregulated in label 0 compared to 1 and sorting by decreasing effect size. - * Indices should refer to the rows of the reference expression matrices (i.e., `ref` in the various `Classifier::run()` or `BasicBuilder::run()` methods). - * So, for example, `markers[0][1][0]` should contain the row index of the most upregulated gene in label 0 compared to 1. - * - * For a given reference dataset, the corresponding `Markers` object should have length equal to the number of labels in that reference. - * Each middle vector (i.e., `markers[i]` for non-negative `i` less than the number of labels) should also have length equal to the number of labels. - * The innermost vectors that are not on the diagonal (i.e., `markers[i][j]` for `i != j`) may be of any positive length and should contain unique row indices. - * Any innermost vector along the "diagonal" (i.e., `markers[i][i]`) is typically of zero length. - * - * Ideally, the non-diagonal innermost vectors should be at least as long as the setting of `Classifier::set_top()`. - * The cell type annotation will still work with shorter (non-empty) vectors but it will not be possible to achieve the user-specified `set_top()`. - * In cases involving feature intersections in `run()`, the vectors should be long enough to achieve the specified `set_top()` after removing non-shared genes. - * For longer vectors, the annotation will safely ignore the indices after the `set_top()` specification. - * - * As mentioned previously, the diagonal innermost vectors are typically empty, given that it makes little sense to identify upregulated markers in a label compared to itself. - * That said, any genes stored on the diagonal will be respected and used in all feature subsets for the corresponding label. - * This can be exploited by advanced users to efficiently store "universal" markers for a label, i.e., markers that are applicable in all comparisons to other labels. - */ -typedef std::vector > > Markers; - -} - -#endif diff --git a/inst/include/singlepp/annotate_cells.hpp b/inst/include/singlepp/annotate_cells.hpp deleted file mode 100644 index 1ae2b9d..0000000 --- a/inst/include/singlepp/annotate_cells.hpp +++ /dev/null @@ -1,116 +0,0 @@ -#ifndef SINGLEPP_ANNOTATE_CELLS_HPP -#define SINGLEPP_ANNOTATE_CELLS_HPP - -#include "macros.hpp" - -#include "tatami/tatami.hpp" - -#include "scaled_ranks.hpp" -#include "process_features.hpp" -#include "build_indices.hpp" -#include "fine_tune.hpp" - -#include -#include -#include - -namespace singlepp { - -inline void annotate_cells_simple( - const tatami::Matrix* mat, - size_t num_subset, - const int* subset, - const std::vector& ref, - const Markers& markers, - double quantile, - bool fine_tune, - double threshold, - int* best, - std::vector& scores, - double* delta, - int nthreads) -{ - // Figuring out how many neighbors to keep and how to compute the quantiles. - const size_t NL = ref.size(); - std::vector search_k(NL); - std::vector > coeffs(NL); - for (size_t r = 0; r < NL; ++r) { - double denom = ref[r].index->nobs() - 1; - double prod = denom * (1 - quantile); - auto k = std::ceil(prod) + 1; - search_k[r] = k; - - // `(1 - quantile) - (k - 2) / denom` represents the gap to the smaller quantile, - // while `(k - 1) / denom - (1 - quantile)` represents the gap from the larger quantile. - // The size of the gap is used as the weight for the _other_ quantile, i.e., - // the closer you are to a quantile, the higher the weight. - // We convert these into proportions by dividing by their sum, i.e., `1/denom`. - coeffs[r].first = static_cast(k - 1) - prod; - coeffs[r].second = prod - static_cast(k - 2); - } - - std::vector subcopy(subset, subset + num_subset); - SubsetSorter subsorted(subcopy); - - tatami::parallelize([&](int, int start, int length) -> void { - auto wrk = tatami::consecutive_extractor(mat, false, start, length, subsorted.extraction_subset()); - RankedVector vec(num_subset); - std::vector buffer(num_subset); - - FineTuner ft; - std::vector curscores(NL); - - for (int c = start, end = start + length; c < end; ++c) { - auto ptr = wrk->fetch(c, buffer.data()); - subsorted.fill_ranks(ptr, vec); - scaled_ranks(vec, buffer.data()); // 'buffer' can be written to, as all data is extracted to 'vec'. - - curscores.resize(NL); - for (size_t r = 0; r < NL; ++r) { - size_t k = search_k[r]; - auto current = ref[r].index->find_nearest_neighbors(buffer.data(), k); - - double last = current[k - 1].second; - last = 1 - 2 * last * last; - if (k == 1) { - curscores[r] = last; - } else { - double next = current[k - 2].second; - next = 1 - 2 * next * next; - curscores[r] = coeffs[r].first * next + coeffs[r].second * last; - } - - if (scores[r]) { - scores[r][c] = curscores[r]; - } - } - - if (!fine_tune) { - auto top = std::max_element(curscores.begin(), curscores.end()); - best[c] = top - curscores.begin(); - if (delta) { - if (curscores.size() > 1) { - double topscore = *top; - *top = -100; - delta[c] = topscore - *std::max_element(curscores.begin(), curscores.end()); - } else { - delta[c] = std::numeric_limits::quiet_NaN(); - } - } - } else { - auto tuned = ft.run(vec, ref, markers, curscores, quantile, threshold); - best[c] = tuned.first; - if (delta) { - delta[c] = tuned.second; - } - } - } - - }, mat->ncol(), nthreads); - - return; -} - -} - -#endif diff --git a/inst/include/singlepp/build_indices.hpp b/inst/include/singlepp/build_indices.hpp deleted file mode 100644 index 0dbe22b..0000000 --- a/inst/include/singlepp/build_indices.hpp +++ /dev/null @@ -1,110 +0,0 @@ -#ifndef SINGLEPP_BUILD_INDICES_HPP -#define SINGLEPP_BUILD_INDICES_HPP - -#include "macros.hpp" - -#include "knncolle/knncolle.hpp" -#include "tatami/tatami.hpp" - -#include "process_features.hpp" -#include "scaled_ranks.hpp" - -#include -#include -#include - -namespace singlepp { - -inline size_t get_nlabels(size_t n, const int* labels) { - if (n == 0) { - throw std::runtime_error("reference dataset must have at least one column"); - } - return *std::max_element(labels, labels + n) + 1; -} - -struct Reference { - std::vector > ranked; - std::shared_ptr > index; -}; - -template -std::vector build_indices(const tatami::Matrix* ref, const int* labels, const std::vector& subset, const Builder& build, int nthreads) { - size_t NC = ref->ncol(); - size_t nlabels = get_nlabels(NC, labels); - std::vector label_count(nlabels); - for (size_t i = 0; i < NC; ++i) { - ++label_count[labels[i]]; - } - - size_t NR = subset.size(); - std::vector nnrefs(nlabels); - std::vector > nndata(nlabels); - for (size_t l = 0; l < nlabels; ++l) { - if (label_count[l] == 0) { - throw std::runtime_error(std::string("no entries for label ") + std::to_string(l)); - } - nnrefs[l].ranked.resize(label_count[l]); - nndata[l].resize(label_count[l] * NR); - } - - std::vector offsets(NC); - { - std::vector counter(nlabels); - for (size_t c = 0; c < NC; ++c) { - auto& o = counter[labels[c]]; - offsets[c] = o; - ++o; - } - } - - SubsetSorter subsorter(subset); - - tatami::parallelize([&](int, int start, int len) -> void { - RankedVector ranked(NR); - std::vector buffer(ref->nrow()); - auto wrk = tatami::consecutive_extractor(ref, false, start, len, subsorter.extraction_subset()); - - for (int c = start, end = start + len; c < end; ++c) { - auto ptr = wrk->fetch(c, buffer.data()); - subsorter.fill_ranks(ptr, ranked); - - auto curlab = labels[c]; - auto curoff = offsets[c]; - auto scaled = nndata[curlab].data() + curoff * NR; - scaled_ranks(ranked, scaled); - - // Storing as a pair of ints to save space; as long - // as we respect ties, everything should be fine. - auto& stored_ranks = nnrefs[curlab].ranked[curoff]; - stored_ranks.reserve(ranked.size()); - simplify_ranks(ranked, stored_ranks); - } - }, ref->ncol(), nthreads); - -#ifndef SINGLEPP_CUSTOM_PARALLEL - #pragma omp parallel for num_threads(nthreads) - for (size_t l = 0; l < nlabels; ++l) { -#else - SINGLEPP_CUSTOM_PARALLEL([&](int, size_t start, size_t len) -> void { - for (size_t l = start, end = start + len; l < end; ++l) { -#endif - nnrefs[l].index = build(NR, label_count[l], nndata[l].data()); - - // Trying to free the memory as we go along, to offset the copying - // of nndata into the memory store owned by the knncolle index. - nndata[l].clear(); - nndata[l].shrink_to_fit(); - -#ifndef SINGLEPP_CUSTOM_PARALLEL - } -#else - } - }, nlabels, nthreads); -#endif - - return nnrefs; -} - -} - -#endif diff --git a/inst/include/singlepp/compute_scores.hpp b/inst/include/singlepp/compute_scores.hpp deleted file mode 100644 index 8ad93fe..0000000 --- a/inst/include/singlepp/compute_scores.hpp +++ /dev/null @@ -1,60 +0,0 @@ -#ifndef SINGLEPP_COMPUTE_SCORES_HPP -#define SINGLEPP_COMPUTE_SCORES_HPP - -#include "macros.hpp" - -#include -#include -#include -#include - -namespace singlepp { - -inline double correlations_to_scores (std::vector& correlations, double quantile) { - const size_t ncells=correlations.size(); - if (ncells==0) { - return std::numeric_limits::quiet_NaN(); - } else if (quantile==1 || ncells==1) { - return *std::max_element(correlations.begin(), correlations.end()); - } else { - const double denom = ncells - 1; - const double prod = denom * quantile; - const size_t left = std::floor(prod); - const size_t right = std::ceil(prod); - - std::nth_element(correlations.begin(), correlations.begin() + right, correlations.end()); - const double rightval=correlations[right]; - if (right == left) { - return rightval; - } - - // Do NOT be tempted to do the second nth_element with the end at begin()+right; - // this does not handle ties properly. - std::nth_element(correlations.begin(), correlations.begin() + left, correlations.end()); - const double leftval=correlations[left]; - - // `quantile - left / denom` represents the gap to the smaller quantile, - // while `right / denom - quantile` represents the gap from the larger quantile. - // The size of the gap is used as the weight for the _other_ quantile, i.e., - // the closer you are to a quantile, the higher the weight. - // We convert these into proportions by dividing by their sum, i.e., `1/denom`. - const double leftweight = right - prod; - const double rightweight = prod - left; - - return rightval * rightweight + leftval * leftweight; - } -} - -template -double distance_to_correlation(size_t n, const Vector& p1, const Vector& p2) { - double d2 = 0; - for (size_t i = 0; i < n; ++i) { - double tmp = p1[i] - p2[i]; - d2 += tmp * tmp; - } - return 1 - 2 * d2; -} - -} - -#endif diff --git a/inst/include/singlepp/fine_tune.hpp b/inst/include/singlepp/fine_tune.hpp deleted file mode 100644 index b7675c0..0000000 --- a/inst/include/singlepp/fine_tune.hpp +++ /dev/null @@ -1,170 +0,0 @@ -#ifndef TATAMI_FINE_TUNE_HPP -#define TATAMI_FINE_TUNE_HPP - -#include "macros.hpp" - -#include "scaled_ranks.hpp" -#include "process_features.hpp" -#include "compute_scores.hpp" -#include "build_indices.hpp" - -#include -#include -#include - -namespace singlepp { - -inline std::pair fill_labels_in_use(const std::vector& scores, double threshold, std::vector& in_use) { - auto it = std::max_element(scores.begin(), scores.end()); - int best_label = it - scores.begin(); - double max_score = *it; - - in_use.clear(); - constexpr double DUMMY = -1000; - double next_score = DUMMY; - const double bound = max_score - threshold; - - for (size_t i = 0; i < scores.size(); ++i) { - const auto& val = scores[i]; - if (val >= bound) { - in_use.push_back(i); - } - if (i != best_label && next_score < val) { - next_score = val; - } - } - - return std::make_pair(best_label, max_score - next_score); -} - -inline std::pair replace_labels_in_use(const std::vector& scores, double threshold, std::vector& in_use) { - auto it = std::max_element(scores.begin(), scores.end()); - int best_index = it - scores.begin(); - double max_score = *it; - - int best_label = in_use[best_index]; - size_t counter = 0; - - constexpr double DUMMY = -1000; - double next_score = DUMMY; - const double bound = max_score - threshold; - - for (size_t i = 0; i < scores.size(); ++i) { - const auto& val = scores[i]; - if (val >= bound) { - in_use[counter] = in_use[i]; - ++counter; - } - if (i != best_index && next_score < val) { - next_score = val; - } - } - - in_use.resize(counter); - return std::make_pair(best_label, max_score - next_score); -} - -class FineTuner { - std::vector labels_in_use; - - std::unordered_map gene_subset; - - std::vector scaled_left, scaled_right; - - std::vector all_correlations; - - RankedVector input_sub; - - RankedVector ref_sub; - -public: - template - std::pair run( - const RankedVector& input, - const std::vector& ref, - const Markers& markers, - std::vector& scores, - double quantile, - double threshold) - { - if (scores.size() <= 1) { - return std::make_pair(0, std::numeric_limits::quiet_NaN()); - } - - auto candidate = fill_labels_in_use(scores, threshold, labels_in_use); - - // If there's only one top label, we don't need to do anything else. - if (labels_in_use.size() == 1) { - return candidate; - } - - // We also give up if every label is in range, because any subsequent - // calculations would use all markers and just give the same result. - // The 'test' parameter allows us to skip this bypass for testing. - if constexpr(!test) { - if (labels_in_use.size() == ref.size()) { - return candidate; - } - } - - while (labels_in_use.size() > 1) { - gene_subset.clear(); - gene_subset.reserve(input.size()); // known maximum. - for (auto l : labels_in_use) { - for (auto l2 : labels_in_use){ - const auto& current = markers[l][l2]; - for (auto c : current) { - if (gene_subset.find(c) == gene_subset.end()) { - const int counter = gene_subset.size(); - gene_subset[c] = counter; - } - } - } - } - - input_sub.clear(); - subset_ranks(input, input_sub, gene_subset); - scaled_left.resize(gene_subset.size()); - scaled_ranks(input_sub, scaled_left.data()); - - scaled_right.resize(gene_subset.size()); - scores.clear(); - - for (size_t i = 0; i < labels_in_use.size(); ++i) { - auto curlab = labels_in_use[i]; - - all_correlations.clear(); - const auto& curref = ref[curlab]; - size_t NR = curref.index->ndim(); - size_t NC = curref.index->nobs(); - - for (size_t c = 0; c < NC; ++c) { - // Technically we could be faster if we remembered the - // subset from the previous fine-tuning iteration, but this - // requires us to (possibly) make a copy of the entire - // reference set; we can't afford to do this in each thread. - ref_sub.clear(); - subset_ranks(curref.ranked[c], ref_sub, gene_subset); - scaled_ranks(ref_sub, scaled_right.data()); - - double cor = distance_to_correlation(scaled_left.size(), scaled_left, scaled_right); - all_correlations.push_back(cor); - } - - double score = correlations_to_scores(all_correlations, quantile); - scores.push_back(score); - } - - candidate = replace_labels_in_use(scores, threshold, labels_in_use); - if (labels_in_use.size() == scores.size()) { // i.e., unchanged. - break; - } - } - - return candidate; - } -}; - -} - -#endif diff --git a/inst/include/singlepp/load_references.hpp b/inst/include/singlepp/load_references.hpp deleted file mode 100644 index 412e329..0000000 --- a/inst/include/singlepp/load_references.hpp +++ /dev/null @@ -1,650 +0,0 @@ -#ifndef SINGLEPP_LOADERS_HPP -#define SINGLEPP_LOADERS_HPP - -#include "macros.hpp" - -#include "byteme/PerByte.hpp" -#include "byteme/RawFileReader.hpp" -#include "tatami/tatami.hpp" - -#ifdef SINGLEPP_USE_ZLIB -#include "byteme/GzipFileReader.hpp" -#include "byteme/ZlibBufferReader.hpp" -#endif - -#include "Markers.hpp" - -#include -#include -#include -#include - -/** - * @file load_references.hpp - * - * @brief Load reference datasets from a few expected formats. - */ - -namespace singlepp { - -/** - * @cond - */ -template -using SuperPerByte = typename std::conditional, byteme::PerByteParallel >::type; - -template -std::vector load_labels_internal(Reader_& reader) { - bool non_empty = false; - int current = 0; - std::vector labels; - bool remaining = true; - - SuperPerByte pb(&reader); - bool okay = pb.valid(); - while (okay) { - char x = pb.get(); - okay = pb.advance(); - - if (x == '\n') { - if (!non_empty) { - throw std::runtime_error("label index must be an integer"); - } - labels.push_back(current); - current = 0; - non_empty = false; - } else if (std::isdigit(x)) { - non_empty = true; - current *= 10; - current += (x - '0'); - } else { - throw std::runtime_error("label index must be an integer"); - } - } - - if (non_empty) { - labels.push_back(current); - } - - return labels; -} -/** - * @endcond - */ - -/** - * @tparam parallel_ Whether file loading and parsing should be parallelized. - * - * @param path Path to a text file containing the labels. - * @param buffer_size Size of the buffer to use when reading the file. - * - * @return Vector containing the label index for each reference profile. - * - * The file should contain one line per profile, containing an integer label index for that profile. - * Label indices refer to another array containing the actual names of the labels. - * The total number of lines should be equal to the number of profiles in the dataset. - * The file should not contain any header. - */ -template -std::vector load_labels_from_text_file(const char* path, size_t buffer_size = 65536) { - byteme::RawFileReader reader(path, buffer_size); - return load_labels_internal(reader); -} - -#ifdef SINGLEPP_USE_ZLIB - -/** - * @tparam parallel_ Whether file loading and parsing should be parallelized. - * - * @param path Path to a Gzip-compressed file containing the labels. - * @param buffer_size Size of the buffer to use when reading the file. - * - * @return Vector containing the label index for each reference profile. - * - * See `load_labels_from_text_file()` for details about the format. - */ -template -std::vector load_labels_from_gzip_file(const char* path, size_t buffer_size = 65536) { - byteme::GzipFileReader reader(path, buffer_size); - return load_labels_internal(reader); -} - -/** - * @tparam parallel_ Whether file loading and parsing should be parallelized. - * - * @param[in] buffer Pointer to an array containing a Zlib/Gzip-compressed string of labels. - * @param len Length of the array for `buffer`. - * @param buffer_size Size of the buffer to use when decompressing the buffer. - * - * @return Vector containing the label index for each reference profile. - * - * See `load_labels_from_text_file()` for details about the format. - */ -template -inline std::vector load_labels_from_zlib_buffer(const unsigned char* buffer, size_t len, size_t buffer_size = 65536) { - byteme::ZlibBufferReader reader(buffer, len, 3, buffer_size); - return load_labels_internal(reader); -} - -#endif - -/** - * @cond - */ -template -std::vector load_names_internal(Reader_& reader) { - std::string current; - std::vector names; - - SuperPerByte pb(&reader); - bool okay = pb.valid(); - while (okay) { - char x = pb.get(); - okay = pb.advance(); - - if (x == '\n') { - names.emplace_back(std::move(current)); - current.clear(); - } else { - current += x; - } - } - - if (!current.empty()) { // absence of trailing newline is okay. - names.emplace_back(std::move(current)); - } - - return names; -}; -/** - * @endcond - */ - -/** - * @tparam parallel_ Whether file loading and parsing should be parallelized. - * - * @param path Path to a text file containing the label names. - * @param buffer_size Size of the buffer to use when reading the file. - * - * @return Vector of strings containing the name for each reference profile. - * - * The file should contain one line per label, containing a (non-quoted) string with the full name for that label. - * The total number of lines should be equal to the number of unique labels in the dataset. - * The file should not contain any header. - */ -template -std::vector load_label_names_from_text_file(const char* path, size_t buffer_size = 65536) { - byteme::RawFileReader reader(path, buffer_size); - return load_names_internal(reader); -} - -#ifdef SINGLEPP_USE_ZLIB - -/** - * @tparam parallel_ Whether file loading and parsing should be parallelized. - * - * @param path Path to a Gzip-compressed file containing the label names. - * @param buffer_size Size of the buffer to use when reading the file. - * - * @return Vector of strings containing the name for each label. - * - * See `load_label_names_from_text_file()` for details about the format. - */ -template -std::vector load_label_names_from_gzip_file(const char* path, size_t buffer_size = 65536) { - byteme::GzipFileReader reader(path, buffer_size); - return load_names_internal(reader); -} - -/** - * @tparam parallel_ Whether file loading and parsing should be parallelized. - * - * @param[in] buffer Pointer to an array containing a Zlib/Gzip-compressed string of label names. - * @param len Length of the array for `buffer`. - * @param buffer_size Size of the buffer to use when decompressing the buffer. - * - * @return Vector of strings containing the name for each label. - * - * See `load_label_names_from_text_file()` for details about the format. - */ -template -std::vector load_label_names_from_zlib_buffer(const unsigned char* buffer, size_t len, size_t buffer_size = 65536) { - byteme::ZlibBufferReader reader(buffer, len, 3, buffer_size); - return load_names_internal(reader); -} - -#endif - -/** - * @cond - */ -template -std::pair, std::vector > load_features_internal(Reader_& reader) { - std::string current; - std::vector ensembl, symbols; - - SuperPerByte pb(&reader); - bool okay = pb.valid(); - while (okay) { - current.clear(); - - // Pulling out the Ensembl ID. - do { - char x = pb.get(); - if (x == ',') { // don't advance yet, so that okay check below doesn't trigger if the symbol is empty and file is not newline terminated. - break; - } else if (x == '\n') { - okay = false; // hit the error below. - break; - } - current += x; - okay = pb.advance(); - } while (okay); - - if (!okay) { - throw std::runtime_error("two comma-separated fields (Ensembl ID and symbol) expected on each line"); - } - okay = pb.advance(); - - ensembl.push_back(std::move(current)); - current.clear(); - - // Now pulling out the gene symbol. - while (okay) { - char x = pb.get(); - okay = pb.advance(); - if (x == '\n') { - break; - } - current += x; - } - - symbols.push_back(std::move(current)); // still gets added if file is not newline terminated. - current.clear(); - } - - return std::make_pair(std::move(ensembl), std::move(symbols)); -} -/** - * @endcond - */ - -/** - * @tparam parallel_ Whether file loading and parsing should be parallelized. - * - * @param path Path to a text file containing the feature annotation. - * @param buffer_size Size of the buffer to use when reading the file. - * - * @return Pair of vectors, each of length equal to the number of features. - * The first contains Ensembl IDs while the second contains gene symbols. - * - * The file should contain one line per feature, with total number of lines equal to the number of features in the dataset. - * Each line should contain two strings separated by a comma. - * The first string should be the Ensembl ID while the second string should be the gene symbol; either string may be empty. - * The file should not contain any header. - */ -template -std::pair, std::vector > load_features_from_text_file(const char* path, size_t buffer_size = 65536) { - byteme::RawFileReader reader(path, buffer_size); - return load_features_internal(reader); -} - -#ifdef SINGLEPP_USE_ZLIB - -/** - * @tparam parallel_ Whether file loading and parsing should be parallelized. - * - * @param path Path to a Gzip-compressed file containing the feature annotation. - * @param buffer_size Size of the buffer to use when reading the file. - * - * @return Pair of vectors, each of length equal to the number of features. - * The first contains Ensembl IDs while the second contains gene symbols. - * - * See `load_features_from_text_file()` for details about the format. - */ -template -std::pair, std::vector > load_features_from_gzip_file(const char* path, size_t buffer_size = 65536) { - byteme::GzipFileReader reader(path, buffer_size); - return load_features_internal(reader); -} - -/** - * @tparam parallel_ Whether file loading and parsing should be parallelized. - * - * @param[in] buffer Pointer to an array containing a Zlib/Gzip-compressed string containing the feature annotation. - * @param len Length of the array for `buffer`. - * @param buffer_size Size of the buffer to use when decompressing the buffer. - * - * @return Pair of vectors, each of length equal to the number of features. - * The first contains Ensembl IDs while the second contains gene symbols. - * - * See `load_features_from_text_file()` for details about the format. - */ -template -std::pair, std::vector > load_features_from_zlib_buffer(const unsigned char* buffer, size_t len, size_t buffer_size = 65536) { - byteme::ZlibBufferReader reader(buffer, len, 3, buffer_size); - return load_features_internal(reader); -} - -#endif - -/** - * Matrix of ranks as a dense column-major matrix. - * Each column corresponds to a sample while each row corresponds to a feature. - * Each column contains the ranked expression values for all features. - * - * @tparam Data Numeric type for data in the matrix interface. - * @tparam Index Integer type for indices in the matrix interface. - */ -template -using RankMatrix = tatami::DenseColumnMatrix >; - -/** - * @cond - */ -template -RankMatrix load_rankings_internal(Reader& reader) { - size_t nfeatures = 0; - size_t line = 0; - std::vector values; - - int field = 0; - bool non_empty = false; - int current = 0; - - bool has_nfeatures = false; - auto check_nfeatures = [&]() -> void { - if (!has_nfeatures) { - has_nfeatures = true; - nfeatures = field + 1; - } else if (field + 1 != nfeatures) { - throw std::runtime_error("number of fields on each line should be equal to the number of features"); - } - }; - - SuperPerByte pb(&reader); - bool okay = pb.valid(); - while (okay) { - char x = pb.get(); - okay = pb.advance(); - - if (x == '\n') { - check_nfeatures(); - if (!non_empty) { - throw std::runtime_error("fields should not be empty"); - } - values.push_back(current); - current = 0; - field = 0; - non_empty = false; - ++line; - - } else if (x == ',') { - if (!non_empty) { - throw std::runtime_error("fields should not be empty"); - } - values.push_back(current); - current = 0; - ++field; - non_empty = false; - - } else if (std::isdigit(x)) { - non_empty = true; - current *= 10; - current += (x - '0'); - - } else { - throw std::runtime_error("fields should only contain integer ranks"); - } - } - - if (field || non_empty) { // aka no terminating newline. - check_nfeatures(); - if (!non_empty) { - throw std::runtime_error("fields should not be empty"); - } - values.push_back(current); - ++line; - } - - return RankMatrix(nfeatures, line, std::move(values)); -}; -/** - * @endcond - */ - -/** - * @tparam Data Numeric type for data in the matrix interface. - * @tparam Index Integer type for indices in the matrix interface. - * @tparam parallel_ Whether file loading and parsing should be parallelized. - * - * @param path Path to a text file containing the ranking matrix. - * @param buffer_size Size of the buffer to use when reading the file. - * - * @return A `RankMatrix` containing the feature rankings for each reference profile. - * Each column corresponds to a reference profile while each row corresponds to a feature. - * - * The file should contain one line per reference profile, with the total number of lines equal to the number of profiles in the dataset. - * Each line should contain the rank of each feature's expression within that profile, separated by commas. - * The number of comma-separated fields on each line should be equal to the number of features. - * Ranks should be strictly integer - tied ranks should default to the minimum rank among the index set of ties. - */ -template -RankMatrix load_rankings_from_text_file(const char* path, size_t buffer_size = 65536) { - byteme::RawFileReader reader(path, buffer_size); - return load_rankings_internal(reader); -} - -#ifdef SINGLEPP_USE_ZLIB - -/** - * @tparam Data Numeric type for data in the matrix interface. - * @tparam Index Integer type for indices in the matrix interface. - * @tparam parallel_ Whether file loading and parsing should be parallelized. - * - * @param path Path to a Gzip-compressed file containing the ranking matrix. - * @param buffer_size Size of the buffer to use when reading the file. - * - * @return A `RankMatrix` containing the feature rankings for each reference profile. - * Each column corresponds to a reference profile while each row corresponds to a feature. - * - * See `load_rankings_from_text_file()` for details about the format. - */ -template -RankMatrix load_rankings_from_gzip_file(const char* path, size_t buffer_size = 65536) { - byteme::GzipFileReader reader(path, buffer_size); - return load_rankings_internal(reader); -} - -/** - * @tparam Data Numeric type for data in the matrix interface. - * @tparam Index Integer type for indices in the matrix interface. - * @tparam parallel_ Whether file loading and parsing should be parallelized. - * - * @param[in] buffer Pointer to an array containing a Zlib/Gzip-compressed string containing the ranking matrix. - * @param len Length of the array for `buffer`. - * @param buffer_size Size of the buffer to use when reading the file. - * - * @return A `RankMatrix` containing the feature rankings for each reference profile. - * Each column corresponds to a reference profile while each row corresponds to a feature. - * - * See `load_rankings_from_text_file()` for details about the format. - */ -template -RankMatrix load_rankings_from_zlib_buffer(const unsigned char* buffer, size_t len, size_t buffer_size = 65536) { - byteme::ZlibBufferReader reader(buffer, len, 3, buffer_size); - return load_rankings_internal(reader); -} - -#endif - -/** - * @cond - */ -template -Markers load_markers_internal(size_t nfeatures, size_t nlabels, Reader& reader) { - Markers markers(nlabels); - for (auto& m : markers) { - m.resize(nlabels); - } - - std::vector values; - SuperPerByte pb(&reader); - bool okay = pb.valid(); - while (okay) { - - // Processing the label IDs. - size_t first = 0, second = 0; - for (int l = 0; l < 2; ++l) { - auto& current = (l == 0 ? first : second); - bool non_empty = false; - - do { - char x = pb.get(); - okay = pb.advance(); - - if (x == '\t') { - if (!non_empty) { - throw std::runtime_error("empty field detected in the label indices"); - } - break; - } else if (x == '\n') { - okay = false; // hit the error below. - break; - } else if (!std::isdigit(x)) { - throw std::runtime_error("label indices should be integers"); - } - - non_empty = true; - current *= 10; - current += (x - '0'); - } while (okay); - - if (!okay) { - throw std::runtime_error("expected at least three tab-separated fields on each line"); - } - if (current >= markers.size()) { - throw std::runtime_error("label index out of range"); - } - } - - // Processing the actual gene indices. - bool non_empty = false; - int current = 0; - while (okay) { - char x = pb.get(); - okay = pb.advance(); - - if (std::isdigit(x)) { - non_empty = true; - current *= 10; - current += (x - '0'); - - } else if (x == '\t') { - if (!non_empty) { - throw std::runtime_error("gene index fields should not be empty"); - } - values.push_back(current); - current = 0; - non_empty = false; - - } else if (x == '\n') { - break; - - } else { - throw std::runtime_error("gene index fields should be integers"); - } - } - - // Adding the last element. We don't do this inside the newline check, - // as we need to account for cases where the file is not newline-terminated. - if (!non_empty) { - throw std::runtime_error("gene index fields should not be empty"); - } - values.push_back(current); - - for (auto v : values) { - if (static_cast(v) >= nfeatures) { - throw std::runtime_error("gene index out of range"); - } - } - - auto& store = markers[first][second]; - if (!store.empty()) { - throw std::runtime_error("multiple marker sets listed for a single pairwise comparison"); - } - store.swap(values); // implicit clear of 'values', as store is empty. - } - - return markers; -} -/** - * @endcond - */ - -/** - * @tparam parallel_ Whether file loading and parsing should be parallelized. - * - * @param path Path to a text file containing the marker lists. - * @param nfeatures Total number of features in the dataset. - * @param nlabels Number of labels in the dataset. - * @param buffer_size Size of the buffer to use when reading the file. - * - * @return A `Markers` object containing the markers from each pairwise comparison between labels. - * - * The file should contain one line per pairwise comparison between labels. - * Each line should at least 3 tab-delimited fields - the index of the first label, the index of the second label, - * and then the indices of the features selected as marker genes for the first label relative to the second. - * Any (non-zero) number of marker indices may be reported provided they are ordered by marker strength. - * The total number of lines in this file should be equal to the total number of pairwise comparisons between different labels, including permutations. - */ -template -Markers load_markers_from_text_file(const char* path, size_t nfeatures, size_t nlabels, size_t buffer_size = 65536) { - byteme::RawFileReader reader(path, buffer_size); - return load_markers_internal(nfeatures, nlabels, reader); -} - -#ifdef SINGLEPP_USE_ZLIB - -/** - * @tparam parallel_ Whether file loading and parsing should be parallelized. - * - * @param path Path to a Gzip-compressed file containing the marker lists. - * @param nfeatures Total number of features in the dataset. - * @param nlabels Number of labels in the dataset. - * @param buffer_size Size of the buffer to use when reading the file. - * - * @return A `Markers` object containing the markers from each pairwise comparison between labels. - * - * See `load_markers_from_text_file()` for details about the format. - */ -template -Markers load_markers_from_gzip_file(const char* path, size_t nfeatures, size_t nlabels, size_t buffer_size = 65536) { - byteme::GzipFileReader reader(path, buffer_size); - return load_markers_internal(nfeatures, nlabels, reader); -} - -/** - * @tparam parallel_ Whether file loading and parsing should be parallelized. - * - * @param[in] buffer Pointer to an array containing a Zlib/Gzip-compressed string containing the marker lists. - * @param len Length of the array for `buffer`. - * @param nfeatures Total number of features in the dataset. - * @param nlabels Number of labels in the dataset. - * @param buffer_size Size of the buffer to use when reading the file. - * - * @return A `Markers` object containing the markers from each pairwise comparison between labels. - * - * See `load_markers_from_text_file()` for details about the format. - */ -template -Markers load_markers_from_zlib_buffer(const unsigned char* buffer, size_t len, size_t nfeatures, size_t nlabels, size_t buffer_size = 65536) { - byteme::ZlibBufferReader reader(buffer, len, 3, buffer_size); - return load_markers_internal(nfeatures, nlabels, reader); -} - -#endif - -} - -#endif diff --git a/inst/include/singlepp/macros.hpp b/inst/include/singlepp/macros.hpp deleted file mode 100644 index 8ac1ddd..0000000 --- a/inst/include/singlepp/macros.hpp +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef SINGLEPP_MACROS_HPP -#define SINGLEPP_MACROS_HPP - -/** - * @file macros.hpp - * - * @brief Set common macros used throughout **singlepp**. - * - * @details - * The `SINGLEPP_CUSTOM_PARALLEL` macro can be set to a function that specifies a custom parallelization scheme. - * This function should be a template that accept three arguments: - * - * - `fun`, a lambda that accepts three arguments: - * - `thread`, the thread number. - * - `start`, the index of the first job for this thread. - * - `length`, the number of jobs for this thread. - * - `njobs`, an integer specifying the number of jobs. - * - `nthreads`, an integer specifying the number of threads to use. - * - * The function should split `[0, njobs)` into any number of contiguous, non-overlapping intervals, and call `fun` on each interval, possibly in different threads. - * The details of the splitting and evaluation are left to the discretion of the developer defining the macro. - * The function should only return once all evaluations of `fun` are complete. - * - * If `SINGLEPP_CUSTOM_PARALLEL` is set, the following macros are also set (if they are not already defined): - * - * - `TATAMI_CUSTOM_PARALLEL`, from the [**tatami**](https://ltla.github.io/tatami) library. - * - * This ensures that any custom parallelization scheme is propagated to all of **singlepp**'s dependencies. - * If these libraries are used outside of **singlepp**, some care is required to ensure that the macros are consistently defined through the client library/application; - * otherwise, developers may observe ODR compilation errors. - */ - -// Synchronizing all parallelization schemes. -#ifdef SINGLEPP_CUSTOM_PARALLEL - -#ifndef TATAMI_CUSTOM_PARALLEL -#define TATAMI_CUSTOM_PARALLEL SINGLEPP_CUSTOM_PARALLEL -#endif - -#endif - -#endif diff --git a/inst/include/singlepp/process_features.hpp b/inst/include/singlepp/process_features.hpp deleted file mode 100644 index a8e533e..0000000 --- a/inst/include/singlepp/process_features.hpp +++ /dev/null @@ -1,153 +0,0 @@ -#ifndef SINGLEPP_PROCESS_FEATURES_HPP -#define SINGLEPP_PROCESS_FEATURES_HPP - -#include "macros.hpp" - -#include "Markers.hpp" - -#include -#include -#include -#include - -namespace singlepp { - -typedef std::vector > Intersection; - -template -Intersection intersect_features(size_t mat_n, const Id* mat_id, size_t ref_n, const Id* ref_id) { - std::unordered_map > intersection; - intersection.reserve(mat_n); - for (size_t i = 0; i < mat_n; ++i) { - if (intersection.find(mat_id[i]) == intersection.end()) { // only using the first occurrence of each ID in mat_id. - intersection[mat_id[i]] = std::make_pair(i, -1); - } - } - - for (size_t i = 0; i < ref_n; ++i) { - auto it = intersection.find(ref_id[i]); - if (it != intersection.end()) { // only using the first occurrence of each ID in ref_id. - auto& target = (it->second).second; - if (target < 0) { - target = i; - } - } - } - - Intersection pairings; - pairings.reserve(intersection.size()); - for (const auto& x : intersection) { - if (x.second.second >= 0) { - pairings.push_back(x.second); - } - } - - return pairings; -} - -inline void subset_markers(Intersection& intersection, Markers& markers, int top) { - std::unordered_set available; - available.reserve(intersection.size()); - for (const auto& in : intersection) { - available.insert(in.second); - } - - // Figuring out the top markers to retain, that are _also_ in the intersection. - std::unordered_set all_markers; - all_markers.reserve(intersection.size()); - - for (size_t i = 0; i < markers.size(); ++i) { - for (size_t j = 0; j < markers[i].size(); ++j) { - auto& current = markers[i][j]; - - std::vector replacement; - size_t upper_bound = static_cast(top >= 0 ? top : -1); // in effect, no upper bound if top = -1. - replacement.reserve(top >= 0 ? static_cast(top) : current.size()); - - for (size_t k = 0; k < current.size() && replacement.size() < upper_bound; ++k) { - if (available.find(current[k]) != available.end()) { - all_markers.insert(current[k]); - replacement.push_back(current[k]); - } - } - current.swap(replacement); - } - } - - // Subsetting the intersection down to the chosen set of markers. - std::unordered_map mapping; - mapping.reserve(intersection.size()); - { - size_t counter = 0; - for (size_t i = 0; i < intersection.size(); ++i) { - if (all_markers.find(intersection[i].second) != all_markers.end()) { - intersection[counter] = intersection[i]; - mapping[intersection[i].second] = counter; - ++counter; - } - } - intersection.resize(counter); - } - - // Reindexing the markers. - for (size_t i = 0; i < markers.size(); ++i) { - for (size_t j = 0; j < markers[i].size(); ++j) { - auto& current = markers[i][j]; - for (size_t k = 0; k < current.size(); ++k) { - auto it = mapping.find(current[k]); - current[k] = it->second; - } - } - } - - return; -} - -// Use this method when the feature spaces are already identical. -inline std::vector subset_markers(Markers& markers, int top) { - std::unordered_set available; - for (size_t i = 0; i < markers.size(); ++i) { - for (size_t j = 0; j < markers[i].size(); ++j) { - auto& current = markers[i][j]; - if (top >= 0) { - current.resize(std::min(current.size(), static_cast(top))); - } - available.insert(current.begin(), current.end()); - } - } - - std::vector subset(available.begin(), available.end()); - std::sort(subset.begin(), subset.end()); - - std::unordered_map mapping; - mapping.reserve(subset.size()); - for (size_t i = 0; i < subset.size(); ++i) { - mapping[subset[i]] = i; - } - - // Reindexing the markers. - for (size_t i = 0; i < markers.size(); ++i) { - for (size_t j = 0; j < markers[i].size(); ++j) { - auto& current = markers[i][j]; - for (size_t k = 0; k < current.size(); ++k) { - auto it = mapping.find(current[k]); - current[k] = it->second; - } - } - } - - return subset; -} - -inline std::pair, std::vector > unzip(const Intersection& intersection) { - std::vector left(intersection.size()), right(intersection.size()); - for (size_t i = 0; i < intersection.size(); ++i) { - left[i] = intersection[i].first; - right[i] = intersection[i].second; - } - return std::make_pair(std::move(left), std::move(right)); -} - -} - -#endif diff --git a/inst/include/singlepp/scaled_ranks.hpp b/inst/include/singlepp/scaled_ranks.hpp deleted file mode 100644 index f5eb381..0000000 --- a/inst/include/singlepp/scaled_ranks.hpp +++ /dev/null @@ -1,155 +0,0 @@ -#ifndef SINGLEPP_SCALED_RANKS_HPP -#define SINGLEPP_SCALED_RANKS_HPP - -#include "macros.hpp" - -#include -#include -#include -#include - -namespace singlepp { - -template -using RankedVector = std::vector >; - -// This class sanitizes any user-provided subsets so that we can provide a -// sorted and unique subset to the tatami extractor. We then undo the sorting -// to use the original indices in the rank filler. This entire thing is -// necessary as the behavior of the subsets isn't something that the user can -// easily control (e.g., if the reference/test datasets do not use the same -// feature ordering, in which case the subset is necessarily unsorted). -struct SubsetSorter { - bool use_sorted_subset = false; - const std::vector* original_subset; - std::vector sorted_subset, original_indices; - - SubsetSorter(const std::vector& sub) : original_subset(&sub) { - size_t num_subset = sub.size(); - for (size_t i = 1; i < num_subset; ++i) { - if (sub[i] <= sub[i-1]) { - use_sorted_subset = true; - break; - } - } - - if (use_sorted_subset) { - std::vector > store; - store.reserve(num_subset); - for (size_t i = 0; i < num_subset; ++i) { - store.emplace_back(sub[i], i); - } - - std::sort(store.begin(), store.end()); - sorted_subset.reserve(num_subset); - original_indices.resize(num_subset); - for (const auto& s : store) { - if (sorted_subset.empty() || sorted_subset.back() != s.first) { - sorted_subset.push_back(s.first); - } - original_indices[s.second] = sorted_subset.size() - 1; - } - } - } - - const std::vector& extraction_subset() const { - if (use_sorted_subset) { - return sorted_subset; - } else { - return *original_subset; - } - } - - void fill_ranks(const double* ptr, RankedVector& vec) const { - if (use_sorted_subset) { - size_t num = original_indices.size(); - for (size_t s = 0; s < num; ++s) { - vec[s].first = ptr[original_indices[s]]; - vec[s].second = s; - } - } else { - size_t num = original_subset->size(); - for (size_t s = 0; s < num; ++s, ++ptr) { - vec[s].first = *ptr; - vec[s].second = s; - } - } - std::sort(vec.begin(), vec.end()); - } -}; - -template -void scaled_ranks(const RankedVector& collected, double* outgoing) { - // Computing tied ranks. - size_t cur_rank = 0; - auto cIt = collected.begin(); - - while (cIt != collected.end()) { - auto copy = cIt; - ++copy; - double accumulated_rank = cur_rank; - ++cur_rank; - - while (copy != collected.end() && copy->first == cIt->first) { - accumulated_rank += cur_rank; - ++cur_rank; - ++copy; - } - - double mean_rank= accumulated_rank / (copy - cIt); - while (cIt!=copy) { - outgoing[cIt->second] = mean_rank; - ++cIt; - } - } - - // Mean-adjusting and converting to cosine values. - double sum_squares = 0; - size_t N = collected.size(); - const double center_rank = static_cast(N - 1)/2; - for (size_t i = 0 ; i < N; ++i) { - auto& o = outgoing[i]; - o -= center_rank; - sum_squares += o*o; - } - - // Special behaviour for no-variance cells; these are left as all-zero scaled ranks. - sum_squares = std::max(sum_squares, 0.00000001); - sum_squares = std::sqrt(sum_squares)*2; - for (size_t i = 0; i < N; ++i) { - outgoing[i] /= sum_squares; - } - - return; -} - -template -void subset_ranks(const RankedVector& x, RankedVector& output, const std::unordered_map& subset) { - for (size_t i = 0; i < x.size(); ++i) { - auto it = subset.find(x[i].second); - if (it != subset.end()) { - output.emplace_back(x[i].first, it->second); - } - } - return; -} - -template -void simplify_ranks(const RankedVector& x, RankedVector& output) { - if (x.size()) { - Index counter = 0; - auto last = x[0].first; - for (const auto& r : x) { - if (r.first != last) { - ++counter; - last = r.first; - } - output.emplace_back(counter, r.second); - } - } - return; -} - -} - -#endif diff --git a/inst/include/singlepp/singlepp.hpp b/inst/include/singlepp/singlepp.hpp deleted file mode 100644 index 21e35fa..0000000 --- a/inst/include/singlepp/singlepp.hpp +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef SINGLEPP_UMBRELLA_HPP -#define SINGLEPP_UMBRELLA_HPP - -/** - * @file singlepp.hpp - * - * @brief Umbrella header for all includes. - */ - -#include "Classifier.hpp" -#include "Markers.hpp" -#include "IntegratedScorer.hpp" -#include "IntegratedBuilder.hpp" -#include "ChooseClassicMarkers.hpp" -#include "load_references.hpp" - -/** - * @namespace singlepp - * Cell type classification using the SingleR algorithm in C++. - */ - -#endif diff --git a/inst/include/source.sh b/inst/include/source.sh deleted file mode 100755 index b36c255..0000000 --- a/inst/include/source.sh +++ /dev/null @@ -1,16 +0,0 @@ -# Fetches all of the header files. - -if [ ! -e source-singlepp ] -then - git clone https://github.com/LTLA/singlepp source-singlepp - cd source-singlepp -else - cd source-singlepp - git pull -fi - -git checkout 53bc74819c367db9d26c785de66206fe6c4f3890 -rm -rf ../singlepp -cp -r include/singlepp/ ../singlepp -git checkout master -cd - diff --git a/man/SingleR.Rd b/man/SingleR.Rd index ddc9c18..d706e0f 100644 --- a/man/SingleR.Rd +++ b/man/SingleR.Rd @@ -45,7 +45,7 @@ Row names may be different across entries but only the intersection will be used \item{labels}{A character vector or factor of known labels for all samples in \code{ref}. Alternatively, if \code{ref} is a list, \code{labels} should be a list of the same length. -Each element should contain a character vector or factor specifying the label for the corresponding entry of \code{ref}.} +Each element should contain a character vector or factor specifying the labels for the columns of the corresponding element of \code{ref}.} \item{method}{Deprecated.} diff --git a/man/classifySingleR.Rd b/man/classifySingleR.Rd index 51a71f8..13b8b9e 100644 --- a/man/classifySingleR.Rd +++ b/man/classifySingleR.Rd @@ -13,17 +13,20 @@ classifySingleR( sd.thresh = NULL, prune = TRUE, assay.type = "logcounts", - check.missing = TRUE, + check.missing = FALSE, num.threads = bpnworkers(BPPARAM), BPPARAM = SerialParam() ) } \arguments{ \item{test}{A numeric matrix of single-cell expression values where rows are genes and columns are cells. +Each row should be named with the gene name. Alternatively, a \linkS4class{SummarizedExperiment} object containing such a matrix.} \item{trained}{A \linkS4class{List} containing the output of the \code{\link{trainSingleR}} function. +If the row names of \code{test} are not exactly the same as the reference dataset, the call to \code{trainSingleR} should set \code{test.genes=rownames(test)}. + Alternatively, a List of Lists produced by \code{\link{trainSingleR}} for multiple references.} \item{quantile}{A numeric scalar specifying the quantile of the correlation distribution to use to compute the score for each label.} @@ -38,8 +41,7 @@ Alternatively, a List of Lists produced by \code{\link{trainSingleR}} for multip \item{assay.type}{Integer scalar or string specifying the matrix of expression values to use if \code{test} is a \linkS4class{SummarizedExperiment}.} -\item{check.missing}{Logical scalar indicating whether rows should be checked for missing values. -If true and any missing values are found, the rows containing these values are silently removed.} +\item{check.missing}{Deprecated and ignored, as any row filtering will cause mismatches with the \code{test.genes=} used in \code{\link{trainSingleR}}.} \item{num.threads}{Integer scalar specifying the number of threads to use for classification.} diff --git a/man/combineRecomputedResults.Rd b/man/combineRecomputedResults.Rd index 19920e9..356b9da 100644 --- a/man/combineRecomputedResults.Rd +++ b/man/combineRecomputedResults.Rd @@ -10,9 +10,9 @@ combineRecomputedResults( trained, quantile = 0.8, assay.type.test = "logcounts", - check.missing = TRUE, - allow.lost = FALSE, + check.missing = FALSE, warn.lost = TRUE, + allow.lost = FALSE, num.threads = bpnworkers(BPPARAM), BPPARAM = SerialParam() ) @@ -32,11 +32,11 @@ or (ii) running \code{trainSingleR} on each reference separately and manually ma \item{assay.type.test}{An integer scalar or string specifying the assay of \code{test} containing the relevant expression matrix, if \code{test} is a \linkS4class{SummarizedExperiment} object.} -\item{check.missing}{Logical scalar indicating whether rows should be checked for missing values (and if found, removed).} +\item{check.missing}{Deprecated and ignored, as any row filtering will cause mismatches with the \code{test.genes=} used in \code{\link{trainSingleR}}.} -\item{allow.lost}{Deprecated.} +\item{warn.lost}{Logical scalar indicating whether to emit a warning if markers from one reference in \code{trained} are absent in other references.} -\item{warn.lost}{Logical scalar indicating whether to emit a warning if markers from one reference in \code{trained} are \dQuote{lost} in other references.} +\item{allow.lost}{Deprecated.} \item{num.threads}{Integer scalar specifying the number of threads to use for index building and classification.} @@ -51,7 +51,7 @@ This mimics the output of \code{\link{classifySingleR}} and contains the followi For any given cell, entries of this matrix are only non-\code{NA} for the assigned label in each reference; scores are not recomputed for the other labels. \item \code{labels}, a character vector containing the per-cell combined label across references. -\item \code{references}, an integer vector specifying the reference from which the combined label was derived. +\item \code{reference}, an integer vector specifying the reference from which the combined label was derived. \item \code{orig.results}, a DataFrame containing \code{results}. } It may also contain \code{pruned.labels} if these were also present in \code{results}. diff --git a/man/getClassicMarkers.Rd b/man/getClassicMarkers.Rd index a04662a..54a7927 100644 --- a/man/getClassicMarkers.Rd +++ b/man/getClassicMarkers.Rd @@ -21,13 +21,12 @@ In general, the expression values are expected to be log-transformed, see Detail Alternatively, a \linkS4class{SummarizedExperiment} object containing such a matrix. -Alternatively, a list or \linkS4class{List} of SummarizedExperiment objects or numeric matrices containing multiple references, -in which case the row names are expected to be the same across all objects.} +Alternatively, a list or \linkS4class{List} of SummarizedExperiment objects or numeric matrices containing multiple references.} \item{labels}{A character vector or factor of known labels for all samples in \code{ref}. Alternatively, if \code{ref} is a list, \code{labels} should be a list of the same length. -Each element should contain a character vector or factor specifying the label for the corresponding entry of \code{ref}.} +Each element should contain a character vector or factor specifying the labels for the columns of the corresponding element of \code{ref}.} \item{assay.type}{An integer scalar or string specifying the assay of \code{ref} containing the relevant expression matrix, if \code{ref} is a \linkS4class{SummarizedExperiment} object (or is a list that contains one or more such objects).} diff --git a/man/trainSingleR.Rd b/man/trainSingleR.Rd index af34530..ca17f58 100644 --- a/man/trainSingleR.Rd +++ b/man/trainSingleR.Rd @@ -7,6 +7,7 @@ trainSingleR( ref, labels, + test.genes = NULL, genes = "de", sd.thresh = NULL, de.method = c("classic", "wilcox", "t"), @@ -31,13 +32,15 @@ In general, the expression values are expected to be log-transformed, see Detail Alternatively, a \linkS4class{SummarizedExperiment} object containing such a matrix. -Alternatively, a list or \linkS4class{List} of SummarizedExperiment objects or numeric matrices containing multiple references, -in which case the row names are expected to be the same across all objects.} +Alternatively, a list or \linkS4class{List} of SummarizedExperiment objects or numeric matrices containing multiple references.} \item{labels}{A character vector or factor of known labels for all samples in \code{ref}. Alternatively, if \code{ref} is a list, \code{labels} should be a list of the same length. -Each element should contain a character vector or factor specifying the label for the corresponding entry of \code{ref}.} +Each element should contain a character vector or factor specifying the labels for the columns of the corresponding element of \code{ref}.} + +\item{test.genes}{Character vector of the names of the genes in the test dataset, i.e., the row names of \code{test} in \code{\link{classifySingleR}}. +If \code{NULL}, it is assumed that the test dataset and \code{ref} have the same genes in the same row order.} \item{genes}{A string containing \code{"de"}, indicating that markers should be calculated from \code{ref}. For back compatibility, other string values are allowed but will be ignored with a deprecation warning. @@ -83,11 +86,11 @@ if \code{ref} is a \linkS4class{SummarizedExperiment} object (or is a list that \item{check.missing}{Logical scalar indicating whether rows should be checked for missing values. If true and any missing values are found, the rows containing these values are silently removed.} -\item{approximate}{Logical scalar indicating whether a faster approximate method should be used to compute the quantile.} +\item{approximate}{Deprecated, use \code{BNPARAM} instead.} \item{num.threads}{Integer scalar specifying the number of threads to use for index building.} -\item{BNPARAM}{Deprecated and ignored.} +\item{BNPARAM}{A \linkS4class{BiocNeighborParam} object specifying how the neighbor search index should be constructed.} \item{BPPARAM}{A \linkS4class{BiocParallelParam} object specifying how parallelization should be performed. Relevant for marker detection if \code{genes = NULL}, aggregation if \code{aggr.ref = TRUE}, and \code{NA} checking if \code{check.missing = TRUE}.} @@ -120,9 +123,15 @@ This is done by identifying the median expression within each label, and computi For each label, the top \code{de.n} genes with the largest differences compared to another label are chosen as markers to distinguish the two labels. The expression values are expected to be log-transformed and normalized. -If \code{restrict} is specified, \code{ref} is subsetted to only include the rows with names that are in \code{restrict}. -Marker selection and all subsequent classification will be performed using this restrictive subset of genes. -This can be convenient for ensuring that only appropriate genes are used (e.g., not pseudogenes or predicted genes). +Classification with \code{classifySingleR} assumes that the test dataset contains all marker genes that were detected from the reference. +If the test and reference datasets do not have the same genes in the same order, we can set \code{test.genes} to the row names of the test dataset. +This will instruct \code{trainSingleR} to only consider markers that are present in the test dataset. +Any subsequent call to \code{classifySingleR} will also check that \code{test.genes} is consistent with \code{rownames(test)}. + +On a similar note, if \code{restrict} is specified, marker selection will only be performed using the specified subset of genes. +This can be convenient for ignoring inappropriate genes like pseudogenes or predicted genes. +It has the same effect as filtering out undesirable rows from \code{ref} prior to calling \code{trainSingleR}. +Unlike \code{test.genes}, setting \code{restrict} does not introduce further checks on \code{rownames(test)} in \code{classifySingleR}. } \section{Custom feature specification}{ diff --git a/src/Makevars b/src/Makevars deleted file mode 100644 index 4f42038..0000000 --- a/src/Makevars +++ /dev/null @@ -1 +0,0 @@ -PKG_CPPFLAGS=-I../inst/include/ diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 6b96b6d..f6bdc17 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -10,6 +10,35 @@ Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif +// classify_integrated +SEXP classify_integrated(Rcpp::RObject test, Rcpp::List results, SEXP integrated_build, double quantile, int nthreads); +RcppExport SEXP _SingleR_classify_integrated(SEXP testSEXP, SEXP resultsSEXP, SEXP integrated_buildSEXP, SEXP quantileSEXP, SEXP nthreadsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::traits::input_parameter< Rcpp::RObject >::type test(testSEXP); + Rcpp::traits::input_parameter< Rcpp::List >::type results(resultsSEXP); + Rcpp::traits::input_parameter< SEXP >::type integrated_build(integrated_buildSEXP); + Rcpp::traits::input_parameter< double >::type quantile(quantileSEXP); + Rcpp::traits::input_parameter< int >::type nthreads(nthreadsSEXP); + rcpp_result_gen = Rcpp::wrap(classify_integrated(test, results, integrated_build, quantile, nthreads)); + return rcpp_result_gen; +END_RCPP +} +// classify_single +SEXP classify_single(Rcpp::RObject test, SEXP prebuilt, double quantile, bool use_fine_tune, double fine_tune_threshold, int nthreads); +RcppExport SEXP _SingleR_classify_single(SEXP testSEXP, SEXP prebuiltSEXP, SEXP quantileSEXP, SEXP use_fine_tuneSEXP, SEXP fine_tune_thresholdSEXP, SEXP nthreadsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::traits::input_parameter< Rcpp::RObject >::type test(testSEXP); + Rcpp::traits::input_parameter< SEXP >::type prebuilt(prebuiltSEXP); + Rcpp::traits::input_parameter< double >::type quantile(quantileSEXP); + Rcpp::traits::input_parameter< bool >::type use_fine_tune(use_fine_tuneSEXP); + Rcpp::traits::input_parameter< double >::type fine_tune_threshold(fine_tune_thresholdSEXP); + Rcpp::traits::input_parameter< int >::type nthreads(nthreadsSEXP); + rcpp_result_gen = Rcpp::wrap(classify_single(test, prebuilt, quantile, use_fine_tune, fine_tune_threshold, nthreads)); + return rcpp_result_gen; +END_RCPP +} // find_classic_markers Rcpp::List find_classic_markers(int nlabels, int ngenes, Rcpp::List labels, Rcpp::List ref, int de_n, int nthreads); RcppExport SEXP _SingleR_find_classic_markers(SEXP nlabelsSEXP, SEXP ngenesSEXP, SEXP labelsSEXP, SEXP refSEXP, SEXP de_nSEXP, SEXP nthreadsSEXP) { @@ -38,9 +67,9 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// integrate_build -SEXP integrate_build(Rcpp::IntegerVector test_features, Rcpp::List references, Rcpp::List ref_ids, Rcpp::List labels, Rcpp::List prebuilt, int nthreads); -RcppExport SEXP _SingleR_integrate_build(SEXP test_featuresSEXP, SEXP referencesSEXP, SEXP ref_idsSEXP, SEXP labelsSEXP, SEXP prebuiltSEXP, SEXP nthreadsSEXP) { +// train_integrated +SEXP train_integrated(Rcpp::IntegerVector test_features, Rcpp::List references, Rcpp::List ref_ids, Rcpp::List labels, Rcpp::List prebuilt, int nthreads); +RcppExport SEXP _SingleR_train_integrated(SEXP test_featuresSEXP, SEXP referencesSEXP, SEXP ref_idsSEXP, SEXP labelsSEXP, SEXP prebuiltSEXP, SEXP nthreadsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::traits::input_parameter< Rcpp::IntegerVector >::type test_features(test_featuresSEXP); @@ -49,45 +78,33 @@ BEGIN_RCPP Rcpp::traits::input_parameter< Rcpp::List >::type labels(labelsSEXP); Rcpp::traits::input_parameter< Rcpp::List >::type prebuilt(prebuiltSEXP); Rcpp::traits::input_parameter< int >::type nthreads(nthreadsSEXP); - rcpp_result_gen = Rcpp::wrap(integrate_build(test_features, references, ref_ids, labels, prebuilt, nthreads)); + rcpp_result_gen = Rcpp::wrap(train_integrated(test_features, references, ref_ids, labels, prebuilt, nthreads)); return rcpp_result_gen; END_RCPP } -// integrate_run -SEXP integrate_run(Rcpp::RObject test, Rcpp::List results, SEXP integrated_build, double quantile, int nthreads); -RcppExport SEXP _SingleR_integrate_run(SEXP testSEXP, SEXP resultsSEXP, SEXP integrated_buildSEXP, SEXP quantileSEXP, SEXP nthreadsSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::traits::input_parameter< Rcpp::RObject >::type test(testSEXP); - Rcpp::traits::input_parameter< Rcpp::List >::type results(resultsSEXP); - Rcpp::traits::input_parameter< SEXP >::type integrated_build(integrated_buildSEXP); - Rcpp::traits::input_parameter< double >::type quantile(quantileSEXP); - Rcpp::traits::input_parameter< int >::type nthreads(nthreadsSEXP); - rcpp_result_gen = Rcpp::wrap(integrate_run(test, results, integrated_build, quantile, nthreads)); - return rcpp_result_gen; -END_RCPP -} -// prebuild -SEXP prebuild(Rcpp::RObject ref, Rcpp::IntegerVector labels, Rcpp::List markers, bool approximate, int nthreads); -RcppExport SEXP _SingleR_prebuild(SEXP refSEXP, SEXP labelsSEXP, SEXP markersSEXP, SEXP approximateSEXP, SEXP nthreadsSEXP) { +// train_single +SEXP train_single(Rcpp::IntegerVector test_features, Rcpp::RObject ref, Rcpp::IntegerVector ref_features, Rcpp::IntegerVector labels, Rcpp::List markers, Rcpp::RObject builder, int nthreads); +RcppExport SEXP _SingleR_train_single(SEXP test_featuresSEXP, SEXP refSEXP, SEXP ref_featuresSEXP, SEXP labelsSEXP, SEXP markersSEXP, SEXP builderSEXP, SEXP nthreadsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; + Rcpp::traits::input_parameter< Rcpp::IntegerVector >::type test_features(test_featuresSEXP); Rcpp::traits::input_parameter< Rcpp::RObject >::type ref(refSEXP); + Rcpp::traits::input_parameter< Rcpp::IntegerVector >::type ref_features(ref_featuresSEXP); Rcpp::traits::input_parameter< Rcpp::IntegerVector >::type labels(labelsSEXP); Rcpp::traits::input_parameter< Rcpp::List >::type markers(markersSEXP); - Rcpp::traits::input_parameter< bool >::type approximate(approximateSEXP); + Rcpp::traits::input_parameter< Rcpp::RObject >::type builder(builderSEXP); Rcpp::traits::input_parameter< int >::type nthreads(nthreadsSEXP); - rcpp_result_gen = Rcpp::wrap(prebuild(ref, labels, markers, approximate, nthreads)); + rcpp_result_gen = Rcpp::wrap(train_single(test_features, ref, ref_features, labels, markers, builder, nthreads)); return rcpp_result_gen; END_RCPP } -// get_subset -Rcpp::IntegerVector get_subset(SEXP built); -RcppExport SEXP _SingleR_get_subset(SEXP builtSEXP) { +// get_ref_subset +Rcpp::IntegerVector get_ref_subset(SEXP built); +RcppExport SEXP _SingleR_get_ref_subset(SEXP builtSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::traits::input_parameter< SEXP >::type built(builtSEXP); - rcpp_result_gen = Rcpp::wrap(get_subset(built)); + rcpp_result_gen = Rcpp::wrap(get_ref_subset(built)); return rcpp_result_gen; END_RCPP } @@ -101,32 +118,16 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// run -SEXP run(Rcpp::RObject test, Rcpp::IntegerVector subset, SEXP prebuilt, double quantile, bool use_fine_tune, double fine_tune_threshold, int nthreads); -RcppExport SEXP _SingleR_run(SEXP testSEXP, SEXP subsetSEXP, SEXP prebuiltSEXP, SEXP quantileSEXP, SEXP use_fine_tuneSEXP, SEXP fine_tune_thresholdSEXP, SEXP nthreadsSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::traits::input_parameter< Rcpp::RObject >::type test(testSEXP); - Rcpp::traits::input_parameter< Rcpp::IntegerVector >::type subset(subsetSEXP); - Rcpp::traits::input_parameter< SEXP >::type prebuilt(prebuiltSEXP); - Rcpp::traits::input_parameter< double >::type quantile(quantileSEXP); - Rcpp::traits::input_parameter< bool >::type use_fine_tune(use_fine_tuneSEXP); - Rcpp::traits::input_parameter< double >::type fine_tune_threshold(fine_tune_thresholdSEXP); - Rcpp::traits::input_parameter< int >::type nthreads(nthreadsSEXP); - rcpp_result_gen = Rcpp::wrap(run(test, subset, prebuilt, quantile, use_fine_tune, fine_tune_threshold, nthreads)); - return rcpp_result_gen; -END_RCPP -} static const R_CallMethodDef CallEntries[] = { + {"_SingleR_classify_integrated", (DL_FUNC) &_SingleR_classify_integrated, 5}, + {"_SingleR_classify_single", (DL_FUNC) &_SingleR_classify_single, 6}, {"_SingleR_find_classic_markers", (DL_FUNC) &_SingleR_find_classic_markers, 6}, {"_SingleR_grouped_medians", (DL_FUNC) &_SingleR_grouped_medians, 4}, - {"_SingleR_integrate_build", (DL_FUNC) &_SingleR_integrate_build, 6}, - {"_SingleR_integrate_run", (DL_FUNC) &_SingleR_integrate_run, 5}, - {"_SingleR_prebuild", (DL_FUNC) &_SingleR_prebuild, 5}, - {"_SingleR_get_subset", (DL_FUNC) &_SingleR_get_subset, 1}, + {"_SingleR_train_integrated", (DL_FUNC) &_SingleR_train_integrated, 6}, + {"_SingleR_train_single", (DL_FUNC) &_SingleR_train_single, 7}, + {"_SingleR_get_ref_subset", (DL_FUNC) &_SingleR_get_ref_subset, 1}, {"_SingleR_is_valid_built", (DL_FUNC) &_SingleR_is_valid_built, 1}, - {"_SingleR_run", (DL_FUNC) &_SingleR_run, 7}, {NULL, NULL, 0} }; diff --git a/src/integrate_run.cpp b/src/classify_integrated.cpp similarity index 61% rename from src/integrate_run.cpp rename to src/classify_integrated.cpp index b787468..9fbc20d 100644 --- a/src/integrate_run.cpp +++ b/src/classify_integrated.cpp @@ -3,9 +3,9 @@ #include //[[Rcpp::export(rng=false)]] -SEXP integrate_run(Rcpp::RObject test, Rcpp::List results, SEXP integrated_build, double quantile, int nthreads) { +SEXP classify_integrated(Rcpp::RObject test, Rcpp::List results, SEXP integrated_build, double quantile, int nthreads) { Rtatami::BoundNumericPointer curtest(test); - IntegratedXPtr iptr(integrated_build); + TrainedIntegratedPointer iptr(integrated_build); // Setting up the previous results. std::vector previous_results_vec; @@ -25,27 +25,25 @@ SEXP integrate_run(Rcpp::RObject test, Rcpp::List results, SEXP integrated_build Rcpp::IntegerVector best(ncells); Rcpp::NumericVector delta(ncells); + singlepp::ClassifyIntegratedBuffers buffers; + buffers.best = static_cast(best.begin()); + buffers.delta = static_cast(delta.begin()); + size_t nrefs = iptr->num_references(); Rcpp::NumericMatrix scores(ncells, nrefs); - std::vector scores_ptr(nrefs); if (nrefs) { - scores_ptr[0] = static_cast(scores.begin()); + buffers.scores.resize(nrefs); + buffers.scores[0] = static_cast(scores.begin()); for (size_t l = 1; l < nrefs; ++l) { - scores_ptr[l] = scores_ptr[l-1] + ncells; + buffers.scores[l] = buffers.scores[l-1] + ncells; } } // Running the integrated scoring. - singlepp::IntegratedScorer scorer; - scorer.set_num_threads(nthreads).set_quantile(quantile); - scorer.run( - curtest->ptr.get(), - previous_results, - *iptr, - static_cast(best.begin()), - scores_ptr, - static_cast(delta.begin()) - ); + singlepp::ClassifyIntegratedOptions opts; + opts.num_threads = nthreads; + opts.quantile = quantile; + singlepp::classify_integrated(*(curtest->ptr), previous_results, *iptr, buffers, opts); return Rcpp::List::create( Rcpp::Named("best") = best, diff --git a/src/classify_single.cpp b/src/classify_single.cpp new file mode 100644 index 0000000..56275c1 --- /dev/null +++ b/src/classify_single.cpp @@ -0,0 +1,44 @@ +#include "utils.h" // must be before raticate, singlepp includes. + +#include + +//' @importFrom Rcpp sourceCpp +//' @useDynLib SingleR +//[[Rcpp::export(rng=false)]] +SEXP classify_single(Rcpp::RObject test, SEXP prebuilt, double quantile, bool use_fine_tune, double fine_tune_threshold, int nthreads) { + Rtatami::BoundNumericPointer parsed(test); + TrainedSingleIntersectPointer built(prebuilt); + + // Setting up outputs. + size_t ncells = parsed->ptr->ncol(); + Rcpp::IntegerVector best(ncells); + Rcpp::NumericVector delta(ncells); + + singlepp::ClassifySingleBuffers buffers; + buffers.best = static_cast(best.begin()); + buffers.delta = static_cast(delta.begin()); + + size_t nlabels = built->num_labels(); + Rcpp::NumericMatrix scores(ncells, nlabels); + if (nlabels) { + buffers.scores.resize(nlabels); + buffers.scores[0] = static_cast(scores.begin()); + for (size_t l = 1; l < nlabels; ++l) { + buffers.scores[l] = buffers.scores[l-1] + ncells; + } + } + + // Running the analysis. + singlepp::ClassifySingleOptions opts; + opts.num_threads = nthreads; + opts.quantile = quantile; + opts.fine_tune = use_fine_tune; + opts.fine_tune_threshold = fine_tune_threshold; + singlepp::classify_single_intersect(*(parsed->ptr), *built, buffers, opts); + + return Rcpp::List::create( + Rcpp::Named("best") = best, + Rcpp::Named("scores") = scores, + Rcpp::Named("delta") = delta + ); +} diff --git a/src/find_classic_markers.cpp b/src/find_classic_markers.cpp index 87f8835..51e75f1 100644 --- a/src/find_classic_markers.cpp +++ b/src/find_classic_markers.cpp @@ -34,9 +34,10 @@ Rcpp::List find_classic_markers(int nlabels, int ngenes, Rcpp::List labels, Rcpp lab_ptrs.push_back(static_cast(lab_vec.back().begin())); } - singlepp::ChooseClassicMarkers mrk; - mrk.set_number(de_n).set_num_threads(nthreads); - auto store = mrk.run(ref_ptrs, lab_ptrs); + singlepp::ChooseClassicMarkersOptions opts; + opts.number = de_n; + opts.num_threads = nthreads; + auto store = singlepp::choose_classic_markers(ref_ptrs, lab_ptrs, opts); // Returning everything in R space. Rcpp::List output(nlabels); diff --git a/src/integrate_build.cpp b/src/integrate_build.cpp deleted file mode 100644 index 7f05490..0000000 --- a/src/integrate_build.cpp +++ /dev/null @@ -1,36 +0,0 @@ -#include "utils.h" // must be before all other includes. - -#include -#include - -//[[Rcpp::export(rng=false)]] -SEXP integrate_build(Rcpp::IntegerVector test_features, Rcpp::List references, Rcpp::List ref_ids, Rcpp::List labels, Rcpp::List prebuilt, int nthreads) { - singlepp::IntegratedBuilder builder; - builder.set_num_threads(nthreads); - - size_t nrefs = references.size(); - std::vector holding_labs; - holding_labs.reserve(nrefs); - - for (size_t r = 0; r < nrefs; ++r) { - Rcpp::RObject curref(references[r]); - auto parsed = Rtatami::BoundNumericPointer(curref); - - Rcpp::IntegerVector curids(ref_ids[r]); - holding_labs.emplace_back(labels[r]); - Rcpp::RObject built = prebuilt[r]; - PrebuiltXPtr curbuilt(built); - - builder.add( - test_features.size(), - static_cast(test_features.begin()), - parsed->ptr.get(), - static_cast(curids.begin()), - static_cast(holding_labs.back().begin()), - *curbuilt - ); - } - - auto finished = builder.finish(); - return IntegratedXPtr(new singlepp::IntegratedReferences(std::move(finished)), true); -} diff --git a/src/prebuild.cpp b/src/prebuild.cpp deleted file mode 100644 index 444d4a3..0000000 --- a/src/prebuild.cpp +++ /dev/null @@ -1,47 +0,0 @@ -#include "utils.h" - -#include -#include - -//' @importFrom Rcpp sourceCpp -//' @useDynLib SingleR -//[[Rcpp::export(rng=false)]] -SEXP prebuild(Rcpp::RObject ref, Rcpp::IntegerVector labels, Rcpp::List markers, bool approximate, int nthreads) { - singlepp::BasicBuilder builder; - builder.set_num_threads(nthreads); - - // Use all available markers; assume subsetting was applied on the R side. - builder.set_top(-1).set_approximate(approximate); - - // Setting up the markers. - singlepp::Markers markers2(markers.size()); - for (size_t m = 0; m < markers.size(); ++m) { - Rcpp::List curmarkers(markers[m]); - auto& curmarkers2 = markers2[m]; - curmarkers2.resize(curmarkers.size()); - - for (size_t n = 0; n < curmarkers.size(); ++n) { - Rcpp::IntegerVector seq(curmarkers[n]); - auto& seq2 = curmarkers2[n]; - seq2.insert(seq2.end(), seq.begin(), seq.end()); - } - } - - // Building the indices. - auto parsed = Rtatami::BoundNumericPointer(ref); - auto built = builder.run(parsed->ptr.get(), static_cast(labels.begin()), std::move(markers2)); - - // Moving it into the external pointer. - return PrebuiltXPtr(new singlepp::BasicBuilder::Prebuilt(std::move(built)), true); -} - -//[[Rcpp::export(rng=false)]] -Rcpp::IntegerVector get_subset(SEXP built) { - PrebuiltXPtr ptr(built); - return Rcpp::IntegerVector(ptr->subset.begin(), ptr->subset.end()); -} - -//[[Rcpp::export(rng=false)]] -Rcpp::LogicalVector is_valid_built(SEXP built) { - return Rf_ScalarLogical(!!R_ExternalPtrAddr(built)); -} diff --git a/src/run.cpp b/src/run.cpp deleted file mode 100644 index b490897..0000000 --- a/src/run.cpp +++ /dev/null @@ -1,45 +0,0 @@ -#include "utils.h" // must be before raticate, singlepp includes. - -#include - -//' @importFrom Rcpp sourceCpp -//' @useDynLib SingleR -//[[Rcpp::export(rng=false)]] -SEXP run(Rcpp::RObject test, Rcpp::IntegerVector subset, SEXP prebuilt, double quantile, bool use_fine_tune, double fine_tune_threshold, int nthreads) { - auto parsed = Rtatami::BoundNumericPointer(test); - PrebuiltXPtr built(prebuilt); - - // Setting up outputs. - size_t ncells = parsed->ptr->ncol(); - Rcpp::IntegerVector best(ncells); - Rcpp::NumericVector delta(ncells); - - size_t nlabels = built->num_labels(); - Rcpp::NumericMatrix scores(ncells, nlabels); - std::vector scores_ptr(nlabels); - if (nlabels) { - scores_ptr[0] = static_cast(scores.begin()); - for (size_t l = 1; l < nlabels; ++l) { - scores_ptr[l] = scores_ptr[l-1] + ncells; - } - } - - // Running the analysis. - singlepp::BasicScorer runner; - runner.set_num_threads(nthreads); - runner.set_quantile(quantile).set_fine_tune(use_fine_tune).set_fine_tune_threshold(fine_tune_threshold); - runner.run( - parsed->ptr.get(), - *built, - static_cast(subset.begin()), - static_cast(best.begin()), - scores_ptr, - static_cast(delta.begin()) - ); - - return Rcpp::List::create( - Rcpp::Named("best") = best, - Rcpp::Named("scores") = scores, - Rcpp::Named("delta") = delta - ); -} diff --git a/src/train_integrated.cpp b/src/train_integrated.cpp new file mode 100644 index 0000000..0705b33 --- /dev/null +++ b/src/train_integrated.cpp @@ -0,0 +1,39 @@ +#include "utils.h" // must be before all other includes. + +#include +#include + +//[[Rcpp::export(rng=false)]] +SEXP train_integrated(Rcpp::IntegerVector test_features, Rcpp::List references, Rcpp::List ref_ids, Rcpp::List labels, Rcpp::List prebuilt, int nthreads) { + size_t nrefs = references.size(); + + std::vector > inputs; + inputs.reserve(nrefs); + std::vector holding_labs; + holding_labs.reserve(nrefs); + + for (size_t r = 0; r < nrefs; ++r) { + Rcpp::RObject curref(references[r]); + Rtatami::BoundNumericPointer parsed(curref); + + Rcpp::IntegerVector curids(ref_ids[r]); + holding_labs.emplace_back(labels[r]); + Rcpp::RObject built = prebuilt[r]; + TrainedSingleIntersectPointer curbuilt(built); + + inputs.push_back(singlepp::prepare_integrated_input_intersect( + static_cast(test_features.size()), + static_cast(test_features.begin()), + *(parsed->ptr), + static_cast(curids.begin()), + static_cast(holding_labs.back().begin()), + *curbuilt + )); + } + + singlepp::TrainIntegratedOptions opts; + opts.num_threads = nthreads; + auto finished = singlepp::train_integrated(std::move(inputs), opts); + + return TrainedIntegratedPointer(new TrainedIntegrated(std::move(finished)), true); +} diff --git a/src/train_single.cpp b/src/train_single.cpp new file mode 100644 index 0000000..77aa41a --- /dev/null +++ b/src/train_single.cpp @@ -0,0 +1,57 @@ +#include "utils.h" +#include "BiocNeighbors.h" + +#include +#include + +//' @importFrom Rcpp sourceCpp +//' @useDynLib SingleR +//[[Rcpp::export(rng=false)]] +SEXP train_single(Rcpp::IntegerVector test_features, Rcpp::RObject ref, Rcpp::IntegerVector ref_features, Rcpp::IntegerVector labels, Rcpp::List markers, Rcpp::RObject builder, int nthreads) { + singlepp::TrainSingleOptions opts; + opts.num_threads = nthreads; + opts.top = -1; // Use all available markers; assume subsetting was applied on the R side. + + BiocNeighbors::BuilderPointer bptr(builder); + opts.trainer = std::shared_ptr(std::shared_ptr{}, bptr.get()); // make a no-op shared pointer. + + // Setting up the markers. We assume that these are already 0-indexed on the R side. + singlepp::Markers markers2(markers.size()); + for (size_t m = 0; m < markers.size(); ++m) { + Rcpp::List curmarkers(markers[m]); + auto& curmarkers2 = markers2[m]; + curmarkers2.resize(curmarkers.size()); + + for (size_t n = 0; n < curmarkers.size(); ++n) { + Rcpp::IntegerVector seq(curmarkers[n]); + auto& seq2 = curmarkers2[n]; + seq2.insert(seq2.end(), seq.begin(), seq.end()); + } + } + + // Building the indices. + Rtatami::BoundNumericPointer parsed(ref); + auto built = singlepp::train_single_intersect( + static_cast(test_features.size()), + static_cast(test_features.begin()), + *(parsed->ptr), + static_cast(ref_features.begin()), + static_cast(labels.begin()), + std::move(markers2), + opts + ); + + return TrainedSingleIntersectPointer(new TrainedSingleIntersect(std::move(built)), true); +} + +//[[Rcpp::export(rng=false)]] +Rcpp::IntegerVector get_ref_subset(SEXP built) { + TrainedSingleIntersectPointer ptr(built); + const auto& rsub = ptr->get_ref_subset(); + return Rcpp::IntegerVector(rsub.begin(), rsub.end()); +} + +//[[Rcpp::export(rng=false)]] +Rcpp::LogicalVector is_valid_built(SEXP built) { + return Rf_ScalarLogical(!!R_ExternalPtrAddr(built)); +} diff --git a/src/utils.h b/src/utils.h index 4cd1173..3e9f97e 100644 --- a/src/utils.h +++ b/src/utils.h @@ -2,16 +2,15 @@ #define UTILS_H #include "Rcpp.h" -#include "Rtatami.h" +#include "Rtatami.h" // before singlepp includes to ensure the tatami_r::parallelize() override is set. +#include "singlepp/singlepp.hpp" -// must be before singlepp includes. -#define SINGLEPP_CUSTOM_PARALLEL tatami_r::parallelize -#define __ERROR_PRINTER_OVERRIDE__ REprintf // avoid R CMD check warnings about stderr in Annoy. +typedef singlepp::TrainedSingleIntersect TrainedSingleIntersect; -#include "singlepp/singlepp.hpp" +typedef Rcpp::XPtr TrainedSingleIntersectPointer; -typedef Rcpp::XPtr PrebuiltXPtr; +typedef singlepp::TrainedIntegrated TrainedIntegrated; -typedef Rcpp::XPtr IntegratedXPtr; +typedef Rcpp::XPtr TrainedIntegratedPointer; #endif diff --git a/tests/testthat/test-SingleR.R b/tests/testthat/test-SingleR.R index 5b0b06a..37f6e43 100644 --- a/tests/testthat/test-SingleR.R +++ b/tests/testthat/test-SingleR.R @@ -70,10 +70,11 @@ test_that("SingleR handles DelayedArray inputs", { }) test_that("SingleR works with multiple references", { - # Handles mismatching row names. - chosen0 <- sample(rownames(training), 900) - chosen1 <- sample(rownames(training), 900) - chosen2 <- sample(rownames(training), 900) + # Handles mismatching row names. Note that the sorting is necessary + # to ensure that tied genes are handled in a consistent way. + chosen0 <- sort(sample(rownames(training), 900)) + chosen1 <- sort(sample(rownames(training), 900)) + chosen2 <- sort(sample(rownames(training), 900)) # Works with recomputation. out <- SingleR(test[chosen0,], list(training[chosen1,], training[chosen2,]), diff --git a/tests/testthat/test-classify.R b/tests/testthat/test-classify.R index 4ad16e0..2537ef5 100644 --- a/tests/testthat/test-classify.R +++ b/tests/testthat/test-classify.R @@ -79,18 +79,6 @@ test_that("classifySingleR behaves with no-variance cells", { expect_identical(out$labels[-(1:10)], ref$labels[-(1:10)]) }) -test_that("classifySingleR behaves with missing values", { - # Can't just set the first entry to NA, as we need to ensure - # that the test set contains a superset of genes in the training set. - sce <- BiocGenerics::rbind(test[1,], test) - logcounts(sce)[1,1] <- NA - - Q <- 0.8 - out <- classifySingleR(sce, trained, fine.tune=FALSE, quantile=Q) - ref <- classifySingleR(test, trained, fine.tune=FALSE, quantile=Q) - expect_identical(out, ref) -}) - test_that("classifySingleR works with multiple references", { training1 <- training2 <- training training1 <- training1[sample(nrow(training1)),] @@ -98,15 +86,23 @@ test_that("classifySingleR works with multiple references", { mtrain <- trainSingleR(list(training1, training2), list(training1$label, training2$label)) out <- classifySingleR(test, mtrain) + expect_identical(names(out$orig.results), c("ref1", "ref2")) + expect_true(all(out$reference %in% 1:2)) ref1 <- classifySingleR(test, mtrain[[1]]) ref2 <- classifySingleR(test, mtrain[[2]]) expect_identical(out, combineRecomputedResults(list(ref1, ref2), test, mtrain)) + + # Preserves names of the references themselves. + mtrain <- trainSingleR(list(foo=training1, bar=training2), list(training1$label, training2$label)) + out <- classifySingleR(test, mtrain) + expect_identical(names(out$orig.results), c("foo", "bar")) + expect_true(all(out$reference %in% 1:2)) }) test_that("classifySingleR behaves with silly inputs", { out <- classifySingleR(test[,0], trained, fine.tune=FALSE) expect_identical(nrow(out$scores), 0L) expect_identical(length(out$labels), 0L) - expect_error(classifySingleR(test[0,], trained, fine.tune=FALSE), "does not contain") + expect_error(classifySingleR(test[0,], trained, fine.tune=FALSE), "expected 'rownames(test)' to be the same", fixed=TRUE) }) diff --git a/tests/testthat/test-recomputed.R b/tests/testthat/test-recomputed.R index 4c75c3f..be75d4c 100644 --- a/tests/testthat/test-recomputed.R +++ b/tests/testthat/test-recomputed.R @@ -19,11 +19,11 @@ test <- .mockTestData(ref) test <- scuttle::logNormCounts(test) ref1 <- scuttle::logNormCounts(ref1) -train1 <- trainSingleR(ref1, labels=ref1$label) +train1 <- trainSingleR(ref1, labels=ref1$label, test.genes=rownames(test)) pred1 <- classifySingleR(test, train1) ref2 <- scuttle::logNormCounts(ref2) -train2 <- trainSingleR(ref2, labels=ref2$label) +train2 <- trainSingleR(ref2, labels=ref2$label, test.genes=rownames(test)) pred2 <- classifySingleR(test, train2) test_that("combineRecomputedResults works as expected (light check)", { @@ -100,47 +100,28 @@ test_that("combineRecomputedResults handles mismatches to rows and cells", { trained=list(train1, train2)), "not identical") colnames(test) <- NULL - # Correctly reorders the gene universes. - ref <- combineRecomputedResults( - results=list(pred1, pred2), - test=test, - trained=list(train1, train2)) - + # Responds to mismatches in the genes. s <- sample(nrow(test)) - out <- combineRecomputedResults( + expect_error(combineRecomputedResults( results=list(pred1, pred2), test=test[s,], - trained=list(train1, train2)) - expect_equal(ref, out) + trained=list(train1, train2)), "test.genes") }) test_that("combineRecomputedResults emits warnings when missing genes are present", { + half <- nrow(test) / 2 + # Spiking in some missing genes. - ref1b <- ref1[c(1, seq_len(nrow(ref1))),] - rownames(ref1b)[1] <- "BLAH" - markers1 <- train1$markers$full - markers1$A$B <- c(markers1$A$B, "BLAH") - train1b <- trainSingleR(ref1b, labels=ref1$label, genes=markers1) - - ref2b <- ref2[c(1, seq_len(nrow(ref2))),] - rownames(ref2b)[1] <- "WHEE" - markers2 <- train2$markers$full - markers2$A$B <- c(markers2$a$b, "WHEE") - train2b <- trainSingleR(ref2b, labels=ref2$label, genes=markers2) - - expect_error(out <- combineRecomputedResults( - results=list(pred1, pred2), - test=test, - trained=list(train1b, train2b)), "should be present") + ref1b <- ref1[seq_len(half),,drop=FALSE] + train1b <- trainSingleR(ref1b, labels=ref1$label, test.genes=rownames(test)) - test2 <- test[c(1,seq_len(nrow(test)),1),] - rownames(test2)[1] <- "WHEE" - rownames(test2)[length(rownames(test2))] <- "BLAH" + ref2b <- ref2[half + seq_len(half),] + train2b <- trainSingleR(ref2b, labels=ref2$label, test.genes=rownames(test)) expect_warning(out <- combineRecomputedResults( results=list(pred1, pred2), - test=test2, - trained=list(train1b, train2b)), "differ in the universe") + test=test, + trained=list(train1b, train2b)), "available in each reference") }) test_that("combineRecomputedResults is invariant to ordering", { diff --git a/tests/testthat/test-train.R b/tests/testthat/test-train.R index 03ead0f..f216fdd 100644 --- a/tests/testthat/test-train.R +++ b/tests/testthat/test-train.R @@ -128,7 +128,8 @@ test_that("trainSingleR strips out NAs", { out <- trainSingleR(sce, sce$label) ref <- trainSingleR(sce[-1,], sce$label) - expect_identical(out$ref, ref$ref) + expect_identical(as.matrix(out$ref), ref$ref) + expect_identical(out$markers, ref$markers) }) test_that("trainSingleR behaves with multiple references, plus recomputation", { @@ -136,16 +137,17 @@ test_that("trainSingleR behaves with multiple references, plus recomputation", { training1 <- training1[sample(nrow(training1)),] rownames(training1) <- rownames(training) - set.seed(1000) ref1 <- trainSingleR(training1, training1$label) ref2 <- trainSingleR(training2, training2$label) - - set.seed(1000) - out <- trainSingleR(list(training1, training2), list(training1$label, training2$label), recompute=TRUE) + out <- trainSingleR(list(training1, training2), list(training1$label, training2$label)) except.built <- setdiff(names(ref1), "built") expect_identical(ref1[except.built], out[[1]][except.built]) expect_identical(ref2[except.built], out[[2]][except.built]) + + # Same result with names. + out <- trainSingleR(list(foo=training1, bar=training2), list(training1$label, training2$label)) + expect_identical(names(out), c("foo", "bar")) }) test_that("trainSingleR behaves with aggregation turned on", {