SydneyBioX · DarioS · Aug 2, 2024 · Aug 2, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -11,7 +11,7 @@ LazyData: false
 Depends: R (>= 4.0)
 Imports:
     S4Vectors,
-    limma,
+    scrapper,
     ggraph,
     igraph,
     methods,
@@ -30,7 +30,7 @@ Imports:
     graphics,
     statmod,
     Cepo
-RoxygenNote: 7.1.1
+RoxygenNote: 7.3.2
 Suggests: 
     knitr,
     rmarkdown,

diff --git a/NAMESPACE b/NAMESPACE
@@ -20,7 +20,6 @@ exportClasses(scClassifyTrainModel)
 exportClasses(scClassifyTrainModelList)
 import(ggplot2)
 import(igraph)
-import(limma)
 import(statmod)
 importClassesFrom(S4Vectors,DataFrame)
 importClassesFrom(S4Vectors,SimpleList)
@@ -45,8 +44,6 @@ importFrom(hopach,hdist)
 importFrom(hopach,improveordering)
 importFrom(hopach,is.hdist)
 importFrom(hopach,vectmatrix)
-importFrom(limma,eBayes)
-importFrom(limma,lmFit)
 importFrom(methods,as)
 importFrom(methods,is)
 importFrom(methods,new)
@@ -57,6 +54,7 @@ importFrom(mixtools,normalmixEM)
 importFrom(proxy,as.dist)
 importFrom(proxy,dist)
 importFrom(proxyC,simil)
+importFrom(scrapper,scoreMarkers)
 importFrom(stats,coef)
 importFrom(stats,cutree)
 importFrom(stats,dnorm)

diff --git a/R/featureSelection.R b/R/featureSelection.R
@@ -1,17 +1,17 @@
 #' @importFrom methods new
 #' @importFrom Cepo Cepo topGenes
-#' @import limma
-
+#' @importFrom scrapper scoreMarkers
 
 featureSelection <- function(exprsMat,
                              trainClass,
-                             feature = c("limma", "DV", "DD", "chisq", "BI",
+                             sampleID = NULL,
+                             feature = c("DM", "DV", "DD", "chisq", "BI",
                                          "Cepo"),
                              topN = 50,
                              pSig = 0.001
 ){
 
-    feature <- match.arg(feature, c("limma", "DV", "DD", "chisq", "BI", "Cepo"),
+    feature <- match.arg(feature, c("DM", "DV", "DD", "chisq", "BI", "Cepo"),
                          several.ok = FALSE)
 
 
@@ -40,69 +40,19 @@ featureSelection <- function(exprsMat,
         tt <- Cepo::Cepo(as.matrix(exprsMat), trainClass, exprsPct = 0.05)
         res <- Reduce(union, Cepo::topGenes(tt, n = topN))
     } else{
-        tt <- doLimma(exprsMat, trainClass)
-        res <- Reduce(union, lapply(tt, function(t)
-            rownames(t[t$logFC > 0 & (t$meanPct.2 - t$meanPct.1) > 0.05 &
-                           t$adj.P.Val < pSig,])[seq_len(topN)]))
+        effectSizes <- scrapper::scoreMarkers(exprsMat, trainClass, sampleID, block.weight.policy = "equal")
+        res <- Reduce(union, mapply(function(typeD, propDetect)
+        {
+          bestNames <- rownames(exprsMat)[order(typeD$mean, decreasing = TRUE)]
+          bestNames <- bestNames[propDetect$mean > 0.05]
+          bestNames[seq_len(topN)]
+        }, effectSizes[["cohens.d"]], effectSizes[["delta.detected"]], SIMPLIFY = FALSE))
 
     }
 
     return(res)
 }
 
-
-#' @importFrom limma eBayes lmFit
-#' @importFrom methods new
-
-doLimma <- function(exprsMat, cellTypes, exprs_pct = 0.05){
-
-    cellTypes <- droplevels(as.factor(cellTypes))
-    tt <- list()
-    for (i in seq_len(nlevels(cellTypes))) {
-        tmp_celltype <- (ifelse(cellTypes == levels(cellTypes)[i], 1, 0))
-        design <- stats::model.matrix(~tmp_celltype)
-
-
-        meanExprs <- do.call(cbind, lapply(c(0,1), function(i){
-            Matrix::rowMeans(exprsMat[, tmp_celltype == i, drop = FALSE])
-        }))
-
-        meanPct <- do.call(cbind, lapply(c(0,1), function(i){
-            Matrix::rowSums(exprsMat[, tmp_celltype == i,
-                                     drop = FALSE] > 0)/sum(tmp_celltype == i)
-        }))
-
-        keep <- meanPct[,2] > exprs_pct
-
-        y <- methods::new("EList")
-        y$E <- exprsMat[keep, ]
-        fit <- limma::lmFit(y, design = design)
-        fit <- limma::eBayes(fit, trend = TRUE, robust = TRUE)
-        tt[[i]] <- limma::topTable(fit, n = Inf, adjust.method = "BH", coef = 2)
-
-
-
-        if (!is.null(tt[[i]]$ID)) {
-            tt[[i]] <- tt[[i]][!duplicated(tt[[i]]$ID),]
-            rownames(tt[[i]]) <- tt[[i]]$ID
-        }
-
-        tt[[i]]$meanExprs.1 <- meanExprs[rownames(tt[[i]]), 1]
-        tt[[i]]$meanExprs.2 <- meanExprs[rownames(tt[[i]]), 2]
-        tt[[i]]$meanPct.1 <- meanPct[rownames(tt[[i]]), 1]
-        tt[[i]]$meanPct.2 <- meanPct[rownames(tt[[i]]), 2]
-    }
-
-
-
-    return(tt)
-
-
-}
-
-
-
-
 doDV <- function(exprsMat, cellTypes){
 
 

diff --git a/R/predict_scClassify.R b/R/predict_scClassify.R
@@ -14,8 +14,8 @@
 #' @param cor_threshold_high A numeric indicates the highest
 #' correlation threshold
 #' @param features A vector indicates the gene selection method,
-#' set as "limma" by default.
-#' This should be one or more of "limma", "DV", "DD", "chisq", "BI".
+#' set as "DM" (Difference in Means) by default.
+#' This should be one or more of "DM", "DV", "DD", "chisq", "BI".
 #' @param algorithm A vector indicates the KNN method that are used,
 #' set as "WKNN" by default.
 #' This should be one or more of "WKNN", "KNN", "DWKNN".
@@ -46,7 +46,7 @@
 #' trainRes = trainClassExample_xin,
 #' cellTypes_test = wang_cellTypes,
 #' algorithm = "WKNN",
-#' features = c("limma"),
+#' features = "DM",
 #' similarity = c("pearson"),
 #' prob_threshold = 0.7,
 #' verbose = TRUE)
@@ -67,7 +67,7 @@ predict_scClassify <- function(exprsMat_test,
                                prob_threshold = 0.7,
                                cor_threshold_static = 0.5,
                                cor_threshold_high = 0.7,
-                               features = "limma",
+                               features = "DM",
                                algorithm = "WKNN",
                                similarity = "pearson",
                                cutoff_method = c("dynamic", "static"),
@@ -238,8 +238,8 @@ predict_scClassify <- function(exprsMat_test,
 #' @param prob_threshold A numeric indicates the probability threshold for KNN/WKNN/DWKNN.
 #' @param cor_threshold_static A numeric indicates the static correlation threshold.
 #' @param cor_threshold_high A numeric indicates the highest correlation threshold
-#' @param features A vector indicates the method to select features, set as "limma" by default.
-#' This should be one or more of "limma", "DV", "DD", "chisq", "BI".
+#' @param features A vector indicates the method to select features, set as "DM" (Difference in Means) by default.
+#' This should be one or more of "DM", "DV", "DD", "chisq", "BI".
 #' @param algorithm A vector indicates the KNN method that are used, set as "WKNN" by default.
 #' This should be one or more of "WKNN", "KNN", "DWKNN".
 #' @param similarity A vector indicates the similarity measure that are used,
@@ -269,7 +269,7 @@ predict_scClassify <- function(exprsMat_test,
 #' trainRes = trainClassExampleJoint,
 #' cellTypes_test = wang_cellTypes,
 #' algorithm = "WKNN",
-#' features = c("limma"),
+#' features = "DM",
 #' similarity = c("pearson"),
 #' prob_threshold = 0.7,
 #' verbose = FALSE)
@@ -287,7 +287,7 @@ predict_scClassifyJoint <- function(exprsMat_test,
                                     prob_threshold = 0.7,
                                     cor_threshold_static = 0.5,
                                     cor_threshold_high = 0.7,
-                                    features = "limma",
+                                    features = "DM",
                                     algorithm = "WKNN",
                                     similarity = "pearson",
                                     cutoff_method = c("dynamic", "static"),
@@ -363,7 +363,7 @@ predict_scClassifySingle <- function(exprsMat_test,
                                      prob_threshold = 0.7,
                                      cor_threshold_static = 0.5,
                                      cor_threshold_high = 0.7,
-                                     features = "limma",
+                                     features = "DM",
                                      algorithm = c("WKNN", "KNN", "DWKNN"),
                                      similarity = c("pearson",  "spearman",
                                                     "cosine", "jaccard",

diff --git a/R/sampleSizeCal.R b/R/sampleSizeCal.R
@@ -170,7 +170,7 @@ runSubSampling <- function(exprsMat,
                                test = cellTypes_test),
                            ...)
 
-    acc_cls <- trainRes$testRes$test$pearson_WKNN_limma$classifyRes
+    acc_cls <- trainRes$testRes$test$pearson_WKNN_DM$classifyRes
     return(acc_cls)
 }
 

diff --git a/R/scClassify.R b/R/scClassify.R
@@ -2,12 +2,13 @@
 #'
 #' @param exprsMat_train A matrix of log-transformed expression matrix of reference dataset
 #' @param cellTypes_train A vector of cell types of reference dataset
+#' @param sampleID_train Default: \code{NULL}. A vector of sample IDs, if the cells come from more than one individual.
 #' @param exprsMat_test A list or a matrix indicates the expression matrices of the query datasets
 #' @param cellTypes_test A list or a vector indicates cell types of the query datasets (Optional).
 #' @param tree A vector indicates the method to build hierarchical tree, set as "HOPACH" by default.
 #' This should be one of "HOPACH" and "HC" (using hclust).
-#' @param selectFeatures A vector indicates the gene selection method, set as "limma" by default.
-#' This should be one or more of "limma", "DV", "DD", "chisq", "BI" and "Cepo".
+#' @param selectFeatures A vector indicates the gene selection method, set as "DM" (Difference in Means) by default.
+#' This should be one or more of "DM", "DV", "DD", "chisq", "BI" and "Cepo".
 #' @param algorithm A vector indicates the KNN method that are used, set as
 #' "WKNN" by default. Thisshould be one or more of "WKNN", "KNN", "DWKNN".
 #' @param similarity A vector indicates the similarity measure that are used,
@@ -56,7 +57,7 @@
 #' cellTypes_test = list(wang = wang_cellTypes),
 #' tree = "HOPACH",
 #' algorithm = "WKNN",
-#' selectFeatures = c("limma"),
+#' selectFeatures = "DM",
 #' similarity = c("pearson"),
 #' returnList = FALSE,
 #' verbose = FALSE)
@@ -69,11 +70,12 @@
 
 scClassify <- function(exprsMat_train = NULL,
                        cellTypes_train = NULL,
+                       sampleID_train = NULL,
                        exprsMat_test = NULL,
                        cellTypes_test = NULL,
                        tree = "HOPACH",
                        algorithm = "WKNN",
-                       selectFeatures = "limma",
+                       selectFeatures = "DM",
                        similarity = "pearson",
                        cutoff_method = c("dynamic", "static"),
                        weighted_ensemble = FALSE,
@@ -147,7 +149,7 @@ scClassify <- function(exprsMat_train = NULL,
 
     tree <- match.arg(tree, c("HOPACH", "HC"), several.ok = FALSE)
     selectFeatures <- match.arg(selectFeatures,
-                                c("limma", "DV", "DD", "chisq", "BI", "Cepo"),
+                                c("DM", "DV", "DD", "chisq", "BI", "Cepo"),
                                 several.ok = TRUE)
 
     algorithm <- match.arg(algorithm,
@@ -231,6 +233,7 @@ scClassify <- function(exprsMat_train = NULL,
     ### train_scClassify
     trainRes <- train_scClassify(exprsMat_train,
                                  cellTypes_train,
+                                 sampleID_train,
                                  tree = tree,
                                  selectFeatures = selectFeatures,
                                  topN = topN,

diff --git a/R/train_scClassify.R b/R/train_scClassify.R
@@ -2,12 +2,13 @@
 #'
 #' @param exprsMat_train A matrix of log-transformed expression matrix of reference dataset
 #' @param cellTypes_train A vector of cell types of reference dataset
+#' @param sampleID_train Default: \code{NULL}. A vector of sample IDs, if the cells come from more than one individual.
 #' @param tree A vector indicates the method to build hierarchical tree,
 #' set as "HOPACH" by default.
 #' This should be one of "HOPACH" and "HC" (using stats::hclust).
 #' @param selectFeatures A vector indicates the gene selection method,
-#' set as "limma" by default.
-#' This should be one or more of "limma", "DV", "DD", "chisq", "BI", "Cepo".
+#' set as "DM" (Difference in Means) by default.
+#' This should be one or more of "DM", "DV", "DD", "chisq", "BI", "Cepo".
 #' @param topN An integer indicates the top number of features that are selected
 #' @param hopach_kmax An integer between 1 and 9 specifying the maximum number of
 #' children at each node in the HOPACH tree.
@@ -32,7 +33,7 @@
 #' exprsMat_xin_subset <- scClassify_example$exprsMat_xin_subset
 #' trainClass <- train_scClassify(exprsMat_train = exprsMat_xin_subset,
 #' cellTypes_train = xin_cellTypes,
-#' selectFeatures = c("limma", "BI"),
+#' selectFeatures = c("DM", "BI"),
 #' returnList = FALSE
 #' )
 #'
@@ -46,8 +47,9 @@
 
 train_scClassify <- function(exprsMat_train,
                              cellTypes_train,
+                             sampleID_train = NULL,
                              tree = "HOPACH",
-                             selectFeatures = "limma",
+                             selectFeatures = "DM",
                              topN = 50,
                              hopach_kmax = 5,
                              pSig = 0.05,
@@ -69,7 +71,7 @@ train_scClassify <- function(exprsMat_train,
 
   # Matching the argument of feature selection method
   selectFeatures <- match.arg(selectFeatures,
-                              c("limma", "DV", "DD", "chisq", "BI", "Cepo"),
+                              c("DM", "DV", "DD", "chisq", "BI", "Cepo"),
                               several.ok = TRUE)
 
 
@@ -133,6 +135,7 @@ train_scClassify <- function(exprsMat_train,
     for (train_list_idx in seq_len(length(exprsMat_train))) {
       trainRes[[train_list_idx]] <- train_scClassifySingle(exprsMat_train[[train_list_idx]],
                                                            cellTypes_train[[train_list_idx]],
+                                                           sampleID_train[[train_list_idx]],
                                                            tree = tree,
                                                            selectFeatures = selectFeatures,
                                                            topN = topN,
@@ -149,6 +152,7 @@ train_scClassify <- function(exprsMat_train,
 
     trainRes <- train_scClassifySingle(exprsMat_train,
                                        cellTypes_train,
+                                       sampleID_train,
                                        tree = tree,
                                        selectFeatures = selectFeatures,
                                        topN = topN,
@@ -211,8 +215,9 @@ train_scClassify <- function(exprsMat_train,
 
 train_scClassifySingle <- function(exprsMat_train,
                                    cellTypes_train,
+                                   sampleID_train = NULL,
                                    tree = "HOPACH",
-                                   selectFeatures = "limma",
+                                   selectFeatures = "DM",
                                    topN = 50,
                                    hopach_kmax = 5,
                                    pSig = 0.05,
@@ -247,7 +252,7 @@ train_scClassifySingle <- function(exprsMat_train,
 
   # Matching the argument of feature selection method
   selectFeatures <- match.arg(selectFeatures,
-                              c("limma", "DV", "DD", "chisq", "BI", "Cepo"),
+                              c("DM", "DV", "DD", "chisq", "BI", "Cepo"),
                               several.ok = TRUE)
 
 
@@ -257,10 +262,12 @@ train_scClassifySingle <- function(exprsMat_train,
 
 
   # Select the features to construct tree
-  tt <- doLimma(exprsMat_train, cellTypes_train)
-  de <- Reduce(union, lapply(tt, function(t)
-    rownames(t)[seq_len(max(min(50, sum(t$adj.P.Val < 0.001)), 30))]))
-  de <- na.omit(de)
+  effectSizes <- scrapper::scoreMarkers(exprsMat_train, cellTypes_train, sampleID_train, block.weight.policy = "equal")
+  de <- Reduce(union, mapply(function(typeD, propDetect)
+  {
+    bestNames <- rownames(exprsMat_train)[order(typeD$mean, decreasing = TRUE)]
+    bestNames[max(30, nrow(exprsMat_train))]
+  }, effectSizes[["cohens.d"]], effectSizes[["delta.detected"]], SIMPLIFY = FALSE))
   if (verbose) {
     print(paste("Number of genes selected to construct HOPACH tree",
                 length(de)))
@@ -420,11 +427,11 @@ currentClass <- function(cellTypes, cutree_res){
 hierarchyKNNcor <- function(exprsMat,
                             cellTypes,
                             cutree_list,
-                            feature = c("limma", "DV", "DD", "chisq", "BI"),
+                            feature = c("DM", "DV", "DD", "chisq", "BI"),
                             topN = 50,
                             pSig = 0.001,
                             verbose= TRUE){
-  feature <- match.arg(feature, c("limma", "DV", "DD", "chisq", "BI", "Cepo"))
+  feature <- match.arg(feature, c("DM", "DV", "DD", "chisq", "BI", "Cepo"))
   numHierchy <- length(cutree_list)
   levelModel <- list()
   levelHVG <- list()