diff --git a/.github/workflows/test_action.yml b/.github/workflows/test_action.yml index 0eae6c3..85c3a7e 100644 --- a/.github/workflows/test_action.yml +++ b/.github/workflows/test_action.yml @@ -15,6 +15,11 @@ jobs: channels: conda-forge, bioconda - name: Install Nextflow uses: nf-core/setup-nextflow@v1 + - name: Set up Python to install gdown + uses: actions/setup-python@v5 + with: + python-version: "3.9" + cache: "pip" - name: Download test dataset run: bash ${GITHUB_WORKSPACE}/test_data/download_data.sh - name: Run pipeline with test data diff --git a/.gitignore b/.gitignore index b2b6327..a6a5c48 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,7 @@ result*/ testing/ testing* *.pyc -docs/build/ \ No newline at end of file +docs/build/ +test_data/ +res_gx12/ +res_gx38/ \ No newline at end of file diff --git a/README.md b/README.md index dbb0c56..e6f6c45 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) -![Caption](docs/source/_static/images/pipeline_v2.png) +![Caption](docs/source/_static/images/pipeline.png) Read the [documentation](https://hadge.readthedocs.io/en/latest/) and the associated [manuscript](https://www.biorxiv.org/content/10.1101/2023.07.23.550061v1) diff --git a/bin/HTODemux-visualisation.R b/bin/HTODemux-visualisation.R index c6b3cc5..b05d5d1 100755 --- a/bin/HTODemux-visualisation.R +++ b/bin/HTODemux-visualisation.R @@ -1,40 +1,40 @@ #!/usr/bin/env Rscript -#Libraries +# Libraries library(Seurat) library(argparse) library(ggplot2) # Create a parser parser <- ArgumentParser("Parameters for HTODemux Visualisation") -parser$add_argument("--hashtagPath",help="folder where rds object was saved from the first part of HTODemux") -parser$add_argument("--assay",help="Name of the Hashtag assay HTO by default", default = "HTO") -#Output graphs - Ridge Plot +parser$add_argument("--hashtagPath", help = "folder where rds object was saved from the first part of HTODemux") +parser$add_argument("--assay", help = "Name of the Hashtag assay HTO by default", default = "HTO") +# Output graphs - Ridge Plot parser$add_argument("--ridgePlot", help = "Generates a ridge plot from the results, True to generate", default = "True") parser$add_argument("--ridgeNCol", help = "Number of columns for ridgePlot", default = 3, type = "integer") -#Output graphs - Scatter Feature -parser$add_argument("--featureScatter",help = "Generates a ridge plot from the results, True to generate", default = "True") +# Output graphs - Scatter Feature +parser$add_argument("--featureScatter", help = "Generates a ridge plot from the results, True to generate", default = "True") parser$add_argument("--scatterFeat1", help = "Feature 1 for Feature Scatter Plot", default = "hto_HTO-A") parser$add_argument("--scatterFeat2", help = "Feature 2 for Feature Scatter Plot", default = "hto_HTO-B") -#Output graphs - Violin Plot +# Output graphs - Violin Plot parser$add_argument("--vlnPlot", help = "Generates a violin plot from the results, True to generate", default = "True") parser$add_argument("--vlnFeatures", help = "Features to plot (gene expression, metrics, PC scores, anything that can be retreived by FetchData)", default = "nCount_RNA") parser$add_argument("--vlnLog", help = "plot the feature axis on log scale", action = "store_true") -#Output graphs - tSNE +# Output graphs - tSNE parser$add_argument("--tSNE", help = "Generate a two dimensional tSNE embedding for HTOs", default = "True") parser$add_argument("--tSNEIdents", help = "What should we remove from the object (we have Singlet,Doublet and Negative)", default = "Negative") parser$add_argument("--tSNEInvert", action = "store_true") parser$add_argument("--tSNEVerbose", action = "store_true") parser$add_argument("--tSNEApprox", action = "store_true") -parser$add_argument("--tSNEDimMax", help = "max number of donors ",type = "integer", default = 1) -parser$add_argument("--tSNEPerplexity", help = "value for perplexity", type = "integer", default = 100) +parser$add_argument("--tSNEDimMax", help = "max number of donors ", type = "integer", default = 1) +parser$add_argument("--tSNEPerplexity", help = "value for perplexity", type = "integer", default = 100) -#Output graphs - Heatmap +# Output graphs - Heatmap parser$add_argument("--heatMap", help = "Generate a Heatmap", default = "False") -parser$add_argument("--heatMapNcells", help ="value for number of cells", type = "integer", default = 500) -parser$add_argument("--outputdir", help='Output directory') +parser$add_argument("--heatMapNcells", help = "value for number of cells", type = "integer", default = 500) +parser$add_argument("--outputdir", help = "Output directory") args <- parser$parse_args() @@ -45,13 +45,13 @@ params <- data.frame(Argument, Value) print(paste0("Loading RDS object in ", args$hashtagPath)) -if (!endsWith(args$hashtagPath, ".rds")){ - hash_file <- list.files(args$hashtagPath, pattern = "\\.rds$", full.names = TRUE)[1] -}else{ - hash_file <- args$hashtagPath +if (!endsWith(args$hashtagPath, ".rds")) { + hash_file <- list.files(args$hashtagPath, pattern = "\\.rds$", full.names = TRUE)[1] +} else { + hash_file <- args$hashtagPath } print(hash_file) -hashtag <-readRDS(hash_file) +hashtag <- readRDS(hash_file) # Ridge Plot @@ -59,39 +59,36 @@ hashtag <-readRDS(hash_file) if (args$ridgePlot == "True") { Idents(hashtag) <- paste0(args$assay, "_maxID") RidgePlot(hashtag, assay = args$assay, features = rownames(hashtag[[args$assay]]), ncol = args$ridgeNCol) - ggsave(paste0(args$outputdir, '/ridge.jpeg'), device = 'jpeg', dpi = 500) # height = 10, width = 10 + ggsave(paste0(args$outputdir, "/ridge.jpeg"), device = "jpeg", dpi = 500) # height = 10, width = 10 } if (args$featureScatter == "True") { FeatureScatter(hashtag, feature1 = args$scatterFeat1, feature2 = args$scatterFeat2) - ggsave(paste0(args$outputdir, '/featureScatter.jpeg'), device = 'jpeg',dpi = 500) + ggsave(paste0(args$outputdir, "/featureScatter.jpeg"), device = "jpeg", dpi = 500) } if (args$vlnPlot == "True") { Idents(hashtag) <- paste0(args$assay, "_classification.global") VlnPlot(hashtag, features = args$vlnFeatures, pt.size = 0.1, log = args$vlnLog) - ggsave(paste0(args$outputdir, '/violinPlot.jpeg'), device = 'jpeg', dpi = 500) + ggsave(paste0(args$outputdir, "/violinPlot.jpeg"), device = "jpeg", dpi = 500) } if (args$tSNE == "True") { hashtag.subset <- subset(hashtag, idents = args$tSNEIdents, invert = args$tSNEInvert) DefaultAssay(hashtag.subset) <- args$assay - hashtag.subset <- ScaleData(hashtag.subset, features = rownames(hashtag.subset), - verbose = args$tSNEVerbose) + hashtag.subset <- ScaleData(hashtag.subset, + features = rownames(hashtag.subset), + verbose = args$tSNEVerbose + ) hashtag.subset <- RunPCA(hashtag.subset, features = rownames(hashtag.subset), approx = args$tSNEApprox) hashtag.subset <- RunTSNE(hashtag.subset, dims = 1:args$tSNEDimMax, perplexity = args$tSNEPerplexity, check_duplicates = FALSE) DimPlot(hashtag.subset) - ggsave(paste0(args$outputdir, '/tSNE.jpeg'), device = 'jpeg', dpi = 500) + ggsave(paste0(args$outputdir, "/tSNE.jpeg"), device = "jpeg", dpi = 500) } if (args$heatMap == "True") { HTOHeatmap(hashtag, assay = args$assay, ncells = args$heatMapNcells) - ggsave(paste0(args$outputdir, '/heatMap.jpeg'), device = 'jpeg', dpi = 500) + ggsave(paste0(args$outputdir, "/heatMap.jpeg"), device = "jpeg", dpi = 500) } write.csv(params, paste0(args$outputdir, "/visual_params.csv")) - - - - - diff --git a/bin/HTODemux.R b/bin/HTODemux.R index e7a9e9c..fdd86d9 100755 --- a/bin/HTODemux.R +++ b/bin/HTODemux.R @@ -1,5 +1,5 @@ #!/usr/bin/env Rscript -#Libraries +# Libraries library(Seurat) library(argparse) library(ggplot2) @@ -15,54 +15,52 @@ parser$add_argument("--seed", help = "Sets the random seed.", type = "integer", parser$add_argument("--init", help = "Initial number of clusters for hashtags.", default = NULL, type = "integer") parser$add_argument("--assay", help = "Assay name", default = "HTO") parser$add_argument("--objectOutHTOdemux", help = "Prefix name for the object containing the output of HTODemux object", type = "character", default = "htodemux") -parser$add_argument("--assignmentOutHTOdemux", help="Prefeix name for the file containing the output of HTODemux assignment", type = "character", default = "htodemux") -parser$add_argument("--outputdir", help='Output directory') +parser$add_argument("--assignmentOutHTOdemux", help = "Prefeix name for the file containing the output of HTODemux assignment", type = "character", default = "htodemux") +parser$add_argument("--outputdir", help = "Output directory") args <- parser$parse_args() -if (!endsWith(args$seuratObject, ".rds")){ - seuratObj <- list.files(args$seuratObject, pattern = "\\.rds$", full.names = TRUE)[1] -}else{ - seuratObj <- args$seuratObject +if (!endsWith(args$seuratObject, ".rds")) { + seuratObj <- list.files(args$seuratObject, pattern = "\\.rds$", full.names = TRUE)[1] +} else { + seuratObj <- args$seuratObject } init <- args$init -if(is.null(init)){ - init <- "NULL" +if (is.null(init)) { + init <- "NULL" } Argument <- c("seuratObject", "quantile", "kfunc", "nstarts", "nsamples", "seed", "init", "assay") Value <- c(seuratObj, args$quantile, args$kfunc, args$nstarts, args$nsamples, args$seed, init, args$assay) params <- data.frame(Argument, Value) # Loading Seurat object -hashtag <-readRDS(seuratObj) +hashtag <- readRDS(seuratObj) # Demultiplex cells based on HTO enrichment -if(args$kfunc == "clara"){ - hashtag <- HTODemux(hashtag, assay = args$assay, positive.quantile = args$quantile, init = args$init, nsamples = args$nsamples, kfunc = "clara", seed = args$seed) -}else{ - hashtag <- HTODemux(hashtag, assay = args$assay, positive.quantile = args$quantile, init = args$init, nstarts = args$nstarts, kfunc = "kmeans", seed = args$seed) +if (args$kfunc == "clara") { + hashtag <- HTODemux(hashtag, assay = args$assay, positive.quantile = args$quantile, init = args$init, nsamples = args$nsamples, kfunc = "clara", seed = args$seed) +} else { + hashtag <- HTODemux(hashtag, assay = args$assay, positive.quantile = args$quantile, init = args$init, nstarts = args$nstarts, kfunc = "kmeans", seed = args$seed) } # Global classification results -#table(hashtag[[paste0(args$assay,"_classification.global")]]) +# table(hashtag[[paste0(args$assay,"_classification.global")]]) -#table(hashtag[[paste0(args$assay,"_classification")]]) +# table(hashtag[[paste0(args$assay,"_classification")]]) # Saving results donors <- rownames(hashtag[[args$assay]]) -assignment <- hashtag[[paste0(args$assay,"_classification")]] -assignment[[paste0(args$assay,"_classification")]][!assignment[[paste0(args$assay,"_classification")]] %in% c(donors, 'Negative')] <- "Doublet" +assignment <- hashtag[[paste0(args$assay, "_classification")]] +assignment[[paste0(args$assay, "_classification")]][!assignment[[paste0(args$assay, "_classification")]] %in% c(donors, "Negative")] <- "Doublet" write.csv(params, paste0(args$outputdir, "/params.csv")) write.csv(assignment, paste0(args$outputdir, "/", args$assignmentOutHTOdemux, "_assignment_htodemux.csv")) -#write.csv(hashtag[[paste0(args$assay,"_classification")]], paste0(args$outputdir, "/", args$assignmentOutHTOdemux, "_assignment_htodemux.csv")) -write.csv(hashtag[[paste0(args$assay,"_classification.global")]], paste0(args$outputdir, "/", args$assignmentOutHTOdemux, "_classification_htodemux.csv")) -saveRDS(hashtag, file=paste0(args$outputdir, "/", args$objectOutHTOdemux,".rds")) - - +# write.csv(hashtag[[paste0(args$assay,"_classification")]], paste0(args$outputdir, "/", args$assignmentOutHTOdemux, "_assignment_htodemux.csv")) +write.csv(hashtag[[paste0(args$assay, "_classification.global")]], paste0(args$outputdir, "/", args$assignmentOutHTOdemux, "_classification_htodemux.csv")) +saveRDS(hashtag, file = paste0(args$outputdir, "/", args$objectOutHTOdemux, ".rds")) diff --git a/bin/MultiSeq.R b/bin/MultiSeq.R index e9224f9..d54f5e6 100755 --- a/bin/MultiSeq.R +++ b/bin/MultiSeq.R @@ -1,5 +1,5 @@ #!/usr/bin/env Rscript -#Libraries +# Libraries library(Seurat) library(argparse) library(ggplot2) @@ -9,21 +9,21 @@ parser <- ArgumentParser("Parameters for MultiSeqDemux") parser$add_argument("--seuratObjectPath", help = "Seurat object or the folder containing the seurat object") parser$add_argument("--quantile", help = "The quantile to use for classification", type = "double", default = 0.7) parser$add_argument("--autoThresh", help = "Whether to perform automated threshold finding to define the best quantile", action = "store_true") -parser$add_argument("--maxiter", help = "Maximum number of iterations if autoThresh = TRUE ", default = 5, type="integer") +parser$add_argument("--maxiter", help = "Maximum number of iterations if autoThresh = TRUE ", default = 5, type = "integer") parser$add_argument("--qrangeFrom", help = "A range of possible quantile values to try if autoThresh is TRUE", type = "double", default = 0.1) parser$add_argument("--qrangeTo", help = "A range of possible quantile values to try if autoThresh is TRUE", type = "double", default = 0.9) parser$add_argument("--qrangeBy", help = "A range of possible quantile values to try if autoThresh is TRUE", type = "double", default = 0.05) parser$add_argument("--verbose", help = "Prints the output", action = "store_true") -parser$add_argument("--assay", help = "Name of the multiplexing assay", default="HTO") -parser$add_argument("--assignmentOutMulti",help = "Prefix name for the file containing the output of MULTI-Seq Demux object", default = "mutliseq") +parser$add_argument("--assay", help = "Name of the multiplexing assay", default = "HTO") +parser$add_argument("--assignmentOutMulti", help = "Prefix name for the file containing the output of MULTI-Seq Demux object", default = "mutliseq") parser$add_argument("--objectOutMulti", help = "Prefix name for the object containing the output of MULTI-Seq Demux object", default = "multiseq") -parser$add_argument("--outputdir", help='Output directory') +parser$add_argument("--outputdir", help = "Output directory") args <- parser$parse_args() -if (!endsWith(args$seuratObjectPath, ".rds")){ - seuratObj <- list.files(args$seuratObjectPath, pattern = "\\.rds$", full.names = TRUE)[1] -}else{ - seuratObj <- args$seuratObjectPath +if (!endsWith(args$seuratObjectPath, ".rds")) { + seuratObj <- list.files(args$seuratObjectPath, pattern = "\\.rds$", full.names = TRUE)[1] +} else { + seuratObj <- args$seuratObjectPath } Argument <- c("seuratObjectPath", "quantile", "autoThresh", "maxiter", "qrangeFrom", "qrangeTo", "qrangeBy", "verbose", "assay") @@ -31,12 +31,11 @@ Value <- c(seuratObj, args$quantile, args$autoThresh, args$maxiter, args$qrangeF params <- data.frame(Argument, Value) -hashtag <-readRDS(seuratObj) +hashtag <- readRDS(seuratObj) if (args$autoThresh == TRUE) { - hashtag <- MULTIseqDemux(hashtag, assay = args$assay, quantile = args$quantile, autoThresh = TRUE, maxiter = args$maxiter, qrange=seq(from = args$qrangeFrom, to = args$qrangeTo, by = args$qrangeBy), verbose = args$verbose) -}else{ - hashtag <- MULTIseqDemux(hashtag, assay = args$assay, quantile = args$quantile, verbose = args$verbose) - + hashtag <- MULTIseqDemux(hashtag, assay = args$assay, quantile = args$quantile, autoThresh = TRUE, maxiter = args$maxiter, qrange = seq(from = args$qrangeFrom, to = args$qrangeTo, by = args$qrangeBy), verbose = args$verbose) +} else { + hashtag <- MULTIseqDemux(hashtag, assay = args$assay, quantile = args$quantile, verbose = args$verbose) } table(hashtag$MULTI_ID) diff --git a/bin/demuxmix.R b/bin/demuxmix.R index 6552507..77732a8 100755 --- a/bin/demuxmix.R +++ b/bin/demuxmix.R @@ -10,51 +10,49 @@ parser <- ArgumentParser("Parameters for Demuxmix") parser$add_argument("--fileUmi", help = "Path to file UMI count matrix.") parser$add_argument("--fileHto", help = "Path to file HTO count matrix.") parser$add_argument("--ndelim", help = "For the initial identity calss for each cell, delimiter for the cell's column name", default = "_") -parser$add_argument("--pAcpt", help='Acceptance probability that must be reached in order to assign a droplet to a hashtag. ') -parser$add_argument("--assay", help='Assay name') -parser$add_argument("--gene_col", help = "Specify which column of genes.tsv or features.tsv to use for gene names; default is 2", type="integer", default = 2) -parser$add_argument("--rna_available", help='TRUE if RNA assay is available',default = FALSE) -parser$add_argument("--correctTails", help='If TRUE, droplets meeting the threshold defined by alpha (beta) are classified as "negative" ("positive") even if the mixture model suggests a different classification',default = TRUE) -parser$add_argument("--model", help='A character specifying the type of mixture model to be used. Either "naive", "regpos", "reg" or "auto".', default = 'naive') -parser$add_argument("--alpha_demuxmix", help='Threshold defining the left tail of the mixture distribution where droplets should not be classified as "positive".',type = "double",default=0.9) -parser$add_argument("--beta_demuxmix", help='Threshold for defining the right tail of the mixture distribution where droplets should not be classified as "negative".',type = "double",default=0.9) -parser$add_argument("--tol_demuxmix", help='Convergence criterion for the EM algorithm used to fit the mixture models.', type="double") -parser$add_argument("--maxIter_demuxmix", help='Maximum number of iterations for the EM algorithm',type = "integer",default=100) -parser$add_argument("--k_hto", help='Factor to define outliers in the HTO counts.',type = "double",default=1.5) -parser$add_argument("--k_rna", help='Factor to define outliers in the numbers of detected genes.',type = "double",default=1.5) -parser$add_argument("--assignmentOutDemuxmix", help="Prefix name for the file containing the output of Demuxmix assignment", type = "character", default = "demuxmix") -parser$add_argument("--outputdir", help='Output directory') +parser$add_argument("--pAcpt", help = "Acceptance probability that must be reached in order to assign a droplet to a hashtag. ") +parser$add_argument("--assay", help = "Assay name") +parser$add_argument("--gene_col", help = "Specify which column of genes.tsv or features.tsv to use for gene names; default is 2", type = "integer", default = 2) +parser$add_argument("--rna_available", help = "TRUE if RNA assay is available", default = FALSE) +parser$add_argument("--correctTails", help = 'If TRUE, droplets meeting the threshold defined by alpha (beta) are classified as "negative" ("positive") even if the mixture model suggests a different classification', default = TRUE) +parser$add_argument("--model", help = 'A character specifying the type of mixture model to be used. Either "naive", "regpos", "reg" or "auto".', default = "naive") +parser$add_argument("--alpha_demuxmix", help = 'Threshold defining the left tail of the mixture distribution where droplets should not be classified as "positive".', type = "double", default = 0.9) +parser$add_argument("--beta_demuxmix", help = 'Threshold for defining the right tail of the mixture distribution where droplets should not be classified as "negative".', type = "double", default = 0.9) +parser$add_argument("--tol_demuxmix", help = "Convergence criterion for the EM algorithm used to fit the mixture models.", type = "double") +parser$add_argument("--maxIter_demuxmix", help = "Maximum number of iterations for the EM algorithm", type = "integer", default = 100) +parser$add_argument("--k_hto", help = "Factor to define outliers in the HTO counts.", type = "double", default = 1.5) +parser$add_argument("--k_rna", help = "Factor to define outliers in the numbers of detected genes.", type = "double", default = 1.5) +parser$add_argument("--assignmentOutDemuxmix", help = "Prefix name for the file containing the output of Demuxmix assignment", type = "character", default = "demuxmix") +parser$add_argument("--outputdir", help = "Output directory") args <- parser$parse_args() -if(as.logical(args$rna_available)){ - - umi <- Read10X(data.dir = args$fileUmi, gene.column=args$gene_col, strip.suffix = TRUE) - +if (as.logical(args$rna_available)) { + umi <- Read10X(data.dir = args$fileUmi, gene.column = args$gene_col, strip.suffix = TRUE) +} +counts <- Read10X(data.dir = args$fileHto, gene.column = args$gene_col, strip.suffix = TRUE) + + +if (as.logical(args$rna_available)) { + Argument <- c("fileUmi", "fileHto") + Value <- c(args$fileUmi, args$fileHto) + params <- data.frame(Argument, Value) + # demuxmix requires a Seurat object created from raw data, + # without any type of pre-processing (Normalisation, ScaleData, etc) + # Setup Seurat object + hashtag <- CreateSeuratObject(counts = umi, names.delim = args$ndelim) + hashtag[[args$assay]] <- CreateAssayObject(counts = counts) +} else { + # If RNA data is not available, a Seurat object containing the HTO data is created + hashtag <- CreateSeuratObject(counts = counts, names.delim = args$ndelim, assay = args$assay) + Argument <- c("fileHto") + Value <- c(args$fileHto) + params <- data.frame(Argument, Value) } -counts <- Read10X(data.dir = args$fileHto, gene.column=args$gene_col, strip.suffix = TRUE) - - -if(as.logical(args$rna_available)){ - Argument <- c("fileUmi", "fileHto") - Value <- c(args$fileUmi, args$fileHto) - params <- data.frame(Argument, Value) - #demuxmix requires a Seurat object created from raw data, - #without any type of pre-processing (Normalisation, ScaleData, etc) - # Setup Seurat object - hashtag <- CreateSeuratObject(counts = umi, names.delim = args$ndelim) - hashtag[[args$assay]] <- CreateAssayObject(counts = counts) - }else{ - #If RNA data is not available, a Seurat object containing the HTO data is created - hashtag <- CreateSeuratObject(counts = counts, names.delim = args$ndelim, assay = args$assay) - Argument <- c( "fileHto") - Value <- c(args$fileHto) - params <- data.frame(Argument, Value) - } -Argument <- c("fileUmi", "fileHto","assay", "model", "alpha_demuxmix", "beta_demuxmix", "tol_demuxmix", "maxIter_demuxmix", "k_hto","k_rna","correct_tails") -Value <- c(args$fileUmi, args$fileHto, args$assay, args$model, args$alpha_demuxmix, args$beta_demuxmix, args$tol_demuxmix, args$maxIter_demuxmix, args$k_hto, args$k_rna,args$correctTails) +Argument <- c("fileUmi", "fileHto", "assay", "model", "alpha_demuxmix", "beta_demuxmix", "tol_demuxmix", "maxIter_demuxmix", "k_hto", "k_rna", "correct_tails") +Value <- c(args$fileUmi, args$fileHto, args$assay, args$model, args$alpha_demuxmix, args$beta_demuxmix, args$tol_demuxmix, args$maxIter_demuxmix, args$k_hto, args$k_rna, args$correctTails) params <- data.frame(Argument, Value) @@ -62,43 +60,35 @@ params <- data.frame(Argument, Value) ### Demuxmix hto_counts <- as.matrix(GetAssayData(hashtag[[args$assay]], slot = "counts")) hashtags_object <- GetAssayData(hashtag[[args$assay]], slot = "counts") -hashtag_list<-hashtags_object@Dimnames +hashtag_list <- hashtags_object@Dimnames tryCatch({ -#Demultiplexing process - if(args$model != 'naive' && as.logical(args$rna_available)){ - rna_counts <- hashtag$nCount_RNA - demuxmix_demul <- demuxmix(hto_counts,rna= rna_counts, model = args$model, alpha= args$alpha_demuxmix,beta= args$beta_demuxmix,maxIter=args$maxIter_demuxmix,k.hto=args$k_hto, correctTails = as.logical(args$correctTails)) - - }else{ - print("Executing naive mode for Demuxmix") - demuxmix_demul <- demuxmix(hto_counts, model = args$model, alpha= args$alpha_demuxmix,beta= args$beta_demuxmix,maxIter=args$maxIter_demuxmix,k.hto=args$k_hto) - } - demuxmix_classify <- dmmClassify(demuxmix_demul) - sumary_demuxmix <- summary(demuxmix_demul) - demuxmix_classify$Barcode <- hashtag_list[[2]] - - res_dt <- as.data.table(demuxmix_classify) - res_dt[, Classification := Type] - res_dt[Classification == "multiplet", Classification := "doublet"] - res_dt[Classification == "uncertain", Classification := "negative"] - - res_dt$HTO <- gsub(',', '_', res_dt$HTO) - write.csv(res_dt, paste0(args$outputdir, "/", args$assignmentOutDemuxmix, "_assignment_demuxmix.csv"), row.names=FALSE) - write.csv(sumary_demuxmix, paste0(args$outputdir, "/", args$assignmentOutDemuxmix, "_summary_results.csv"), row.names=FALSE) - + # Demultiplexing process + if (args$model != "naive" && as.logical(args$rna_available)) { + rna_counts <- hashtag$nCount_RNA + demuxmix_demul <- demuxmix(hto_counts, rna = rna_counts, model = args$model, alpha = args$alpha_demuxmix, beta = args$beta_demuxmix, maxIter = args$maxIter_demuxmix, k.hto = args$k_hto, correctTails = as.logical(args$correctTails)) + } else { + print("Executing naive mode for Demuxmix") + demuxmix_demul <- demuxmix(hto_counts, model = args$model, alpha = args$alpha_demuxmix, beta = args$beta_demuxmix, maxIter = args$maxIter_demuxmix, k.hto = args$k_hto) + } + demuxmix_classify <- dmmClassify(demuxmix_demul) + sumary_demuxmix <- summary(demuxmix_demul) + demuxmix_classify$Barcode <- hashtag_list[[2]] + + res_dt <- as.data.table(demuxmix_classify) + res_dt[, Classification := Type] + res_dt[Classification == "multiplet", Classification := "doublet"] + res_dt[Classification == "uncertain", Classification := "negative"] + + res_dt$HTO <- gsub(",", "_", res_dt$HTO) + write.csv(res_dt, paste0(args$outputdir, "/", args$assignmentOutDemuxmix, "_assignment_demuxmix.csv"), row.names = FALSE) + write.csv(sumary_demuxmix, paste0(args$outputdir, "/", args$assignmentOutDemuxmix, "_summary_results.csv"), row.names = FALSE) }, error = function(e) { - print("Error for Demuxmix") - error_message <- conditionMessage(e) - writeLines(e, "error_log.txt") - df <- data.frame() - write.csv(df, paste0(args$outputdir, "/", args$assignmentOutDemuxmix, "_assignment_demuxmix.csv"), row.names=FALSE) -},finally = { - write.csv(params, paste0(args$outputdir, "/params.csv")) -}) - - - - - - + print("Error for Demuxmix") + error_message <- conditionMessage(e) + writeLines(e, "error_log.txt") + df <- data.frame() + write.csv(df, paste0(args$outputdir, "/", args$assignmentOutDemuxmix, "_assignment_demuxmix.csv"), row.names = FALSE) +}, finally = { + write.csv(params, paste0(args$outputdir, "/params.csv")) +}) diff --git a/bin/donor_match.R b/bin/donor_match.R index 99a68eb..574b46f 100755 --- a/bin/donor_match.R +++ b/bin/donor_match.R @@ -2,7 +2,6 @@ library(pheatmap) library(data.table) library(ComplexUpset) -library(ggplot2) library(tidyverse) library(vcfR) library(argparse) @@ -10,49 +9,67 @@ library(argparse) parser <- ArgumentParser("Parameters for donor matching based on Pearson coorelation. ") parser$add_argument("--result_csv", help = "The path to the csv file or the directory containing the demultiplexing assignment. ") parser$add_argument("--barcode", help = "The path to the tsv file containing the white list of barcodes. ", default = NULL) -parser$add_argument("--ndonor", help = "The number of donors in the cell mixture. ", type="integer") -parser$add_argument("--method1", help="The Name of the first method in the assignment file to map donors. ", default = NULL) -parser$add_argument("--method2", help="The Name of the second method in the assignment file. We will use its donor names as reference.", default=NULL) -parser$add_argument("--findVariants", help='How to find representative variants for each donor. ', default='default') -parser$add_argument("--cell_genotype", help="The path to the VCF file containing the genotype of the cells.") -parser$add_argument("--variant_count", help='The Minimal count of a variant for filtering', type="integer", default = 0) -parser$add_argument("--variant_pct", help="The Minimal percentage of a variant for filtering", type="double", default = 0.5) -parser$add_argument("--vireo_parent_dir", help="The parent folder for searching the output of the best vireo trial, e.g. +parser$add_argument("--ndonor", help = "The number of donors in the cell mixture. ", type = "integer") +parser$add_argument("--method1", help = "The Name of the first method in the assignment file to map donors. ", default = NULL) +parser$add_argument("--method2", help = "The Name of the second method in the assignment file. We will use its donor names as reference.", default = NULL) +parser$add_argument("--findVariants", help = "How to find representative variants for each donor. ", default = "default") +parser$add_argument("--cell_genotype", help = "The path to the VCF file containing the genotype of the cells.") +parser$add_argument("--variant_count", help = "The Minimal count of a variant for filtering", type = "integer", default = 0) +parser$add_argument("--variant_pct", help = "The Minimal percentage of a variant for filtering", type = "double", default = 0.5) +parser$add_argument("--vireo_parent_dir", help = "The parent folder for searching the output of the best vireo trial, e.g. donor genotype and discriminatory variants. ") -parser$add_argument("--outputdir", help="Output directory. ") +parser$add_argument("--outputdir", help = "Output directory. ") args <- parser$parse_args() -convert2binary <- function(result_csv, method_name){ +convert2binary <- function(result_csv, method_name, min_cell) { method_assign <- result_csv %>% select("Barcode", method_name) - donor_id <- setdiff(unique(method_assign[[method_name]]), - c(NA, "negative", "doublet")) - method_assign <- method_assign[method_assign[[method_name]] %in% donor_id,] - method_assign_binary = data.frame(model.matrix(~ method_assign[[method_name]]-1, data=method_assign)) - names(method_assign_binary) <- sort(donor_id) - rownames(method_assign_binary) <- method_assign$Barcode + donor_id <- setdiff( + unique(method_assign[[method_name]]), + c(NA, "negative", "doublet") + ) + method_assign <- method_assign[method_assign[[method_name]] %in% donor_id, ] + if (nrow(method_assign) < min_cell) { + return(NULL) + } + if (length(unique(method_assign[[method_name]])) == 1) { + method_assign_binary <- as.data.frame(matrix(0, nrow = nrow(result_csv), ncol = 1), + row.names = result_csv$Barcode + ) + colnames(method_assign_binary) <- c(unique(method_assign[[method_name]])) + method_assign_binary[rownames(method_assign_binary) %in% method_assign$Barcode, ] <- 1 + } else { + method_assign_binary <- data.frame(model.matrix(~ method_assign[[method_name]] - 1, data = method_assign)) + names(method_assign_binary) <- sort(donor_id) + rownames(method_assign_binary) <- method_assign$Barcode + } return(method_assign_binary) } result_csv <- NULL - +min_cell <- 0 if (file.exists(args$result_csv) && !dir.exists(args$result_csv)) { result_csv <- fread(args$result_csv, stringsAsFactors = FALSE, na.strings = c(NA_character_, "")) } if (dir.exists(args$result_csv)) { result_csv <- list.files(args$result_csv, - pattern = "assignment_all", full.names = TRUE) + pattern = "assignment_all", full.names = TRUE + ) result_csv <- fread(result_csv[1], stringsAsFactors = FALSE, na.strings = c(NA_character_, "")) } if (!is.null(args$barcode)) { - barcode_whitelist <- fread(args$barcode, header = FALSE, - stringsAsFactors = FALSE)$V1 + barcode_whitelist <- fread(args$barcode, + header = FALSE, + stringsAsFactors = FALSE + )$V1 result_csv <- result_csv[result_csv$Barcode %in% barcode_whitelist, ] + # min_cell <- floor(nrow(result_csv) * 0.01) } colname_with_singlet <- colnames(result_csv %>% select_if(~ any(. != "negative" & . != "doublet"))) colname_with_singlet <- colname_with_singlet[colname_with_singlet != "Barcode"] -if (length(colname_with_singlet) < 2){ + +if (length(colname_with_singlet) < 2) { stop("Please choose more methods to run donor matching!") } @@ -65,8 +82,7 @@ if (!is.null(args$method1) && !is.null(args$method2)) { method2_all <- sapply(all_methods_pair, "[", 2) } - -hashing_methods <- c("demuxem", "htodemux", "multiseq", "hashsolo", "hashedDrops") +hashing_methods <- c("demuxem", "htodemux", "multiseq", "hashsolo", "hashedDrops", "bff", "gmm_demux") genetic_methods <- c("demuxlet", "freemuxlet", "vireo", "scsplit", "souporcell") best_result <- 0 @@ -74,24 +90,26 @@ best_method1 <- "None" best_method2 <- "None" num_trial <- 1 -result_record <- data.frame(best_method1 = character(), - best_method2 = character(), - score = numeric(), - matched_donor = numeric(), - remain_na = logical(), - stringsAsFactors = FALSE) +result_record <- data.frame( + best_method1 = character(), + best_method2 = character(), + score = numeric(), + matched_donor = numeric(), + remain_na = logical(), + stringsAsFactors = FALSE +) if (is.null(method1_all) || is.null(method2_all)) { stop("No method is not found in the CSV file!") } -for (i in 1:length(method1_all)){ +for (i in 1:length(method1_all)) { method1 <- method1_all[i] method2 <- method2_all[i] # put the hashing method on the second place - if (grepl(paste(hashing_methods, collapse="|"), method1) && - (grepl(paste(genetic_methods, collapse="|"), method2))) { + if (grepl(paste(hashing_methods, collapse = "|"), method1) && + (grepl(paste(genetic_methods, collapse = "|"), method2))) { hash_method <- method1 method1 <- method2 method2 <- hash_method @@ -99,20 +117,30 @@ for (i in 1:length(method1_all)){ outputdir <- file.path(args$outputdir, paste0(method1, "_vs_", method2)) ifelse(!dir.exists(outputdir), dir.create(outputdir), FALSE) - print(method1) - print(method2) - method1_res <- convert2binary(result_csv, method1) - method2_res <- convert2binary(result_csv, method2) - + + method1_res <- convert2binary(result_csv, method1, min_cell) + method2_res <- convert2binary(result_csv, method2, min_cell) + if (is.null(method1_res) || is.null(method2_res)) { + next + } + intersect_barcode <- intersect(rownames(method1_res), rownames(method2_res)) - method1_res <- method1_res[rownames(method1_res) %in% intersect_barcode, ] - method2_res <- method2_res[rownames(method2_res) %in% intersect_barcode, ] + if (length(intersect_barcode) == 0) { + next + } + print(paste0("Comapring ", method1, " and ", method2)) + method1_res <- method1_res[rownames(method1_res) %in% intersect_barcode, , drop = FALSE] + method2_res <- method2_res[rownames(method2_res) %in% intersect_barcode, , drop = FALSE] - correlation_res <- apply(method1_res, 2, function(x){ - apply(method2_res, 2, function(y){ + correlation_res <- apply(method1_res, 2, function(x) { + apply(method2_res, 2, function(y) { return(cor.test(x, y)[["estimate"]][["cor"]]) }) }) + if (is.vector(correlation_res)) { + correlation_res <- t(as.data.frame(correlation_res)) + rownames(correlation_res) <- colnames(method2_res) + } write.csv(correlation_res, file.path(outputdir, "correlation_res.csv")) match_score <- 0 @@ -121,48 +149,53 @@ for (i in 1:length(method1_all)){ colnames(geno_match) <- c("Method1", "Method2", "Correlation") geno_match$Method1 <- colnames(correlation_res) - for (id in geno_match$Method1){ + for (id in geno_match$Method1) { if (!is.infinite(-max(correlation_res[, id], na.rm = TRUE)) && - max(correlation_res[, id], na.rm = TRUE) == + max(correlation_res[, id], na.rm = TRUE) == max(correlation_res[which.max(correlation_res[, id]), ], na.rm = TRUE)) { geno_match[which(geno_match$Method1 == id), 2:3] <- - c(rownames(correlation_res)[which.max(correlation_res[, id])], - max(correlation_res[, id], na.rm = TRUE)) + c( + rownames(correlation_res)[which.max(correlation_res[, id])], + max(correlation_res[, id], na.rm = TRUE) + ) match_score <- match_score + max(correlation_res[, id], na.rm = TRUE) matched_donor <- matched_donor + 1 } else { geno_match[which(geno_match$Cluster1_ID == id)] <- c("unassigned", NA) } } - write.table(geno_match[,1:2], - file.path(outputdir, "donor_match.csv"), - row.names = FALSE, col.names = FALSE, sep = " ", quote = FALSE) + write.table(geno_match[, 1:2], + file.path(outputdir, "donor_match.csv"), + row.names = FALSE, col.names = FALSE, sep = " ", quote = FALSE + ) if (!all(is.na(correlation_res))) { newCols <- colorRampPalette(grDevices::rainbow(nrow(geno_match))) annoCol <- newCols(nrow(geno_match)) names(annoCol) <- colnames(correlation_res) annoCol <- list(category = annoCol) - correlation_res <- correlation_res[!is.na(row.names(correlation_res)), ] - correlation_res <- correlation_res[order(as.numeric(row.names(correlation_res))), ] - pheatmap(correlation_res, treeheight_row = FALSE, - treeheight_col = FALSE, display_numbers = TRUE, angle_col = "45", - number_color = "white", fontsize = 12, cluster_rows = FALSE, - cluster_cols = FALSE, width = 7, height = 5, - filename = file.path(outputdir, "concordance_heatmap.png")) + correlation_res <- correlation_res[!is.na(row.names(correlation_res)), , drop = FALSE] + correlation_res <- correlation_res[order(as.numeric(row.names(correlation_res))), , drop = FALSE] + pheatmap(correlation_res, + treeheight_row = FALSE, + treeheight_col = FALSE, display_numbers = TRUE, angle_col = "45", + number_color = "white", fontsize = 12, cluster_rows = FALSE, + cluster_cols = FALSE, width = 7, height = 5, + filename = file.path(outputdir, "concordance_heatmap.png") + ) } - if(grepl(paste(hashing_methods, collapse = "|"), method2) && + if (grepl(paste(hashing_methods, collapse = "|"), method2) && grepl(paste(genetic_methods, collapse = "|"), method1)) { - remain_na <- (matched_donor != args$ndonor) match_score <- match_score / args$ndonor - if (match_score > best_result && !remain_na){ + if (match_score > best_result && !remain_na) { write.table(geno_match[, 1:2], - file.path(args$outputdir, "donor_match.csv"), - row.names = FALSE, col.names = FALSE, - sep = " ", quote = FALSE) + file.path(args$outputdir, "donor_match.csv"), + row.names = FALSE, col.names = FALSE, + sep = " ", quote = FALSE + ) } best_method1 <- ifelse(match_score > best_result & !remain_na, method1, best_method1) best_method2 <- ifelse(match_score > best_result & !remain_na, method2, best_method2) @@ -170,52 +203,58 @@ for (i in 1:length(method1_all)){ new_record <- c(method1, method2, match_score, matched_donor, remain_na) result_record[num_trial, ] <- new_record num_trial <- num_trial + 1 - + result_merge <- select(result_csv, "Barcode", method1, method2) result_merge_new <- result_merge for (i in 1:nrow(geno_match)) { - result_merge_new[[method1]] <- replace(result_merge_new[[method1]], - result_merge[[method1]] == geno_match$Method1[i], - geno_match$Method2[i]) + result_merge_new[[method1]] <- replace( + result_merge_new[[method1]], + result_merge[[method1]] == geno_match$Method1[i], + geno_match$Method2[i] + ) } write.csv(result_merge_new, - file.path(outputdir, "all_assignment_after_match.csv"), - row.names = FALSE) + file.path(outputdir, "all_assignment_after_match.csv"), + row.names = FALSE + ) if (best_result == match_score) { write.csv(result_merge_new, - file.path(args$outputdir, "all_assignment_after_match.csv"), - row.names = FALSE) + file.path(args$outputdir, "all_assignment_after_match.csv"), + row.names = FALSE + ) } result_merge_new <- result_merge_new[result_merge_new$Barcode %in% intersect_barcode, ] write.csv(result_merge_new, - file.path(outputdir, "intersect_assignment_after_match.csv"), - row.names = FALSE) + file.path(outputdir, "intersect_assignment_after_match.csv"), + row.names = FALSE + ) } - else{ - print(paste0("Skip comparision between ", method1, " and ", method2)) - } - + # else{ + # print(paste0("Skip comparision between ", method1, " and ", method2)) + # } } -if (best_method1 != "None" && best_method2 != "None"){ - print(paste0("Best method pair: ", best_method1, " and ", best_method2, " with score ", best_result)) +if (best_method1 != "None" && best_method2 != "None") { + print(paste0("Best method pair: ", best_method1, " and ", best_method2, " with score ", best_result)) print("------------------------------------------------------------------") } if (nrow(result_record) > 1) { - write.csv(result_record, row.names = FALSE, - file.path(args$outputdir, "score_record.csv")) + write.csv(result_record, + row.names = FALSE, + file.path(args$outputdir, "score_record.csv") + ) } if (args$findVariants == "True" || args$findVariants == "default") { if (startsWith(best_method1, "vireo")) { write.table(best_method1, - file.path(args$outputdir, "best_method_vireo.txt"), - sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE) - + file.path(args$outputdir, "best_method_vireo.txt"), + sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE + ) } else { stop("Vireo is not the best method for donor matching!") } @@ -227,90 +266,103 @@ if (args$findVariants == "True" || args$findVariants == "default") { matched <- result_merge_new[result_merge_new$match, ] unmatched <- result_csv[!result_csv$Barcode %in% matched$Barcode, ]$Barcode cell_genotype_vcf <- read.vcfR(args$cell_genotype) - cell_genotype_vcf_gt <- extract.gt(cell_genotype_vcf, element = "GT", as.numeric=TRUE) + cell_genotype_vcf_gt <- extract.gt(cell_genotype_vcf, element = "GT", as.numeric = TRUE) donors <- sort(unique(matched[[best_method1]])) - representative_variant_list <- vector(mode = 'list', length=length(donors)) + representative_variant_list <- vector(mode = "list", length = length(donors)) representative_variant_list <- setNames(representative_variant_list, donors) - for (donorid in donors){ - matched_barcode <- matched[matched[[best_method1]]==donorid]$Barcode + for (donorid in donors) { + matched_barcode <- matched[matched[[best_method1]] == donorid]$Barcode matched_gt_list <- cell_genotype_vcf_gt[, matched_barcode] matched_gt_list <- matched_gt_list[rowSums(is.na(matched_gt_list)) != ncol(matched_gt_list), ] matched_gt <- as.data.frame(matrix(nrow = nrow(matched_gt_list))) matched_gt$ref <- rowSums(matched_gt_list == 0, na.rm = TRUE) matched_gt$alt <- rowSums(matched_gt_list != 0, na.rm = TRUE) matched_gt$V1 <- rownames(matched_gt_list) - matched_gt$count <- matched_gt$ref + matched_gt$alt - matched_gt$pct <- matched_gt$alt / (matched_gt$ref+matched_gt$alt) - matched_gt$dominant <- ifelse(matched_gt$pct>0.5, 1, 0) + matched_gt$count <- matched_gt$ref + matched_gt$alt + matched_gt$pct <- matched_gt$alt / (matched_gt$ref + matched_gt$alt) + matched_gt$dominant <- ifelse(matched_gt$pct > 0.5, 1, 0) matched_gt <- matched_gt[(matched_gt$pct >= args$variant_pct | - matched_gt$pct<=(1-args$variant_pct)), ] + matched_gt$pct <= (1 - args$variant_pct)), ] matched_gt <- matched_gt[matched_gt$count >= args$variant_count, ] unmatched_gt_list <- cell_genotype_vcf_gt[, unmatched] unmatched_gt_list <- unmatched_gt_list[rownames(unmatched_gt_list) %in% matched_gt$V1, ] unmatched_gt_list <- unmatched_gt_list[rowSums(is.na(unmatched_gt_list)) != ncol(unmatched_gt_list), ] unmatched_gt_list <- cbind(rownames(unmatched_gt_list), unmatched_gt_list) - unmatched_gt_list <- melt(data.table(unmatched_gt_list), id.vars = 'V1') - unmatched_gt_list <- unmatched_gt_list[!is.na(unmatched_gt_list$value),] + unmatched_gt_list <- melt(data.table(unmatched_gt_list), id.vars = "V1") + unmatched_gt_list <- unmatched_gt_list[!is.na(unmatched_gt_list$value), ] colnames(unmatched_gt_list) <- c("variant", "cell", "allele") write.csv(matched_gt, - file.path(outputdir_variant, paste0(donorid, "_matched_gt.csv")), - row.names = FALSE) + file.path(outputdir_variant, paste0(donorid, "_matched_gt.csv")), + row.names = FALSE + ) write.csv(unmatched_gt_list, - file.path(outputdir_variant, paste0(donorid, "_unmatched_gt.csv")), - row.names = FALSE) + file.path(outputdir_variant, paste0(donorid, "_unmatched_gt.csv")), + row.names = FALSE + ) informative_variants_cells <- merge(matched_gt, unmatched_gt_list, - by.x = c("V1", "dominant"), - by.y = c("variant", "allele")) + by.x = c("V1", "dominant"), + by.y = c("variant", "allele") + ) colnames(informative_variants_cells)[1] <- "variant" - num_informative_variants <- informative_variants_cells %>% group_by(cell) %>% summarise(matched=n()) - if (nrow(unmatched_gt_list[!unmatched_gt_list$cell %in% num_informative_variants$cell,]) > 0) { + num_informative_variants <- informative_variants_cells %>% + group_by(cell) %>% + summarise(matched = n()) + if (nrow(unmatched_gt_list[!unmatched_gt_list$cell %in% num_informative_variants$cell, ]) > 0) { print(unmatched_gt_list[!unmatched_gt_list$cell %in% num_informative_variants$cell, ]) } representative_variant_list[[donorid]] <- list(unique(informative_variants_cells$variant)) write.table(unique(informative_variants_cells$variant), - file.path(outputdir_variant, paste0(donorid, "_informative_variants.csv")), - row.names = FALSE, col.names = FALSE) - + file.path(outputdir_variant, paste0(donorid, "_informative_variants.csv")), + row.names = FALSE, col.names = FALSE + ) } - - representative_variant <- rbindlist(representative_variant_list, idcol = 'donor') + + representative_variant <- rbindlist(representative_variant_list, idcol = "donor") colnames(representative_variant)[2] <- "variant" representative_variant_df <- dcast(data = representative_variant, variant ~ donor, length) - write.csv(representative_variant_df, - file.path(args$outputdir, "all_representative_variant_df.csv")) + write.csv( + representative_variant_df, + file.path(args$outputdir, "all_representative_variant_df.csv") + ) upset <- ComplexUpset::upset(representative_variant_df, donors, - width_ratio = 0.45, height_ratio = 0.9, - stripes = "white", max_degree = 1, - name = "Number of donor-specific variants", - set_sizes = (upset_set_size() + - geom_text(aes(label = ..count.., size=3), - hjust = -0.1, stat = "count", - color = "white", size = 2.3) + - theme(axis.text.x = element_text(angle = 90), - text = element_text(size = 10))), - base_annotations=list('Intersection size'= intersection_size()) + width_ratio = 0.45, height_ratio = 0.9, + stripes = "white", max_degree = 1, + name = "Number of donor-specific variants", + set_sizes = (upset_set_size() + + geom_text(aes(label = ..count.., size = 3), + hjust = -0.1, stat = "count", + color = "white", size = 2.3 + ) + + theme( + axis.text.x = element_text(angle = 90), + text = element_text(size = 10) + )), + base_annotations = list("Intersection size" = intersection_size()) ) ggsave(file.path(args$outputdir, "donor_specific_variants_upset.png")) - representative_variant_single <- representative_variant_df[rowSums(representative_variant_df[,-1]) == 1,] + representative_variant_single <- representative_variant_df[rowSums(representative_variant_df[, -1]) == 1, ] representative_variant_single <- separate(representative_variant_single, - col= "variant", - into = c("chr", "pos"), sep = "_") + col = "variant", + into = c("chr", "pos"), sep = "_" + ) write.table(representative_variant_single[, c("chr", "pos")], - quote = FALSE, col.names = FALSE, sep = "\t", row.names = FALSE, - file.path(args$outputdir, "donor_specific_variants.csv")) + quote = FALSE, col.names = FALSE, sep = "\t", row.names = FALSE, + file.path(args$outputdir, "donor_specific_variants.csv") + ) } if (args$findVariants == "True" || args$findVariants == "vireo") { if (startsWith(best_method1, "vireo")) { - write.table(best_method1, - file.path(args$outputdir, "best_method_vireo.txt"), - sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE) + write.table(best_method1, + file.path(args$outputdir, "best_method_vireo.txt"), + sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE + ) } else { stop("Vireo is not the best method1 for donor matching, variants can not be filtered!") } @@ -318,14 +370,16 @@ if (args$findVariants == "True" || args$findVariants == "vireo") { vireo_result_dir <- file.path(args$vireo_parent_dir, best_method1) representative_variant <- list.files(vireo_result_dir, "filtered_variants.tsv", - full.names = TRUE, recursive = TRUE)[1] + full.names = TRUE, recursive = TRUE + )[1] representative_variant <- fread(representative_variant) - representative_variant <- separate(representative_variant, col = "variants", - into = c("chr", "pos"), - sep = "_", extra = "drop") - write.table(representative_variant[,c("chr", "pos")], - quote = FALSE, col.names = FALSE, sep = "\t", row.names = FALSE, - file.path(args$outputdir, "representative_variants_vireo.csv")) - - + representative_variant <- separate(representative_variant, + col = "variants", + into = c("chr", "pos"), + sep = "_", extra = "drop" + ) + write.table(representative_variant[, c("chr", "pos")], + quote = FALSE, col.names = FALSE, sep = "\t", row.names = FALSE, + file.path(args$outputdir, "representative_variants_vireo.csv") + ) } diff --git a/bin/dropletUtils.R b/bin/dropletUtils.R index d775e91..384e837 100755 --- a/bin/dropletUtils.R +++ b/bin/dropletUtils.R @@ -6,66 +6,113 @@ library(argparse) # for emptyDrops parser <- ArgumentParser("Parameters for Empty Drops cell identification") parser$add_argument("--raw_hto_matrix_dir", - help = "A numeric/integer matrix-like object containing UMI counts prior to any filtering or cell calling. Rows correspond to HTOs and columns correspond to cell barcodes. ") -parser$add_argument("--lower", default = 100, type = "double", - help = "A numeric scalar specifying the lower bound on the total UMI count, at or below which all barcodes are assumed to correspond to empty droplets.") -parser$add_argument("--niters", default = 10000, type = "integer", - help = "An integer scalar specifying the number of iterations to use for the Monte Carlo p-value calculations.") -parser$add_argument("--testAmbient", action = "store_true", - help = "A logical scalar indicating whether results should be returned for barcodes with totals less than or equal to lower.") -parser$add_argument("--ignore", default = NULL, type = "double", - help = "A numeric scalar specifying the lower bound on the total UMI count, at or below which barcodes will be ignored.") -parser$add_argument("--alpha", default = NULL, type = "double", - help = "A numeric scalar specifying the scaling parameter for the Dirichlet-multinomial sampling scheme.") -parser$add_argument("--round", action = "store_true", - help="Logical scalar indicating whether to check for non-integer values in m and, if present, round them for ambient profile estimation.") -parser$add_argument("--byRank", default = NULL, type = "integer", - help = "An integer scalar parametrizing an alternative method for identifying assumed empty droplets. If set, this is used to redefine lower and any specified value for lower is ignored.") -parser$add_argument("--isCellFDR", default = 0.01, type = "double", - help = "Threshold to filter the cells.") -parser$add_argument("--objectOutEmptyDrops", default = "emptyDroplets", - help = "Prefix name for the emptyDrops RDS file") -parser$add_argument("--assignmentOutEmptyDrops", default = "emptyDroplets", - help = "prefex name for emptyDrops assignment CSV file") - -#for hashedDrops -parser$add_argument("--ambient", action = "store_true", - help = "Whether to use the relative abundance of each HTO in the ambient solution from emtpyDrops, set TRUE only when test_ambient is TRUE.") -parser$add_argument("--minProp", default = 0.05, type = "double", - help = "Numeric scalar to be used to infer the ambient profile when ambient=NULL,") -parser$add_argument("--pseudoCount", default = 5, type = "double", - help = "A numeric scalar specifying the minimum pseudo-count when computing logfold changes.") -parser$add_argument("--constantAmbient", action = "store_true", - help = "Logical scalar indicating whether a constant level of ambient contamination should be used to estimate LogFC2 for all cells.") -parser$add_argument("--doubletNmads", default = 3, type = "double", - help = "A numeric scalar specifying the number of median absolute deviations (MADs) to use to identify doublets.") -parser$add_argument("--doubletMin", default = 2, type = "double", - help = "A numeric scalar specifying the minimum threshold on the log-fold change to use to identify doublets.") -parser$add_argument("--doubletMixture", action = "store_true", - help = "Logical scalar indicating whether to use a 2-component mixture model to identify doublets.") -parser$add_argument("--confidentNmads", default = 3, type = "double", - help = "A numeric scalar specifying the number of MADs to use to identify confidently assigned singlets.") -parser$add_argument("--confidenMin", default = 2, type = "double", - help = "A numeric scalar specifying the minimum threshold on the log-fold change to use to identify singlets.") -parser$add_argument("--combinations", default = NULL, type = "integer", - help = "An integer matrix specifying valid combinations of HTOs. Each row corresponds to a single sample and specifies the indices of rows in x corresponding to the HTOs used to label that sample.") -parser$add_argument("--objectOutHashedDrops", default = "hashedDrops", - help = "Prefix name for the hashedDrops RDS file") -parser$add_argument("--assignmentOutHashedDrops", default = "hashedDrops", - help = "prefex name for hashedDrops assignment CSV file") + help = "A numeric/integer matrix-like object containing UMI counts prior to any filtering or cell calling. Rows correspond to HTOs and columns correspond to cell barcodes. " +) +parser$add_argument("--lower", + default = 100, type = "double", + help = "A numeric scalar specifying the lower bound on the total UMI count, at or below which all barcodes are assumed to correspond to empty droplets." +) +parser$add_argument("--niters", + default = 10000, type = "integer", + help = "An integer scalar specifying the number of iterations to use for the Monte Carlo p-value calculations." +) +parser$add_argument("--testAmbient", + action = "store_true", + help = "A logical scalar indicating whether results should be returned for barcodes with totals less than or equal to lower." +) +parser$add_argument("--ignore", + default = NULL, type = "double", + help = "A numeric scalar specifying the lower bound on the total UMI count, at or below which barcodes will be ignored." +) +parser$add_argument("--alpha", + default = NULL, type = "double", + help = "A numeric scalar specifying the scaling parameter for the Dirichlet-multinomial sampling scheme." +) +parser$add_argument("--round", + action = "store_true", + help = "Logical scalar indicating whether to check for non-integer values in m and, if present, round them for ambient profile estimation." +) +parser$add_argument("--byRank", + default = NULL, type = "integer", + help = "An integer scalar parametrizing an alternative method for identifying assumed empty droplets. If set, this is used to redefine lower and any specified value for lower is ignored." +) +parser$add_argument("--isCellFDR", + default = 0.01, type = "double", + help = "Threshold to filter the cells." +) +parser$add_argument("--objectOutEmptyDrops", + default = "emptyDroplets", + help = "Prefix name for the emptyDrops RDS file" +) +parser$add_argument("--assignmentOutEmptyDrops", + default = "emptyDroplets", + help = "prefex name for emptyDrops assignment CSV file" +) + +# for hashedDrops +parser$add_argument("--ambient", + action = "store_true", + help = "Whether to use the relative abundance of each HTO in the ambient solution from emtpyDrops, set TRUE only when test_ambient is TRUE." +) +parser$add_argument("--minProp", + default = 0.05, type = "double", + help = "Numeric scalar to be used to infer the ambient profile when ambient=NULL," +) +parser$add_argument("--pseudoCount", + default = 5, type = "double", + help = "A numeric scalar specifying the minimum pseudo-count when computing logfold changes." +) +parser$add_argument("--constantAmbient", + action = "store_true", + help = "Logical scalar indicating whether a constant level of ambient contamination should be used to estimate LogFC2 for all cells." +) +parser$add_argument("--doubletNmads", + default = 3, type = "double", + help = "A numeric scalar specifying the number of median absolute deviations (MADs) to use to identify doublets." +) +parser$add_argument("--doubletMin", + default = 2, type = "double", + help = "A numeric scalar specifying the minimum threshold on the log-fold change to use to identify doublets." +) +parser$add_argument("--doubletMixture", + action = "store_true", + help = "Logical scalar indicating whether to use a 2-component mixture model to identify doublets." +) +parser$add_argument("--confidentNmads", + default = 3, type = "double", + help = "A numeric scalar specifying the number of MADs to use to identify confidently assigned singlets." +) +parser$add_argument("--confidenMin", + default = 2, type = "double", + help = "A numeric scalar specifying the minimum threshold on the log-fold change to use to identify singlets." +) +parser$add_argument("--combinations", + default = NULL, type = "integer", + help = "An integer matrix specifying valid combinations of HTOs. Each row corresponds to a single sample and specifies the indices of rows in x corresponding to the HTOs used to label that sample." +) +parser$add_argument("--objectOutHashedDrops", + default = "hashedDrops", + help = "Prefix name for the hashedDrops RDS file" +) +parser$add_argument("--assignmentOutHashedDrops", + default = "hashedDrops", + help = "prefex name for hashedDrops assignment CSV file" +) parser$add_argument("--outputdir", help = "Output directory") -parser$add_argument("--gene_col", help = "Specify which column of genes.tsv or features.tsv to use for gene names; default is 2", type="integer", default = 2) +parser$add_argument("--gene_col", help = "Specify which column of genes.tsv or features.tsv to use for gene names; default is 2", type = "integer", default = 2) args <- parser$parse_args() hto <- Read10X(data.dir = args$raw_hto_matrix_dir, gene.column = args$gene_col) -emptyDrops_out <- emptyDrops(hto, lower = args$lower, niters = args$niters, - test.ambient = args$testAmbient, - ignore = args$ignore, - alpha = args$alpha, round = args$round, - by.rank = args$byRank) +emptyDrops_out <- emptyDrops(hto, + lower = args$lower, niters = args$niters, + test.ambient = args$testAmbient, + ignore = args$ignore, + alpha = args$alpha, round = args$round, + by.rank = args$byRank +) print("------------------- emptyDrops finished ---------------------------------") @@ -74,59 +121,63 @@ print("-------- Following Files are saved in folder hashedDrops_out ------------ print(paste0(args$objectOutEmptyDrops, ".rds")) print(paste0(args$assignmentOutEmptyDrops, ".csv")) write.csv(emptyDrops_out, paste0(args$outputdir, "/", args$assignmentOutEmptyDrops, ".csv")) -saveRDS(emptyDrops_out, file=paste0(args$outputdir, "/", args$objectOutEmptyDrops, ".rds")) +saveRDS(emptyDrops_out, file = paste0(args$outputdir, "/", args$objectOutEmptyDrops, ".rds")) print("------------------- filtering empty droplets ----------------------------") is.cell <- emptyDrops_out$FDR <= args$isCellFDR colors <- ifelse(is.cell, "red", "black") png(paste0(args$outputdir, "/", "plot_emptyDrops.png")) -plot(emptyDrops_out$Total, -emptyDrops_out$LogProb, col=colors, xlab="Total UMI count", ylab="-Log Probability") +plot(emptyDrops_out$Total, -emptyDrops_out$LogProb, col = colors, xlab = "Total UMI count", ylab = "-Log Probability") dev.off() if (args$ambient == TRUE) { - hashedDrops_out <- hashedDrops(hto[,which(is.cell)], min.prop = args$minProp, ambient = metadata(emptyDrops_out)$ambient, pseudo.count = args$pseudoCount, constant.ambient = args$constantAmbient, doublet.nmads = args$doubletNmads, doublet.min = args$doubletMin, doublet.mixture = args$doubletMixture, confident.nmads = args$confidentNmads, confident.min = args$confidenMin, combinations = args$combinations) + hashedDrops_out <- hashedDrops(hto[, which(is.cell)], min.prop = args$minProp, ambient = metadata(emptyDrops_out)$ambient, pseudo.count = args$pseudoCount, constant.ambient = args$constantAmbient, doublet.nmads = args$doubletNmads, doublet.min = args$doubletMin, doublet.mixture = args$doubletMixture, confident.nmads = args$confidentNmads, confident.min = args$confidenMin, combinations = args$combinations) } else { - hashedDrops_out <- hashedDrops(hto[,which(is.cell)], min.prop = args$minProp, pseudo.count = args$pseudoCount, constant.ambient = args$constantAmbient, doublet.nmads = args$doubletNmads, doublet.min = args$doubletMin, doublet.mixture = args$doubletMixture, confident.nmads = args$confidentNmads, confident.min = args$confidenMin, combinations = args$combinations) + hashedDrops_out <- hashedDrops(hto[, which(is.cell)], min.prop = args$minProp, pseudo.count = args$pseudoCount, constant.ambient = args$constantAmbient, doublet.nmads = args$doubletNmads, doublet.min = args$doubletMin, doublet.mixture = args$doubletMixture, confident.nmads = args$confidentNmads, confident.min = args$confidenMin, combinations = args$combinations) } print("------------------- hashedDrops finished ---------------------------------") ignore <- args$ignore if (is.null(ignore)) { - ignore <- "NULL" + ignore <- "NULL" } alpha <- args$alpha if (is.null(alpha)) { - alpha <- "NULL" + alpha <- "NULL" } byRank <- args$byRank if (is.null(byRank)) { - byRank <- "NULL" + byRank <- "NULL" } minProp <- args$minProp if (is.null(minProp)) { - minProp <- "NULL" + minProp <- "NULL" } combinations <- args$combinations if (is.null(combinations)) { - combinations <- "NULL" + combinations <- "NULL" } -Argument <- c("raw_hto_matrix_dir", "lower", "niters", - "testAmbient", "ignore", "alpha", "round", - "byRank", "isCellFDR", "ambient", "minProp", - "pseudoCount", "constantAmbient", "doubletNmads", "doubletMin", - "doubletMixture", "confidentNmads", "confidenMin", "combinations") -Value <- c(args$raw_hto_matrix_dir, args$lower, args$niters, - args$testAmbient, ignore, alpha, args$round, - byRank, args$isCellFDR, args$ambient, minProp, - args$pseudoCount, args$constantAmbient, args$doubletNmads, - args$doubletMin, args$doubletMixture, args$confidentNmads, - args$confidenMin, combinations) +Argument <- c( + "raw_hto_matrix_dir", "lower", "niters", + "testAmbient", "ignore", "alpha", "round", + "byRank", "isCellFDR", "ambient", "minProp", + "pseudoCount", "constantAmbient", "doubletNmads", "doubletMin", + "doubletMixture", "confidentNmads", "confidenMin", "combinations" +) +Value <- c( + args$raw_hto_matrix_dir, args$lower, args$niters, + args$testAmbient, ignore, alpha, args$round, + byRank, args$isCellFDR, args$ambient, minProp, + args$pseudoCount, args$constantAmbient, args$doubletNmads, + args$doubletMin, args$doubletMixture, args$confidentNmads, + args$confidenMin, combinations +) params <- data.frame(Argument, Value) @@ -135,19 +186,24 @@ print(paste0(args$objectOutHashedDrops, ".rds")) print(paste0(args$assignmentOutHashedDrops, "_res.csv")) print("params.csv") write.csv(params, paste0(args$outputdir, "/params.csv")) -write.csv(hashedDrops_out, - paste0(args$outputdir, "/", args$assignmentOutHashedDrops, "_res.csv")) +write.csv( + hashedDrops_out, + paste0(args$outputdir, "/", args$assignmentOutHashedDrops, "_res.csv") +) saveRDS(hashedDrops_out, - file = paste0(args$outputdir, "/", args$objectOutHashedDrops, ".rds")) + file = paste0(args$outputdir, "/", args$objectOutHashedDrops, ".rds") +) colors <- ifelse(hashedDrops_out$Confident, "black", - ifelse(hashedDrops_out$Doublet, "red", "grey")) + ifelse(hashedDrops_out$Doublet, "red", "grey") +) png(paste0(args$outputdir, "/", "plot_hashed.png")) if (sum(is.na(hashedDrops_out$LogFC2)) != length(hashedDrops_out$LogFC2)) { - print(paste0(args$objectOutHashedDrops, "_LogFC.png")) - plot(hashedDrops_out$LogFC, hashedDrops_out$LogFC2, col = colors, - xlab = "Log-fold change between best and second HTO", - ylab = "Log-fold change between second HTO and ambient") - dev.off() - -} \ No newline at end of file + print(paste0(args$objectOutHashedDrops, "_LogFC.png")) + plot(hashedDrops_out$LogFC, hashedDrops_out$LogFC2, + col = colors, + xlab = "Log-fold change between best and second HTO", + ylab = "Log-fold change between second HTO and ambient" + ) + dev.off() +} diff --git a/bin/pre_processing.R b/bin/pre_processing.R index 71794c8..6d582d1 100755 --- a/bin/pre_processing.R +++ b/bin/pre_processing.R @@ -1,5 +1,5 @@ #!/usr/bin/env Rscript -#Libraries +# Libraries library(Seurat) library(argparse) @@ -14,21 +14,19 @@ parser$add_argument("--selectMethod", help = "Selection method", default = "mean parser$add_argument("--numberFeatures", help = "Number of features to be used when finding variable features", type = "integer", default = 2000) parser$add_argument("--assay", help = "Assay name for hashing modality", default = "HTO") parser$add_argument("--normalisationMethod", help = "Normalisation method", default = "CLR") -parser$add_argument("--margin", help = "Margin for normalisation", type="integer", default = 2) -parser$add_argument("--gene_col", help = "Specify which column of genes.tsv or features.tsv to use for gene names; default is 2", type="integer", default = 2) -parser$add_argument( "--OutputFile", help="Prefix of output files containing the output of HTODemux hashtag", default = "preprocessed") -parser$add_argument("--outputdir", help='Output directory') +parser$add_argument("--margin", help = "Margin for normalisation", type = "integer", default = 2) +parser$add_argument("--gene_col", help = "Specify which column of genes.tsv or features.tsv to use for gene names; default is 2", type = "integer", default = 2) +parser$add_argument("--OutputFile", help = "Prefix of output files containing the output of HTODemux hashtag", default = "preprocessed") +parser$add_argument("--outputdir", help = "Output directory") args <- parser$parse_args() -#Check if RNA data is available -if(args$rdsObject){ +# Check if RNA data is available +if (args$rdsObject) { umi <- readRDS(args$fileUmi) counts <- readRDS(args$fileHto) - -}else{ - +} else { umi <- Read10X(data.dir = args$fileUmi, gene.column = args$gene_col) counts <- Read10X(data.dir = args$fileHto, gene.column = args$gene_col) } @@ -53,13 +51,8 @@ hashtag <- ScaleData(hashtag, features = VariableFeatures(hashtag)) hashtag[[args$assay]] <- CreateAssayObject(counts = counts) # Normalize HTO data hashtag <- NormalizeData(hashtag, assay = args$assay, normalization.method = args$normalisationMethod, margin = args$margin) - -#Save Results + +# Save Results saveRDS(hashtag, file = paste0(args$outputdir, "/", args$OutputFile, ".rds")) write.csv(params, paste0(args$outputdir, "/params.csv")) - - - - - diff --git a/conda/donor_match.yml b/conda/donor_match.yml index a11f26d..77cbb4c 100644 --- a/conda/donor_match.yml +++ b/conda/donor_match.yml @@ -3,13 +3,11 @@ channels: - conda-forge - bioconda dependencies: - - r-base=4.1 + - r-base - r-tidyverse - r-pheatmap - - r-magrittr - r-complexupset - r-data.table - r-argparse - r-vcfr - - r-ggplot2 - bcftools diff --git a/docs/source/_static/images/genotype-based.png b/docs/source/_static/images/genotype-based.png deleted file mode 100644 index 0f38bed..0000000 Binary files a/docs/source/_static/images/genotype-based.png and /dev/null differ diff --git a/docs/source/_static/images/genotype.png b/docs/source/_static/images/genotype.png new file mode 100644 index 0000000..7baeaa4 Binary files /dev/null and b/docs/source/_static/images/genotype.png differ diff --git a/docs/source/_static/images/hashing-based.png b/docs/source/_static/images/hashing-based.png deleted file mode 100644 index 74afb15..0000000 Binary files a/docs/source/_static/images/hashing-based.png and /dev/null differ diff --git a/docs/source/_static/images/hashing.png b/docs/source/_static/images/hashing.png new file mode 100644 index 0000000..8256f1c Binary files /dev/null and b/docs/source/_static/images/hashing.png differ diff --git a/docs/source/_static/images/pipeline.png b/docs/source/_static/images/pipeline.png index bcc1877..b8a555e 100644 Binary files a/docs/source/_static/images/pipeline.png and b/docs/source/_static/images/pipeline.png differ diff --git a/docs/source/_static/images/pipeline_v2.png b/docs/source/_static/images/pipeline_v2.png deleted file mode 100644 index 0b3305e..0000000 Binary files a/docs/source/_static/images/pipeline_v2.png and /dev/null differ diff --git a/docs/source/general.md b/docs/source/general.md index ad9d6d9..5a0d9cb 100644 --- a/docs/source/general.md +++ b/docs/source/general.md @@ -1,6 +1,10 @@ # General -![Caption](_static/images/pipeline_v2.png) +## **hadge: a comprehensive pipeline for donor deconvolution in single cell** + +Preprint manuscript is available [here](https://www.biorxiv.org/content/10.1101/2023.07.23.550061v2) + +![Caption](_static/images/pipeline.png) ## **Pipeline overview:** diff --git a/docs/source/genetic.md b/docs/source/genetic.md index aafb2bc..005cea5 100644 --- a/docs/source/genetic.md +++ b/docs/source/genetic.md @@ -2,15 +2,48 @@ Genotyped-based deconvolution leverages the unique genetic composition of individual samples to guarantee that the final cell mixture can be deconvolved. This can be conducted with genotype of origin or in a genotype-free mode using a genomic reference from unmatched donors, for example the 1000 genome project genotypes in a genotype-free. The result of this approach is a table of SNP assignment to cells that can be used to computationally infer the donors. One limitation of this approach is the need to produce additional data to genotype the individual donors in order to correctly assign the cell mixtures. -## **Genetics-based deconvolution (gene_demulti) in hadge** +## **gene_demulti in hadge** -![Caption](_static/images/genotype-based.png) +

+ +

## **Quick start** ```bash -cd hadge -nextflow run main.nf -profile test --mode genetic +nextflow run ${hadge_project_dir}/main.nf -profile test,conda_singularity --mode genetic +``` + +## **Example case** + +Case 1: Run the entire genotype-based mode without known donor genotype: + +```bash +nextflow run ${hadge_project_dir}/main.nf -profile conda_singularity --outputdir ${output_dir} --mode genetic --bam ${bam_dir} --bai ${bai_dir} --barcodes ${barcodes_dir} --nsamples_genetic ${nsamples} --fasta ${fasta_dir} --fasta_index ${fasta_index_dir} --common_variants_scSplit ${common_variant_scsplit} --common_variants_souporcell ${common_variant_souporcell} --common_variants_freemuxlet ${common_variant_freemuxlet} --common_variants_cellsnp ${common_variant_cellsnp} --demuxlet False +``` + +Case 2: Skip cellSNP and run Vireo with available cell genotype file in VCF format: + +```bash +nextflow run ${hadge_project_dir}/main.nf -profile conda --mode genetic --vireo_variant False --celldata ${cell_data_dir} +``` + +Case 3: Run Demuxlet with donor genotype: + +```bash +nextflow run ${hadge_project_dir}/main.nf -profile conda --mode genetic --outputdir ${output_dir} --bam ${bam_dir} --bai ${bai_dir} --barcodes ${barcodes_dir} --vcf_donor ${donor_genotype_dir} +``` + +Case 4: Run scSplit without data pre-processing: + +```bash +nextflow run ${hadge_project_dir}/main.nf -profile conda --mode genetic --scSplit_preprocess False //additional paramters as in case 1 +``` + +Case 5: Run the pipeline with different combinations of parameter. This is only available in the single sample mode. The values should be separated by semicolumn and double quoted. + +```bash +nextflow run ${hadge_project_dir}/main.nf -profile conda_singularity --mode genetic --alpha "0.1;0.3;0.5" //additional paramters as in case 1 ``` ## **Input data preparation** @@ -106,7 +139,7 @@ output directory: `$pipeline_output_folder/cellsnp/cellsnp_[task_ID/sampleId]` - bam - bam_index - barcodes -- nsample +- nsamples_genetic - celldata - vcf_mixed - vcf_donor @@ -242,7 +275,7 @@ output directory: `$pipeline_output_folder/souporcell/souporcell_[task_ID/sample | bam | Input SAM/BAM/CRAM file. Must be sorted by coordinates and indexed. | | bai | Index of Input SAM/BAM/CRAM file. | | barcodes | List of cell barcodes to consider. | -| nsample | Number of samples multiplexed together | +| nsamples_genetic | Number of samples multiplexed together | | tag_group | Tag representing readgroup or cell barcodes, in the case to partition the BAM file into multiple groups. For 10x genomics, use CB Default: CB | | tag_UMI | Tag representing UMIs. For 10x genomiucs, use UB. Default: UB | | common_variants_freemuxlet | Input VCF/BCF file for dsc-pileup, containing the AC and AN field. | @@ -278,7 +311,7 @@ output directory: `$pipeline_output_folder/souporcell/souporcell_[task_ID/sample | vireo_preprocess | Whether to perform pre-processing on the input params.bam for cellSNP-lite. True: Perform pre-processing. Otherwise pre-processing is not called. Default: False | | vireo_variant | Whether to perform cellSNP-lite before running Vireo. True: Run cellSNP-lite. Otherwise cellSNP-lite is not called and params.celldata is used as input. Default: True | | celldata | The cell genotype file in VCF format or cellSNP folder with sparse matrices. | -| nsample | Number of donors to demultiplex; can be larger than provided in vcf_donor | +| nsamples_genetic | Number of donors to demultiplex; can be larger than provided in vcf_donor | | vartrixData | The cell genotype files in vartrix outputs (three/four files, comma separated): alt.mtx,ref.mtx,barcodes.tsv,SNPs.vcf.gz. This will suppress cellData argument. Default: None | | vcf_donor | The donor genotype file in VCF format. Default: None | | genoTag | The tag for donor genotype: GT, GP, PL. Default: GT | @@ -308,7 +341,7 @@ output directory: `$pipeline_output_folder/souporcell/souporcell_[task_ID/sample | barcodes | Barcodes whitelist. | | tag_group | Tag for barcode. Default: CB | | common_variants_scSplit | Common SNVs for scSplit. | -| nsample | Expected number of mixed samples. | +| nsamples_genetic | Expected number of mixed samples. | | refscSplit | Output Ref count matrix. Default: ref_filtered.csv | | altscSplit | Output Alt count matrix. Default: alt_filtered.csv | | subscSplit | The maximum number of subpopulations in autodetect mode. Default: 10 | @@ -329,7 +362,7 @@ output directory: `$pipeline_output_folder/souporcell/souporcell_[task_ID/sample | barcodes | Barcodes.tsv from cellranger | | fasta | Reference fasta file. | | fasta_index | Index of reference fasta file. | -| nsample | Number of clusters in the BAM file. | +| nsamples_genetic | Number of clusters in the BAM file. | | threads | Max threads to use. Default: 5 | | ploidy | Ploidy, must be 1 or 2. Default: 2 | | min_alt | Min alt to use locus. Default: 10 | diff --git a/docs/source/hashing.md b/docs/source/hashing.md index c849bfb..97da55c 100644 --- a/docs/source/hashing.md +++ b/docs/source/hashing.md @@ -1,31 +1,52 @@ -# Hashing demultiplexing +# Hashing-based deconvolution workflow Cell hashing is a sample processing technique that requires processing individual samples to “tag” the membrane of the cell or the nuclei with unique oligonucleotide barcodes. The cells are then washed or the reaction is quenched, and the samples can be safely mixed and processed following the standard library preparation procedure. Two libraries are generated after this process, one for the scRNA and one for the hashing oligos (HTO), which are independently sequenced to produce each a single cell count matrix, one for the RNA library and one for the HTO library. The hashtag counts are then bioinformatically processed to deconvolve the cell’s source sample. -## **Hashing-based deconvolution (hash_demulti) in hadge** +## **hash_demulti in hadge** -![Caption](_static/images/hashing-based.png) +

+ +

## **Quick start** ```bash -cd hadge -nextflow run main.nf -profile test --mode hashing +nextflow run ${hadge_project_dir}/main.nf -profile test,conda --mode hashing +``` + +## **Example case** + +Case 1: Run the entire hashing-based mode: + +```bash +nextflow run ${hadge_project_dir}/main.nf -profile conda --outputdir ${output_dir} --mode hashing --hto_matrix_raw ${hto_raw_dir} --hto_matrix_filtered ${hto_filtered_dir} --rna_matrix_raw ${rna_raw_dir} --rna_matrix_filtered ${rna_filtered_dir} +``` + +Case 2: Run Multiseq with raw counts : + +```bash +nextflow run ${hadge_project_dir}/main.nf -profile conda --outputdir ${output_dir} --mode hashing --rna_matrix_multiseq raw --hto_matrix_multiseq raw // additional parameters as in case 1 +``` + +Case 3: Run the pipeline with different combinations of parameter. This is only available in the single sample mode. The values should be separated by semicolumn and double quoted. + +```bash +nextflow run ${hadge_project_dir}/main.nf -profile conda --mode hashing --quantile_multi "0.5;0.7" //additional paramters as in case 1 ``` ## **Input data preparation** The input data depends heavily on the deconvolution tools. In the following table, you will find the minimal input data required by different tools. -| Deconvolution method | Input data | Parameter | -| -------------------- | ------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------- | -| HTODemux | - Seurat object with both UMI and hashing count matrix (RDS) | `params.rna_matrix_htodemux`
`params.hto_matrix_htodemux` | -| Multiseq | - Seurat object with both UMI and hashing count matrix (RDS) | `params.rna_matrix_multiseq`
`params.hto_matrix_multiseq` | -| HashSolo | - 10x mtx directory with hashing count matrix (H5) | `params.hto_matrix_hashsolo`
`params.rna_matrix_hashsolo` | -| HashedDrops | - 10x mtx directory with hashing count matrix (Directory) | `params.hto_matrix_hashedDrops` | -| Demuxem | - 10x mtx directory with UMI count matrix (Directory)
- 10x mtx directory with hashing count matrix (Directory) | `params.hto_matrix_demuxem`
`params.rna_matrix_demuxem` | +| Deconvolution method | Input data | +| -------------------- | ------------------------------------------------------------- | +| HTODemux | UMI and hashing count matrix | +| Multiseq | UMI and hashing count matrix | +| HashSolo | - Required: Hashing count matrix - Optional: UMI count matrix | +| HashedDrops | Hashing count matrix | +| Demuxem | Both UMI and hashing count matrix | -Similary as genotype-based deconvlution methods, hashing methods also have some input in common. So we also try to utilize common input parameters `params.[rna/hto]_matrix_[raw/filtered]` to store count matrices for better control and `params.[rna/hto]_matrix_[method]` is used to specify whether to use raw or filtered counts for each method. +Similary as genotype-based deconvlution methods, hashing methods also have some input in common. So we also try to utilize common input parameters `params.[rna/hto]_matrix_[raw/filtered]` to store count matrices for better control and `params.[rna/hto]_matrix_[method]` is used to specify whether to use raw or filtered counts for each method, e.g. `hto_matrix_hashedDrops = "raw"` means that raw HTO count matrix is used as input for HTODemux. | Input data | Parameter | | ------------------------------ | ---------------------------- | @@ -36,7 +57,7 @@ Similary as genotype-based deconvlution methods, hashing methods also have some #### Pre-processing -Similar as in the genetic demultiplexing workflow, we provide a pre-processing step required before running HTODemux and Multiseq to load count matrices into a Seurat object. The input will be automatically loaded from the parameters set above. +Similar as in the genetic demultiplexing workflow, we provide a pre-processing step required before running HTODemux and Multiseq where the count matrices are loaded from the parameters set above into a Seurat object. ## **Output** @@ -114,13 +135,6 @@ output directory: `$pipeline_output_folder/hashedDrops/hashedDrops_[task_ID/samp - `${params.objectOutHashedDrops}_LogFC.png`: a diagnostic plot comparing the log-fold change between the second HTO's abundance and the ambient contamination - `params.csv`: specified parameters in the HashedDrops task -### Demuxmix - -output directory: `$pipeline_output_folder/demuxmix/demuxmix_[task_ID/sampleId]` - -- `${params.assignmentOutDemuxmix}_assignment_demuxmix.csv`: the assignment and classification results produced by Demuxmix -- `params.csv`: specified parameters in the Demuxmix task - ### GMM-Demux output directory: `$pipeline_output_folder/gmm_demux/gmm_demux_[task_ID/sampleId]` @@ -165,7 +179,7 @@ output directory: `$pipeline_output_folder/bff/bff_[task_ID/sampleId]` | quantile_htodemux | The quantile of inferred 'negative' distribution for each hashtag, over which the cell is considered 'positive'. Default: 0.99 | | kfunc | Clustering function for initial hashtag grouping. Default: clara. | | nstarts | nstarts value for k-means clustering when kfunc=kmeans. Default: 100 | -| nsamples | Number of samples to be drawn from the dataset used for clustering when kfunc= clara. Default: 100 | +| nsamples_clustering | Number of samples to be drawn from the dataset used for clustering when kfunc= clara. Default: 100 | | seed | Sets the random seed. Default: 42 | | init | Initial number of clusters for hashtags. Default: None, which means the # of hashtag oligo names + 1 to account for negatives. | | objectOutHTO | Name of the output Seurat object. Default: htodemux | @@ -206,36 +220,18 @@ output directory: `$pipeline_output_folder/bff/bff_[task_ID/sampleId]` | assignmentOutMulti | Prefix of the output CSV files. Default: multiseq | | objectOutMulti | Name of the output Seurat object. Default: multiseq | -### Solo - -| | | -| -------------------------- | ------------------------------------------------------------------------------------------------ | -| solo | Whether to perform Solo. Default: True | -| rna_matrix_solo | Input folder to RNA expression matrix in 10x format. | -| max_epochs | Number of epochs to train for. Default: 400 | -| lr | Learning rate for optimization. Default: 0.001 | -| train_size | Size of training set in the range between 0 and 1. Default: 0.9 | -| validation_size | Size of the test set. Default: 0.1 | -| batch_size | Minibatch size to use during training. Default: 128 | -| early_stopping | Adds callback for early stopping on validation_loss. Default: True | -| early_stopping_patience | Number of times early stopping metric can not improve over early_stopping_min_delta. Default: 30 | -| early_stopping_min_delta | Threshold for counting an epoch towards patience train(). Default: 10 | -| soft | Return probabilities instead of class label. Default: False | -| include_simulated_doublets | Return probabilities for simulated doublets as well. | -| assignmentOutSolo | Prefix of the output CSV files. Default: solo_predict | - ### HashSolo | | | | ------------------------ | -------------------------------------------------------------------------------------------- | | hashsolo | Whether to perform HashSolo. Default: True | +| use_rna_data | Whether to use RNA counts for deconvolution. Default: False | | rna_matrix_hashsolo | Whether to use raw or filtered scRNA-seq count matrix. Default: raw | | hto_matrix_hashsolo | Whether to use raw or filtered HTO count matrix if use_rna_data is set to True. Default: raw | | priors_negative | Prior for the negative hypothesis. Default: 1/3 | | priors_singlet | Prior for the singlet hypothesis. Default: 1/3 | | priors_doublet | Prior for the doublet hypothesis. Default: 1/3 | | pre_existing_clusters | Column in the input data for how to break up demultiplexing. Default: None | -| use_rna_data | Whether to use RNA counts for deconvolution. Default: False | | number_of_noise_barcodes | Number of barcodes to use to create noise distribution. Default: None | | assignmentOutHashSolo | Prefix of the output CSV files. Default: hashsolo | | plotOutHashSolo | Prefix of the output figures. Default: hashsolo | diff --git a/docs/source/index.md b/docs/source/index.md index fb0656e..871e75b 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -10,53 +10,61 @@ hadge is a one-stop pipeline for demultiplexing single cell mixtures. It consist The genetics-based deconvolution workflow includes 5 methods: -- Freemuxlet - Demuxlet -- Vireo -- Souporcell +- Freemuxlet - scSplit +- Souporcell +- Vireo -The hashing-based deconvolution includes 8 methods: +The hashing-based deconvolution includes 7 methods: -- hashedDrops -- Multiseq -- HTODemux -- Demuxem -- HashSolo -- Demuxmix - BFF +- Demuxem - GMM_Demux +- hashedDrops +- HashSolo +- HTODemux +- Multiseq ## **Installation** The hadge pipeline is implemented in Nextflow. To get started, you need to install Nextflow. Please refer to [Nextflow](https://www.nextflow.io/docs/latest/getstarted.html#installation) for more details. Alternatively, you can also install Nextflow via [conda](https://anaconda.org/bioconda/nextflow). -As next, please run the pipeline +## **Quick start** + +To execute the pipeline locally, start by cloning the repository into a directory, for example, named ${hadge_project_dir}. ```bash -nextflow run http://github.com/theislab/hadge -r main +cd ${hadge_project_dir} && git clone https://github.com/theislab/hadge.git +nextflow run ${hadge_project_dir}/hadge/main.nf -profile conda_singularity ``` -You can also: +It is also allowed to run the pipeline from a directory outside the hadge project folder. + +Alternatively, you can also run the pipeline on the cloud: + +```bash +nextflow run http://github.com/theislab/hadge -r main -profile conda_singularity +``` + +Please note: - Choose the mode: `--mode=` - Specify the folder name `--outdir` to save the output files. This will create a folder automatically in the project directory. -- Specify the input data for each process. +- To run the pipeline with your own dataset, specify the input data and additional parrameters if needed. - The pipeline can be run either locally or on a HPC with different resource specifications. As default, the pipeline will run locally. You can also set the SLURM executor by running the pipeline with `-profile cluster`. - Please also check [](general) for more details. -## **Quick start** +To get familiar with hadge, we provide the test profile for a quick start. To access the test sample data, you can use the provided bash script to download the test data to the project directory of hadge and run the pipeline locally. ```bash -git clone https://github.com/theislab/hadge.git -cd hadge -sh test_data/download_data.sh -nextflow run main.nf -profile test +cd ${hadge_project_dir}/hadge && sh test_data/download_data.sh +nextflow run main.nf -profile test,conda_singularity ``` ## Notebook -Check[](../../notebook.ipynb) to get familiar with the output of hadge. +Check the [notebook](../../notebook.ipynb) to get familiar with the output of hadge. ## **Pipeline output** @@ -98,6 +106,7 @@ general genetic hashing rescue +notebook ``` # Indices and tables diff --git a/docs/source/rescue.md b/docs/source/rescue.md index 76bd6d2..e07d3e4 100644 --- a/docs/source/rescue.md +++ b/docs/source/rescue.md @@ -4,7 +4,9 @@ The joint call of hashing and genetic deconvolution methods has been shown to be ## Overview -![Caption](_static/images/rescue.png) +

+ +

## **Quick start** diff --git a/main.nf b/main.nf index 547e332..1768451 100644 --- a/main.nf +++ b/main.nf @@ -1,21 +1,21 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=2 +nextflow.enable.dsl = 2 include { run_multi } from './modules/multi_demultiplexing' include { gene_demultiplexing } from './modules/single/gene_demultiplexing' include { hash_demultiplexing } from './modules/single/hash_demultiplexing' include { donor_match } from './modules/single/donor_match' -process summary_all{ +process summary_all { publishDir "$projectDir/$params.outdir/$params.mode", mode: 'copy' label 'small_mem' - conda "pandas scanpy mudata" + conda 'pandas scanpy mudata' input: path gene_demulti_result path hash_demulti_result output: - path "summary" + path 'summary' script: """ @@ -23,89 +23,87 @@ process summary_all{ """ } -process generate_data{ +process generate_data { publishDir "$projectDir/$params.outdir/$params.mode/data_output", mode: 'copy' - conda "pandas scanpy mudata" + conda 'pandas scanpy mudata' input: path assignment val generate_anndata val generate_mudata val rna_matrix - val hto_matrix + val hto_matrix output: - path "adata_with_donor_matching.h5ad", optional: true - path "mudata_with_donor_matching.h5mu", optional: true - + path 'adata_with_donor_matching.h5ad', optional: true + path 'mudata_with_donor_matching.h5mu', optional: true script: - def generate_adata = "" - def generate_mdata = "" + def generate_adata = '' + def generate_mdata = '' - if (generate_anndata == "True"){ - if(rna_matrix == "None"){ - error "Error: RNA count matrix is not given." + if (generate_anndata == 'True') { + if (rna_matrix == 'None') { + error 'Error: RNA count matrix is not given.' } generate_adata = "--generate_anndata --read_rna_mtx $rna_matrix" } - if (generate_mudata == "True"){ - if(rna_matrix == "None"){ - error "Error: RNA count matrix is not given." + if (generate_mudata == 'True') { + if (rna_matrix == 'None') { + error 'Error: RNA count matrix is not given.' } - if(hto_matrix == "None"){ - error "Error: HTO count matrix is not given." + if (hto_matrix == 'None') { + error 'Error: HTO count matrix is not given.' } generate_mdata = "--generate_mudata --read_rna_mtx $rna_matrix --read_hto_mtx $hto_matrix" } - + """ generate_data.py --assignment $assignment $generate_adata $generate_mdata """ } -workflow run_single{ - if (params.mode == "genetic"){ +workflow run_single { + if (params.mode == 'genetic') { gene_demultiplexing() - if (params.match_donor == "True"){ + if (params.match_donor == 'True') { donor_match(gene_demultiplexing.out) } } - else if (params.mode == "hashing"){ - + else if (params.mode == 'hashing') { hash_demultiplexing(params.rna_matrix_raw, params.rna_matrix_filtered, params.hto_matrix_raw, params.hto_matrix_filtered) - if (params.match_donor == "True"){ + if (params.match_donor == 'True') { donor_match(hash_demultiplexing.out) } } - else if (params.mode == "rescue"){ + else if (params.mode == 'rescue') { hash_demultiplexing(params.rna_matrix_raw, params.rna_matrix_filtered, params.hto_matrix_raw, params.hto_matrix_filtered) gene_demultiplexing() gene_summary = gene_demultiplexing.out hash_summary = hash_demultiplexing.out summary_all(gene_summary, hash_summary) - if (params.match_donor == "True"){ + if (params.match_donor == 'True') { donor_match(summary_all.out) - if (params.generate_anndata == "True" || params.generate_mudata == "True" ){ - generate_data(donor_match.out, params.generate_anndata, params.generate_mudata, + if (params.generate_anndata == 'True' || params.generate_mudata == 'True') { + generate_data(donor_match.out, params.generate_anndata, params.generate_mudata, params.rna_matrix_filtered, params.hto_matrix_filtered) } } } - else if (params.mode == "donor_match"){ + else if (params.mode == 'donor_match') { donor_match(params.demultiplexing_result) - if (params.generate_anndata == "True" || params.generate_mudata == "True" ){ - generate_data(donor_match.out, params.generate_anndata, params.generate_mudata, + if (params.generate_anndata == 'True' || params.generate_mudata == 'True') { + generate_data(donor_match.out, params.generate_anndata, params.generate_mudata, params.rna_matrix_filtered, params.hto_matrix_filtered) } } } workflow { - if (params.multi_input == null){ + if (params.multi_input == null) { run_single() } - else{ + else { run_multi() } -} \ No newline at end of file +} diff --git a/modules/donor_match.nf b/modules/donor_match.nf index 070e2ef..723932f 100644 --- a/modules/donor_match.nf +++ b/modules/donor_match.nf @@ -1,35 +1,35 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=2 +nextflow.enable.dsl = 2 -process matchDonor{ +process matchDonor { publishDir "$projectDir/$params.outdir/$sampleId/$params.mode", mode: 'copy' label 'big_mem' conda "$projectDir/conda/donor_match.yml" - + input: - tuple val (sampleId), val(ndonor), path(barcode_whitelist), val(cell_genotype), val(vireo_parent_dir), path(demultiplexing_result) + tuple val(sampleId), val(ndonor), path(barcode_whitelist), val(cell_genotype), val(vireo_parent_dir), path(demultiplexing_result) val method1_name val method2_name val findVariants val variant_count val variant_pct - + output: tuple val(sampleId), path("donor_match_${sampleId}") script: - def two_method = (method1_name != "None" & method2_name != "None") ? "--method1 $method1_name --method2 $method2_name" : "" - - def cell_genotype_path = "" - if (findVariants == "True" | findVariants == "default"){ - cell_genotype_path = cell_genotype != "None" ? "--cell_genotype $cell_genotype" : \ + def two_method = (method1_name != 'None' & method2_name != 'None') ? "--method1 $method1_name --method2 $method2_name" : '' + + def cell_genotype_path = '' + if (findVariants == 'True' | findVariants == 'default') { + cell_genotype_path = cell_genotype != 'None' ? "--cell_genotype $cell_genotype" : \ "--cell_genotype $projectDir/$params.outdir/$sampleId/$params.mode/gene_demulti/cellSNP/cellsnp_1/*/cellSNP.cells.vcf.gz" } - def vireo_parent_path = "" - if ( findVariants == 'vireo' | findVariants == 'True' ){ - vireo_parent_path = (params.mode == "donor_match" & vireo_parent_dir != "None") ? "--vireo_parent_dir $vireo_parent_dir" : "--vireo_parent_dir $projectDir/$params.outdir/$sampleId/$params.mode/gene_demulti/vireo/" + def vireo_parent_path = '' + if (findVariants == 'vireo' | findVariants == 'True') { + vireo_parent_path = (params.mode == 'donor_match' & vireo_parent_dir != 'None') ? "--vireo_parent_dir $vireo_parent_dir" : "--vireo_parent_dir $projectDir/$params.outdir/$sampleId/$params.mode/gene_demulti/vireo/" } def barcode_whitelist_path = "--barcode $barcode_whitelist" """ @@ -39,7 +39,7 @@ process matchDonor{ donor_match.R --result_csv $demultiplexing_result $barcode_whitelist_path --findVariants $findVariants \ $cell_genotype_path --variant_pct $variant_pct --variant_count $variant_count --ndonor $ndonor \ $two_method --outputdir \$outputdir $vireo_parent_path - + if ([ "$findVariants" != "False" ]); then best_method_vireo="\$(head -n 1 \$outputdir/best_method_vireo.txt)" if ([ "$params.mode" != "donor_match" ]); then @@ -47,7 +47,7 @@ process matchDonor{ else donor_genotype="\$(find $vireo_parent_dir/\$best_method_vireo -name GT_donors.vireo.vcf.gz | head -n 1)" fi - + if ([ "$findVariants" = "True" ] || [ "$findVariants" = "default" ]); then gunzip -c \$donor_genotype > \$outputdir/GT_donors.vireo.vcf if ([ \$(grep "^##config" \$outputdir/GT_donors.vireo.vcf | wc -l) == 0 ]); then @@ -69,7 +69,7 @@ process matchDonor{ bcftools filter \$outputdir/compressed_sorted_donor_genotype.vcf.gz -R \$outputdir/representative_variants_vireo.csv > \$outputdir/donor_genotype_subset_by_vireo.vcf bcftools reheader --samples \$outputdir/donor_match.csv -o \$outputdir/donor_genotype_subset_by_vireo_matched.vcf \$outputdir/donor_genotype_subset_by_vireo.vcf fi - + if ([ "$findVariants" = "vireo" ]); then bcftools sort \$donor_genotype -Oz -o \$outputdir/compressed_sorted_donor_genotype.vcf.gz bcftools index \$outputdir/compressed_sorted_donor_genotype.vcf.gz @@ -81,16 +81,15 @@ process matchDonor{ rm \$outputdir/compressed_sorted_donor_genotype.vcf.gz rm \$outputdir/compressed_sorted_donor_genotype.vcf.gz.csi fi - + """ } - -workflow donor_match{ +workflow donor_match { take: demultiplexing_result main: - matchDonor(demultiplexing_result, params.match_donor_method1, params.match_donor_method2, + matchDonor(demultiplexing_result, params.match_donor_method1, params.match_donor_method2, params.findVariants, params.variant_count, params.variant_pct) emit: matchDonor.out diff --git a/modules/gene_demultiplexing.nf b/modules/gene_demultiplexing.nf index 938eba7..d6d4bfb 100644 --- a/modules/gene_demultiplexing.nf +++ b/modules/gene_demultiplexing.nf @@ -1,5 +1,5 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=2 +nextflow.enable.dsl = 2 include { data_preprocess } from './gene_demulti/samtools' include { filter_variant } from './gene_demulti/bcftools' include { variant_cellSNP } from './gene_demulti/cellsnp' @@ -10,11 +10,11 @@ include { demultiplex_scSplit } from './gene_demulti/scsplit' include { demultiplex_souporcell } from './gene_demulti/souporcell' include { demultiplex_vireo } from './gene_demulti/vireo' -process summary{ +process summary { publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/gene_demulti", mode: 'copy' label 'small_mem' - conda "pandas scanpy mudata" + conda 'pandas scanpy mudata' input: tuple val(sampleId), path(hto_matrix, stageAs: 'hto_data'), path(rna_matrix, stageAs: 'rna_data') @@ -25,238 +25,234 @@ process summary{ val scsplit_result val generate_anndata val generate_mudata - + output: - tuple val(sampleId), path("genetic_summary") + tuple val(sampleId), path('genetic_summary') script: - def demuxlet_files = "" - def freemuxlet_files = "" - def vireo_files = "" - def souporcell_files = "" - def scsplit_files = "" - def generate_adata = "" - def generate_mdata = "" - - if (demuxlet_result != "no_result"){ - demuxlet_res = demuxlet_result.find{it.name.contains(sampleId)} + def demuxlet_files = '' + def freemuxlet_files = '' + def vireo_files = '' + def souporcell_files = '' + def scsplit_files = '' + def generate_adata = '' + def generate_mdata = '' + + if (demuxlet_result != 'no_result') { + demuxlet_res = demuxlet_result.find { it.name.contains(sampleId) } demuxlet_files = "--demuxlet ${demuxlet_res}" } - if (freemuxlet_result != "no_result"){ - freemuxlet_res = freemuxlet_result.find{it.name.contains(sampleId)} + if (freemuxlet_result != 'no_result') { + freemuxlet_res = freemuxlet_result.find { it.name.contains(sampleId) } freemuxlet_files = "--freemuxlet ${freemuxlet_res}" } - if (vireo_result != "no_result"){ - vireo_res = vireo_result.find{it.name.contains(sampleId)} + if (vireo_result != 'no_result') { + vireo_res = vireo_result.find { it.name.contains(sampleId) } vireo_files = "--vireo ${vireo_res}" } - if (souporcell_result != "no_result"){ - souporcell_res = souporcell_result.find{it.name.contains(sampleId)} + if (souporcell_result != 'no_result') { + souporcell_res = souporcell_result.find { it.name.contains(sampleId) } souporcell_files = "--souporcell ${souporcell_res}" } - if (scsplit_result != "no_result"){ - scsplit_res = scsplit_result.find{it.name.contains(sampleId)} + if (scsplit_result != 'no_result') { + scsplit_res = scsplit_result.find { it.name.contains(sampleId) } scsplit_files = "--scsplit ${scsplit_res}" } - if (generate_anndata == "True"){ - if(rna_matrix.name == "None"){ - error "Error: RNA count matrix is not given." + if (generate_anndata == 'True') { + if (rna_matrix.name == 'None') { + error 'Error: RNA count matrix is not given.' } - generate_adata = "--generate_anndata --read_rna_mtx rna_data" + generate_adata = '--generate_anndata --read_rna_mtx rna_data' } - if (generate_mudata == "True"){ - if(rna_matrix.name == "None"){ - error "Error: RNA count matrix is not given." + if (generate_mudata == 'True') { + if (rna_matrix.name == 'None') { + error 'Error: RNA count matrix is not given.' } - if(hto_matrix.name == "None"){ - error "Error: HTO count matrix is not given." + if (hto_matrix.name == 'None') { + error 'Error: HTO count matrix is not given.' } - generate_mdata = "--generate_mudata --read_rna_mtx rna_data --read_hto_mtx hto_data" + generate_mdata = '--generate_mudata --read_rna_mtx rna_data --read_hto_mtx hto_data' } """ summary_gene.py $demuxlet_files $vireo_files $souporcell_files $scsplit_files $freemuxlet_files $generate_adata $generate_mdata """ } -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{ return it.tokenize(';')}.flatten() +def split_input(input) { + if (input =~ /;/) { + Channel.from(input).map { return it.tokenize(';') }.flatten() } - else{ + else { Channel.from(input) } } - workflow gene_demultiplexing { - if ((params.demuxlet == "True" & params.demuxlet_preprocess == "True")| \ - (params.freemuxlet == "True" & params.freemuxlet_preprocess == "True")| \ - (params.scSplit == "True" & params.scSplit_preprocess == "True") | \ - (params.vireo == "True" & params.vireo_preprocess == "True") | \ - (params.souporcell == "True" & params.souporcell_preprocess == "True")){ + if ((params.demuxlet == 'True' & params.demuxlet_preprocess == 'True') | \ + (params.freemuxlet == 'True' & params.freemuxlet_preprocess == 'True')| \ + (params.scSplit == 'True' & params.scSplit_preprocess == 'True') | \ + (params.vireo == 'True' & params.vireo_preprocess == 'True') | \ + (params.souporcell == 'True' & params.souporcell_preprocess == 'True')) { Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.bam)} + | map { row-> tuple(row.sampleId, row.bam) } | data_preprocess - qc_bam = data_preprocess.out.map{ it -> tuple( it.name.tokenize( '_' ).last(), it + "/sorted.bam", it + "/sorted.bam.bai") } - } + qc_bam = data_preprocess.out.map { it -> tuple(it.name.tokenize('_').last(), it + '/sorted.bam', it + '/sorted.bam.bai') } + } - if (params.scSplit == "True" & params.scSplit_variant == 'True'){ - freebayes_region = Channel.from(1..22, "X","Y").flatten() - if (params.region != "None"){ + if (params.scSplit == 'True' & params.scSplit_variant == 'True') { + freebayes_region = Channel.from(1..22, 'X', 'Y').flatten() + if (params.region != 'None') { freebayes_region = split_input(params.region) } - if(params.scSplit_preprocess == "True"){ + if (params.scSplit_preprocess == 'True') { variant_freebayes(qc_bam, freebayes_region) } - else{ + else { input_list = Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.bam, row.bam_index)} + | map { row-> tuple(row.sampleId, row.bam, row.bam_index) } variant_freebayes(input_list, freebayes_region) } filter_variant(variant_freebayes.out) - freebayes_vcf = filter_variant.out.map{ it -> tuple(it[0], it[1] + "/filtered_sorted_total_chroms.vcf")} - + freebayes_vcf = filter_variant.out.map { it -> tuple(it[0], it[1] + '/filtered_sorted_total_chroms.vcf') } } - if (params.vireo == "True" & params.vireo_variant == 'True'){ - if(params.vireo_preprocess == 'True'){ + if (params.vireo == 'True' & params.vireo_variant == 'True') { + if (params.vireo_preprocess == 'True') { input_param_cellsnp = Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ | map { row-> tuple(row.sampleId, row.barcodes) } qc_bam_new = qc_bam.join(input_param_cellsnp) variant_cellSNP(qc_bam_new) } - else{ + else { Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.bam, row.bam_index, row.barcodes)} + | map { row-> tuple(row.sampleId, row.bam, row.bam_index, row.barcodes) } | variant_cellSNP } - cellsnp_vcf = variant_cellSNP.out.map{ it -> tuple( it.name.tokenize( '_' ).last(), it + "/*/cellSNP.cells.vcf") } + cellsnp_vcf = variant_cellSNP.out.map { it -> tuple(it.name.tokenize('_').last(), it + '/*/cellSNP.cells.vcf') } } - if (params.scSplit == "True"){ - if (params.scSplit_preprocess == 'False'){ + if (params.scSplit == 'True') { + if (params.scSplit_preprocess == 'False') { input_bam_scsplit = Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.bam, row.bam_index)} + | map { row-> tuple(row.sampleId, row.bam, row.bam_index) } } - else{ + else { input_bam_scsplit = qc_bam } - if (params.scSplit_variant == 'True'){ + if (params.scSplit_variant == 'True') { input_vcf_scsplit = freebayes_vcf } - else{ + else { input_vcf_scsplit = Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.vcf_mixed)} + | map { row-> tuple(row.sampleId, row.vcf_mixed) } } input_param_scsplit = Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.barcodes, row.nsample, row.vcf_donor)} - + | map { row-> tuple(row.sampleId, row.barcodes, row.nsamples_genetic, row.vcf_donor) } + input_list_scsplit = input_bam_scsplit.join(input_vcf_scsplit) input_list_scsplit = input_list_scsplit.join(input_param_scsplit) demultiplex_scSplit(input_list_scsplit) scSplit_out = demultiplex_scSplit.out } - else{ - scSplit_out = channel.value("no_result") + else { + scSplit_out = channel.value('no_result') } - if (params.vireo == "True"){ - if (params.vireo_variant == 'True'){ + if (params.vireo == 'True') { + if (params.vireo_variant == 'True') { input_vcf_vireo = cellsnp_vcf } - else{ + else { input_vcf_vireo = Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.celldata)} + | map { row-> tuple(row.sampleId, row.celldata) } } input_param_vireo = Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.nsample, row.vcf_donor)} - + | map { row-> tuple(row.sampleId, row.nsamples_genetic, row.vcf_donor) } + input_list_vireo = input_vcf_vireo.join(input_param_vireo) demultiplex_vireo(input_list_vireo) vireo_out = demultiplex_vireo.out } - else{ - vireo_out = channel.value("no_result") + else { + vireo_out = channel.value('no_result') } - if (params.demuxlet == "True"){ - if (params.demuxlet_preprocess == 'False'){ + if (params.demuxlet == 'True') { + if (params.demuxlet_preprocess == 'False') { input_bam_demuxlet = Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.bam, row.bam_index)} + | map { row-> tuple(row.sampleId, row.bam, row.bam_index) } } - else{ + else { input_bam_demuxlet = qc_bam } input_param_demuxlet = Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.barcodes, row.vcf_donor)} + | map { row-> tuple(row.sampleId, row.barcodes, row.vcf_donor) } input_list_demuxlet = input_bam_demuxlet.join(input_param_demuxlet) demultiplex_demuxlet(input_list_demuxlet) demuxlet_out = demultiplex_demuxlet.out } - else{ - demuxlet_out = channel.value("no_result") + else { + demuxlet_out = channel.value('no_result') } - if (params.freemuxlet == "True"){ - if (params.freemuxlet_preprocess == 'False'){ + if (params.freemuxlet == 'True') { + if (params.freemuxlet_preprocess == 'False') { input_bam_freemuxlet = Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.bam, row.bam_index)} + | map { row-> tuple(row.sampleId, row.bam, row.bam_index) } } - else{ + else { input_bam_freemuxlet = qc_bam } input_param_freemuxlet = Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.barcodes, row.nsample)} + | map { row-> tuple(row.sampleId, row.barcodes, row.nsamples_genetic) } input_list_freemuxlet = input_bam_freemuxlet.join(input_param_freemuxlet) demultiplex_freemuxlet(input_list_freemuxlet) freemuxlet_out = demultiplex_freemuxlet.out } - else{ - freemuxlet_out = channel.value("no_result") + else { + freemuxlet_out = channel.value('no_result') } - if (params.souporcell == "True"){ - if (params.souporcell_preprocess == 'False'){ + if (params.souporcell == 'True') { + if (params.souporcell_preprocess == 'False') { input_bam_souporcell = Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.bam, row.bam_index)} + | map { row-> tuple(row.sampleId, row.bam, row.bam_index) } } - else{ + else { input_bam_souporcell = qc_bam } input_param_souporcell = Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.barcodes, row.nsample, row.vcf_donor)} + | map { row-> tuple(row.sampleId, row.barcodes, row.nsamples_genetic, row.vcf_donor) } input_list_souporcell = input_bam_souporcell.join(input_param_souporcell) demultiplex_souporcell(input_list_souporcell) souporcell_out = demultiplex_souporcell.out } - else{ - souporcell_out = channel.value("no_result") + else { + souporcell_out = channel.value('no_result') } Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, file(row.hto_matrix_filtered), file(row.rna_matrix_filtered))} - | set {input_list_summary} - summary(input_list_summary, demuxlet_out, freemuxlet_out, vireo_out, souporcell_out, scSplit_out, + | map { row-> tuple(row.sampleId, file(row.hto_matrix_filtered), file(row.rna_matrix_filtered)) } + | set { input_list_summary } + summary(input_list_summary, demuxlet_out, freemuxlet_out, vireo_out, souporcell_out, scSplit_out, params.generate_anndata, params.generate_mudata) - - + emit: - summary.out + summary.out } - diff --git a/modules/hash_demulti/bff.nf b/modules/hash_demulti/bff.nf index 3cc1e1b..c656133 100644 --- a/modules/hash_demulti/bff.nf +++ b/modules/hash_demulti/bff.nf @@ -1,14 +1,14 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=2 +nextflow.enable.dsl = 2 -process bff{ +process bff { publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/hash_demulti/bff", mode:'copy' label 'small_mem' - + conda "$projectDir/conda/bff.yml" - + input: - + tuple val(sampleId), path(hto_matrix, stageAs: 'hto_data') val methods val methodsForConsensus @@ -23,27 +23,24 @@ process bff{ val assignmentOutBff val preprocess_bff val barcodeWhitelist - + output: path "bff_${sampleId}" - - + script: - + """ mkdir bff_${sampleId} bff.R --fileHto hto_data --methods $methods --methodsForConsensus $methodsForConsensus \ - --cellbarcodeWhitelist $cellbarcodeWhitelist --metricsFile bff_${sampleId}_$metricsFile \ - --doTSNE $doTSNE --doHeatmap $doHeatmap --perCellSaturation $perCellSaturation --majorityConsensusThreshold $majorityConsensusThreshold \ - --chemistry $chemistry --callerDisagreementThreshold $callerDisagreementThreshold --outputdir bff_${sampleId} \ - --assignmentOutBff $assignmentOutBff --preprocess $preprocess_bff --barcodeWhitelist $barcodeWhitelist + --cellbarcodeWhitelist $cellbarcodeWhitelist --metricsFile bff_${sampleId}_$metricsFile \ + --doTSNE $doTSNE --doHeatmap $doHeatmap --perCellSaturation $perCellSaturation --majorityConsensusThreshold $majorityConsensusThreshold \ + --chemistry $chemistry --callerDisagreementThreshold $callerDisagreementThreshold --outputdir bff_${sampleId} \ + --assignmentOutBff $assignmentOutBff --preprocess $preprocess_bff --barcodeWhitelist $barcodeWhitelist """ - } - -workflow bff_hashing{ - take: +workflow bff_hashing { + take: hto_matrix main: methods = params.methods @@ -63,13 +60,11 @@ workflow bff_hashing{ bff(hto_matrix, methods, methodsForConsensus, cellbarcodeWhitelist, metricsFile, doTSNE, doHeatmap, perCellSaturation, majorityConsensusThreshold, chemistry, callerDisagreementThreshold, assignmentOutBff, preprocess_bff, barcodeWhitelist) - + emit: bff.out.collect() } - -workflow{ +workflow { bff_hashing() - } diff --git a/modules/hash_demulti/demuxem.nf b/modules/hash_demulti/demuxem.nf index 71493e7..ae9a5e3 100644 --- a/modules/hash_demulti/demuxem.nf +++ b/modules/hash_demulti/demuxem.nf @@ -1,14 +1,14 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=2 +nextflow.enable.dsl = 2 -process demuxem{ +process demuxem { publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/hash_demulti/demuxem", mode:'copy' label 'small_mem' - conda "bioconda::pegasuspy bioconda::scanpy bioconda::demuxEM" - + conda 'bioconda::pegasuspy bioconda::scanpy bioconda::demuxEM' + input: - tuple val(sampleId), path(raw_hto_matrix_dir, stageAs: "hto_data_${params.hto_matrix_demuxem}"), + tuple val(sampleId), path(raw_hto_matrix_dir, stageAs: "hto_data_${params.hto_matrix_demuxem}"), path(raw_rna_matrix_dir, stageAs: "rna_data_${params.rna_matrix_demuxem}") val threads val alpha @@ -23,9 +23,9 @@ process demuxem{ val filter_demuxem output: path "demuxem_${sampleId}" - + script: - def generateGenderPlot = generate_gender_plot != "None" ? " --generateGenderPlot ${generate_gender_plot}" : '' + def generateGenderPlot = generate_gender_plot != 'None' ? " --generateGenderPlot ${generate_gender_plot}" : '' """ mkdir demuxem_${sampleId} demuxem.py --rna_matrix_dir rna_data_${params.rna_matrix_demuxem} --hto_matrix_dir hto_data_${params.hto_matrix_demuxem} \ @@ -33,12 +33,11 @@ process demuxem{ --min_num_genes $min_num_genes --min_num_umis $min_num_umis --alpha $alpha --alpha_noise $alpha_noise \ --n_threads $threads $generateGenderPlot --objectOutDemuxem $objectOutDemuxem --outputdir demuxem_${sampleId} \ --filter_demuxem $filter_demuxem - - """ + """ } -workflow demuxem_hashing{ +workflow demuxem_hashing { take: input_list main: @@ -54,9 +53,9 @@ workflow demuxem_hashing{ objectOutDemuxem = params.objectOutDemuxem filter_demuxem = params.filter_demuxem - demuxem(input_list, threads, alpha, alpha_noise, tol, min_num_genes, min_num_umis, + demuxem(input_list, threads, alpha, alpha_noise, tol, min_num_genes, min_num_umis, min_signal, random_state, generate_gender_plot, objectOutDemuxem, filter_demuxem) - + emit: demuxem.out.collect() -} \ No newline at end of file +} diff --git a/modules/hash_demulti/demuxmix.nf b/modules/hash_demulti/demuxmix.nf index 5103a02..6bfa109 100644 --- a/modules/hash_demulti/demuxmix.nf +++ b/modules/hash_demulti/demuxmix.nf @@ -1,14 +1,14 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=2 +nextflow.enable.dsl = 2 -process demuxmix{ +process demuxmix { publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/hash_demulti/demuxmix", mode:'copy' label 'small_mem' - + conda "$projectDir/conda/demuxmix.yml" - + input: - tuple val(sampleId), path(raw_hto_matrix_dir, stageAs: "hto_data_${params.hto_matrix_demuxmix}"), + tuple val(sampleId), path(raw_hto_matrix_dir, stageAs: "hto_data_${params.hto_matrix_demuxmix}"), path(raw_rna_matrix_dir, stageAs: "rna_data_${params.rna_matrix_demuxmix}") val hto_raw_or_filtered val rna_raw_or_filtered @@ -25,10 +25,10 @@ process demuxmix{ val correctTails val assignmentOutDemuxmix val gene_col - + output: path "demuxmix_${sampleId}" - + script: """ mkdir demuxmix_${sampleId} @@ -38,12 +38,11 @@ process demuxmix{ --maxIter_demuxmix $maxIter_demuxmix --correctTails $correctTails --k_hto $k_hto --k_rna $k_rna --outputdir demuxmix_${sampleId} \ --assignmentOutDemuxmix $assignmentOutDemuxmix --gene_col $gene_col """ - } -workflow demuxmix_hashing{ +workflow demuxmix_hashing { take: - input_list + input_list main: assay = params.assay ndelim = params.ndelim @@ -55,16 +54,15 @@ workflow demuxmix_hashing{ k_hto = params.k_hto k_rna = params.k_rna correctTails = params.correctTails - assignmentOutDemuxmix = params.assignmentOutDemuxmix + assignmentOutDemuxmix = params.assignmentOutDemuxmix gene_col = params.gene_col - - demuxmix(input_list, assay,ndelim,model,alpha_demuxmix, beta_demuxmix, tol_demuxmix, maxIter_demuxmix, k_hto, k_rna,correctTails,assignmentOutDemuxmix, gene_col) - + demuxmix(input_list, assay, ndelim, model, alpha_demuxmix, beta_demuxmix, tol_demuxmix, maxIter_demuxmix, k_hto, k_rna, correctTails, assignmentOutDemuxmix, gene_col) + emit: demuxmix.out.collect() } -workflow{ +workflow { demuxmix_hashing() } diff --git a/modules/hash_demulti/hashsolo.nf b/modules/hash_demulti/hashsolo.nf index 8b2328b..534a0ce 100755 --- a/modules/hash_demulti/hashsolo.nf +++ b/modules/hash_demulti/hashsolo.nf @@ -1,11 +1,11 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=2 -process hash_solo{ +nextflow.enable.dsl = 2 +process hash_solo { publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/hash_demulti/hashsolo", mode:'copy' label 'small_mem' conda "$projectDir/conda/hashsolo_py.yml" - + input: tuple val(sampleId), path(hto_data, stageAs: "hto_data_${params.hto_matrix_hashsolo}"), path(rna_data, stageAs: "rna_data_${params.rna_matrix_hashsolo}") val priors_negative @@ -16,13 +16,13 @@ process hash_solo{ val number_of_noise_barcodes val assignmentOutHashSolo val plotOutHashSolo - + output: path "hashsolo_${sampleId}" script: - def noise_barcodes = number_of_noise_barcodes != "None" ? "--number_of_noise_barcodes $number_of_noise_barcodes" : '' - def existing_clusters = pre_existing_clusters != "None" ? "--pre_existing_clusters $pre_existing_clusters" : '' + def noise_barcodes = number_of_noise_barcodes != 'None' ? "--number_of_noise_barcodes $number_of_noise_barcodes" : '' + def existing_clusters = pre_existing_clusters != 'None' ? "--pre_existing_clusters $pre_existing_clusters" : '' def clustering_data = use_rna_data != 'False' ? "--clustering_data rna_data_$params.rna_matrix_hashsolo" : '' """ mkdir hashsolo_${sampleId} @@ -31,7 +31,6 @@ process hash_solo{ --assignmentOutHashSolo $assignmentOutHashSolo \ --plotOutHashSolo $plotOutHashSolo --outputdir hashsolo_${sampleId} """ - } workflow hash_solo_hashing { @@ -50,4 +49,4 @@ workflow hash_solo_hashing { hash_solo(input_list, priors_negative, priors_singlet, priors_doublet, pre_existing_clusters, use_rna_data, number_of_noise_barcodes, assignmentOutHashSolo, plotOutHashSolo) emit: hash_solo.out.collect() -} \ No newline at end of file +} diff --git a/modules/hash_demulti/htodemux.nf b/modules/hash_demulti/htodemux.nf index bf18a55..3170ccc 100644 --- a/modules/hash_demulti/htodemux.nf +++ b/modules/hash_demulti/htodemux.nf @@ -1,12 +1,12 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=2 +nextflow.enable.dsl = 2 -process htodemux{ +process htodemux { publishDir "$projectDir/$params.outdir/${seurat_object.name.tokenize( '_' )[1]}/$params.mode/hash_demulti/htodemux", mode: 'copy' label 'small_mem' - conda "conda-forge::r-seurat conda-forge::r-argparse" - + conda 'conda-forge::r-seurat conda-forge::r-argparse' + input: each seurat_object val assay @@ -18,7 +18,7 @@ process htodemux{ val init val objectOutHTO val assignmentOutHTO - + //Ridge plot params val ridgePlot val ridgeNCol @@ -41,18 +41,18 @@ process htodemux{ //Heatmap val heatmap val heatmapNcells - + output: path "htodemux_${seurat_object.name.tokenize( '_' )[1]}" - + script: - def sampleId = seurat_object.name.tokenize( '_' )[1] + def sampleId = seurat_object.name.tokenize('_')[1] def init_val = init != 'None' ? " --init $init" : '' - def vln_log = vlnLog != 'False' ? "--vlnLog" : '' - def invert = tsneInvert != 'False' ? "--tSNEInvert" : '' - def verbose = tsneVerbose != 'False' ? "--tSNEVerbose" : '' - def approx = tsneApprox != 'False' ? "--tSNEApprox" : '' - + def vln_log = vlnLog != 'False' ? '--vlnLog' : '' + def invert = tsneInvert != 'False' ? '--tSNEInvert' : '' + def verbose = tsneVerbose != 'False' ? '--tSNEVerbose' : '' + def approx = tsneApprox != 'False' ? '--tSNEApprox' : '' + """ mkdir htodemux_${sampleId} HTODemux.R --seuratObject $seurat_object --assay $assay --quantile $quantile --kfunc $kfunc \ @@ -65,10 +65,9 @@ process htodemux{ --tSNEDimMax $tsneDimMax --tSNEPerplexity $tsnePerplexity --heatMap $heatmap --heatMapNcells $heatmapNcells \ --outputdir htodemux_${sampleId} """ - } -workflow htodemux_hashing{ +workflow htodemux_hashing { take: seurat_object main: @@ -76,12 +75,12 @@ workflow htodemux_hashing{ assay = params.assay kfunc = params.kfunc nstarts = params.nstarts - nsamples = params.nsamples + nsamples = params.nsamples_clustering seed = params.seed init = params.init objectOutHTO = params.objectOutHTO assignmentOutHTO = params.assignmentOutHTO - + ridgePlot = params.ridgePlot ridgeNCol = params.ridgeNCol featureScatter = params.featureScatter @@ -90,7 +89,7 @@ workflow htodemux_hashing{ vlnplot = params.vlnplot vlnFeatures = params.vlnFeatures vlnLog = params.vlnLog - + tsne = params.tsne tsneIdents = params.tsneIdents tsneInvert = params.tsneInvert diff --git a/modules/hash_demulti/multiseq.nf b/modules/hash_demulti/multiseq.nf index db9c58b..63de4ed 100644 --- a/modules/hash_demulti/multiseq.nf +++ b/modules/hash_demulti/multiseq.nf @@ -1,11 +1,11 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=2 +nextflow.enable.dsl = 2 -process multi_seq{ +process multi_seq { publishDir "$projectDir/$params.outdir/${seurat_object.name.tokenize( '_' )[1]}/$params.mode/hash_demulti/multiseq", mode: 'copy' label 'small_mem' - - conda "conda-forge::r-seurat conda-forge::r-argparse" + + conda 'conda-forge::r-seurat conda-forge::r-argparse' input: each seurat_object @@ -24,10 +24,10 @@ process multi_seq{ path "multiseq_${seurat_object.name.tokenize( '_' )[1]}" script: - def sampleId = seurat_object.name.tokenize( '_' )[1] - def autoThr = autoThresh != 'False' ? " --autoThresh" : '' - def verb = verbose != 'False' ? " --verbose" : '' - + def sampleId = seurat_object.name.tokenize('_')[1] + def autoThr = autoThresh != 'False' ? ' --autoThresh' : '' + def verb = verbose != 'False' ? ' --verbose' : '' + """ mkdir multiseq_${sampleId} MultiSeq.R --seuratObjectPath $seurat_object --assay $assay --quantile $quantile $autoThr \ @@ -36,7 +36,7 @@ process multi_seq{ """ } -workflow multiseq_hashing{ +workflow multiseq_hashing { take: seurat_object main: diff --git a/modules/hash_demulti/preprocess.nf b/modules/hash_demulti/preprocess.nf index 752d327..d866d6a 100644 --- a/modules/hash_demulti/preprocess.nf +++ b/modules/hash_demulti/preprocess.nf @@ -1,12 +1,12 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=2 +nextflow.enable.dsl = 2 -process preprocess{ +process preprocess { publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/hash_demulti/preprocess", mode:'copy' label 'small_mem' - conda "conda-forge::r-seurat conda-forge::r-argparse" - + conda 'conda-forge::r-seurat conda-forge::r-argparse' + input: tuple val(sampleId), path(hto_matrix, stageAs: 'hto_data'), path(umi_matrix, stageAs: 'rna_data') val hto_raw_or_filtered @@ -32,7 +32,7 @@ process preprocess{ """ } -workflow preprocessing_hashing{ +workflow preprocessing_hashing { take: input_list hto_raw_or_filtered @@ -47,8 +47,8 @@ workflow preprocessing_hashing{ out_file = params.preprocessOut gene_col = params.gene_col - preprocess(input_list, hto_raw_or_filtered, rna_raw_or_filtered,ndelim,sel_method, n_features, assay, margin, norm_method, out_file, gene_col) + preprocess(input_list, hto_raw_or_filtered, rna_raw_or_filtered, ndelim, sel_method, n_features, assay, margin, norm_method, out_file, gene_col) emit: preprocess.out.collect() } - \ No newline at end of file + diff --git a/modules/hash_demultiplexing.nf b/modules/hash_demultiplexing.nf index ba215d4..6a1c229 100644 --- a/modules/hash_demultiplexing.nf +++ b/modules/hash_demultiplexing.nf @@ -1,5 +1,5 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=2 +nextflow.enable.dsl = 2 include { preprocessing_hashing as preprocessing_hashing_htodemux } from './hash_demulti/preprocess' include { preprocessing_hashing as preprocessing_hashing_multiseq } from './hash_demulti/preprocess' include { multiseq_hashing } from './hash_demulti/multiseq' @@ -11,11 +11,11 @@ include { demuxmix_hashing } from './hash_demulti/demuxmix' include { gmm_demux_hashing } from './hash_demulti/gmm_demux' include { bff_hashing } from './hash_demulti/bff' -process summary{ +process summary { publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/hash_demulti", mode: 'copy' label 'small_mem' - - conda "pandas scanpy mudata" + + conda 'pandas scanpy mudata' input: tuple val(sampleId), path(hto_matrix, stageAs: 'hto_data'), path(rna_matrix, stageAs: 'rna_data') @@ -29,193 +29,191 @@ process summary{ val gmmDemux_result val generate_anndata val generate_mudata - + output: - tuple val(sampleId), path("hash_summary") + tuple val(sampleId), path('hash_summary') script: - def demuxem_files = "" - def htodemux_files = "" - def hashsolo_files = "" - def multiseq_files = "" - def hashedDrops_files = "" - def gmmDemux_files = "" - def demuxmix_files = "" - def bff_files = "" - def generate_adata = "" - def generate_mdata = "" - - if (demuxem_result != "no_result"){ - demuxem_res = demuxem_result.find{it.name.contains(sampleId)} + def demuxem_files = '' + def htodemux_files = '' + def hashsolo_files = '' + def multiseq_files = '' + def hashedDrops_files = '' + def gmmDemux_files = '' + def demuxmix_files = '' + def bff_files = '' + def generate_adata = '' + def generate_mdata = '' + + if (demuxem_result != 'no_result') { + demuxem_res = demuxem_result.find { it.name.contains(sampleId) } demuxem_files = "--demuxem ${demuxem_res}" } - if (hashsolo_result != "no_result"){ - hashsolo_res = hashsolo_result.find{it.name.contains(sampleId)} + if (hashsolo_result != 'no_result') { + hashsolo_res = hashsolo_result.find { it.name.contains(sampleId) } hashsolo_files = "--hashsolo ${hashsolo_res}" } - if (htodemux_result != "no_result"){ - htodemux_res = htodemux_result.find{it.name.contains(sampleId)} + if (htodemux_result != 'no_result') { + htodemux_res = htodemux_result.find { it.name.contains(sampleId) } htodemux_files = "--htodemux ${htodemux_res}" } - if (multiseq_result != "no_result"){ - multiseq_res = multiseq_result.find{it.name.contains(sampleId)} + if (multiseq_result != 'no_result') { + multiseq_res = multiseq_result.find { it.name.contains(sampleId) } multiseq_files = "--multiseq ${multiseq_res}" } - if (hashedDrops_result != "no_result"){ - hashedDrops_res = hashedDrops_result.find{it.name.contains(sampleId)} + if (hashedDrops_result != 'no_result') { + hashedDrops_res = hashedDrops_result.find { it.name.contains(sampleId) } hashedDrops_files = "--hashedDrops ${hashedDrops_res}" } - if (gmmDemux_result != "no_result"){ - gmmDemux_res = gmmDemux_result.find{it.name.contains(sampleId)} + if (gmmDemux_result != 'no_result') { + gmmDemux_res = gmmDemux_result.find { it.name.contains(sampleId) } gmmDemux_files = "--gmm_demux ${gmmDemux_res}" } - if (demuxmix_result != "no_result"){ - demuxmix_res = demuxmix_result.find{it.name.contains(sampleId)} + if (demuxmix_result != 'no_result') { + demuxmix_res = demuxmix_result.find { it.name.contains(sampleId) } demuxmix_files = "--demuxmix ${demuxmix_res}" } - if (bff_result != "no_result"){ - bff_res = bff_result.find{it.name.contains(sampleId)} + if (bff_result != 'no_result') { + bff_res = bff_result.find { it.name.contains(sampleId) } bff_files = "--bff ${bff_res}" } - if (generate_anndata == "True"){ - if(rna_matrix.name == "None"){ - error "Error: RNA count matrix is not given." + if (generate_anndata == 'True') { + if (rna_matrix.name == 'None') { + error 'Error: RNA count matrix is not given.' } - generate_adata = "--generate_anndata --read_rna_mtx rna_data" + generate_adata = '--generate_anndata --read_rna_mtx rna_data' } - if (generate_mudata == "True"){ - if(rna_matrix.name == "None"){ - error "Error: RNA count matrix is not given." + if (generate_mudata == 'True') { + if (rna_matrix.name == 'None') { + error 'Error: RNA count matrix is not given.' } - if(hto_matrix.name == "None"){ - error "Error: HTO count matrix is not given." + if (hto_matrix.name == 'None') { + error 'Error: HTO count matrix is not given.' } - generate_mdata = "--generate_mudata --read_rna_mtx rna_data --read_hto_mtx hto_data" + generate_mdata = '--generate_mudata --read_rna_mtx rna_data --read_hto_mtx hto_data' } - + """ summary_hash.py $demuxem_files $htodemux_files $multiseq_files $hashedDrops_files $hashsolo_files $demuxmix_files $gmmDemux_files $bff_files $generate_adata $generate_mdata """ } - -workflow hash_demultiplexing{ - if (params.htodemux == "True"){ +workflow hash_demultiplexing { + if (params.htodemux == 'True') { Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, params.hto_matrix_htodemux == "raw" ? row.hto_matrix_raw : row.hto_matrix_filtered, - params.rna_matrix_htodemux == "raw" ? row.rna_matrix_raw : row.rna_matrix_filtered)} - | set {input_list_preprocess_htodemux} - preprocessing_hashing_htodemux(input_list_preprocess_htodemux, params.hto_matrix_htodemux, params.rna_matrix_htodemux) + | map { row-> tuple(row.sampleId, params.hto_matrix_htodemux == 'raw' ? row.hto_matrix_raw : row.hto_matrix_filtered, + params.rna_matrix_htodemux == 'raw' ? row.rna_matrix_raw : row.rna_matrix_filtered)} + | set { input_list_preprocess_htodemux } + preprocessing_hashing_htodemux(input_list_preprocess_htodemux, params.hto_matrix_htodemux, params.rna_matrix_htodemux) htodemux_preprocess_out = preprocessing_hashing_htodemux.out htodemux_hashing(htodemux_preprocess_out) htodemux_out = htodemux_hashing.out } - else{ - htodemux_out = channel.value("no_result") - } - - if (params.multiseq == "True"){ - if (params.htodemux == "True" & params.hto_matrix_htodemux == params.hto_matrix_multiseq & - params.rna_matrix_htodemux == params.rna_matrix_multiseq){ - multiseq_preprocess_out = htodemux_preprocess_out + else { + htodemux_out = channel.value('no_result') } - else{ + + if (params.multiseq == 'True') { + if (params.htodemux == 'True' & params.hto_matrix_htodemux == params.hto_matrix_multiseq & + params.rna_matrix_htodemux == params.rna_matrix_multiseq) { + multiseq_preprocess_out = htodemux_preprocess_out + } + else { Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, params.hto_matrix_multiseq == "raw" ? row.hto_matrix_raw : row.hto_matrix_filtered, - params.rna_matrix_multiseq == "raw" ? row.rna_matrix_raw : row.rna_matrix_filtered)} - | set {input_list_preprocess_multiseq} - preprocessing_hashing_multiseq(input_list_preprocess_multiseq, params.hto_matrix_multiseq, params.rna_matrix_multiseq) + | map { row-> tuple(row.sampleId, params.hto_matrix_multiseq == 'raw' ? row.hto_matrix_raw : row.hto_matrix_filtered, + params.rna_matrix_multiseq == 'raw' ? row.rna_matrix_raw : row.rna_matrix_filtered)} + | set { input_list_preprocess_multiseq } + preprocessing_hashing_multiseq(input_list_preprocess_multiseq, params.hto_matrix_multiseq, params.rna_matrix_multiseq) multiseq_preprocess_out = preprocessing_hashing_multiseq.out } multiseq_hashing(multiseq_preprocess_out) multiseq_out = multiseq_hashing.out } - else{ - multiseq_out = channel.value("no_result") + else { + multiseq_out = channel.value('no_result') } - - if (params.hashsolo == "True"){ + + if (params.hashsolo == 'True') { Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, params.hto_matrix_hashsolo == "raw" ? row.hto_matrix_raw : row.hto_matrix_filtered, - params.rna_matrix_hashsolo == "False" ? channel.value("None") : - (params.rna_matrix_hashsolo == "raw" ? row.rna_matrix_raw : row.rna_matrix_filtered) + | map { row-> tuple(row.sampleId, params.hto_matrix_hashsolo == 'raw' ? row.hto_matrix_raw : row.hto_matrix_filtered, + params.rna_matrix_hashsolo == 'False' ? channel.value('None') : + (params.rna_matrix_hashsolo == 'raw' ? row.rna_matrix_raw : row.rna_matrix_filtered) )} | hash_solo_hashing hashsolo_out = hash_solo_hashing.out } - else{ - hashsolo_out = channel.value("no_result") + else { + hashsolo_out = channel.value('no_result') } - - if (params.demuxem == "True"){ + + if (params.demuxem == 'True') { Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, params.hto_matrix_demuxem == "raw" ? row.hto_matrix_raw : row.hto_matrix_filtered, - params.rna_matrix_demuxem == "raw" ? row.rna_matrix_raw : row.rna_matrix_filtered)} + | map { row-> tuple(row.sampleId, params.hto_matrix_demuxem == 'raw' ? row.hto_matrix_raw : row.hto_matrix_filtered, + params.rna_matrix_demuxem == 'raw' ? row.rna_matrix_raw : row.rna_matrix_filtered)} | demuxem_hashing demuxem_out = demuxem_hashing.out } - else{ - demuxem_out = channel.value("no_result") + else { + demuxem_out = channel.value('no_result') } - - if (params.hashedDrops == "True"){ + + if (params.hashedDrops == 'True') { Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, params.hto_matrix_hashedDrops == "raw" ? row.hto_matrix_raw : row.hto_matrix_filtered )} + | map { row-> tuple(row.sampleId, params.hto_matrix_hashedDrops == 'raw' ? row.hto_matrix_raw : row.hto_matrix_filtered) } | hashedDrops_hashing hashedDrops_out = hashedDrops_hashing.out } - else{ - hashedDrops_out = channel.value("no_result") + else { + hashedDrops_out = channel.value('no_result') } - - if (params.demuxmix == "True"){ + if (params.demuxmix == 'True') { Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, params.hto_matrix_demuxmix == "raw" ? row.hto_matrix_raw : row.hto_matrix_filtered, - params.rna_matrix_demuxmix == "raw" ? row.rna_matrix_raw : row.rna_matrix_filtered, + | map { row-> tuple(row.sampleId, params.hto_matrix_demuxmix == 'raw' ? row.hto_matrix_raw : row.hto_matrix_filtered, + params.rna_matrix_demuxmix == 'raw' ? row.rna_matrix_raw : row.rna_matrix_filtered, params.hto_matrix_demuxmix,params.rna_matrix_demuxmix)} - |set {input_list_demuxmix} - demuxmix_hashing (input_list_demuxmix,params.rna_available) + |set { input_list_demuxmix } + demuxmix_hashing(input_list_demuxmix, params.rna_available) demuxmix_out = demuxmix_hashing.out } - else{ - demuxmix_out = channel.value("no_result") + else { + demuxmix_out = channel.value('no_result') } - if (params.bff == "True"){ + if (params.bff == 'True') { Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, params.hto_matrix_bff == "raw" ? row.hto_matrix_raw : row.hto_matrix_filtered )} + | map { row-> tuple(row.sampleId, params.hto_matrix_bff == 'raw' ? row.hto_matrix_raw : row.hto_matrix_filtered) } | bff_hashing bff_out = bff_hashing.out - print("BFF path to output") + print('BFF path to output') } - else{ - bff_out = channel.value("no_result") + else { + bff_out = channel.value('no_result') } - if (params.gmmDemux == "True"){ + if (params.gmmDemux == 'True') { Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, params.hto_matrix_gmm_demux == "raw" ? row.hto_matrix_raw : row.hto_matrix_filtered, row.hto_name_gmm )} + | map { row-> tuple(row.sampleId, params.hto_matrix_gmm_demux == 'raw' ? row.hto_matrix_raw : row.hto_matrix_filtered, row.hto_name_gmm) } | gmm_demux_hashing gmmDemux_out = gmm_demux_hashing.out } - else{ - gmmDemux_out = channel.value("no_result") + else { + gmmDemux_out = channel.value('no_result') } Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, file(row.hto_matrix_filtered), file(row.rna_matrix_filtered))} - | set {input_list_summary} - summary(input_list_summary, demuxem_out, hashsolo_out, htodemux_out, multiseq_out, hashedDrops_out,demuxmix_out,bff_out,gmmDemux_out, params.generate_anndata, params.generate_mudata) - + | map { row-> tuple(row.sampleId, file(row.hto_matrix_filtered), file(row.rna_matrix_filtered)) } + | set { input_list_summary } + summary(input_list_summary, demuxem_out, hashsolo_out, htodemux_out, multiseq_out, hashedDrops_out, demuxmix_out, bff_out, gmmDemux_out, params.generate_anndata, params.generate_mudata) + emit: summary.out } diff --git a/modules/multi_demultiplexing.nf b/modules/multi_demultiplexing.nf index cfbeafb..24e1cf0 100644 --- a/modules/multi_demultiplexing.nf +++ b/modules/multi_demultiplexing.nf @@ -1,59 +1,59 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=2 +nextflow.enable.dsl = 2 include { hash_demultiplexing } from './hash_demultiplexing' include { gene_demultiplexing } from './gene_demultiplexing' include { donor_match } from './donor_match' -process generate_data{ +process generate_data { publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/data_output", mode: 'copy' label 'small_mem' - conda "pandas scanpy mudata" + conda 'pandas scanpy mudata' input: tuple val(sampleId), val(hto_matrix), val(rna_matrix), path(assignment) val generate_anndata val generate_mudata - + output: - path "adata_with_donor_matching.h5ad", optional: true - path "mudata_with_donor_matching.h5mu", optional: true + path 'adata_with_donor_matching.h5ad', optional: true + path 'mudata_with_donor_matching.h5mu', optional: true script: - def generate_adata = "" - def generate_mdata = "" + def generate_adata = '' + def generate_mdata = '' - if (generate_anndata == "True"){ - if(rna_matrix == "None"){ - error "Error: RNA count matrix is not given." + if (generate_anndata == 'True') { + if (rna_matrix == 'None') { + error 'Error: RNA count matrix is not given.' } generate_adata = "--generate_anndata --read_rna_mtx $rna_matrix" } - if (generate_mudata == "True"){ - if(rna_matrix == "None"){ - error "Error: RNA count matrix is not given." + if (generate_mudata == 'True') { + if (rna_matrix == 'None') { + error 'Error: RNA count matrix is not given.' } - if(hto_matrix == "None"){ - error "Error: HTO count matrix is not given." + if (hto_matrix == 'None') { + error 'Error: HTO count matrix is not given.' } generate_mdata = "--generate_mudata --read_rna_mtx $rna_matrix --read_hto_mtx $hto_matrix" } - + """ generate_data.py --assignment $assignment $generate_adata $generate_mdata """ } -process summary_all{ +process summary_all { publishDir "$projectDir/$params.outdir/$sampleId/$params.mode", mode: 'copy' label 'small_mem' - - conda "pandas scanpy mudata" + + conda 'pandas scanpy mudata' input: tuple val(sampleId), path(gene_demulti_result), path(hash_demulti_result) output: - tuple val(sampleId), path("summary") + tuple val(sampleId), path('summary') script: """ @@ -61,64 +61,63 @@ process summary_all{ """ } -workflow run_multi{ - if (params.mode == "genetic"){ +workflow run_multi { + if (params.mode == 'genetic') { gene_demultiplexing() - if (params.match_donor == "True"){ + if (params.match_donor == 'True') { gene_demultiplexing.out.view() Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.nsample, row.barcodes, "None", "None")} + | map { row-> tuple(row.sampleId, row.nsamples_genetic, row.barcodes, 'None', 'None') } | join(gene_demultiplexing.out) | donor_match } } - else if (params.mode == "hashing"){ + else if (params.mode == 'hashing') { hash_demultiplexing() - if (params.match_donor == "True"){ + if (params.match_donor == 'True') { Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ - | map { row -> tuple(row.sampleId, row.nsample, row.barcodes, "None", "None")} + | map { row -> tuple(row.sampleId, row.nsamples_genetic, row.barcodes, 'None', 'None') } | join(hash_demultiplexing.out) | donor_match } } - else if (params.mode == "rescue"){ + else if (params.mode == 'rescue') { hash_demultiplexing() gene_demultiplexing() gene_summary = gene_demultiplexing.out hash_summary = hash_demultiplexing.out input_summary_all = gene_summary.join(hash_summary) summary_all(input_summary_all) - if (params.match_donor == "True"){ + if (params.match_donor == 'True') { Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ - | map { row -> tuple(row.sampleId, row.nsample, row.barcodes, "None", "None")} + | map { row -> tuple(row.sampleId, row.nsamples_genetic, row.barcodes, 'None', 'None') } | join(summary_all.out) | donor_match } - if (params.generate_anndata == "True" || params.generate_mudata == "True" ){ - Channel.fromPath(params.multi_input) \ + if (params.generate_anndata == 'True' || params.generate_mudata == 'True') { + Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ - | map { row -> tuple(row.sampleId, row.hto_matrix_filtered, row.rna_matrix_filtered)} + | map { row -> tuple(row.sampleId, row.hto_matrix_filtered, row.rna_matrix_filtered) } | join(donor_match.out) - | set {input_generate_data} - generate_data(input_generate_data, params.generate_anndata, params.generate_mudata) + | set { input_generate_data } + generate_data(input_generate_data, params.generate_anndata, params.generate_mudata) } } - else if (params.mode == "donor_match"){ + else if (params.mode == 'donor_match') { Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ - | map { row -> tuple(row.sampleId, row.nsample, row.barcodes, row.celldata, row.vireo_parent_dir, row.demultiplexing_result)} \ + | map { row -> tuple(row.sampleId, row.nsamples_genetic, row.barcodes, row.celldata, row.vireo_parent_dir, row.demultiplexing_result) } \ | donor_match - if (params.generate_anndata == "True" || params.generate_mudata == "True" ){ + if (params.generate_anndata == 'True' || params.generate_mudata == 'True') { Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ - | map { row -> tuple(row.sampleId, row.hto_matrix_filtered, row.rna_matrix_filtered)} + | map { row -> tuple(row.sampleId, row.hto_matrix_filtered, row.rna_matrix_filtered) } | join(donor_match.out) - | set {input_generate_data} + | set { input_generate_data } generate_data(input_generate_data, params.generate_anndata, params.generate_mudata) } } - } diff --git a/modules/single/donor_match.nf b/modules/single/donor_match.nf index 39f0a0b..c1e72c1 100644 --- a/modules/single/donor_match.nf +++ b/modules/single/donor_match.nf @@ -1,12 +1,12 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=2 +nextflow.enable.dsl = 2 -process matchDonor{ +process matchDonor { publishDir "$projectDir/$params.outdir/$params.mode", mode: 'copy' label 'big_mem' conda "$projectDir/conda/donor_match.yml" - + input: path demultiplexing_result val ndonor @@ -18,27 +18,27 @@ process matchDonor{ val variant_count val variant_pct val vireo_parent_dir - + output: - path "donor_match" + path 'donor_match' script: - def two_method = (method1_name != "None" & method2_name != "None") ? "--method1 $method1_name --method2 $method2_name" : "" + def two_method = (method1_name != 'None' & method2_name != 'None') ? "--method1 $method1_name --method2 $method2_name" : '' - def cell_genotype_path = "" - if (findVariants == "True" | findVariants == "default"){ - cell_genotype_path = cell_genotype != "None" ? "--cell_genotype $cell_genotype" : \ + def cell_genotype_path = '' + if (findVariants == 'True' | findVariants == 'default') { + cell_genotype_path = cell_genotype != 'None' ? "--cell_genotype $cell_genotype" : \ "--cell_genotype $projectDir/$params.outdir/$params.mode/gene_demulti/cellSNP/cellsnp_1/*/cellSNP.cells.vcf.gz" } - def vireo_parent_path = "" - if ( findVariants == 'vireo' | findVariants == 'True' ){ - vireo_parent_path = (params.mode == "donor_match" & vireo_parent_dir != "None") ? "--vireo_parent_dir $vireo_parent_dir" : \ + def vireo_parent_path = '' + if (findVariants == 'vireo' | findVariants == 'True') { + vireo_parent_path = (params.mode == 'donor_match' & vireo_parent_dir != 'None') ? "--vireo_parent_dir $vireo_parent_dir" : \ "--vireo_parent_dir $projectDir/$params.outdir/$params.mode/gene_demulti/vireo/" } - def barcode_whitelist_path = (barcode_whitelist != "None") ? "--barcode $barcode_whitelist" : "" - + def barcode_whitelist_path = (barcode_whitelist != 'None') ? "--barcode $barcode_whitelist" : '' + """ export R_MAX_VSIZE=100Gb outputdir=donor_match @@ -46,7 +46,7 @@ process matchDonor{ donor_match.R --result_csv $demultiplexing_result $barcode_whitelist_path --findVariants $findVariants \ $cell_genotype_path --variant_pct $variant_pct --variant_count $variant_count --ndonor $ndonor \ $two_method --outputdir \$outputdir $vireo_parent_path - + if ([ "$findVariants" != "False" ]); then best_method_vireo="\$(head -n 1 \$outputdir/best_method_vireo.txt)" if ([ "$params.mode" != "donor_match" ]); then @@ -54,7 +54,7 @@ process matchDonor{ else donor_genotype="\$(find $vireo_parent_dir/\$best_method_vireo -name GT_donors.vireo.vcf.gz | head -n 1)" fi - + if ([ "$findVariants" = "True" ] || [ "$findVariants" = "default" ]); then gunzip -c \$donor_genotype > \$outputdir/GT_donors.vireo.vcf if ([ \$(grep "^##config" \$outputdir/GT_donors.vireo.vcf | wc -l) == 0 ]); then @@ -89,16 +89,15 @@ process matchDonor{ rm \$outputdir/compressed_sorted_donor_genotype.vcf.gz.csi fi - + """ } - -workflow donor_match{ +workflow donor_match { take: demultiplexing_result main: - matchDonor(demultiplexing_result, params.nsample, params.barcodes, params.match_donor_method1, params.match_donor_method2, + matchDonor(demultiplexing_result, params.nsamples_genetic, params.barcodes, params.match_donor_method1, params.match_donor_method2, params.findVariants, params.celldata, params.variant_count, params.variant_pct, params.vireo_parent_dir) emit: matchDonor.out diff --git a/modules/single/gene_demulti/bcftools.nf b/modules/single/gene_demulti/bcftools.nf index d53efe7..e499c2e 100644 --- a/modules/single/gene_demulti/bcftools.nf +++ b/modules/single/gene_demulti/bcftools.nf @@ -1,28 +1,27 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=2 -process bcftools{ +nextflow.enable.dsl = 2 +process bcftools { publishDir "$projectDir/$params.outdir/$params.mode/gene_demulti/bcftools", mode: 'copy' label 'big_mem' - conda "bioconda::bcftools=1.9" - + conda 'bioconda::bcftools=1.9' + input: val vcf output: path "bcftools_${task.index}" script: - vcf_files = vcf.join(" ") + vcf_files = vcf.join(' ') """ mkdir bcftools_${task.index} bcftools concat -o bcftools_${task.index}/total_chroms.vcf ${vcf_files} cd bcftools_${task.index} bcftools sort total_chroms.vcf -o sorted_total_chroms.vcf - bcftools filter -i '%QUAL > 30' sorted_total_chroms.vcf -o filtered_sorted_total_chroms.vcf + bcftools filter -i '%QUAL > 30' sorted_total_chroms.vcf -o filtered_sorted_total_chroms.vcf """ - } -workflow filter_variant{ +workflow filter_variant { take: vcf main: diff --git a/modules/single/gene_demulti/cellsnp.nf b/modules/single/gene_demulti/cellsnp.nf index 7768740..082afb9 100644 --- a/modules/single/gene_demulti/cellsnp.nf +++ b/modules/single/gene_demulti/cellsnp.nf @@ -1,11 +1,11 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=2 +nextflow.enable.dsl = 2 -process cellSNP{ +process cellSNP { publishDir "$projectDir/$params.outdir/$params.mode/gene_demulti/cellSNP", mode: 'copy' label 'big_mem' - conda "bioconda::cellsnp-lite" + conda 'bioconda::cellsnp-lite' input: path samFile_cellSNP @@ -33,7 +33,6 @@ process cellSNP{ val countORPHAN_cellSNP val cellsnp_out - output: path "cellsnp_${task.index}" @@ -44,9 +43,9 @@ process cellSNP{ def barcodeFile = barcodeFile_cellSNP.name != 'None' ? "--barcodeFile ${barcodeFile_cellSNP}" : '' def sampleList = sampleList_cellSNP != 'None' ? "--sampleList ${sampleList_cellSNP}" : '' def sampleIDs = sampleIDs_cellSNP != 'None' ? "--sampleIDs ${sampleIDs_cellSNP}" : '' - def genotype = genotype_cellSNP != 'False' ? "--genotype" : '' - def gzip = gzip_cellSNP != 'False' ? "--gzip" : '' - def printSkipSNPs = printSkipSNPs_cellSNP != 'False' ? "--printSkipSNPs" : '' + def genotype = genotype_cellSNP != 'False' ? '--genotype' : '' + def gzip = gzip_cellSNP != 'False' ? '--gzip' : '' + def printSkipSNPs = printSkipSNPs_cellSNP != 'False' ? '--printSkipSNPs' : '' def nproc = nproc_cellSNP != 'None' ? "--nproc ${nproc_cellSNP}" : '' def refseq = refseq_cellSNP != 'None' ? "--refseq ${refseq_cellSNP}" : '' def chrom = chrom_cellSNP != 'None' ? "--chrom ${chrom_cellSNP}" : '' @@ -54,12 +53,12 @@ process cellSNP{ def UMItag = "--UMItag ${UMItag_cellSNP}" def minCOUNT = "--minCOUNT ${minCOUNT_cellSNP}" def minMAF = "--minMAF ${minMAF_cellSNP}" - def doubletGL = doubletGL_cellSNP != 'False' ? "--doubletGL" : '' + def doubletGL = doubletGL_cellSNP != 'False' ? '--doubletGL' : '' def inclFLAG = inclFLAG_cellSNP != 'None' ? "--inclFLAG ${inclFLAG_cellSNP}" : '' def exclFLAG = exclFLAG_cellSNP != 'None' ? "--exclFLAG ${exclFLAG_cellSNP}" : '' def minLEN = "--minLEN ${minLEN_cellSNP}" def minMAPQ = "--minMAPQ ${minMAPQ_cellSNP}" - def countORPHAN = countORPHAN_cellSNP != 'False' ? "--countORPHAN" : '' + def countORPHAN = countORPHAN_cellSNP != 'False' ? '--countORPHAN' : '' def out = "cellsnp_${task.index}/${cellsnp_out}" """ @@ -75,17 +74,16 @@ process cellSNP{ """ } - -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{ return it.tokenize(';')}.flatten() +def split_input(input) { + if (input =~ /;/) { + Channel.from(input).map { return it.tokenize(';') }.flatten() } - else{ + else { Channel.from(input) } } -workflow variant_cellSNP{ +workflow variant_cellSNP { take: samFile indexFile @@ -112,8 +110,8 @@ workflow variant_cellSNP{ minMAPQ = channel.value(params.minMAPQ) countORPHAN = channel.value(params.countORPHAN) cellsnp_out = channel.value(params.cellsnp_out) - cellSNP(samFile, indexFile, regionsVCF, targetsVCF, barcodeFile, sampleList, - sampleIDs, genotype_cellSNP, gzip_cellSNP, printSkipSNPs, nproc_cellSNP, refseq_cellSNP, + cellSNP(samFile, indexFile, regionsVCF, targetsVCF, barcodeFile, sampleList, + sampleIDs, genotype_cellSNP, gzip_cellSNP, printSkipSNPs, nproc_cellSNP, refseq_cellSNP, chrom, cellTAG, UMItag, minCOUNT, minMAF, doubletGL, inclFLAG, exclFLAG, minLEN, minMAPQ, countORPHAN, cellsnp_out) emit: cellSNP.out diff --git a/modules/single/gene_demulti/demuxlet.nf b/modules/single/gene_demulti/demuxlet.nf index 208ee23..2be6ea9 100755 --- a/modules/single/gene_demulti/demuxlet.nf +++ b/modules/single/gene_demulti/demuxlet.nf @@ -1,12 +1,12 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=2 +nextflow.enable.dsl = 2 process demuxlet { publishDir "$projectDir/$params.outdir/$params.mode/gene_demulti/demuxlet", mode: 'copy' label 'small_mem' - conda "bioconda::popscle" + conda 'bioconda::popscle' input: each sam @@ -38,11 +38,10 @@ process demuxlet { each r2_info each min_mac each min_callrate - + each alpha each doublet_prior each demuxlet_out - output: path "demuxlet_${task.index}" @@ -51,14 +50,14 @@ process demuxlet { def samfile = "--sam $sam" def taggroup = tag_group != 'None' ? "--tag-group ${tag_group}" : '' def tagUMI = tag_UMI != 'None' ? "--tag-UMI ${tag_UMI}" : '' - def vcfref = plp == 'True' ? "--vcf ${vcf_donor}" : "" - def vcfref_name = plp == 'True' ? vcf_donor : "No VCF Ref is used because plp is not performed." + def vcfref = plp == 'True' ? "--vcf ${vcf_donor}" : '' + def vcfref_name = plp == 'True' ? vcf_donor : 'No VCF Ref is used because plp is not performed.' def smlist = sm != 'None' ? "--sm $sm" : '' def sm_list_file = sm_list != 'None' ? "--sm-list ${sm_list}" : '' - def sm_list_file_name = sm_list != 'None' ? file(sm_list).baseName : "No sm list file is given" + def sm_list_file_name = sm_list != 'None' ? file(sm_list).baseName : 'No sm list file is given' def samverbose = "--sam-verbose ${sam_verbose}" def vcfverbose = "--vcf-verbose ${vcf_verbose}" - def skipumi = skip_umi != "False" ? "--skip-umi" : "" + def skipumi = skip_umi != 'False' ? '--skip-umi' : '' def capBQ = "--cap-BQ ${cap_BQ}" def minBQ = "--min-BQ ${min_BQ}" def minMQ = "--min-MQ ${min_MQ}" @@ -69,7 +68,7 @@ process demuxlet { def minumi = "--min-umi ${min_umi}" def minuniq = "--min-uniq ${min_uniq}" def minsnp = "--min-snp ${min_snp}" - def plp_name = plp == 'True' ? "plp performed" : "plp not performed" + def plp_name = plp == 'True' ? 'plp performed' : 'plp not performed' def vcfdonor = "--vcf ${vcf_donor}" def fieldinfo = "--field $field" def genoerror_off = "--geno-error-offset ${geno_error_offset}" @@ -77,9 +76,9 @@ process demuxlet { def r2info = "--r2-info ${r2_info}" def minmac = "--min-mac ${min_mac}" def mincallrate = "--min-callrate ${min_callrate}" - def alpha_value = alpha.replaceAll(/,/, " --alpha ") + def alpha_value = alpha.replaceAll(/,/, ' --alpha ') def doubletprior = "--doublet-prior ${doublet_prior}" - + """ mkdir demuxlet_${task.index} touch demuxlet_${task.index}/params.csv @@ -99,22 +98,21 @@ process demuxlet { ${genoerror_off} ${genoerror_cof} $r2info $minmac $mincallrate $smlist ${sm_list_file} --alpha ${alpha_value} \ $doubletprior $samverbose $vcfverbose $capBQ $minBQ $minMQ $minTD $exclflag $grouplist $mintotal $minumi $minsnp \ $minuniq --out demuxlet_${task.index}/${demuxlet_out} - + fi """ - } -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{ return it.tokenize(';')}.flatten() +def split_input(input) { + if (input =~ /;/) { + Channel.from(input).map { return it.tokenize(';') }.flatten() } - else{ + else { Channel.from(input) } } -workflow demultiplex_demuxlet{ +workflow demultiplex_demuxlet { take: sam main: @@ -140,18 +138,18 @@ workflow demultiplex_demuxlet{ field = split_input(params.field) geno_error_offset = split_input(params.geno_error_offset) geno_error_coeff = split_input(params.geno_error_coeff) - r2_info= split_input(params.r2_info) + r2_info = split_input(params.r2_info) min_mac = split_input(params.min_mac) min_callrate = split_input(params.min_callrate) alpha = split_input(params.alpha) doublet_prior = split_input(params.doublet_prior) demuxlet_out = params.demuxlet_out - demuxlet(sam, tag_group, tag_UMI, sm, sm_list, sam_verbose, vcf_verbose, skip_umi, - cap_BQ, min_BQ, min_MQ, min_TD, excl_flag, group_list, min_total, min_uniq, min_umi, - min_snp, plp, vcfdonor, field, geno_error_offset, geno_error_coeff, r2_info, min_mac, + demuxlet(sam, tag_group, tag_UMI, sm, sm_list, sam_verbose, vcf_verbose, skip_umi, + cap_BQ, min_BQ, min_MQ, min_TD, excl_flag, group_list, min_total, min_uniq, min_umi, + min_snp, plp, vcfdonor, field, geno_error_offset, geno_error_coeff, r2_info, min_mac, min_callrate, alpha, doublet_prior, demuxlet_out) - + emit: demuxlet.out.collect() } diff --git a/modules/single/gene_demulti/freebayes.nf b/modules/single/gene_demulti/freebayes.nf index 930448c..73ad0b8 100644 --- a/modules/single/gene_demulti/freebayes.nf +++ b/modules/single/gene_demulti/freebayes.nf @@ -1,11 +1,11 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=2 +nextflow.enable.dsl = 2 -process freebayes{ +process freebayes { publishDir "$projectDir/$params.outdir/$params.mode/gene_demulti/freebayes", mode: 'copy' label 'big_mem' - conda "bioconda::freebayes=1.2" + conda 'bioconda::freebayes=1.2' input: path bam_freebayes @@ -83,91 +83,90 @@ process freebayes{ val genotype_qualities_freebayes val debug_freebayes val dd_freebayes - output: - path "*.vcf" + path '*.vcf' script: - def stdin = stdin_freebayes != 'False' ? "--stdin" : '' + def stdin = stdin_freebayes != 'False' ? '--stdin' : '' def targets = targets_freebayes != 'None' ? "--targets ${targets_freebayes}" : '' def region = region_freebayes != 'None' ? "--region ${region_freebayes}" : '' def samples = samples_freebayes != 'None' ? "--samples ${samples_freebayes}" : '' - def populations = populations_freebayes != 'None' ? "--populations ${populations}":'' + def populations = populations_freebayes != 'None' ? "--populations ${populations}" : '' def cnv_map = cnv_map_freebayes != 'None' ? "--cnv-map ${cnv_map_freebayes}" : '' - - def gvcf = gvcf_freebayes !='False' ? "--gvcf":'' - def gvcf_chunk = gvcf_chunk_freebayes !='None' ? "--gvcf-chunk ${gvcf_chunk_freebayes}":'' - def gvcf_dont_use_chunk = gvcf_dont_use_chunk_freebayes != 'None' ?"--gvcf-dont-use-chunk ${gvcf_dont_use_chunk_freebayes}":'' + + def gvcf = gvcf_freebayes != 'False' ? '--gvcf' : '' + def gvcf_chunk = gvcf_chunk_freebayes != 'None' ? "--gvcf-chunk ${gvcf_chunk_freebayes}" : '' + def gvcf_dont_use_chunk = gvcf_dont_use_chunk_freebayes != 'None' ? "--gvcf-dont-use-chunk ${gvcf_dont_use_chunk_freebayes}" : '' def variant_input = variant_input_freebayes != 'None' ? "--variant-input ${variant_input_freebayes}" : '' - def only_use_input_alleles = only_use_input_alleles_freebayes != 'False' ? "--only-use-input-alleles" : '' - def haplotype_basis_alleles = haplotype_basis_alleles_freebayes != 'None' ? "--haplotype-basis-alleles ${haplotype_basis_alleles_freebayes}":'' - def report_all_haplotype_alleles = report_all_haplotype_alleles_freebayes !='False' ? "--report-all-haplotype-alleles":'' - def report_monomorphic = report_monomorphic_freebayes !='False' ? "--report-monomorphic":'' + def only_use_input_alleles = only_use_input_alleles_freebayes != 'False' ? '--only-use-input-alleles' : '' + def haplotype_basis_alleles = haplotype_basis_alleles_freebayes != 'None' ? "--haplotype-basis-alleles ${haplotype_basis_alleles_freebayes}" : '' + def report_all_haplotype_alleles = report_all_haplotype_alleles_freebayes != 'False' ? '--report-all-haplotype-alleles' : '' + def report_monomorphic = report_monomorphic_freebayes != 'False' ? '--report-monomorphic' : '' def pvar = "--pvar ${pvar_freebayes}" - def strict_vcf = strict_vcf_freebayes!='False' ? "--strict-vcf":'' + def strict_vcf = strict_vcf_freebayes != 'False' ? '--strict-vcf' : '' def theta = "--theta ${theta_freebayes}" def ploidy = "--ploidy ${ploidy_freebayes}" - def pooled_discrete = pooled_discrete_freebayes!='False' ? "--pooled-discrete":'' - def pooled_continuous = pooled_continuous_freebayes !='False' ? "--pooled-continuous":'' + def pooled_discrete = pooled_discrete_freebayes != 'False' ? '--pooled-discrete' : '' + def pooled_continuous = pooled_continuous_freebayes != 'False' ? '--pooled-continuous' : '' - def use_reference_allele = use_reference_allele_freebayes != 'False' ? "--use-reference-allele" : '' + def use_reference_allele = use_reference_allele_freebayes != 'False' ? '--use-reference-allele' : '' def reference_quality = "--reference-quality ${reference_quality_freebayes}" - def no_snps = no_snps_freebayes !='False' ? "--no-snps" :'' - def no_indels = no_indels_freebayes !='False' ? "--no-indels" :'' - def no_mnps = no_mnps_freebayes !='False' ? "--no-mnps" :'' - def no_complex = no_complex_freebayes !='False' ? "--no-complex" :'' + def no_snps = no_snps_freebayes != 'False' ? '--no-snps' : '' + def no_indels = no_indels_freebayes != 'False' ? '--no-indels' : '' + def no_mnps = no_mnps_freebayes != 'False' ? '--no-mnps' : '' + def no_complex = no_complex_freebayes != 'False' ? '--no-complex' : '' def use_best_n_alleles = "--use-best-n-alleles ${use_best_n_alleles_freebayes}" def haplotype_length = haplotype_length_freebayes = "--haplotype-length ${haplotype_length_freebayes}" def min_repeat_size = "--min-repeat-size ${min_repeat_size_freebayes}" def min_repeat_entropy = "--min-repeat-entropy ${min_repeat_entropy_freebayes}" - def no_partial_observations = no_partial_observations_freebayes !='False' ? "--no-partial-observations":'' + def no_partial_observations = no_partial_observations_freebayes != 'False' ? '--no-partial-observations' : '' - def dont_left_align_indels = dont_left_align_indels_freebayes != 'False' ? "--dont-left-align-indels" : '' + def dont_left_align_indels = dont_left_align_indels_freebayes != 'False' ? '--dont-left-align-indels' : '' - def use_duplicate_reads = use_duplicate_reads_freebayes != 'False' ? "--use-duplicate-reads" : '' + def use_duplicate_reads = use_duplicate_reads_freebayes != 'False' ? '--use-duplicate-reads' : '' def min_mapping_quality = "--min-mapping-quality ${min_mapping_quality_freebayes}" def min_base_quality = "--min-base-quality ${min_base_quality_freebayes}" def min_supporting_allele_qsum = "--min-supporting-allele-qsum ${min_supporting_allele_qsum_freebayes}" def min_supporting_mapping_qsum = "--min-supporting-mapping-qsum ${min_supporting_mapping_qsum_freebayes}" def mismatch_base_quality_threshold = "--mismatch-base-quality-threshold ${mismatch_base_quality_threshold_freebayes}" - def read_mismatch_limit = read_mismatch_limit_freebayes != 'None' ? "-read-mismatch-limit ${read_mismatch_limit_freebayes}":'' + def read_mismatch_limit = read_mismatch_limit_freebayes != 'None' ? "-read-mismatch-limit ${read_mismatch_limit_freebayes}" : '' def read_max_mismatch_fraction = "--read-max-mismatch-fraction ${read_max_mismatch_fraction_freebayes}" - def read_snp_limit = read_snp_limit_freebayes != 'None' ? "--read-snp-limit ${read_snp_limit_freebayes}":'' - def read_indel_limit = read_indel_limit_freebayes != 'None' ? "--read-indel-limit ${read_indel_limit_freebayes}" :'' - def standard_filters = standard_filters_freebayes!='False' ? "--standard-filters":'' + def read_snp_limit = read_snp_limit_freebayes != 'None' ? "--read-snp-limit ${read_snp_limit_freebayes}" : '' + def read_indel_limit = read_indel_limit_freebayes != 'None' ? "--read-indel-limit ${read_indel_limit_freebayes}" : '' + def standard_filters = standard_filters_freebayes != 'False' ? '--standard-filters' : '' def min_alternate_fraction = "--min-alternate-fraction ${min_alternate_fraction_freebayes}" def min_alternate_count = "--min-alternate-count ${min_alternate_count_freebayes}" def min_alternate_qsum = "--min-alternate-qsum ${min_alternate_qsum_freebayes}" def min_alternate_total = "--min-alternate-total ${min_alternate_total_freebayes}" def min_coverage = "--min-coverage ${min_coverage_freebayes}" - def max_coverage = max_coverage_freebayes !='None' ? "--max-coverage ${max_coverage_freebayes}":'' + def max_coverage = max_coverage_freebayes != 'None' ? "--max-coverage ${max_coverage_freebayes}" : '' - def no_population_priors = no_population_priors_freebayes != 'False' ? "--no-population-priors" : '' - def hwe_priors_off = hwe_priors_off_freebayes != 'False' ? "--hwe-priors-off" : '' - def binomial_obs_priors_off = binomial_obs_priors_off_freebayes != 'False' ? "--binomial-obs-priors-off" : '' - def allele_balance_priors_off = allele_balance_priors_off_freebayes != 'False' ? "--allele-balance-priors-off" : '' - def observation_bias = observation_bias_freebayes!= 'None' ? "--observation-bias ${observation_bias_freebayes}":'' - def base_quality_cap = base_quality_cap_freebayes !='None' ? "--base-quality-cap ${base_quality_cap_freebayes}":'' + def no_population_priors = no_population_priors_freebayes != 'False' ? '--no-population-priors' : '' + def hwe_priors_off = hwe_priors_off_freebayes != 'False' ? '--hwe-priors-off' : '' + def binomial_obs_priors_off = binomial_obs_priors_off_freebayes != 'False' ? '--binomial-obs-priors-off' : '' + def allele_balance_priors_off = allele_balance_priors_off_freebayes != 'False' ? '--allele-balance-priors-off' : '' + def observation_bias = observation_bias_freebayes != 'None' ? "--observation-bias ${observation_bias_freebayes}" : '' + def base_quality_cap = base_quality_cap_freebayes != 'None' ? "--base-quality-cap ${base_quality_cap_freebayes}" : '' def prob_contamination = "--prob-contamination ${prob_contamination_freebayes}" - def legacy_gls = legacy_gls_freebayes !='False' ? "--legacy-gls" :'' - def contamination_estimates = contamination_estimates_freebayes != 'None' ? "--contamination-estimates ${contamination_estimates_freebayes}":'' - - def report_genotype_likelihood_max = report_genotype_likelihood_max_freebayes !='False' ? "--report-genotype-likelihood-max":'' + def legacy_gls = legacy_gls_freebayes != 'False' ? '--legacy-gls' : '' + def contamination_estimates = contamination_estimates_freebayes != 'None' ? "--contamination-estimates ${contamination_estimates_freebayes}" : '' + + def report_genotype_likelihood_max = report_genotype_likelihood_max_freebayes != 'False' ? '--report-genotype-likelihood-max' : '' def genotyping_max_iterations = "--genotyping-max-iterations ${genotyping_max_iterations_freebayes}" def genotyping_max_banddepth = "--genotyping-max-banddepth ${genotyping_max_banddepth_freebayes}" def posterior_integration_limits = "--posterior-integration-limits ${posterior_integration_limits_freebayes}" - def exclude_unobserved_genotypes = exclude_unobserved_genotypes_freebayes != 'False' ? "--exclude-unobserved-genotypes" : '' - def genotype_variant_threshold = genotype_variant_threshold_freebayes != 'None' ? "--genotype-variant-threshold ${genotype_variant_threshold_freebayes}":'' - def use_mapping_quality = use_mapping_quality_freebayes != 'False' ? "--use-mapping-quality" : '' - def harmonic_indel_quality = harmonic_indel_quality_freebayes !='False' ? "--harmonic-indel-quality" :'' + def exclude_unobserved_genotypes = exclude_unobserved_genotypes_freebayes != 'False' ? '--exclude-unobserved-genotypes' : '' + def genotype_variant_threshold = genotype_variant_threshold_freebayes != 'None' ? "--genotype-variant-threshold ${genotype_variant_threshold_freebayes}" : '' + def use_mapping_quality = use_mapping_quality_freebayes != 'False' ? '--use-mapping-quality' : '' + def harmonic_indel_quality = harmonic_indel_quality_freebayes != 'False' ? '--harmonic-indel-quality' : '' def read_dependence_factor = "--read-dependence-factor ${read_dependence_factor_freebayes}" - def genotype_qualities = genotype_qualities_freebayes !='False' ? "--genotype-qualities" :'' + def genotype_qualities = genotype_qualities_freebayes != 'False' ? '--genotype-qualities' : '' - def debug = debug_freebayes != 'False' ? "--debug" : '' - def dd = dd_freebayes != 'False' ? "-dd" : '' + def debug = debug_freebayes != 'False' ? '--debug' : '' + def dd = dd_freebayes != 'False' ? '-dd' : '' """ freebayes ${bam_freebayes} $stdin -f ${ref_freebayes} $targets $region $samples $populations ${cnv_map} \ @@ -180,12 +179,11 @@ process freebayes{ ${observation_bias} ${base_quality_cap} ${prob_contamination} ${legacy_gls} ${contamination_estimates} \ ${report_genotype_likelihood_max} ${genotyping_max_iterations} ${genotyping_max_banddepth} ${posterior_integration_limits} ${exclude_unobserved_genotypes} ${genotype_variant_threshold} \ ${use_mapping_quality} ${harmonic_indel_quality} ${read_dependence_factor} ${genotype_qualities} $debug $dd - + """ } - -workflow variant_freebayes{ +workflow variant_freebayes { take: bam_freebayes bai_freebayes @@ -209,7 +207,7 @@ workflow variant_freebayes{ report_monomorphic_freebayes = channel.value(params.report_monomorphic) pvar_freebayes = channel.value(params.pvar) strict_vcf_freebayes = channel.value(params.strict_vcf) - + theta_freebayes = channel.value(params.theta) ploidy_freebayes = channel.value(params.ploidy) pooled_discrete_freebayes = channel.value(params.pooled_discrete) @@ -225,7 +223,7 @@ workflow variant_freebayes{ min_repeat_size_freebayes = channel.value(params.min_repeat_size) min_repeat_entropy_freebayes = channel.value(params.min_repeat_entropy) no_partial_observations_freebayes = channel.value(params.no_partial_observations) - + dont_left_align_indels_freebayes = channel.value(params.dont_left_align_indels) use_duplicate_reads_freebayes = channel.value(params.use_duplicate_reads) min_mapping_quality_freebayes = channel.value(params.min_mapping_quality) @@ -245,7 +243,7 @@ workflow variant_freebayes{ min_coverage_freebayes = channel.value(params.min_coverage) max_coverage_freebayes = channel.value(params.max_coverage) no_population_priors_freebayes = channel.value(params.no_population_priors) - + hwe_priors_off_freebayes = channel.value(params.hwe_priors_off) binomial_obs_priors_off_freebayes = channel.value(params.binomial_obs_priors_off) allele_balance_priors_off_freebayes = channel.value(params.allele_balance_priors_off) @@ -254,7 +252,7 @@ workflow variant_freebayes{ prob_contamination_freebayes = channel.value(params.prob_contamination) legacy_gls_freebayes = channel.value(params.legacy_gls) contamination_estimates_freebayes = channel.value(params.contamination_estimates) - + report_genotype_likelihood_max_freebayes = channel.value(params.report_genotype_likelihood_max) genotyping_max_iterations_freebayes = channel.value(params.genotyping_max_iterations) genotyping_max_banddepth_freebayes = channel.value(params.genotyping_max_banddepth) @@ -267,7 +265,6 @@ workflow variant_freebayes{ genotype_qualities_freebayes = channel.value(params.genotype_qualities) debug_freebayes = channel.value(params.debug) dd_freebayes = channel.value(params.dd) - freebayes(bam_freebayes, bai_freebayes, stdin_freebayes, fasta_reference, fasta_reference_index, targets_freebayes, region_freebayes, samples_freebayes, populations_freebayes, cnv_map_freebayes, \ vcf_freebayes, gvcf_freebayes, gvcf_chunk_freebayes, gvcf_dont_use_chunk_freebayes, variant_input_freebayes, only_use_input_alleles_freebayes, haplotype_basis_alleles_freebayes, report_all_haplotype_alleles_freebayes, report_monomorphic_freebayes, pvar_freebayes, strict_vcf_freebayes, \ @@ -282,5 +279,4 @@ workflow variant_freebayes{ emit: freebayes.out.collect() - } diff --git a/modules/single/gene_demulti/freemuxlet.nf b/modules/single/gene_demulti/freemuxlet.nf index e006d2a..567b7dc 100755 --- a/modules/single/gene_demulti/freemuxlet.nf +++ b/modules/single/gene_demulti/freemuxlet.nf @@ -1,11 +1,11 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=2 +nextflow.enable.dsl = 2 process freemuxlet { publishDir "$projectDir/$params.outdir/$params.mode/gene_demulti/freemuxlet", mode: 'copy' label 'small_mem' - conda "bioconda::popscle" + conda 'bioconda::popscle' input: each sam @@ -48,10 +48,10 @@ process freemuxlet { def vcffile = "--vcf $vcf" def smlist = sm != 'None' ? "--sm $sm" : '' def sm_list_file = sm_list != 'None' ? "--sm-list ${sm_list}" : '' - def sm_list_file_name = sm_list != 'None' ? file(sm_list).baseName : "No sm list file is given" + def sm_list_file_name = sm_list != 'None' ? file(sm_list).baseName : 'No sm list file is given' def samverbose = "--sam-verbose ${sam_verbose}" def vcfverbose = "--vcf-verbose ${vcf_verbose}" - def skipumi = skip_umi != "False" ? "--skip-umi" : "" + def skipumi = skip_umi != 'False' ? '--skip-umi' : '' def capBQ = "--cap-BQ ${cap_BQ}" def minBQ = "--min-BQ ${min_BQ}" def minMQ = "--min-MQ ${min_MQ}" @@ -64,43 +64,40 @@ process freemuxlet { def minsnp = "--min-snp ${min_snp}" def initcluster = init_cluster != 'None' ? "--init-cluster ${init_cluster}" : '' def n_sample = "--nsample $nsample" - def auxfiles = aux_files != 'False' ? "--aux-files" : '' + def auxfiles = aux_files != 'False' ? '--aux-files' : '' def verbose_info = "--verbose $verbose" def doubletprior = "--doublet-prior ${doublet_prior}" def bfthres = "--bf-thres ${bf_thres}" def frac_init_cluster = "--frac-init-clust ${frac_init_clust}" def iterinit = "--iter-init ${iter_init}" - def keepinit_missing = keep_init_missing != "False" ? "--keep-init-missing" : '' - + def keepinit_missing = keep_init_missing != 'False' ? '--keep-init-missing' : '' + """ mkdir freemuxlet_${task.index} mkdir freemuxlet_${task.index}/plp touch freemuxlet_${task.index}/params.csv echo -e "Argument,Value \n samfile,${sam} \n tag_group,${tag_group} \n tag_UMI,${tag_UMI} \n vcf_file,${vcf} \n sm,${sm} \n sm_list_file,${sm_list_file_name} \n sam_verbose,${sam_verbose} \n vcf_verbose,${vcf_verbose} \n skip_umi,${skip_umi} \n cap_BQ,${cap_BQ} \n min_BQ,${min_BQ} \n min_MQ,${min_MQ} \n min_TD,${min_TD} \n excl_flag,${excl_flag} \n grouplist,${group_list} \n min_total,${min_total} \n min_uniq,${min_uniq} \n min_umi,${min_umi} \n min_snp,${min_snp} \n init_cluster,${init_cluster} \n nsample,${nsample} \n aux_files,${aux_files} \n verbose,${verbose} \n doublet_prior,${doublet_prior} \n bf_thres,${bf_thres} \n frac_init_clust,${frac_init_clust} \n iter_init,${iter_init} \n keep_init_missing,${keep_init_missing}" >> freemuxlet_${task.index}/params.csv - + popscle dsc-pileup $samfile ${taggroup} ${tagUMI} $vcffile ${smlist} ${sm_list_file} ${samverbose} \ ${vcfverbose} ${skipumi} ${capBQ} ${minBQ} ${minMQ} ${minTD} ${exclflag} ${grouplist} ${mintotal} ${minuniq} \ ${minsnp} --out freemuxlet_${task.index}/plp/${freemuxlet_out} popscle freemuxlet --plp freemuxlet_${task.index}/plp/${freemuxlet_out} --out freemuxlet_${task.index}/${freemuxlet_out} \ ${initcluster} ${n_sample} ${auxfiles} ${verbose_info} ${doubletprior} ${bfthres} ${frac_init_cluster} ${iterinit} \ ${keepinit_missing} ${capBQ} ${minBQ} ${grouplist} ${mintotal} ${minumi} ${minsnp} - + """ - } - -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{return it.tokenize(';')}.flatten() +def split_input(input) { + if (input =~ /;/) { + Channel.from(input).map { return it.tokenize(';') }.flatten() } - else{ + else { Channel.from(input) } } - -workflow demultiplex_freemuxlet{ +workflow demultiplex_freemuxlet { take: sam main: @@ -113,7 +110,7 @@ workflow demultiplex_freemuxlet{ sam_verbose = split_input(params.sam_verbose) vcf_verbose = split_input(params.vcf_verbose) skip_umi = split_input(params.skip_umi) - + cap_BQ = split_input(params.cap_BQ) min_BQ = split_input(params.min_BQ) min_MQ = split_input(params.min_MQ) @@ -124,7 +121,7 @@ workflow demultiplex_freemuxlet{ min_umi = split_input(params.min_umi) min_snp = split_input(params.min_snp) init_cluster = split_input(params.init_cluster) - nsample = split_input(params.nsample) + nsample = split_input(params.nsamples_genetic) aux_files = split_input(params.aux_files) verbose = split_input(params.verbose) doublet_prior = split_input(params.doublet_prior) @@ -135,9 +132,9 @@ workflow demultiplex_freemuxlet{ freemuxlet_out = params.freemuxlet_out freemuxlet(sam, vcf, tag_group, tag_UMI, sm, sm_list, sam_verbose, vcf_verbose, skip_umi, cap_BQ, - min_BQ, min_MQ, min_TD, excl_flag, group_list, min_total, min_uniq, min_umi, min_snp, init_cluster, nsample, + min_BQ, min_MQ, min_TD, excl_flag, group_list, min_total, min_uniq, min_umi, min_snp, init_cluster, nsample, aux_files, verbose, doublet_prior, bf_thres, frac_init_clust, iter_init, keep_init_missing, freemuxlet_out) - + emit: freemuxlet.out.collect() } diff --git a/modules/single/gene_demulti/scsplit.nf b/modules/single/gene_demulti/scsplit.nf index e1e5df3..03f4342 100644 --- a/modules/single/gene_demulti/scsplit.nf +++ b/modules/single/gene_demulti/scsplit.nf @@ -1,12 +1,12 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=2 +nextflow.enable.dsl = 2 -process scSplit{ +process scSplit { publishDir "$projectDir/$params.outdir/$params.mode/gene_demulti/scSplit", mode: 'copy' label 'big_mem' conda "$projectDir/conda/scsplit.yml" - + input: each vcf each bam @@ -23,39 +23,38 @@ process scSplit{ each vcf_known each sample_geno each scsplit_out - - + output: path "scsplit_${task.index}" - + script: - + def common_data = com != 'None' ? "--com $com" : '' def common_data_name = com != 'None' ? com : 'Common variants are not given.' def vcf_known_data = vcf_known != 'None' ? "--vcf ${vcf_known}" : '' def vcf_known_data_name = vcf_known != 'None' ? vcf_known : 'Known variants are not given.' - def sub_yesno = num == 0 ? "$sub": 'no_sub' - + def sub_yesno = num == 0 ? "$sub" : 'no_sub' + def vcf_data = "-v $vcf" def bam_data = "-i $bam" def barcode_data = "-b $barcode" def tag_data = "--tag $tag" def num_data = "-n $num" - def sub_data = num == 0 ? "--sub $sub": '' + def sub_data = num == 0 ? "--sub $sub" : '' def ems_data = "--ems $ems" def dbl_data = dbl != 'None' ? "--dbl $dbl" : '' def out = "scsplit_${task.index}/${scsplit_out}" - + """ git clone https://github.com/jon-xu/scSplit mkdir scsplit_${task.index} mkdir $out touch scsplit_${task.index}/params.csv echo -e "Argument,Value \n vcf,$vcf \n bam,$bam \n barcode,$barcode \n common_data,${common_data_name} \n num,${num} \n sub,${sub_yesno} \n ems,${ems} \n dbl,${dbl} \n vcf_known_data,${vcf_known_data_name}" >> scsplit_${task.index}/params.csv - + python scSplit/scSplit count ${vcf_data} ${bam_data} ${barcode_data} ${common_data} -r $ref -a $alt --out $out python scSplit/scSplit run -r $out/$ref -a $out/$alt ${num_data} ${sub_data} ${ems_data} ${dbl_data} ${vcf_known_data} --out $out - + if [[ "$sample_geno" != "False" ]] then python scSplit/scSplit genotype -r $out/$ref -a $out/$alt -p $out/scSplit_P_s_c.csv --out $out @@ -63,16 +62,16 @@ process scSplit{ """ } -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{ return it.tokenize(';')}.flatten() +def split_input(input) { + if (input =~ /;/) { + Channel.from(input).map { return it.tokenize(';') }.flatten() } - else{ + else { Channel.from(input) } } -workflow demultiplex_scSplit{ +workflow demultiplex_scSplit { take: bam_scsplit vcf_scsplit @@ -83,17 +82,15 @@ workflow demultiplex_scSplit{ com_scsplit = split_input(params.common_variants_scSplit) ref_scsplit = split_input(params.refscSplit) alt_scsplit = split_input(params.altscSplit) - num_scsplit = split_input(params.nsample) + num_scsplit = split_input(params.nsamples_genetic) sub_scsplit = split_input(params.subscSplit) ems_scsplit = split_input(params.emsscSplit) dbl_scsplit = split_input(params.dblscSplit) vcf_known_scsplit = split_input(params.vcf_donor) sample_geno = split_input(params.sample_geno) scsplit_out = params.scsplit_out - scSplit(vcf_scsplit, bam_scsplit, bai_scsplit, bar_scsplit, tag_scsplit, com_scsplit, ref_scsplit, + scSplit(vcf_scsplit, bam_scsplit, bai_scsplit, bar_scsplit, tag_scsplit, com_scsplit, ref_scsplit, alt_scsplit, num_scsplit, sub_scsplit, ems_scsplit, dbl_scsplit, vcf_known_scsplit,sample_geno, scsplit_out) emit: scSplit.out.collect() - } - diff --git a/modules/single/gene_demulti/souporcell.nf b/modules/single/gene_demulti/souporcell.nf index 72cd131..22cffcc 100755 --- a/modules/single/gene_demulti/souporcell.nf +++ b/modules/single/gene_demulti/souporcell.nf @@ -81,7 +81,7 @@ workflow demultiplex_souporcell{ barcodes = split_input(params.barcodes) fasta = split_input(params.fasta) threads = split_input(params.threads) - clusters = split_input(params.nsample) + clusters = split_input(params.nsamples_genetic) ploidy = split_input(params.ploidy) min_alt = split_input(params.min_alt) diff --git a/modules/single/gene_demulti/vireo.nf b/modules/single/gene_demulti/vireo.nf index 55c8cbd..1db4387 100755 --- a/modules/single/gene_demulti/vireo.nf +++ b/modules/single/gene_demulti/vireo.nf @@ -1,11 +1,11 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=2 +nextflow.enable.dsl = 2 -process vireo{ +process vireo { publishDir "$projectDir/$params.outdir/$params.mode/gene_demulti/vireo", mode: 'copy' label 'big_mem' - conda "aksarkar::vireosnp" + conda 'aksarkar::vireosnp' input: each celldata @@ -26,29 +26,27 @@ process vireo{ each findVariant each vireo_out - output: path "vireo_${task.index}" - script: def cell_data = "-c $celldata" - def n_donor = ndonor != 'None'? "-N $ndonor" : '' - def n_donor_yesno = ndonor != 'None'? "$ndonor" : "Number of donors are not given" + def n_donor = ndonor != 'None' ? "-N $ndonor" : '' + def n_donor_yesno = ndonor != 'None' ? "$ndonor" : 'Number of donors are not given' def donor = donorfile != 'None' ? "-d $donorfile" : '' def donor_data_name = donorfile != 'None' ? donorfile : 'Donor file is not given' def geno_tag = donorfile != 'None' ? "--genoTag $genoTag" : '' - def no_doublet = noDoublet != 'False' ? "--noDoublet" : '' + def no_doublet = noDoublet != 'False' ? '--noDoublet' : '' def n_init = "--nInit $nInit" def extra_donor = "--extraDonor $extraDonor" def extradonor_mode = extraDonorMode != 'distance' ? "--extraDonorMode $extraDonorMode" : '' - def learnGT = (forceLearnGT != 'False' && donorfile != 'None')? "--forceLearnGT" : '' - def learnGT_yesno = (forceLearnGT != 'False' && donorfile != 'None')? "$forceLearnGT" : 'False' - def ase_mode = ASEmode != 'False' ? "--ASEmode" : '' - def no_plot = noPlot != 'False' ? "--noPlot" : '' - def random_seed = randSeed != 'None'? "--randSeed $randSeed" : '' - def cell_range = cellRange != 'all'? "--cellRange $cellRange" : '' - def call_ambient_rna = callAmbientRNAs != 'False' ? "--callAmbientRNAs" : '' + def learnGT = (forceLearnGT != 'False' && donorfile != 'None') ? '--forceLearnGT' : '' + def learnGT_yesno = (forceLearnGT != 'False' && donorfile != 'None') ? "$forceLearnGT" : 'False' + def ase_mode = ASEmode != 'False' ? '--ASEmode' : '' + def no_plot = noPlot != 'False' ? '--noPlot' : '' + def random_seed = randSeed != 'None' ? "--randSeed $randSeed" : '' + def cell_range = cellRange != 'all' ? "--cellRange $cellRange" : '' + def call_ambient_rna = callAmbientRNAs != 'False' ? '--callAmbientRNAs' : '' def n_proc = "--nproc $nproc" """ @@ -64,27 +62,25 @@ process vireo{ GTbarcode -i vireo_${task.index}/${vireo_out}/GT_donors.vireo.vcf.gz -o vireo_${task.index}/${vireo_out}/filtered_variants.tsv ${randSeed} fi fi - - """ + """ } - -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{ return it.tokenize(';')}.flatten() +def split_input(input) { + if (input =~ /;/) { + Channel.from(input).map { return it.tokenize(';') }.flatten() } - else{ + else { Channel.from(input) } } -workflow demultiplex_vireo{ +workflow demultiplex_vireo { take: celldata - + main: - ndonor = split_input(params.nsample) + ndonor = split_input(params.nsamples_genetic) donorfile = split_input(params.vcf_donor) genoTag = split_input(params.genoTag) @@ -102,10 +98,9 @@ workflow demultiplex_vireo{ nproc = split_input(params.nproc) findVariant = split_input(params.findVariants) vireo_out = params.vireo_out - - vireo(celldata, ndonor, donorfile, genoTag, noDoublet, nInit, extraDonor, extraDonorMode, forceLearnGT, + + vireo(celldata, ndonor, donorfile, genoTag, noDoublet, nInit, extraDonor, extraDonorMode, forceLearnGT, ASEmode, noPlot, randSeed, cellRange, callAmbientRNAs, nproc, findVariant, vireo_out) - emit: vireo.out.collect() diff --git a/modules/single/gene_demultiplexing.nf b/modules/single/gene_demultiplexing.nf index 20d5bd4..0e6f317 100644 --- a/modules/single/gene_demultiplexing.nf +++ b/modules/single/gene_demultiplexing.nf @@ -1,5 +1,5 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=2 +nextflow.enable.dsl = 2 include { data_preprocess } from './gene_demulti/samtools' include { filter_variant } from './gene_demulti/bcftools' @@ -11,20 +11,20 @@ include { demultiplex_scSplit } from './gene_demulti/scsplit' include { demultiplex_souporcell } from './gene_demulti/souporcell' include { demultiplex_vireo } from './gene_demulti/vireo' -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{ return it.tokenize(';')}.flatten() +def split_input(input) { + if (input =~ /;/) { + Channel.from(input).map { return it.tokenize(';') }.flatten() } - else{ + else { Channel.from(input) } } -process summary{ +process summary { publishDir "$projectDir/$params.outdir/$params.mode/gene_demulti", mode: 'copy' label 'small_mem' - - conda "pandas scanpy mudata" + + conda 'pandas scanpy mudata' input: val demuxlet_result @@ -40,44 +40,44 @@ process summary{ path genetic_summary script: - def demuxlet_files = "" - def freemuxlet_files = "" - def vireo_files = "" - def souporcell_files = "" - def scsplit_files = "" - def generate_adata = "" - def generate_mdata = "" - - if (demuxlet_result != "no_result"){ - demuxlet_files = "--demuxlet ${demuxlet_result.join(":")}" - } - if (freemuxlet_result != "no_result"){ - freemuxlet_files = "--freemuxlet ${freemuxlet_result.join(":")}" - } - if (vireo_result != "no_result"){ - vireo_files = "--vireo ${vireo_result.join(":")}" - } - if (souporcell_result != "no_result"){ - souporcell_files = "--souporcell ${souporcell_result.join(":")}" - } - if (scsplit_result != "no_result"){ - scsplit_files = "--scsplit ${scsplit_result.join(":")}" - } - if (scsplit_result != "no_result"){ - scsplit_files = "--scsplit ${scsplit_result.join(":")}" - } - if (generate_anndata == "True"){ - if(rna_matrix == "None"){ - error "Error: RNA count matrix is not given." + def demuxlet_files = '' + def freemuxlet_files = '' + def vireo_files = '' + def souporcell_files = '' + def scsplit_files = '' + def generate_adata = '' + def generate_mdata = '' + + if (demuxlet_result != 'no_result') { + demuxlet_files = "--demuxlet ${demuxlet_result.join(':')}" + } + if (freemuxlet_result != 'no_result') { + freemuxlet_files = "--freemuxlet ${freemuxlet_result.join(':')}" + } + if (vireo_result != 'no_result') { + vireo_files = "--vireo ${vireo_result.join(':')}" + } + if (souporcell_result != 'no_result') { + souporcell_files = "--souporcell ${souporcell_result.join(':')}" + } + if (scsplit_result != 'no_result') { + scsplit_files = "--scsplit ${scsplit_result.join(':')}" + } + if (scsplit_result != 'no_result') { + scsplit_files = "--scsplit ${scsplit_result.join(':')}" + } + if (generate_anndata == 'True') { + if (rna_matrix == 'None') { + error 'Error: RNA count matrix is not given.' } generate_adata = "--generate_anndata --read_rna_mtx $rna_matrix" } - if (generate_mudata == "True"){ - if(rna_matrix == "None"){ - error "Error: RNA count matrix is not given." + if (generate_mudata == 'True') { + if (rna_matrix == 'None') { + error 'Error: RNA count matrix is not given.' } - if(hto_matrix == "None"){ - error "Error: HTO count matrix is not given." + if (hto_matrix == 'None') { + error 'Error: HTO count matrix is not given.' } generate_mdata = "--generate_mudata --read_rna_mtx $rna_matrix --read_hto_mtx $hto_matrix" } @@ -86,105 +86,100 @@ process summary{ """ } - workflow gene_demultiplexing { main: input_bam = Channel.fromPath(params.bam) input_bai = Channel.fromPath(params.bai) - if ((params.demuxlet == "True" & params.demuxlet_preprocess != 'False')| \ - (params.freemuxlet == "True" & params.freemuxlet_preprocess != 'False')| \ - (params.scSplit == "True" & params.scSplit_preprocess != 'False') | \ - (params.vireo == "True" & params.vireo_preprocess != 'False') | \ - (params.souporcell == "True" & params.souporcell_preprocess != 'False')){ + if ((params.demuxlet == 'True' & params.demuxlet_preprocess != 'False') | \ + (params.freemuxlet == 'True' & params.freemuxlet_preprocess != 'False')| \ + (params.scSplit == 'True' & params.scSplit_preprocess != 'False') | \ + (params.vireo == 'True' & params.vireo_preprocess != 'False') | \ + (params.souporcell == 'True' & params.souporcell_preprocess != 'False')) { data_preprocess(input_bam) - qc_bam = data_preprocess.out.map{ return it + "/sorted.bam"} - qc_bam_bai = data_preprocess.out.map{ return it + "/sorted.bam.bai"} + qc_bam = data_preprocess.out.map { return it + '/sorted.bam' } + qc_bam_bai = data_preprocess.out.map { return it + '/sorted.bam.bai' } } - if (params.vireo == "True" & params.vireo_variant == 'True'){ - if(params.vireo_preprocess != 'False'){ + if (params.vireo == 'True' & params.vireo_variant == 'True') { + if (params.vireo_preprocess != 'False') { variant_cellSNP(qc_bam, qc_bam_bai) } - else{ + else { variant_cellSNP(input_bam, input_bai) } - cellsnp_vcf = variant_cellSNP.out.map{ return it + "/*/cellSNP.cells.vcf"} + cellsnp_vcf = variant_cellSNP.out.map { return it + '/*/cellSNP.cells.vcf' } } - - if (params.scSplit == "True" & params.scSplit_variant == 'True' ){ - freebayes_region = Channel.from(1..22, "X","Y").flatten() - if (params.region != "None"){ + + if (params.scSplit == 'True' & params.scSplit_variant == 'True') { + freebayes_region = Channel.from(1..22, 'X', 'Y').flatten() + if (params.region != 'None') { freebayes_region = split_input(params.region) } - if(params.scSplit_preprocess != 'False'){ + if (params.scSplit_preprocess != 'False') { variant_freebayes(qc_bam, qc_bam_bai, freebayes_region) } - else{ + else { variant_freebayes(input_bam, input_bai, freebayes_region) } filter_variant(variant_freebayes.out) - freebayes_vcf = filter_variant.out.map{ return it + "/filtered_sorted_total_chroms.vcf"} - + freebayes_vcf = filter_variant.out.map { return it + '/filtered_sorted_total_chroms.vcf' } } - - if (params.demuxlet == "True"){ - bam = params.demuxlet_preprocess == 'True'? qc_bam: input_bam //qc_bam.mix(input_bam)) + + if (params.demuxlet == 'True') { + bam = params.demuxlet_preprocess == 'True' ? qc_bam : input_bam //qc_bam.mix(input_bam)) demultiplex_demuxlet(bam) demuxlet_out = demultiplex_demuxlet.out } - else{ - demuxlet_out = channel.value("no_result") + else { + demuxlet_out = channel.value('no_result') } - - - if (params.freemuxlet == "True"){ - bam = params.freemuxlet_preprocess == 'True'? qc_bam: input_bam // qc_bam.mix(input_bam)) + + if (params.freemuxlet == 'True') { + bam = params.freemuxlet_preprocess == 'True' ? qc_bam : input_bam // qc_bam.mix(input_bam)) demultiplex_freemuxlet(bam) freemuxlet_out = demultiplex_freemuxlet.out } - else{ - freemuxlet_out = channel.value("no_result") + else { + freemuxlet_out = channel.value('no_result') } - - if (params.vireo == "True"){ - vcf = params.vireo_variant != 'True'? Channel.fromPath(params.celldata): - variant_cellSNP.out.map{ return it + "/*/cellSNP.cells.vcf"} - //variant_cellSNP.out.map{ return it + "/*/cellSNP.cells.vcf"}.mix(Channel.fromPath(params.celldata))) + if (params.vireo == 'True') { + vcf = params.vireo_variant != 'True' ? Channel.fromPath(params.celldata) : + variant_cellSNP.out.map { return it + '/*/cellSNP.cells.vcf' } + //variant_cellSNP.out.map{ return it + "/*/cellSNP.cells.vcf"}.mix(Channel.fromPath(params.celldata))) demultiplex_vireo(vcf) vireo_out = demultiplex_vireo.out } - else{ - vireo_out = channel.value("no_result") + else { + vireo_out = channel.value('no_result') } - if (params.scSplit == "True"){ - bam = params.scSplit_preprocess == 'True'? qc_bam: input_bam // qc_bam.mix(input_bam)) - bai = params.scSplit_preprocess == 'True'? qc_bam_bai: input_bai // qc_bam_bai.mix(input_bai)) - vcf = params.scSplit_variant != 'True'? Channel.fromPath(params.vcf_mixed): - filter_variant.out.map{ return it + "/filtered_sorted_total_chroms.vcf"} - //freebayes_vcf.mix(Channel.fromPath(params.vcf_mixed))) + if (params.scSplit == 'True') { + bam = params.scSplit_preprocess == 'True' ? qc_bam : input_bam // qc_bam.mix(input_bam)) + bai = params.scSplit_preprocess == 'True' ? qc_bam_bai : input_bai // qc_bam_bai.mix(input_bai)) + vcf = params.scSplit_variant != 'True' ? Channel.fromPath(params.vcf_mixed) : + filter_variant.out.map { return it + '/filtered_sorted_total_chroms.vcf' } + //freebayes_vcf.mix(Channel.fromPath(params.vcf_mixed))) demultiplex_scSplit(bam, vcf, bai) scSplit_out = demultiplex_scSplit.out } - else{ - scSplit_out = channel.value("no_result") + else { + scSplit_out = channel.value('no_result') } - - if (params.souporcell == "True"){ - bam = params.souporcell_preprocess == 'True'? qc_bam: input_bam // qc_bam.mix(input_bam)) + + if (params.souporcell == 'True') { + bam = params.souporcell_preprocess == 'True' ? qc_bam : input_bam // qc_bam.mix(input_bam)) demultiplex_souporcell(bam) souporcell_out = demultiplex_souporcell.out } - else{ - souporcell_out = channel.value("no_result") + else { + souporcell_out = channel.value('no_result') } - summary(demuxlet_out, freemuxlet_out, vireo_out, souporcell_out, scSplit_out, + summary(demuxlet_out, freemuxlet_out, vireo_out, souporcell_out, scSplit_out, params.generate_anndata, params.generate_mudata, params.rna_matrix_filtered, params.hto_matrix_filtered) emit: summary.out } - diff --git a/modules/single/hash_demulti/bff.nf b/modules/single/hash_demulti/bff.nf index f60e098..0e4082f 100644 --- a/modules/single/hash_demulti/bff.nf +++ b/modules/single/hash_demulti/bff.nf @@ -1,12 +1,12 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=2 +nextflow.enable.dsl = 2 -process bff{ +process bff { publishDir "$projectDir/$params.outdir/$params.mode/hash_demulti/bff", mode:'copy' label 'small_mem' conda "$projectDir/conda/bff.yml" - + input: path hto_matrix, stageAs: 'hto_data' @@ -23,11 +23,10 @@ process bff{ each assignmentOutBff each preprocess_bff each barcodeWhitelist - + output: path "bff_${task.index}" - - + script: """ @@ -38,20 +37,19 @@ process bff{ --chemistry $chemistry --callerDisagreementThreshold $callerDisagreementThreshold --outputdir bff_${task.index} --assignmentOutBff $assignmentOutBff \ --preprocess $preprocess_bff --barcodeWhitelist $barcodeWhitelist """ - } -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{ return it.tokenize(';')}.flatten() +def split_input(input) { + if (input =~ /;/) { + Channel.from(input).map { return it.tokenize(';') }.flatten() } - else{ + else { Channel.from(input) } } -workflow bff_hashing{ - take: +workflow bff_hashing { + take: hto_matrix main: methods = split_input(params.methods) @@ -68,14 +66,12 @@ workflow bff_hashing{ preprocess_bff = split_input(params.preprocess_bff) barcodeWhitelist = split_input(params.barcodeWhitelist) - bff(hto_matrix, methods, methodsForConsensus,cellbarcodeWhitelist, metricsFile,doTSNE,doHeatmap,perCellSaturation,majorityConsensusThreshold,chemistry,callerDisagreementThreshold,assignmentOutBff,preprocess_bff,barcodeWhitelist) - + bff(hto_matrix, methods, methodsForConsensus, cellbarcodeWhitelist, metricsFile, doTSNE, doHeatmap, perCellSaturation, majorityConsensusThreshold, chemistry, callerDisagreementThreshold, assignmentOutBff, preprocess_bff, barcodeWhitelist) + emit: bff.out.collect() } - -workflow{ +workflow { bff_hashing() - } diff --git a/modules/single/hash_demulti/demuxem.nf b/modules/single/hash_demulti/demuxem.nf index 4d94480..ba81034 100644 --- a/modules/single/hash_demulti/demuxem.nf +++ b/modules/single/hash_demulti/demuxem.nf @@ -1,11 +1,11 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=2 +nextflow.enable.dsl = 2 -process demuxem{ +process demuxem { publishDir "$projectDir/$params.outdir/$params.mode/hash_demulti/demuxem", mode:'copy' label 'small_mem' - - conda "bioconda::pegasuspy demuxEM scanpy" + + conda 'bioconda::pegasuspy demuxEM scanpy' input: path raw_rna_matrix_dir, stageAs: "rna_data_${params.rna_matrix_demuxem}" @@ -23,9 +23,9 @@ process demuxem{ each filter_demuxem output: path "demuxem_${task.index}" - + script: - def generateGenderPlot = generate_gender_plot != "None" ? " --generateGenderPlot ${generate_gender_plot}" : '' + def generateGenderPlot = generate_gender_plot != 'None' ? " --generateGenderPlot ${generate_gender_plot}" : '' """ mkdir demuxem_${task.index} demuxem.py --rna_matrix_dir rna_data_${params.rna_matrix_demuxem} --hto_matrix_dir hto_data_${params.hto_matrix_demuxem} \ @@ -33,19 +33,18 @@ process demuxem{ --alpha $alpha --alpha_noise $alpha_noise --n_threads $threads $generateGenderPlot --objectOutDemuxem $objectOutDemuxem \ --outputdir demuxem_${task.index} --filter_demuxem $filter_demuxem """ - } -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{ return it.tokenize(';')}.flatten() +def split_input(input) { + if (input =~ /;/) { + Channel.from(input).map { return it.tokenize(';') }.flatten() } - else{ + else { Channel.from(input) } } -workflow demuxem_hashing{ +workflow demuxem_hashing { take: hto_matrix rna_matrix @@ -62,9 +61,9 @@ workflow demuxem_hashing{ objectOutDemuxem = params.objectOutDemuxem filter_demuxem = split_input(params.filter_demuxem) - demuxem(rna_matrix, hto_matrix, threads, alpha, alpha_noise, tol, min_num_genes, min_num_umis, + demuxem(rna_matrix, hto_matrix, threads, alpha, alpha_noise, tol, min_num_genes, min_num_umis, min_signal, random_state, generate_gender_plot, objectOutDemuxem, filter_demuxem) - + emit: demuxem.out.collect() -} \ No newline at end of file +} diff --git a/modules/single/hash_demulti/demuxmix.nf b/modules/single/hash_demulti/demuxmix.nf index b687e97..62c01f3 100644 --- a/modules/single/hash_demulti/demuxmix.nf +++ b/modules/single/hash_demulti/demuxmix.nf @@ -1,12 +1,12 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=2 +nextflow.enable.dsl = 2 -process demuxmix{ +process demuxmix { publishDir "$projectDir/$params.outdir/$params.mode/hash_demulti/demuxmix", mode:'copy' label 'small_mem' conda "$projectDir/conda/demuxmix.yml" - + input: path hto_matrix, stageAs: 'hto_data' path umi_matrix, stageAs: 'rna_data' @@ -25,31 +25,30 @@ process demuxmix{ each correctTails each assignmentOutDemuxmix each gene_col - + output: path "demuxmix_${task.index}" - + script: - + """ mkdir demuxmix_${task.index} demuxmix.R --fileUmi rna_data --fileHto hto_data --rna_available $rna_available --assay $assay --ndelim $ndelim --model $model --alpha_demuxmix $alpha_demuxmix \ --beta_demuxmix $beta_demuxmix --tol_demuxmix $tol_demuxmix --maxIter_demuxmix $maxIter_demuxmix --correctTails $correctTails \ --k_hto $k_hto --k_rna $k_rna --outputdir demuxmix_${task.index} --assignmentOutDemuxmix $assignmentOutDemuxmix --gene_col $gene_col """ - } -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{ return it.tokenize(';')}.flatten() +def split_input(input) { + if (input =~ /;/) { + Channel.from(input).map { return it.tokenize(';') }.flatten() } - else{ + else { Channel.from(input) } } -workflow demuxmix_hashing{ +workflow demuxmix_hashing { take: hto_matrix rna_matrix @@ -67,16 +66,15 @@ workflow demuxmix_hashing{ k_hto = split_input(params.k_hto) k_rna = split_input(params.k_rna) correctTails = split_input(params.correctTails) - assignmentOutDemuxmix = split_input(params.assignmentOutDemuxmix) + assignmentOutDemuxmix = split_input(params.assignmentOutDemuxmix) gene_col = split_input(params.gene_col) - - demuxmix(hto_matrix,rna_matrix,hto_raw_or_filtered,rna_raw_or_filtered,rna_available, assay,ndelim,model, alpha_demuxmix, beta_demuxmix, tol_demuxmix, maxIter_demuxmix, k_hto, k_rna,correctTails,assignmentOutDemuxmix,gene_col ) - + demuxmix(hto_matrix, rna_matrix, hto_raw_or_filtered, rna_raw_or_filtered, rna_available, assay, ndelim, model, alpha_demuxmix, beta_demuxmix, tol_demuxmix, maxIter_demuxmix, k_hto, k_rna, correctTails, assignmentOutDemuxmix, gene_col) + emit: demuxmix.out.collect() } -workflow{ +workflow { demuxmix_hashing() } diff --git a/modules/single/hash_demulti/gmm_demux.nf b/modules/single/hash_demulti/gmm_demux.nf index 88c70ba..c998280 100644 --- a/modules/single/hash_demulti/gmm_demux.nf +++ b/modules/single/hash_demulti/gmm_demux.nf @@ -1,11 +1,11 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=2 +nextflow.enable.dsl = 2 -process gmm_demux{ +process gmm_demux { publishDir "$projectDir/$params.outdir/$params.mode/hash_demulti/gmm_demux", mode:'copy' label 'small_mem' conda "$projectDir/conda/gmm_demux.yml" - + input: path path_hto //HTO names as string separated by commas @@ -16,59 +16,55 @@ process gmm_demux{ //obligatory each summary //need to be combined with summary to get a report as file - each report_gmm + each report_gmm //mode 4 // write csv or tsv - type of input each mode_GMM //case 5 - each extract + each extract //float between 0 and 1 each threshold_gmm each ambiguous - - - + output: path "gmm_demux_${task.index}" - + script: def extract_droplets = extract != 'None' ? " -x ${extract}" : '' def ambiguous_droplets = extract != 'None' ? " --ambiguous ${ambiguous}" : '' - if(mode_GMM=="csv"){ + if (mode_GMM == 'csv') { """ mkdir gmm_demux_${task.index} touch gmm_demux_${task.index}_$report_gmm - + GMM-demux -c $path_hto $hto_name_gmm -u $summary --report gmm_demux_${task.index}_$report_gmm --full gmm_demux_${task.index} $extract_droplets -t $threshold_gmm gmm_demux_params.py --path_hto $path_hto --hto_name_gmm $hto_name_gmm --summary $summary --report gmm_demux_${task.index}_$report_gmm --mode $mode_GMM $extract_droplets --threshold_gmm $threshold_gmm $ambiguous_droplets --outputdir gmm_demux_${task.index} - + """ }else { """ mkdir gmm_demux_${task.index} touch gmm_demux_${task.index}_$report_gmm - + GMM-demux $path_hto $hto_name_gmm -u $summary -r gmm_demux_${task.index}_$report_gmm --full gmm_demux_${task.index} -o gmm_demux_${task.index} $extract_droplets -t $threshold_gmm gmm_demux_params.py --path_hto $path_hto --hto_name_gmm $hto_name_gmm --summary $summary --report gmm_demux_${task.index}_$report_gmm --mode $mode_GMM $extract_droplets --threshold_gmm $threshold_gmm $ambiguous_droplets --outputdir gmm_demux_${task.index} - + """ } - - } -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{ return it.tokenize(';')}.flatten() +def split_input(input) { + if (input =~ /;/) { + Channel.from(input).map { return it.tokenize(';') }.flatten() } - else{ + else { Channel.from(input) } } -workflow gmm_demux_hashing{ - take: +workflow gmm_demux_hashing { + take: hto_matrix main: hto_name_gmm = split_input(params.hto_name_gmm) @@ -79,14 +75,12 @@ workflow gmm_demux_hashing{ threshold_gmm = split_input(params.threshold_gmm) ambiguous = split_input(params.ambiguous) - gmm_demux(hto_matrix,hto_name_gmm,summary,report_gmm,mode,extract,threshold_gmm,ambiguous) - + gmm_demux(hto_matrix, hto_name_gmm, summary, report_gmm, mode, extract, threshold_gmm, ambiguous) + emit: gmm_demux.out.collect() } - -workflow{ +workflow { gmm_demux_hashing() - } diff --git a/modules/single/hash_demulti/hashsolo.nf b/modules/single/hash_demulti/hashsolo.nf index d2a03a8..0093b99 100755 --- a/modules/single/hash_demulti/hashsolo.nf +++ b/modules/single/hash_demulti/hashsolo.nf @@ -1,6 +1,6 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=2 -process hash_solo{ +nextflow.enable.dsl = 2 +process hash_solo { publishDir "$projectDir/$params.outdir/$params.mode/hash_demulti/hashsolo", mode:'copy' label 'small_mem' @@ -17,13 +17,13 @@ process hash_solo{ each number_of_noise_barcodes val assignmentOutHashSolo val plotOutHashSolo - + output: path "hashsolo_${task.index}" script: - def noise_barcodes = number_of_noise_barcodes != "None" ? "--number_of_noise_barcodes $number_of_noise_barcodes" : '' - def existing_clusters = pre_existing_clusters != "None" ? "--pre_existing_clusters $pre_existing_clusters" : '' + def noise_barcodes = number_of_noise_barcodes != 'None' ? "--number_of_noise_barcodes $number_of_noise_barcodes" : '' + def existing_clusters = pre_existing_clusters != 'None' ? "--pre_existing_clusters $pre_existing_clusters" : '' def clustering_data = use_rna_data != 'False' ? "--clustering_data rna_data_${params.rna_matrix_hashsolo}" : '' """ mkdir hashsolo_${task.index} @@ -32,15 +32,13 @@ process hash_solo{ --assignmentOutHashSolo $assignmentOutHashSolo \ --plotOutHashSolo $plotOutHashSolo --outputdir hashsolo_${task.index} """ - } - -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{ return it.tokenize(';')}.flatten() +def split_input(input) { + if (input =~ /;/) { + Channel.from(input).map { return it.tokenize(';') }.flatten() } - else{ + else { Channel.from(input) } } @@ -62,4 +60,4 @@ workflow hash_solo_hashing { hash_solo(hto_matrix, priors_negative, priors_singlet, priors_doublet, pre_existing_clusters, rna_matrix, use_rna_data, number_of_noise_barcodes, assignmentOutHashSolo, plotOutHashSolo) emit: hash_solo.out.collect() -} \ No newline at end of file +} diff --git a/modules/single/hash_demulti/htodemux.nf b/modules/single/hash_demulti/htodemux.nf index 317594b..c211ff3 100644 --- a/modules/single/hash_demulti/htodemux.nf +++ b/modules/single/hash_demulti/htodemux.nf @@ -1,12 +1,12 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=2 +nextflow.enable.dsl = 2 -process htodemux{ +process htodemux { publishDir "$projectDir/$params.outdir/$params.mode/hash_demulti/htodemux", mode: 'copy' label 'small_mem' - - conda "conda-forge::r-seurat conda-forge::r-argparse" - + + conda 'conda-forge::r-seurat conda-forge::r-argparse' + input: each seurat_object val assay @@ -18,7 +18,7 @@ process htodemux{ each init val objectOutHTO val assignmentOutHTO - + //Ridge plot params each ridgePlot each ridgeNCol @@ -41,36 +41,34 @@ process htodemux{ //Heatmap each heatmap each heatmapNcells - + output: path "htodemux_${task.index}" - + script: def init_val = init != 'None' ? " --init $init" : '' - def vln_log = vlnLog != 'False' ? "--vlnLog" : '' - def invert = tsneInvert != 'False' ? "--tSNEInvert" : '' - def verbose = tsneVerbose != 'False' ? "--tSNEVerbose" : '' - def approx = tsneApprox != 'False' ? "--tSNEApprox" : '' - + def vln_log = vlnLog != 'False' ? '--vlnLog' : '' + def invert = tsneInvert != 'False' ? '--tSNEInvert' : '' + def verbose = tsneVerbose != 'False' ? '--tSNEVerbose' : '' + def approx = tsneApprox != 'False' ? '--tSNEApprox' : '' + """ mkdir htodemux_${task.index} HTODemux.R --seuratObject $seurat_object --assay $assay --quantile $quantile --kfunc $kfunc --nstarts $nstarts --nsamples $nsamples --seed $seed $init_val --objectOutHTO $objectOutHTO --assignmentOutHTO $assignmentOutHTO --outputdir htodemux_${task.index} HTODemux-visualisation.R --hashtagPath htodemux_${task.index}/${objectOutHTO}.rds --assay $assay --ridgePlot $ridgePlot --ridgeNCol $ridgeNCol --featureScatter $featureScatter --scatterFeat1 $scatterFeat1 --scatterFeat2 $scatterFeat2 --vlnPlot $vlnplot --vlnFeatures $vlnFeatures $vln_log --tSNE $tsne --tSNEIdents $tsneIdents $invert $verbose $approx --tSNEDimMax $tsneDimMax --tSNEPerplexity $tsnePerplexity --heatMap $heatmap --heatMapNcells $heatmapNcells --outputdir htodemux_${task.index} """ - } -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{ return it.tokenize(';')}.flatten() +def split_input(input) { + if (input =~ /;/) { + Channel.from(input).map { return it.tokenize(';') }.flatten() } - else{ + else { Channel.from(input) } } - -workflow htodemux_hashing{ +workflow htodemux_hashing { take: seurat_object main: @@ -78,12 +76,12 @@ workflow htodemux_hashing{ assay = params.assay kfunc = split_input(params.kfunc) nstarts = split_input(params.nstarts) - nsamples = split_input(params.nsamples) + nsamples = split_input(params.nsamples_clustering) seed = split_input(params.seed) init = split_input(params.init) objectOutHTO = params.objectOutHTO assignmentOutHTO = params.assignmentOutHTO - + ridgePlot = split_input(params.ridgePlot) ridgeNCol = split_input(params.ridgeNCol) featureScatter = split_input(params.featureScatter) @@ -92,7 +90,7 @@ workflow htodemux_hashing{ vlnplot = split_input(params.vlnplot) vlnFeatures = split_input(params.vlnFeatures) vlnLog = split_input(params.vlnLog) - + tsne = split_input(params.tsne) tsneIdents = split_input(params.tsneIdents) tsneInvert = split_input(params.tsneInvert) @@ -109,6 +107,6 @@ workflow htodemux_hashing{ htodemux.out.collect() } -workflow{ +workflow { htodemux_hashing() } diff --git a/modules/single/hash_demulti/multiseq.nf b/modules/single/hash_demulti/multiseq.nf index f56442c..12795f0 100644 --- a/modules/single/hash_demulti/multiseq.nf +++ b/modules/single/hash_demulti/multiseq.nf @@ -1,12 +1,12 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=2 +nextflow.enable.dsl = 2 -process multi_seq{ +process multi_seq { publishDir "$projectDir/$params.outdir/$params.mode/hash_demulti/multiseq", mode:'copy' label 'small_mem' - - conda "conda-forge::r-seurat conda-forge::r-argparse" - + + conda 'conda-forge::r-seurat conda-forge::r-argparse' + input: each rdsObject each quantile @@ -24,25 +24,25 @@ process multi_seq{ path "multiseq_${task.index}" script: - def autoThr = autoThresh != 'False' ? " --autoThresh" : '' - def verb = verbose != 'False' ? " --verbose" : '' - + def autoThr = autoThresh != 'False' ? ' --autoThresh' : '' + def verb = verbose != 'False' ? ' --verbose' : '' + """ mkdir multiseq_${task.index} MultiSeq.R --seuratObjectPath $rdsObject --assay $assay --quantile $quantile $autoThr --maxiter $maxiter --qrangeFrom $qrangeFrom --qrangeTo $qrangeTo --qrangeBy $qrangeBy $verb --objectOutMulti $objectOutMulti --assignmentOutMulti $assignmentOutMulti --outputdir multiseq_${task.index} """ } -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{ return it.tokenize(';')}.flatten() +def split_input(input) { + if (input =~ /;/) { + Channel.from(input).map { return it.tokenize(';') }.flatten() } - else{ + else { Channel.from(input) } } -workflow multiseq_hashing{ +workflow multiseq_hashing { take: rdsObject main: diff --git a/modules/single/hash_demulti/preprocess.nf b/modules/single/hash_demulti/preprocess.nf index 422931e..ecaeed7 100644 --- a/modules/single/hash_demulti/preprocess.nf +++ b/modules/single/hash_demulti/preprocess.nf @@ -1,9 +1,9 @@ -process preprocess{ +process preprocess { publishDir "$projectDir/$params.outdir/$params.mode/hash_demulti/preprocess", mode:'copy' label 'small_mem' - - conda "conda-forge::r-seurat conda-forge::r-argparse" - + + conda 'conda-forge::r-seurat conda-forge::r-argparse' + input: path hto_matrix, stageAs: 'hto_data' path umi_matrix, stageAs: 'rna_data' @@ -22,7 +22,7 @@ process preprocess{ path "preprocess_${task.index}_hto_${hto_raw_or_filtered}_rna_${rna_raw_or_filtered}" script: - + """ mkdir preprocess_${task.index}_hto_${hto_raw_or_filtered}_rna_${rna_raw_or_filtered} pre_processing.R --fileUmi rna_data --fileHto hto_data --ndelim $ndelim \ @@ -31,20 +31,18 @@ process preprocess{ --outputdir preprocess_${task.index}_hto_${hto_raw_or_filtered}_rna_${rna_raw_or_filtered} --gene_col $gene_col """ - } - -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{ return it.tokenize(';')}.flatten() +def split_input(input) { + if (input =~ /;/) { + Channel.from(input).map { return it.tokenize(';') }.flatten() } - else{ + else { Channel.from(input) } } -workflow preprocessing_hashing{ +workflow preprocessing_hashing { take: hto_matrix rna_matrix @@ -59,7 +57,7 @@ workflow preprocessing_hashing{ norm_method = split_input(params.norm_method) out_file = params.preprocessOut gene_col = split_input(params.gene_col) - preprocess(hto_matrix, rna_matrix, hto_raw_or_filtered, rna_raw_or_filtered, ndelim, sel_method, n_features, assay, margin, norm_method,out_file,gene_col) + preprocess(hto_matrix, rna_matrix, hto_raw_or_filtered, rna_raw_or_filtered, ndelim, sel_method, n_features, assay, margin, norm_method, out_file, gene_col) emit: preprocess.out.collect() -} \ No newline at end of file +} diff --git a/modules/single/hash_demultiplexing.nf b/modules/single/hash_demultiplexing.nf index bde9466..799c7b5 100644 --- a/modules/single/hash_demultiplexing.nf +++ b/modules/single/hash_demultiplexing.nf @@ -1,5 +1,5 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=2 +nextflow.enable.dsl = 2 include { preprocessing_hashing as preprocessing_hashing_htodemux } from './hash_demulti/preprocess' include { preprocessing_hashing as preprocessing_hashing_multiseq } from './hash_demulti/preprocess' include { multiseq_hashing } from './hash_demulti/multiseq' @@ -11,11 +11,11 @@ include { demuxmix_hashing } from './hash_demulti/demuxmix' include { gmm_demux_hashing } from './hash_demulti/gmm_demux' include { bff_hashing } from './hash_demulti/bff' -process summary{ +process summary { publishDir "$projectDir/$params.outdir/$params.mode/hash_demulti", mode: 'copy' label 'small_mem' - conda "pandas scanpy mudata" + conda 'pandas scanpy mudata' input: val demuxem_result @@ -30,164 +30,162 @@ process summary{ val generate_mudata path rna_matrix, stageAs: 'rna_data' path hto_matrix, stageAs: 'hto_data' - + output: path hash_summary script: - def demuxem_files = "" - def htodemux_files = "" - def hashsolo_files = "" - def multiseq_files = "" - def hashedDrops_files = "" - def gmmDemux_files = "" - def demuxmix_files = "" - def bff_files = "" - def generate_adata = "" - def generate_mdata = "" - - if (demuxem_result != "no_result"){ - demuxem_files = "--demuxem ${demuxem_result.join(":")}" + def demuxem_files = '' + def htodemux_files = '' + def hashsolo_files = '' + def multiseq_files = '' + def hashedDrops_files = '' + def gmmDemux_files = '' + def demuxmix_files = '' + def bff_files = '' + def generate_adata = '' + def generate_mdata = '' + + if (demuxem_result != 'no_result') { + demuxem_files = "--demuxem ${demuxem_result.join(':')}" } - if (hashsolo_result != "no_result"){ - hashsolo_files = "--hashsolo ${hashsolo_result.join(":")}" + if (hashsolo_result != 'no_result') { + hashsolo_files = "--hashsolo ${hashsolo_result.join(':')}" } - if (htodemux_result != "no_result"){ - htodemux_files = "--htodemux ${htodemux_result.join(":")}" + if (htodemux_result != 'no_result') { + htodemux_files = "--htodemux ${htodemux_result.join(':')}" } - if (multiseq_result != "no_result"){ - multiseq_files = "--multiseq ${multiseq_result.join(":")}" + if (multiseq_result != 'no_result') { + multiseq_files = "--multiseq ${multiseq_result.join(':')}" } - if (hashedDrops_result != "no_result"){ - hashedDrops_files = "--hashedDrops ${hashedDrops_result.join(":")}" + if (hashedDrops_result != 'no_result') { + hashedDrops_files = "--hashedDrops ${hashedDrops_result.join(':')}" } - if (gmmDemux_result != "no_result"){ - gmmDemux_files = "--gmm_demux ${gmmDemux_result.join(":")}" + if (gmmDemux_result != 'no_result') { + gmmDemux_files = "--gmm_demux ${gmmDemux_result.join(':')}" } - if (demuxmix_result != "no_result"){ - demuxmix_files = "--demuxmix ${demuxmix_result.join(":")}" + if (demuxmix_result != 'no_result') { + demuxmix_files = "--demuxmix ${demuxmix_result.join(':')}" } - if (bff_result != "no_result"){ - bff_files = "--bff ${bff_result.join(":")}" + if (bff_result != 'no_result') { + bff_files = "--bff ${bff_result.join(':')}" } - if (generate_anndata == "True"){ - if(rna_matrix.name == "None"){ - error "Error: RNA count matrix is not given." + if (generate_anndata == 'True') { + if (rna_matrix.name == 'None') { + error 'Error: RNA count matrix is not given.' } - generate_adata = "--generate_anndata --read_rna_mtx rna_data" + generate_adata = '--generate_anndata --read_rna_mtx rna_data' } - if (generate_mudata == "True"){ - if(rna_matrix.name == "None"){ - error "Error: RNA count matrix is not given." + if (generate_mudata == 'True') { + if (rna_matrix.name == 'None') { + error 'Error: RNA count matrix is not given.' } - if(hto_matrix.name == "None"){ - error "Error: HTO count matrix is not given." + if (hto_matrix.name == 'None') { + error 'Error: HTO count matrix is not given.' } - generate_mdata = "--generate_mudata --read_rna_mtx rna_data --read_hto_mtx hto_data" + generate_mdata = '--generate_mudata --read_rna_mtx rna_data --read_hto_mtx hto_data' } - + """ summary_hash.py $demuxem_files $htodemux_files $multiseq_files $hashedDrops_files $hashsolo_files $demuxmix_files $gmmDemux_files $bff_files $generate_adata $generate_mdata """ } - -workflow hash_demultiplexing{ +workflow hash_demultiplexing { take: rna_matrix_raw rna_matrix_filtered hto_matrix_raw hto_matrix_filtered main: - - if (params.htodemux == "True"){ - rna_matrix = params.rna_matrix_htodemux == "raw" ? rna_matrix_raw : rna_matrix_filtered - hto_matrix = params.hto_matrix_htodemux == "raw" ? hto_matrix_raw : hto_matrix_filtered + + if (params.htodemux == 'True') { + rna_matrix = params.rna_matrix_htodemux == 'raw' ? rna_matrix_raw : rna_matrix_filtered + hto_matrix = params.hto_matrix_htodemux == 'raw' ? hto_matrix_raw : hto_matrix_filtered preprocessing_hashing_htodemux(hto_matrix, rna_matrix, params.hto_matrix_htodemux, params.rna_matrix_htodemux) htodemux_preprocess_out = preprocessing_hashing_htodemux.out htodemux_hashing(htodemux_preprocess_out) htodemux_out = htodemux_hashing.out } - else{ - htodemux_out = channel.value("no_result") + else { + htodemux_out = channel.value('no_result') } - - if (params.multiseq == "True"){ - if (params.htodemux == "True" & params.hto_matrix_htodemux == params.hto_matrix_multiseq & - params.rna_matrix_htodemux == params.rna_matrix_multiseq){ + + if (params.multiseq == 'True') { + if (params.htodemux == 'True' & params.hto_matrix_htodemux == params.hto_matrix_multiseq & + params.rna_matrix_htodemux == params.rna_matrix_multiseq) { multiseq_preprocess_out = htodemux_preprocess_out - } - else{ - rna_matrix = params.rna_matrix_multiseq == "raw" ? rna_matrix_raw : rna_matrix_filtered - hto_matrix = params.hto_matrix_multiseq == "raw" ? hto_matrix_raw : hto_matrix_filtered + } + else { + rna_matrix = params.rna_matrix_multiseq == 'raw' ? rna_matrix_raw : rna_matrix_filtered + hto_matrix = params.hto_matrix_multiseq == 'raw' ? hto_matrix_raw : hto_matrix_filtered preprocessing_hashing_multiseq(hto_matrix, rna_matrix, params.hto_matrix_multiseq, params.rna_matrix_multiseq) multiseq_preprocess_out = preprocessing_hashing_multiseq.out } multiseq_hashing(multiseq_preprocess_out) multiseq_out = multiseq_hashing.out } - else{ - multiseq_out = channel.value("no_result") + else { + multiseq_out = channel.value('no_result') } - - if (params.hashsolo == "True"){ - hashsolo_hto_input = params.hto_matrix_hashsolo == "raw" ? hto_matrix_raw : hto_matrix_filtered - hashsolo_rna_input = params.rna_matrix_hashsolo == "False" ? channel.value("None") : - (params.rna_matrix_hashsolo == "raw" ? rna_matrix_raw : rna_matrix_filtered) + + if (params.hashsolo == 'True') { + hashsolo_hto_input = params.hto_matrix_hashsolo == 'raw' ? hto_matrix_raw : hto_matrix_filtered + hashsolo_rna_input = params.rna_matrix_hashsolo == 'False' ? channel.value('None') : + (params.rna_matrix_hashsolo == 'raw' ? rna_matrix_raw : rna_matrix_filtered) hash_solo_hashing(hashsolo_hto_input, hashsolo_rna_input) hashsolo_out = hash_solo_hashing.out } - else{ - hashsolo_out = channel.value("no_result") + else { + hashsolo_out = channel.value('no_result') } - - if (params.demuxem == "True"){ - demuxem_hto_input = params.hto_matrix_demuxem == "raw" ? hto_matrix_raw : hto_matrix_filtered - demuxem_rna_input = params.rna_matrix_demuxem == "raw" ? rna_matrix_raw : rna_matrix_filtered + + if (params.demuxem == 'True') { + demuxem_hto_input = params.hto_matrix_demuxem == 'raw' ? hto_matrix_raw : hto_matrix_filtered + demuxem_rna_input = params.rna_matrix_demuxem == 'raw' ? rna_matrix_raw : rna_matrix_filtered demuxem_hashing(demuxem_hto_input, demuxem_rna_input) demuxem_out = demuxem_hashing.out } - else{ - demuxem_out = channel.value("no_result") + else { + demuxem_out = channel.value('no_result') } - - if (params.hashedDrops == "True"){ - hashedDrops_hto_input = params.hto_matrix_hashedDrops == "raw" ? hto_matrix_raw : hto_matrix_filtered + + if (params.hashedDrops == 'True') { + hashedDrops_hto_input = params.hto_matrix_hashedDrops == 'raw' ? hto_matrix_raw : hto_matrix_filtered hashedDrops_hashing(hashedDrops_hto_input) hashedDrops_out = hashedDrops_hashing.out } - else{ - hashedDrops_out = channel.value("no_result") + else { + hashedDrops_out = channel.value('no_result') } - if (params.demuxmix == "True"){ - demuxmix_rna_input = params.rna_available == "False" ? channel.value("None") : - (params.rna_matrix_demuxmix == "raw" ? hto_matrix_raw : hto_matrix_filtered) - demuxmix_hto_input = params.hto_matrix_demuxmix == "raw" ? rna_matrix_raw : rna_matrix_filtered - demuxmix_hashing(demuxmix_hto_input,demuxmix_rna_input,params.hto_matrix_demuxmix, params.rna_matrix_demuxmix,params.rna_available) + if (params.demuxmix == 'True') { + demuxmix_rna_input = params.rna_available == 'False' ? channel.value('None') : + (params.rna_matrix_demuxmix == 'raw' ? hto_matrix_raw : hto_matrix_filtered) + demuxmix_hto_input = params.hto_matrix_demuxmix == 'raw' ? rna_matrix_raw : rna_matrix_filtered + demuxmix_hashing(demuxmix_hto_input, demuxmix_rna_input, params.hto_matrix_demuxmix, params.rna_matrix_demuxmix, params.rna_available) demuxmix_out = demuxmix_hashing.out } - else{ - demuxmix_out = channel.value("no_result") + else { + demuxmix_out = channel.value('no_result') } - if (params.bff == "True"){ - bff_hto_input = params.hto_matrix_bff == "raw" ? hto_matrix_raw : hto_matrix_filtered + if (params.bff == 'True') { + bff_hto_input = params.hto_matrix_bff == 'raw' ? hto_matrix_raw : hto_matrix_filtered bff_hashing(bff_hto_input) bff_out = bff_hashing.out } - else{ - bff_out = channel.value("no_result") + else { + bff_out = channel.value('no_result') } - if (params.gmmDemux == "True"){ - gmmDemux_hto_input = params.hto_matrix_gmm_demux == "raw" ? hto_matrix_raw : hto_matrix_filtered + if (params.gmmDemux == 'True') { + gmmDemux_hto_input = params.hto_matrix_gmm_demux == 'raw' ? hto_matrix_raw : hto_matrix_filtered gmm_demux_hashing(gmmDemux_hto_input) gmmDemux_out = gmm_demux_hashing.out } - else{ - gmmDemux_out = channel.value("no_result") + else { + gmmDemux_out = channel.value('no_result') } - - summary(demuxem_out, hashsolo_out, htodemux_out, multiseq_out, hashedDrops_out,gmmDemux_out, demuxmix_out,bff_out, + summary(demuxem_out, hashsolo_out, htodemux_out, multiseq_out, hashedDrops_out, gmmDemux_out, demuxmix_out, bff_out, params.generate_anndata, params.generate_mudata, rna_matrix_filtered, hto_matrix_filtered) emit: summary.out diff --git a/multi_sample_input.csv b/multi_sample_input.csv index f4b27f5..6dfc981 100644 --- a/multi_sample_input.csv +++ b/multi_sample_input.csv @@ -1,3 +1,3 @@ -sampleId,rna_matrix_raw,rna_matrix_filtered,hto_matrix_raw,hto_matrix_filtered,bam,bam_index,barcodes,nsample,cell_data,vcf_mixed,vcf_donor,vireo_parent_dir,demultiplexing_result +sampleId,rna_matrix_raw,rna_matrix_filtered,hto_matrix_raw,hto_matrix_filtered,bam,bam_index,barcodes,nsamples_genetic,cell_data,vcf_mixed,vcf_donor,vireo_parent_dir,demultiplexing_result sample1,sample1/rna/raw_feature_bc_matrix,sample1/rna/filtered_feature_bc_matrix,sample1/hto/raw_feature_bc_matrix,sample1/hto/filtered_feature_bc_matrix,sample1/pooled.sorted.bam,sample1/pooled.sorted.bam.bai,sample1/rna/filtered_feature_bc_matrix/barcodes.tsv,2,None,None,None,None,None sample2,sample2/rna/raw_feature_bc_matrix,sample2/rna/filtered_feature_bc_matrix,sample2/hto/raw_feature_bc_matrix,sample2/hto/filtered_feature_bc_matrix,sample2/pooled.sorted.bam,sample2/pooled.sorted.bam.bai,sample2/rna/filtered_feature_bc_matrix/barcodes.tsv,2,None,None,None,None,None \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index ccc10f2..9b947bb 100644 --- a/nextflow.config +++ b/nextflow.config @@ -29,7 +29,7 @@ params { quantile_htodemux = "0.99" kfunc = "clara" nstarts = 100 - nsamples = 100 + nsamples_clustering = 100 seed = 42 init = "None" objectOutHTO = "htodemux" @@ -177,7 +177,7 @@ params { barcodes = "None" fasta = "None" fasta_index = "None" - nsample = 2 + nsamples_genetic = 2 common_variants_scSplit = "None" common_variants_souporcell = "None" common_variants_freemuxlet = "None" @@ -212,7 +212,7 @@ params { r2_info = "R2" min_mac = 1 min_callrate = 0.50 - alpha = "0.5" // must be string, multiple values in a single run should be comma separated + alpha = "0.5" // must be quoted doublet_prior = 0.5 demuxlet_out = "demuxlet_res" @@ -255,7 +255,7 @@ params { min_ref = 10 max_loci = 2048 restarts = "None" - use_known_genotype = "True" + use_known_genotype = "False" known_genotypes_sample_names = "None" skip_remap = "True" ignore = "False" @@ -290,7 +290,7 @@ params { report_all_haplotype_alleles = "False" report_monomorphic = "False" pvar = 0.0 - strict_vcf = "False" + strict_vcf = "False" theta = 0.001 pooled_discrete = "False" diff --git a/test.config b/test.config index 5a60ee6..5c74183 100644 --- a/test.config +++ b/test.config @@ -12,7 +12,7 @@ params { barcodes = "$projectDir/test_data/barcodes.tsv" fasta = "$projectDir/test_data/genome_chr1.fa" fasta_index = "$projectDir/test_data/genome_chr1.fa.fai" - nsample = 2 + nsamples_genetic = 2 common_variants_scSplit = "$projectDir/test_data/common_variants_hg19_list.vcf" common_variants_souporcell = "$projectDir/test_data/common_variants_hg19.vcf" common_variants_freemuxlet = "$projectDir/test_data/jurkat_293t_exons_only.vcf.withAF.vcf.gz" diff --git a/test_data/download_data.sh b/test_data/download_data.sh index ea3aa9b..8dee2a4 100644 --- a/test_data/download_data.sh +++ b/test_data/download_data.sh @@ -6,9 +6,11 @@ wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1xl9g wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1jlhEO1Z7YGYVnxv1YO9arjDwGFeZbfkr' -O jurkat_293t_downsampled_n500_full_bam.bam.bai FILEID="13CV6CjP9VzmwG5MVHbJiVDMVdiIhGdJB" FILENAME="jurkat_293t_downsampled_n500_full_bam.bam" -wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=$FILEID' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=$FILEID" -O $FILENAME && rm -rf /tmp/cookies.txt +# wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=$FILEID' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=$FILEID" -O $FILENAME && rm -rf /tmp/cookies.txt +pip install gdown +gdown --id $FILEID -O $FILENAME wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1MmwEiOsdzEfRdXS6oXXBwMJXUovKWcni' -O final_res.zip -unzip final_res.zip +unzip final_res.zip rm final_res.zip mv final_res/jurkat_293t_demuxlet.best . rm -rf final_res @@ -24,7 +26,12 @@ wget --no-check-certificate https://figshare.com/ndownloader/files/43102453 -O g wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1lw4T6d7uXsm9dt39ZtEwpuB2VTY3wK1y' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1lw4T6d7uXsm9dt39ZtEwpuB2VTY3wK1y" -O common_variants_hg19.vcf && rm -rf /tmp/cookies.txt wget https://master.dl.sourceforge.net/project/cellsnp/SNPlist/genome1K.phase3.SNP_AF5e2.chr1toX.hg19.vcf.gz # Processed by bcftools query -f '%CHROM:%POS\n' common_variants_hg19.vcf > common_variants_hg19_list.vcf -wget https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/41773431/common_variants_hg19_list.vcf?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIYCQYOYV5JSSROOA/20230731/eu-west-1/s3/aws4_request&X-Amz-Date=20230731T153655Z&X-Amz-Expires=10&X-Amz-SignedHeaders=host&X-Amz-Signature=283575c6ccb3104b8b95684e6d955abd28b47db71c18d1eeec99ae5dab65ff7b +wget https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/41773431/common_variants_hg19_list.vcf?X-Amz-Algorithm=AWS4-HMAC-SHA256 & +X-Amz-Credential=AKIAIYCQYOYV5JSSROOA/20230731/eu-west-1/s3/aws4_request & +X-Amz-Date=20230731T153655Z & +X-Amz-Expires=10 & +X-Amz-SignedHeaders=host & +X-Amz-Signature=283575c6ccb3104b8b95684e6d955abd28b47db71c18d1eeec99ae5dab65ff7b # Download simualated data # Can also run Rscript simulation.R @@ -34,9 +41,8 @@ wget --no-check-certificate https://figshare.com/ndownloader/files/41773428 -O b wget --no-check-certificate https://figshare.com/ndownloader/files/41773431 -O common_variants_hg19_list.vcf # simulated HTO and RNA counts wget --no-check-certificate https://figshare.com/ndownloader/files/41779407 -O hto.zip -unzip hto.zip +unzip hto.zip wget --no-check-certificate https://figshare.com/ndownloader/files/41779260 -O rna.zip unzip rna.zip rm hto.zip rm rna.zip -