From 749f8be933d35c95b177af9d52760362dff8439e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Myl=C3=A8ne=20Mariana=20Gonzales=20Andr=C3=A9?= Date: Mon, 6 Nov 2023 19:24:41 +0100 Subject: [PATCH 01/32] correction to merged files --- bin/bff.R | 13 ------------- bin/demuxem.py | 4 ++++ bin/summary_hash.py | 5 ++--- 3 files changed, 6 insertions(+), 16 deletions(-) diff --git a/bin/bff.R b/bin/bff.R index e4a4606..6154dcf 100755 --- a/bin/bff.R +++ b/bin/bff.R @@ -89,19 +89,6 @@ if (!is.null(args$methodsForConsensus)) { perCell_args <- args$perCellSaturation perCell <- ifelse(perCell_args == "null" || perCell_args == "Null", NULL, perCell_args) -<<<<<<< HEAD -<<<<<<< HEAD -print("---------------------") -print(perCell) -print("---------------------") -======= -print(perCell) - ->>>>>>> c781241 (bff re-added problematic parameter) -======= -print("---------------------") -print(perCell) -print("---------------------") if(args$methodsForConsensus=="bff_raw" || args$methodsForConsensus=="bff_cluster" || args$methodsForConsensus=="bff_raw,bff_cluster" || is.null(args$methodsForConsensus) ) #Only Bff in its different variations is available diff --git a/bin/demuxem.py b/bin/demuxem.py index 02b57a8..e6e8702 100755 --- a/bin/demuxem.py +++ b/bin/demuxem.py @@ -41,6 +41,10 @@ # Extract rna and hashing data #rna_data = data.get_data(modality="rna") #hashing_data = data.get_data(modality="hashing") + rna = args.rna_matrix_dir + print("-------------------") + print(rna) + print("-------------------") filter = "" if args.filter_demuxem.lower() in ['true', 't', 'yes', 'y', '1']: filter = True diff --git a/bin/summary_hash.py b/bin/summary_hash.py index f43603a..bdf8582 100755 --- a/bin/summary_hash.py +++ b/bin/summary_hash.py @@ -450,13 +450,13 @@ def bff_summary(bff_res,raw_adata, raw_mudata): dt_assign = dt_assign.drop([column], axis=1) dt_assign.loc[dt_assign["consensuscall"] == "Singlet", "consensuscall"] = "singlet" dt_assign.loc[dt_assign["consensuscall"] == "Doublet", "consensuscall"] = "doublet" + dt_assign.loc[dt_assign["consensuscall"] == "Negative", "consensuscall"] = "negative" + dt_assign['consensuscall'] = dt_assign['consensuscall'].astype('category') dt_assign = dt_assign.rename(columns={"cellbarcode": "Barcode", "consensuscall": os.path.basename(x)}) assign.append(dt_assign) if raw_adata is not None: adata = raw_adata.copy() - adata.obs = adata.obs.merge(dt_assign, left_index=True, right_index=True, how='left') - adata.obs.rename(columns={adata.obs.columns[0]: 'donor'}, inplace=True) adata.obs.donor = adata.obs.donor.fillna("negative") adata.obs.donor = adata.obs.donor.astype(str) @@ -465,7 +465,6 @@ def bff_summary(bff_res,raw_adata, raw_mudata): if raw_mudata is not None: mudata = raw_mudata.copy() - mudata['rna'].obs = mudata['rna'].obs.merge(dt_assign, left_index=True, right_index=True, how='left') mudata['rna'].obs.rename(columns={mudata['rna'].obs.columns[0]: 'donor'}, inplace=True) mudata['rna'].obs.donor = mudata['rna'].obs.donor.fillna("negative") From e5810bc2f9537a48aa983873c8106542c171eb89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Myl=C3=A8ne=20Mariana=20Gonzales=20Andr=C3=A9?= Date: Mon, 6 Nov 2023 22:55:48 +0100 Subject: [PATCH 02/32] correction merged files --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index afecaa3..bd056af 100644 --- a/nextflow.config +++ b/nextflow.config @@ -448,7 +448,7 @@ process { conda = "$projectDir/conda/demuxmix.yml" } withName:bff { - conda = './conda/bff.yml' + conda = "$projectDir/conda/dropletutils.yml" } withName:matchDonor { conda = "$projectDir/conda/donor_match.yml" From 2393ad3cbc92a88156cca6abaf31f9fde807984d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Myl=C3=A8ne=20Mariana=20Gonzales=20Andr=C3=A9?= Date: Mon, 6 Nov 2023 23:00:24 +0100 Subject: [PATCH 03/32] correction merged filed --- modules/single/hash_demulti/gmm_demux.nf | 64 ++++++++++-------------- 1 file changed, 27 insertions(+), 37 deletions(-) diff --git a/modules/single/hash_demulti/gmm_demux.nf b/modules/single/hash_demulti/gmm_demux.nf index 0f0ec38..990b340 100644 --- a/modules/single/hash_demulti/gmm_demux.nf +++ b/modules/single/hash_demulti/gmm_demux.nf @@ -2,32 +2,31 @@ nextflow.enable.dsl=2 process gmm_demux{ - publishDir "$projectDir/$params.outdir/$params.mode/hash_demulti/gmm_demux", mode:'copy' + publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/hash_demulti/gmm_demux", mode:'copy' label 'small_mem' input: - path path_hto + tuple val(sampleId), path(filtered_hto_matrix_dir), val(hto_name_gmm) //HTO names as string separated by commas - each hto_name_gmm - //5 cases are available for the tool it all depends on the vars given + //val hto_name_gmm //mode 2 //need estimate number of cells in the single cell assay //obligatory - each summary + val summary //need to be combined with summary to get a report as file - each report_gmm + val report_gmm //mode 4 // write csv or tsv - type of input - each mode_GMM + val mode_GMM //case 5 - each extract + val extract //float between 0 and 1 - each threshold_gmm - each ambiguous + val threshold_gmm + val ambiguous output: - path "gmm_demux_${task.index}" + path "gmm_demux_${sampleId}" script: def extract_droplets = extract != 'None' ? " -x ${extract}" : '' @@ -35,20 +34,20 @@ process gmm_demux{ if(mode_GMM=="csv"){ """ - mkdir gmm_demux_${task.index} - touch gmm_demux_${task.index}_$report_gmm + mkdir gmm_demux_${sampleId} + touch gmm_demux_${sampleId}_$report_gmm - GMM-demux -c $path_hto $hto_name_gmm -u $summary --report gmm_demux_${task.index}_$report_gmm --full gmm_demux_${task.index} $extract_droplets -t $threshold_gmm - gmm_demux_params.py --path_hto $path_hto --hto_name_gmm $hto_name_gmm --summary $summary --report gmm_demux_${task.index}_$report_gmm --mode $mode_GMM $extract_droplets --threshold_gmm $threshold_gmm $ambiguous_droplets --outputdir gmm_demux_${task.index} + GMM-demux -c $filtered_hto_matrix_dir $hto_name_gmm -u $summary --report gmm_demux_${sampleId}_$report_gmm --full gmm_demux_${sampleId} $extract_droplets -t $threshold_gmm + gmm_demux_params.py --path_hto $filtered_hto_matrix_dir --hto_name_gmm $hto_name_gmm --summary $summary --report gmm_demux_${sampleId}_$report_gmm --mode $mode_GMM $extract_droplets --threshold_gmm $threshold_gmm $ambiguous_droplets --outputdir gmm_demux_${sampleId} """ }else { """ - mkdir gmm_demux_${task.index} - touch gmm_demux_${task.index}_$report_gmm + mkdir gmm_demux_${sampleId} + touch gmm_demux_${sampleId}_$report_gmm - GMM-demux $path_hto $hto_name_gmm -u $summary -r gmm_demux_${task.index}_$report_gmm --full gmm_demux_${task.index} -o gmm_demux_${task.index} $extract_droplets -t $threshold_gmm - gmm_demux_params.py --path_hto $path_hto --hto_name_gmm $hto_name_gmm --summary $summary --report gmm_demux_${task.index}_$report_gmm --mode $mode_GMM $extract_droplets --threshold_gmm $threshold_gmm $ambiguous_droplets --outputdir gmm_demux_${task.index} + GMM-demux $filtered_hto_matrix_dir $hto_name_gmm -u $summary -r gmm_demux_${sampleId}_$report_gmm --full gmm_demux_${sampleId} -o gmm_demux_${sampleId} $extract_droplets -t $threshold_gmm + gmm_demux_params.py --path_hto $filtered_hto_matrix_dir --hto_name_gmm $hto_name_gmm --summary $summary --report gmm_demux_${sampleId}_$report_gmm --mode $mode_GMM $extract_droplets --threshold_gmm $threshold_gmm $ambiguous_droplets --outputdir gmm_demux_${sampleId} """ } @@ -56,28 +55,19 @@ process gmm_demux{ } -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{ return it.tokenize(';')}.flatten() - } - else{ - Channel.from(input) - } -} workflow gmm_demux_hashing{ - take: +take: hto_matrix - main: - hto_name_gmm = split_input(params.hto_name_gmm) - summary = split_input(params.summary) - report_gmm = split_input(params.report_gmm) - mode = split_input(params.mode_GMM) - extract = split_input(params.extract) - threshold_gmm = split_input(params.threshold_gmm) - ambiguous = split_input(params.ambiguous) + main: + summary = params.summary + report_gmm = params.report_gmm + mode = params.mode_GMM + extract = params.extract + threshold_gmm = params.threshold_gmm + ambiguous = params.ambiguous - gmm_demux(hto_matrix,hto_name_gmm,summary,report_gmm,mode,extract,threshold_gmm,ambiguous) + gmm_demux(hto_matrix,summary,report_gmm,mode,extract,threshold_gmm,ambiguous) emit: gmm_demux.out.collect() From c93582eb9a6f66f489ac2642d0ee12de28437f79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Myl=C3=A8ne=20Mariana=20Gonzales=20Andr=C3=A9?= Date: Mon, 6 Nov 2023 23:07:55 +0100 Subject: [PATCH 04/32] correction merged files --- modules/hash_demultiplexing.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/hash_demultiplexing.nf b/modules/hash_demultiplexing.nf index 4fb87a6..c2a61ae 100644 --- a/modules/hash_demultiplexing.nf +++ b/modules/hash_demultiplexing.nf @@ -91,7 +91,7 @@ process summary{ generate_mdata = "--generate_mudata --read_rna_mtx rna_data --read_hto_mtx hto_data" } - """ + """ summary_hash.py $demuxem_files $htodemux_files $multiseq_files $hashedDrops_files $hashsolo_files $demuxmix_files $gmmDemux_files $bff_files $generate_adata $generate_mdata --sampleId $sampleId """ } From 2dfd17f57fcfb7b5862d93820d29e3f7aa8bc7d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Myl=C3=A8ne=20Mariana=20Gonzales=20Andr=C3=A9?= Date: Mon, 6 Nov 2023 23:12:00 +0100 Subject: [PATCH 05/32] correction merged --- modules/hash_demultiplexing.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/hash_demultiplexing.nf b/modules/hash_demultiplexing.nf index c2a61ae..4fb87a6 100644 --- a/modules/hash_demultiplexing.nf +++ b/modules/hash_demultiplexing.nf @@ -91,7 +91,7 @@ process summary{ generate_mdata = "--generate_mudata --read_rna_mtx rna_data --read_hto_mtx hto_data" } - """ + """ summary_hash.py $demuxem_files $htodemux_files $multiseq_files $hashedDrops_files $hashsolo_files $demuxmix_files $gmmDemux_files $bff_files $generate_adata $generate_mdata --sampleId $sampleId """ } From 142eca88029eda5828899924e8d9631dcb086c66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Myl=C3=A8ne=20Mariana=20Gonzales=20Andr=C3=A9?= Date: Mon, 6 Nov 2023 23:31:49 +0100 Subject: [PATCH 06/32] correction merge --- modules/hash_demulti/gmm_demux.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/hash_demulti/gmm_demux.nf b/modules/hash_demulti/gmm_demux.nf index 337ae31..990b340 100644 --- a/modules/hash_demulti/gmm_demux.nf +++ b/modules/hash_demulti/gmm_demux.nf @@ -59,7 +59,6 @@ process gmm_demux{ workflow gmm_demux_hashing{ take: hto_matrix - hto_name_gmm main: summary = params.summary report_gmm = params.report_gmm From a1e6078c3d6fc1e3095856f0ced10d13781d472b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Myl=C3=A8ne=20Mariana=20Gonzales=20Andr=C3=A9?= Date: Tue, 7 Nov 2023 21:34:07 +0100 Subject: [PATCH 07/32] general code for mudata added --- bin/generate_data.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/bin/generate_data.py b/bin/generate_data.py index 96fc494..b1d1043 100755 --- a/bin/generate_data.py +++ b/bin/generate_data.py @@ -27,5 +27,17 @@ adata.write("adata_with_donor_matching.h5ad") if args.generate_mudata: - # write mudata_with_donor_matching.h5mu data - pass + rna_data = sc.read_10x_mtx(args.read_rna_mtx) + hto_data = sc.read_10x_mtx(args.read_hto_mtx, gex_only=False) + assignment_dir = os.path.join(args.assignment, + [filename for filename in os.listdir(args.assignment) if filename == "all_assignment_after_match.csv"][0]) + + assignment = pd.read_csv(assignment_dir, index_col = 0) + mudata = MuData({"rna": rna_data, "hto": hto_data }) + + mudata['rna'].obs = mudata['rna'].obs.merge(args.assignment, left_index=True, right_index=True, how='left') + mudata['rna'].obs.rename(columns={mudata['rna'].obs.columns[0]: 'donor'}, inplace=True) + mudata['rna'].obs.donor = mudata['rna'].obs.donor.fillna("negative") + mudata['rna'].obs.donor = mudata['rna'].obs.donor.astype(str) + mudata.update() + mudata.write("mudata_with_donor_matching.h5mu") From 1588feb36af4ee44d25ff7781dce590337a53637 Mon Sep 17 00:00:00 2001 From: zethson Date: Fri, 17 Nov 2023 17:10:54 +0100 Subject: [PATCH 08/32] Add notebook to sphinx Signed-off-by: zethson --- docs/requirements.txt | 1 + docs/source/conf.py | 6 ++++-- notebook.ipynb => docs/source/hadge_output.ipynb | 9 ++++++++- 3 files changed, 13 insertions(+), 3 deletions(-) rename notebook.ipynb => docs/source/hadge_output.ipynb (99%) diff --git a/docs/requirements.txt b/docs/requirements.txt index db72b7b..92e893c 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,3 +1,4 @@ sphinx +nb-sphinx myst-parser furo diff --git a/docs/source/conf.py b/docs/source/conf.py index 779af44..e314080 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -3,7 +3,8 @@ author = 'Fabiola Curion, Xichen Wu, Lukas Heumos' release = '1.0.0' -extensions = ['myst_parser'] +extensions = ['myst_parser', + 'nbsphinx'] source_suffix = { '.rst': 'restructuredtext', @@ -16,5 +17,6 @@ exclude_patterns = [] html_theme = "furo" +html_static_path = ['_static'] -html_static_path = ['_static'] \ No newline at end of file +nbsphinx_execute = 'never' diff --git a/notebook.ipynb b/docs/source/hadge_output.ipynb similarity index 99% rename from notebook.ipynb rename to docs/source/hadge_output.ipynb index ecd6e92..ac881e7 100644 --- a/notebook.ipynb +++ b/docs/source/hadge_output.ipynb @@ -1,5 +1,12 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Working with hadge output" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -16,7 +23,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This notebook serves to get familiar with the ouput of hadge and reproduce some figures from the manuscript." + "This notebook serves to get familiar with the output of hadge and reproduce some figures from the manuscript." ] }, { From 0f99fd89d789eafdcc334f856f2e3e6ee7dcbc34 Mon Sep 17 00:00:00 2001 From: zethson Date: Fri, 17 Nov 2023 17:11:52 +0100 Subject: [PATCH 09/32] Fix nbsphinx requirements Signed-off-by: zethson --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 92e893c..2b919fd 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,4 @@ sphinx -nb-sphinx +nbsphinx myst-parser furo From 45c5db7a93148cf4e433657d96a58e1f509b7377 Mon Sep 17 00:00:00 2001 From: zethson Date: Fri, 17 Nov 2023 17:19:41 +0100 Subject: [PATCH 10/32] Pandoc pls Signed-off-by: zethson --- docs/source/conf.py | 2 +- docs/source/index.md | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index e314080..5678883 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -14,7 +14,7 @@ templates_path = ['_templates'] -exclude_patterns = [] +exclude_patterns = ['_build', '**.ipynb_checkpoints'] html_theme = "furo" html_static_path = ['_static'] diff --git a/docs/source/index.md b/docs/source/index.md index fb0656e..6d74528 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -56,7 +56,7 @@ nextflow run main.nf -profile test ## Notebook -Check[](../../notebook.ipynb) to get familiar with the output of hadge. +Check[](hadge_output.ipynb) to get familiar with the output of hadge. ## **Pipeline output** @@ -98,6 +98,7 @@ general genetic hashing rescue +hadge_output ``` # Indices and tables From 9bcebac0931c1a76e529e23a56a19f3e0b49e1c9 Mon Sep 17 00:00:00 2001 From: zethson Date: Fri, 17 Nov 2023 17:21:31 +0100 Subject: [PATCH 11/32] Remove docs wf Signed-off-by: zethson --- .github/workflows/build_docs.yml | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 .github/workflows/build_docs.yml diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml deleted file mode 100644 index e0c9ef6..0000000 --- a/.github/workflows/build_docs.yml +++ /dev/null @@ -1,22 +0,0 @@ -name: sphinx-docs - -on: - push: - pull_request: - -jobs: - build: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: "3.11" - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r docs/requirements.txt - - - name: Build documentation - run: cd docs && make html From ce91ab36582a5216006c8745ead4b91f41e21fb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Myl=C3=A8ne=20Mariana=20Gonzales=20Andr=C3=A9?= Date: Mon, 20 Nov 2023 20:02:34 +0100 Subject: [PATCH 12/32] Docs for hashing --- docs/source/hashing.md | 74 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 73 insertions(+), 1 deletion(-) diff --git a/docs/source/hashing.md b/docs/source/hashing.md index af91a13..d77253f 100644 --- a/docs/source/hashing.md +++ b/docs/source/hashing.md @@ -16,7 +16,11 @@ The input data depends heavily on the deconvolution tools. In the following tabl | Multiseq | - Seurat object with both UMI and hashing count matrix (RDS) | `params.rna_matrix_multiseq`
`params.hto_matrix_multiseq` | | HashSolo | - 10x mtx directory with hashing count matrix (H5) | `params.hto_matrix_hashsolo`
`params.rna_matrix_hashsolo` | | HashedDrops | - 10x mtx directory with hashing count matrix (Directory) | `params.hto_matrix_hashedDrops` | -| Demuxem | - 10x mtx directory with UMI count matrix (Directory)
- 10x mtx directory with hashing count matrix (Directory) | `params.hto_matrix_demuxem`
`params.rna_matrix_demuxem` | +| Demuxem | - 10x mtx directory with UMI count matrix (Directory)
- 10x mtx directory with hashing count matrix (Directory) | `params.hto_matrix_demuxem`
`params.rna_matrix_demuxem` | +| GMM - Demux | - 10x mtx directory with UMI count matrix (Directory)
- 10x mtx directory with hashing count matrix (Directory) | `params.hto_matrix_gmm_demux` | +| Demuxmix | - 10x mtx directory with UMI count matrix (Directory)
- 10x mtx directory with hashing count matrix (Directory) | `params.hto_matrix_demuxmix`
`params.rna_matrix_demuxmix` | +| BFF | - 10x mtx directory with UMI count matrix (Directory)
- 10x mtx directory with hashing count matrix (Directory) | `params.hto_matrix_bff` | + Similary as genotype-based deconvlution methods, hashing methods also have some input in common. So we also try to utilize common input parameters `params.[rna/hto]_matrix_[raw/filtered]` to store count matrices for better control and `params.[rna/hto]_matrix_[method]` is used to specify whether to use raw or filtered counts for each method. @@ -279,3 +283,71 @@ output directory: `$pipeline_output_folder/bff/bff_[task_ID/sampleId]` | combinations | An integer matrix specifying valid combinations of HTOs. Each row corresponds to a single sample and specifies the indices of rows in x corresponding to the HTOs used to label that sample. Default: NULL | | objectOutHashedDrops | Prefix of the hashedDrops output RDS object. Default: hashedDrops | | assignmentOutHashedDrops | Prefix of the hashedDrops output CSV file. Default: hashedDrops | +### GMM-Demux +| | | +| ------------------------ | -------------------------------------------------------------------------------------------- | +| gmmDemux | Whether to perform GMMDemux. Default: True | +| hto_matrix_gmm_demux | Whether to use raw or filtered HTO count matrix. Default: filtered | +| assignmentOutGmmDemux | Name for the folder output. Default: gmm_demux| +| hto_name_gmm | list of sample tags (HTOs) separated by ',' without whitespace. Default: None | +| summary | the estimated total count of cells in the single cell assay. Default: 2000 | +| report_gmm | Name for the file generated by the summary. Default:report.txt | +| mode_GMM | Format of the input, either tsv or csv. Default: tsv | +| extract | extract names of the sample barcoding tag(s) to extract, separated by ','. Joint tags are linked with '+'. Default: None | +| threshold_gmm | Provide the confidence threshold value. Requires a float in (0,1). Default: 0.8 | +| ambiguous | The estimated chance of having a phony GEM getting included in a pure type GEM cluster by the clustering algorithm. Default: 0.5. | +| plotOutHashSolo | Prefix of the output figures. Default: hashsolo | + +### BFF +| | | +| ------------------------ | -------------------------------------------------------------------------------------------- | +| BFF | Whether to perform BFF. Default: False | +| hto_matrix_bff | Whether to use raw or filtered HTO count matrix. Default: raw | +| rna_matrix_bff | Whether to use raw or filtered scRNA-seq count matrix. Default: raw | +| assignmentOutBff | Name for the folder output. Default: bff| +| methods | method or list of methods to be used. Default: combined_bff | +| methodsForConsensus | a consensus call will be generated using all methods especified. Default: NULL | +| cellbarcodeWhitelist | A vector of expected cell barcodes. Default:NULL | +| metricsFile | summary metrics will be written to this file. Default: metrics_bff.cvs | +| doTSNE | tSNE will be run on the resulting hashing calls after each caller. Default: True | +| doHeatmap | if true, Seurat::HTOHeatmap will be run on the results of each calle Default: True | +| perCellSaturation | An optional dataframe with the columns cellbarcode and saturation. Default: NULL | +| majorityConsensusThreshold | This applies to calculating a consensus call when multiple algorithms are used. Default: NULL | +| chemistry | This string is passed to EstimateMultipletRate. Should be either 10xV2 or 10xV3. Default: 10xV3 | +| callerDisagreementThreshold | If provided, the agreement rate will be calculated between each caller and the simple majority call, ignoring discordant and no-call cells. Default: NULL | +| preprocess_bff | When True, the data is preprocess using the method ProcessCountMatrix from CellHashR. Default: False | +| barcodeWhitelist | A vector of barcode names to retain. This parameter is used only when the pre-processing step is executed. Default: NULL | + +### Demuxmix +| | | +| ------------------------ | -------------------------------------------------------------------------------------------- | +| demuxmix | Whether to perform Demuxmix. Default: False | +| hto_matrix_demuxmix | Whether to use raw or filtered HTO count matrix. Default: raw | +| rna_matrix_demuxmix | Whether to use raw or filtered scRNA-seq count matrix. Default: raw | +| assignmentOutDemuxmix | Name for the folder output. Default: demuxmix| +| correctTails | If TRUE, droplets meeting the threshold defined by alpha (beta) are classified as "negative" ("positive") even if the mixture model suggests a different classification. Default: True | +| rna_available | If TRUE, it indicates that RNA data is available and some modes are execuatble with Demuxmix. Default: TRUE | +| model | A character specifying the type of mixture model to be used. Either "naive", "regpos", "reg" or "auto". The last three options require parameter rna to be specified. Default:naive | +| alpha_demuxmix | Threshold defining the left tail of the mixture distribution where droplets should not be classified as "positive". Threshold must be between 0 and 1 Default: 0.9 | +| beta_demuxmix | Threshold for defining the right tail of the mixture distribution where droplets should not be classified as "negative" Default: 0.9 | +| tol_demuxmix | Convergence criterion for the EM algorithm used to fit the mixture models. Default: 0.00001 | +| maxIter_demuxmix | Maximum number of iterations for the EM algorithm and for the alternating iteration process fitting the NB regression models Default:1000 | +| k_hto |Factor to define outliers in the HTO count Default: 1.5 | +| k_rna |Factor to define outliers in the numbers of detected genes. Default: 1.5 | + +### General Use + +#### Single sample use +The use of the pipeline for a single samples require the definition of certain parameters in order to run the tools under default configuration. +The parameter `--mode hashing` must be included with the purpose of running the hashing tools only. +##### GMM-Demux +The names of the hashtags must be given as a list of string, separated by ','. This list is given under the parameter `--hto_name_gmm` +##### BFF +The demultiplexing method for the experiment must be given under the parameter `--methods`. Multiple methods can be given as a list, separated by ','. +Besides, the method or methods for consensus must be given under the parameter `--methodsForConsensus`. + + +```bash +nextflow run main.nf --mode hashing --match_donor False --hto_matrix_raw /data_folder/raw_hto_data +--hto_matrix_filtered /data_folder/filtered_hto_data --barcodes /data_folder/filtered_hto_data/barcodes.tsv.gz --rna_matrix_raw /data_folder/raw_rna_data --rna_matrix_filtered /data_folder/filtered_rna_data --hto_name_gmm "hto_name_1,hto_name_2,hto_name_3" --methods bff_cluster --methodsForConsensus bff_cluster --demuxmix False +``` \ No newline at end of file From 1d4f0bcdf041268fa1d6f9a6c55006c6b100b501 Mon Sep 17 00:00:00 2001 From: Lukas Heumos Date: Thu, 23 Nov 2023 18:59:48 +0100 Subject: [PATCH 13/32] Add quickstart to README --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index dbb0c56..6b3c23d 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,14 @@ Read the [documentation](https://hadge.readthedocs.io/en/latest/) and the associated [manuscript](https://www.biorxiv.org/content/10.1101/2023.07.23.550061v1) +## Quickstart + +```bash +git clone https://github.com/theislab/hadge.git +sh hadge/test_data/download_data.sh +nextflow run main.nf -profile test +``` + ## Citation hadge: a comprehensive pipeline for donor deconvolution in single cell From f4e5b27367c291f249b502e93afd37b358141f95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Myl=C3=A8ne=20Mariana=20Gonzales=20Andr=C3=A9?= Date: Mon, 27 Nov 2023 15:02:52 +0100 Subject: [PATCH 14/32] demuxmix out --- bin/demuxmix.R | 104 ------------------------ bin/summary_hash.py | 65 --------------- docs/source/hashing.md | 25 +----- docs/source/index.md | 3 +- modules/hash_demulti/demuxmix.nf | 68 ---------------- modules/hash_demultiplexing.nf | 25 +----- modules/single/hash_demulti/demuxmix.nf | 79 ------------------ modules/single/hash_demultiplexing.nf | 19 +---- nextflow.config | 18 ---- 9 files changed, 6 insertions(+), 400 deletions(-) delete mode 100755 bin/demuxmix.R delete mode 100644 modules/hash_demulti/demuxmix.nf delete mode 100644 modules/single/hash_demulti/demuxmix.nf diff --git a/bin/demuxmix.R b/bin/demuxmix.R deleted file mode 100755 index 6552507..0000000 --- a/bin/demuxmix.R +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env Rscript -library(Seurat) -library(demuxmix) -library(argparse) -library(data.table) - - -# Create a parser -parser <- ArgumentParser("Parameters for Demuxmix") -parser$add_argument("--fileUmi", help = "Path to file UMI count matrix.") -parser$add_argument("--fileHto", help = "Path to file HTO count matrix.") -parser$add_argument("--ndelim", help = "For the initial identity calss for each cell, delimiter for the cell's column name", default = "_") -parser$add_argument("--pAcpt", help='Acceptance probability that must be reached in order to assign a droplet to a hashtag. ') -parser$add_argument("--assay", help='Assay name') -parser$add_argument("--gene_col", help = "Specify which column of genes.tsv or features.tsv to use for gene names; default is 2", type="integer", default = 2) -parser$add_argument("--rna_available", help='TRUE if RNA assay is available',default = FALSE) -parser$add_argument("--correctTails", help='If TRUE, droplets meeting the threshold defined by alpha (beta) are classified as "negative" ("positive") even if the mixture model suggests a different classification',default = TRUE) -parser$add_argument("--model", help='A character specifying the type of mixture model to be used. Either "naive", "regpos", "reg" or "auto".', default = 'naive') -parser$add_argument("--alpha_demuxmix", help='Threshold defining the left tail of the mixture distribution where droplets should not be classified as "positive".',type = "double",default=0.9) -parser$add_argument("--beta_demuxmix", help='Threshold for defining the right tail of the mixture distribution where droplets should not be classified as "negative".',type = "double",default=0.9) -parser$add_argument("--tol_demuxmix", help='Convergence criterion for the EM algorithm used to fit the mixture models.', type="double") -parser$add_argument("--maxIter_demuxmix", help='Maximum number of iterations for the EM algorithm',type = "integer",default=100) -parser$add_argument("--k_hto", help='Factor to define outliers in the HTO counts.',type = "double",default=1.5) -parser$add_argument("--k_rna", help='Factor to define outliers in the numbers of detected genes.',type = "double",default=1.5) -parser$add_argument("--assignmentOutDemuxmix", help="Prefix name for the file containing the output of Demuxmix assignment", type = "character", default = "demuxmix") -parser$add_argument("--outputdir", help='Output directory') -args <- parser$parse_args() - - -if(as.logical(args$rna_available)){ - - umi <- Read10X(data.dir = args$fileUmi, gene.column=args$gene_col, strip.suffix = TRUE) - -} -counts <- Read10X(data.dir = args$fileHto, gene.column=args$gene_col, strip.suffix = TRUE) - - -if(as.logical(args$rna_available)){ - Argument <- c("fileUmi", "fileHto") - Value <- c(args$fileUmi, args$fileHto) - params <- data.frame(Argument, Value) - #demuxmix requires a Seurat object created from raw data, - #without any type of pre-processing (Normalisation, ScaleData, etc) - # Setup Seurat object - hashtag <- CreateSeuratObject(counts = umi, names.delim = args$ndelim) - hashtag[[args$assay]] <- CreateAssayObject(counts = counts) - }else{ - #If RNA data is not available, a Seurat object containing the HTO data is created - hashtag <- CreateSeuratObject(counts = counts, names.delim = args$ndelim, assay = args$assay) - Argument <- c( "fileHto") - Value <- c(args$fileHto) - params <- data.frame(Argument, Value) - } - - -Argument <- c("fileUmi", "fileHto","assay", "model", "alpha_demuxmix", "beta_demuxmix", "tol_demuxmix", "maxIter_demuxmix", "k_hto","k_rna","correct_tails") -Value <- c(args$fileUmi, args$fileHto, args$assay, args$model, args$alpha_demuxmix, args$beta_demuxmix, args$tol_demuxmix, args$maxIter_demuxmix, args$k_hto, args$k_rna,args$correctTails) - -params <- data.frame(Argument, Value) - - -### Demuxmix -hto_counts <- as.matrix(GetAssayData(hashtag[[args$assay]], slot = "counts")) -hashtags_object <- GetAssayData(hashtag[[args$assay]], slot = "counts") -hashtag_list<-hashtags_object@Dimnames - -tryCatch({ -#Demultiplexing process - if(args$model != 'naive' && as.logical(args$rna_available)){ - rna_counts <- hashtag$nCount_RNA - demuxmix_demul <- demuxmix(hto_counts,rna= rna_counts, model = args$model, alpha= args$alpha_demuxmix,beta= args$beta_demuxmix,maxIter=args$maxIter_demuxmix,k.hto=args$k_hto, correctTails = as.logical(args$correctTails)) - - }else{ - print("Executing naive mode for Demuxmix") - demuxmix_demul <- demuxmix(hto_counts, model = args$model, alpha= args$alpha_demuxmix,beta= args$beta_demuxmix,maxIter=args$maxIter_demuxmix,k.hto=args$k_hto) - } - demuxmix_classify <- dmmClassify(demuxmix_demul) - sumary_demuxmix <- summary(demuxmix_demul) - demuxmix_classify$Barcode <- hashtag_list[[2]] - - res_dt <- as.data.table(demuxmix_classify) - res_dt[, Classification := Type] - res_dt[Classification == "multiplet", Classification := "doublet"] - res_dt[Classification == "uncertain", Classification := "negative"] - - res_dt$HTO <- gsub(',', '_', res_dt$HTO) - write.csv(res_dt, paste0(args$outputdir, "/", args$assignmentOutDemuxmix, "_assignment_demuxmix.csv"), row.names=FALSE) - write.csv(sumary_demuxmix, paste0(args$outputdir, "/", args$assignmentOutDemuxmix, "_summary_results.csv"), row.names=FALSE) - -}, error = function(e) { - print("Error for Demuxmix") - error_message <- conditionMessage(e) - writeLines(e, "error_log.txt") - df <- data.frame() - write.csv(df, paste0(args$outputdir, "/", args$assignmentOutDemuxmix, "_assignment_demuxmix.csv"), row.names=FALSE) -},finally = { - write.csv(params, paste0(args$outputdir, "/params.csv")) -}) - - - - - - diff --git a/bin/summary_hash.py b/bin/summary_hash.py index bdf8582..f68f05e 100755 --- a/bin/summary_hash.py +++ b/bin/summary_hash.py @@ -12,7 +12,6 @@ parser.add_argument("--multiseq", help="Folder containing output files of multiseq", default=None) parser.add_argument("--hashsolo", help="Folder containing output files of hashsolo", default=None) parser.add_argument("--hashedDrops", help="Folder containing output files of hashedDrops", default=None) -parser.add_argument("--demuxmix", help="Folder containing output files of Demuxmix", default=None) parser.add_argument("--bff", help="Folder containing output files of BFF", default=None) parser.add_argument("--gmm_demux", help="Folder containing output files of GMM-Demux", default=None) parser.add_argument("--generate_anndata", help="Generate anndata", action='store_true') @@ -291,66 +290,6 @@ def htodemux_summary(htodemux_res, raw_adata, raw_mudata): params = pd.concat(params, axis=1) params.to_csv("hash_summary/htodemux_params.csv") - -def demuxmix_summary(demuxmix_res,raw_adata, raw_mudata): - classi = [] - assign = [] - params = [] - files_in_folder = [item for item in demuxmix_res if os.path.isfile(os.path.join(demuxmix_res, item))] - - if len(files_in_folder) > 0: - for x in demuxmix_res: - obs_res_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename.endswith("_assignment_demuxmix.csv")][0]) - demuxmix_asign = pd.read_csv(obs_res_dir) - if demuxmix_asign.empty: - #no results create empty dataframe for empty col - column_names = ['Barcode', os.path.basename(x)] - # Create an empty dataframe with only column names - df = pd.DataFrame(columns=column_names) - classi.append(df) - assign.append(df) - else: - dt_assign = demuxmix_asign[["Barcode", "HTO"]] - dt_assign.columns = ["Barcode", os.path.basename(x)] - assign.append(dt_assign) - - if raw_adata is not None: - adata = raw_adata.copy() - adata.obs = adata.obs.merge(demuxmix_asign, left_index=True, right_on='Barcode', how='left').set_index('Barcode') - adata.obs.rename(columns={adata.obs.columns[0]: 'donor'}, inplace=True) - adata.obs.donor = adata.obs.donor.fillna("negative") - adata.obs.donor = adata.obs.donor.astype(str) - adata.write("hash_summary/adata/adata_with_"+os.path.basename(x)+".h5ad") - - if raw_mudata is not None: - mudata = raw_mudata.copy() - mudata['rna'].obs = mudata['rna'].obs.merge(demuxmix_asign, left_index=True, right_on='Barcode', how='left').set_index('Barcode') - mudata['rna'].obs.rename(columns={mudata['rna'].obs.columns[0]: 'donor'}, inplace=True) - mudata['rna'].obs.donor = mudata['rna'].obs.donor.fillna("negative") - mudata['rna'].obs.donor = mudata['rna'].obs.donor.astype(str) - mudata.update() - mudata.write("hash_summary/mudata/mudata_with_mudata_"+ os.path.basename(x)+".h5mu") - - demuxmix_classi = pd.read_csv(obs_res_dir) - dt_classi = demuxmix_classi[["Barcode", "Classification"]] - dt_classi.columns = ["Barcode", os.path.basename(x)] - classi.append(dt_classi) - - params_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename == "params.csv"][0]) - params_res = pd.read_csv(params_dir, usecols=[1, 2], keep_default_na=False, index_col=0) - params_res.columns = [os.path.basename(x)] - params.append(params_res) - - classi_df = pd.concat(classi, axis=1, join="outer") - classi_df.to_csv("hash_summary" + "/demuxmix_classification.csv",index=False) - - assign_df = pd.concat(assign, axis=1, join="outer") - assign_df.to_csv("hash_summary" +"/demuxmix_assignment.csv",index=False) - - params = pd.concat(params, axis=1) - params.to_csv("hash_summary" +"/demuxmix_params.csv",index=False) - else: - print("No results found for Demuxmix") def gmm_summary(gmmDemux_res,raw_adata, raw_mudata): classi = [] @@ -535,10 +474,6 @@ def bff_summary(bff_res,raw_adata, raw_mudata): if args.htodemux is not None: htodemux_res = args.htodemux.split(':') htodemux_summary(htodemux_res, adata, mudata) - - if args.demuxmix is not None: - demuxmix_res = args.demuxmix.split(':') - demuxmix_summary(demuxmix_res, adata, mudata) if args.gmm_demux is not None: gmmDemux_res = args.gmm_demux.split(':') diff --git a/docs/source/hashing.md b/docs/source/hashing.md index d77253f..59c1dd5 100644 --- a/docs/source/hashing.md +++ b/docs/source/hashing.md @@ -18,7 +18,6 @@ The input data depends heavily on the deconvolution tools. In the following tabl | HashedDrops | - 10x mtx directory with hashing count matrix (Directory) | `params.hto_matrix_hashedDrops` | | Demuxem | - 10x mtx directory with UMI count matrix (Directory)
- 10x mtx directory with hashing count matrix (Directory) | `params.hto_matrix_demuxem`
`params.rna_matrix_demuxem` | | GMM - Demux | - 10x mtx directory with UMI count matrix (Directory)
- 10x mtx directory with hashing count matrix (Directory) | `params.hto_matrix_gmm_demux` | -| Demuxmix | - 10x mtx directory with UMI count matrix (Directory)
- 10x mtx directory with hashing count matrix (Directory) | `params.hto_matrix_demuxmix`
`params.rna_matrix_demuxmix` | | BFF | - 10x mtx directory with UMI count matrix (Directory)
- 10x mtx directory with hashing count matrix (Directory) | `params.hto_matrix_bff` | @@ -111,12 +110,6 @@ output directory: `$pipeline_output_folder/hashedDrops/hashedDrops_[task_ID/samp - `${params.objectOutHashedDrops}_LogFC.png`: a diagnostic plot comparing the log-fold change between the second HTO's abundance and the ambient contamination - `params.csv`: specified parameters in the HashedDrops task -### Demuxmix - -output directory: `$pipeline_output_folder/demuxmix/demuxmix_[task_ID/sampleId]` - -- `${params.assignmentOutDemuxmix}_assignment_demuxmix.csv`: the assignment and classification results produced by Demuxmix -- `params.csv`: specified parameters in the Demuxmix task ### GMM-Demux @@ -318,22 +311,6 @@ output directory: `$pipeline_output_folder/bff/bff_[task_ID/sampleId]` | preprocess_bff | When True, the data is preprocess using the method ProcessCountMatrix from CellHashR. Default: False | | barcodeWhitelist | A vector of barcode names to retain. This parameter is used only when the pre-processing step is executed. Default: NULL | -### Demuxmix -| | | -| ------------------------ | -------------------------------------------------------------------------------------------- | -| demuxmix | Whether to perform Demuxmix. Default: False | -| hto_matrix_demuxmix | Whether to use raw or filtered HTO count matrix. Default: raw | -| rna_matrix_demuxmix | Whether to use raw or filtered scRNA-seq count matrix. Default: raw | -| assignmentOutDemuxmix | Name for the folder output. Default: demuxmix| -| correctTails | If TRUE, droplets meeting the threshold defined by alpha (beta) are classified as "negative" ("positive") even if the mixture model suggests a different classification. Default: True | -| rna_available | If TRUE, it indicates that RNA data is available and some modes are execuatble with Demuxmix. Default: TRUE | -| model | A character specifying the type of mixture model to be used. Either "naive", "regpos", "reg" or "auto". The last three options require parameter rna to be specified. Default:naive | -| alpha_demuxmix | Threshold defining the left tail of the mixture distribution where droplets should not be classified as "positive". Threshold must be between 0 and 1 Default: 0.9 | -| beta_demuxmix | Threshold for defining the right tail of the mixture distribution where droplets should not be classified as "negative" Default: 0.9 | -| tol_demuxmix | Convergence criterion for the EM algorithm used to fit the mixture models. Default: 0.00001 | -| maxIter_demuxmix | Maximum number of iterations for the EM algorithm and for the alternating iteration process fitting the NB regression models Default:1000 | -| k_hto |Factor to define outliers in the HTO count Default: 1.5 | -| k_rna |Factor to define outliers in the numbers of detected genes. Default: 1.5 | ### General Use @@ -349,5 +326,5 @@ Besides, the method or methods for consensus must be given under the parameter ` ```bash nextflow run main.nf --mode hashing --match_donor False --hto_matrix_raw /data_folder/raw_hto_data ---hto_matrix_filtered /data_folder/filtered_hto_data --barcodes /data_folder/filtered_hto_data/barcodes.tsv.gz --rna_matrix_raw /data_folder/raw_rna_data --rna_matrix_filtered /data_folder/filtered_rna_data --hto_name_gmm "hto_name_1,hto_name_2,hto_name_3" --methods bff_cluster --methodsForConsensus bff_cluster --demuxmix False +--hto_matrix_filtered /data_folder/filtered_hto_data --barcodes /data_folder/filtered_hto_data/barcodes.tsv.gz --rna_matrix_raw /data_folder/raw_rna_data --rna_matrix_filtered /data_folder/filtered_rna_data --hto_name_gmm "hto_name_1,hto_name_2,hto_name_3" --methods bff_cluster --methodsForConsensus bff_cluster ``` \ No newline at end of file diff --git a/docs/source/index.md b/docs/source/index.md index cb4f111..dc70159 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -16,14 +16,13 @@ The genetics-based deconvolution workflow includes 5 methods: - Souporcell - scSplit -The hashing-based deconvolution includes 8 methods: +The hashing-based deconvolution includes 7 methods: - hashedDrops - Multiseq - HTODemux - Demuxem - HashSolo -- Demuxmix - BFF - GMM_Demux diff --git a/modules/hash_demulti/demuxmix.nf b/modules/hash_demulti/demuxmix.nf deleted file mode 100644 index 6f93482..0000000 --- a/modules/hash_demulti/demuxmix.nf +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env nextflow -nextflow.enable.dsl=2 - -process demuxmix{ - publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/hash_demulti/demuxmix", mode:'copy' - label 'small_mem' - - input: - tuple val(sampleId), path(raw_hto_matrix_dir, stageAs: "hto_data_${params.hto_matrix_demuxmix}"), - path(raw_rna_matrix_dir, stageAs: "rna_data_${params.rna_matrix_demuxmix}") - val hto_raw_or_filtered - val rna_raw_or_filtered - val rna_available - val assay - val ndelim - val model - val alpha_demuxmix - val beta_demuxmix - val tol_demuxmix - val maxIter_demuxmix - val k_hto - val k_rna - val correctTails - val assignmentOutDemuxmix - val gene_col - - output: - path "demuxmix_${sampleId}" - - script: - """ - mkdir demuxmix_${sampleId} - - demuxmix.R --fileUmi rna_data_${params.rna_matrix_demuxmix} --fileHto hto_data_${params.hto_matrix_demuxmix} --rna_available $rna_available --assay $assay \ - --ndelim $ndelim --model $model --alpha_demuxmix $alpha_demuxmix --beta_demuxmix $beta_demuxmix --tol_demuxmix $tol_demuxmix \ - --maxIter_demuxmix $maxIter_demuxmix --correctTails $correctTails --k_hto $k_hto --k_rna $k_rna --outputdir demuxmix_${sampleId} \ - --assignmentOutDemuxmix $assignmentOutDemuxmix --gene_col $gene_col - """ - -} - -workflow demuxmix_hashing{ - take: - input_list - main: - assay = params.assay - ndelim = params.ndelim - model = params.model - alpha_demuxmix = params.alpha_demuxmix - beta_demuxmix = params.beta_demuxmix - tol_demuxmix = params.tol_demuxmix - maxIter_demuxmix = params.maxIter_demuxmix - k_hto = params.k_hto - k_rna = params.k_rna - correctTails = params.correctTails - assignmentOutDemuxmix = params.assignmentOutDemuxmix - gene_col = params.gene_col - - - demuxmix(input_list, assay,ndelim,model,alpha_demuxmix, beta_demuxmix, tol_demuxmix, maxIter_demuxmix, k_hto, k_rna,correctTails,assignmentOutDemuxmix, gene_col) - - emit: - demuxmix.out.collect() -} - -workflow{ - demuxmix_hashing() -} diff --git a/modules/hash_demultiplexing.nf b/modules/hash_demultiplexing.nf index 4fb87a6..eb95f5e 100644 --- a/modules/hash_demultiplexing.nf +++ b/modules/hash_demultiplexing.nf @@ -7,7 +7,6 @@ include { htodemux_hashing } from './hash_demulti/htodemux' include { hash_solo_hashing } from './hash_demulti/hashsolo' include { hashedDrops_hashing } from './hash_demulti/hashedDrops' include { demuxem_hashing } from './hash_demulti/demuxem' -include { demuxmix_hashing } from './hash_demulti/demuxmix' include { gmm_demux_hashing } from './hash_demulti/gmm_demux' include { bff_hashing } from './hash_demulti/bff' @@ -22,7 +21,6 @@ process summary{ val htodemux_result val multiseq_result val hashedDrops_result - val demuxmix_result val bff_result val gmmDemux_result val generate_anndata @@ -32,7 +30,6 @@ process summary{ tuple val(sampleId), path("hash_summary") script: - def demuxem_files = "" def htodemux_files = "" def hashsolo_files = "" def multiseq_files = "" @@ -43,10 +40,6 @@ process summary{ def generate_adata = "" def generate_mdata = "" - if (demuxem_result != "no_result"){ - demuxem_res = demuxem_result.find{it.name.contains(sampleId)} - demuxem_files = "--demuxem ${demuxem_res}" - } if (hashsolo_result != "no_result"){ hashsolo_res = hashsolo_result.find{it.name.contains(sampleId)} hashsolo_files = "--hashsolo ${hashsolo_res}" @@ -92,7 +85,7 @@ process summary{ } """ - summary_hash.py $demuxem_files $htodemux_files $multiseq_files $hashedDrops_files $hashsolo_files $demuxmix_files $gmmDemux_files $bff_files $generate_adata $generate_mdata --sampleId $sampleId + summary_hash.py $demuxem_files $htodemux_files $multiseq_files $hashedDrops_files $hashsolo_files $gmmDemux_files $bff_files $generate_adata $generate_mdata --sampleId $sampleId """ } @@ -172,20 +165,6 @@ workflow hash_demultiplexing{ } - if (params.demuxmix == "True"){ - Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, params.hto_matrix_demuxmix == "raw" ? row.hto_matrix_raw : row.hto_matrix_filtered, - params.rna_matrix_demuxmix == "raw" ? row.rna_matrix_raw : row.rna_matrix_filtered, - params.hto_matrix_demuxmix,params.rna_matrix_demuxmix)} - |set {input_list_demuxmix} - demuxmix_hashing (input_list_demuxmix,params.rna_available) - demuxmix_out = demuxmix_hashing.out - } - else{ - demuxmix_out = channel.value("no_result") - } - if (params.bff == "True"){ Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ @@ -215,7 +194,7 @@ workflow hash_demultiplexing{ | splitCsv(header:true) \ | map { row-> tuple(row.sampleId, row.hto_matrix_filtered, row.rna_matrix_filtered)} | set {input_list_summary} - summary(input_list_summary, demuxem_out, hashsolo_out, htodemux_out, multiseq_out, hashedDrops_out,demuxmix_out,bff_out,gmmDemux_out, params.generate_anndata, params.generate_mudata) + summary(input_list_summary, demuxem_out, hashsolo_out, htodemux_out, multiseq_out, hashedDrops_out,bff_out,gmmDemux_out, params.generate_anndata, params.generate_mudata) emit: summary.out diff --git a/modules/single/hash_demulti/demuxmix.nf b/modules/single/hash_demulti/demuxmix.nf deleted file mode 100644 index 021b887..0000000 --- a/modules/single/hash_demulti/demuxmix.nf +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env nextflow -nextflow.enable.dsl=2 - -process demuxmix{ - publishDir "$projectDir/$params.outdir/$params.mode/hash_demulti/demuxmix", mode:'copy' - label 'small_mem' - input: - path hto_matrix, stageAs: 'hto_data' - path umi_matrix, stageAs: 'rna_data' - val hto_raw_or_filtered - val rna_raw_or_filtered - each rna_available - each assay - val ndelim - each model - each alpha_demuxmix - each beta_demuxmix - each tol_demuxmix - each maxIter_demuxmix - each k_hto - each k_rna - each correctTails - each assignmentOutDemuxmix - each gene_col - - output: - path "demuxmix_${task.index}" - - script: - - """ - mkdir demuxmix_${task.index} - demuxmix.R --fileUmi rna_data --fileHto hto_data --rna_available $rna_available --assay $assay --ndelim $ndelim --model $model --alpha_demuxmix $alpha_demuxmix \ - --beta_demuxmix $beta_demuxmix --tol_demuxmix $tol_demuxmix --maxIter_demuxmix $maxIter_demuxmix --correctTails $correctTails \ - --k_hto $k_hto --k_rna $k_rna --outputdir demuxmix_${task.index} --assignmentOutDemuxmix $assignmentOutDemuxmix --gene_col $gene_col - """ - -} - -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{ return it.tokenize(';')}.flatten() - } - else{ - Channel.from(input) - } -} - -workflow demuxmix_hashing{ - take: - hto_matrix - rna_matrix - hto_raw_or_filtered - rna_raw_or_filtered - rna_available - main: - assay = split_input(params.assay) - ndelim = params.ndelim - model = split_input(params.model) - alpha_demuxmix = split_input(params.alpha_demuxmix) - beta_demuxmix = split_input(params.beta_demuxmix) - tol_demuxmix = split_input(params.tol_demuxmix) - maxIter_demuxmix = split_input(params.maxIter_demuxmix) - k_hto = split_input(params.k_hto) - k_rna = split_input(params.k_rna) - correctTails = split_input(params.correctTails) - assignmentOutDemuxmix = split_input(params.assignmentOutDemuxmix) - gene_col = split_input(params.gene_col) - - - demuxmix(hto_matrix,rna_matrix,hto_raw_or_filtered,rna_raw_or_filtered,rna_available, assay,ndelim,model, alpha_demuxmix, beta_demuxmix, tol_demuxmix, maxIter_demuxmix, k_hto, k_rna,correctTails,assignmentOutDemuxmix,gene_col ) - - emit: - demuxmix.out.collect() -} - -workflow{ - demuxmix_hashing() -} diff --git a/modules/single/hash_demultiplexing.nf b/modules/single/hash_demultiplexing.nf index 0a011a9..3bea0fa 100644 --- a/modules/single/hash_demultiplexing.nf +++ b/modules/single/hash_demultiplexing.nf @@ -7,7 +7,6 @@ include { htodemux_hashing } from './hash_demulti/htodemux' include { hash_solo_hashing } from './hash_demulti/hashsolo' include { hashedDrops_hashing } from './hash_demulti/hashedDrops' include { demuxem_hashing } from './hash_demulti/demuxem' -include { demuxmix_hashing } from './hash_demulti/demuxmix' include { gmm_demux_hashing } from './hash_demulti/gmm_demux' include { bff_hashing } from './hash_demulti/bff' @@ -22,7 +21,6 @@ process summary{ val multiseq_result val hashedDrops_result val gmmDemux_result - val demuxmix_result val bff_result val generate_anndata val generate_mudata @@ -62,9 +60,6 @@ process summary{ if (gmmDemux_result != "no_result"){ gmmDemux_files = "--gmm_demux ${gmmDemux_result.join(":")}" } - if (demuxmix_result != "no_result"){ - demuxmix_files = "--demuxmix ${demuxmix_result.join(":")}" - } if (bff_result != "no_result"){ bff_files = "--bff ${bff_result.join(":")}" } @@ -85,7 +80,7 @@ process summary{ } """ - summary_hash.py $demuxem_files $htodemux_files $multiseq_files $hashedDrops_files $hashsolo_files $demuxmix_files $gmmDemux_files $bff_files $generate_adata $generate_mdata + summary_hash.py $demuxem_files $htodemux_files $multiseq_files $hashedDrops_files $hashsolo_files $gmmDemux_files $bff_files $generate_adata $generate_mdata """ } @@ -157,16 +152,6 @@ workflow hash_demultiplexing{ else{ hashedDrops_out = channel.value("no_result") } - if (params.demuxmix == "True"){ - demuxmix_rna_input = params.rna_available == "False" ? channel.value("None") : - (params.rna_matrix_demuxmix == "raw" ? hto_matrix_raw : hto_matrix_filtered) - demuxmix_hto_input = params.hto_matrix_demuxmix == "raw" ? rna_matrix_raw : rna_matrix_filtered - demuxmix_hashing(demuxmix_hto_input,demuxmix_rna_input,params.hto_matrix_demuxmix, params.rna_matrix_demuxmix,params.rna_available) - demuxmix_out = demuxmix_hashing.out - } - else{ - demuxmix_out = channel.value("no_result") - } if (params.bff == "True"){ bff_hto_input = params.hto_matrix_bff == "raw" ? hto_matrix_raw : hto_matrix_filtered bff_hashing(bff_hto_input) @@ -185,7 +170,7 @@ workflow hash_demultiplexing{ } - summary(demuxem_out, hashsolo_out, htodemux_out, multiseq_out, hashedDrops_out,gmmDemux_out, demuxmix_out,bff_out, + summary(demuxem_out, hashsolo_out, htodemux_out, multiseq_out, hashedDrops_out,gmmDemux_out,bff_out, params.generate_anndata, params.generate_mudata, rna_matrix_filtered, hto_matrix_filtered) emit: summary.out diff --git a/nextflow.config b/nextflow.config index bd056af..9ac0bbf 100644 --- a/nextflow.config +++ b/nextflow.config @@ -137,21 +137,6 @@ params { threshold_gmm = 0.8 ambiguous = 0.05 - //demuxmix - demuxmix = "False" - rna_matrix_demuxmix = "raw" - hto_matrix_demuxmix = "raw" - assignmentOutDemuxmix = "demuxmix" - correctTails = "TRUE" - rna_available = 'TRUE' - model = 'naive' - alpha_demuxmix = 0.9 - beta_demuxmix = 0.9 - tol_demuxmix = 0.00001 - maxIter_demuxmix = 1000 - k_hto = 1.5 - k_rna = 1.5 - //bff bff = "False" rna_matrix_bff = "raw" @@ -444,9 +429,6 @@ process { withName:gmm_demux { conda = "$projectDir/conda/gmm_demux.yml" } - withName:demuxmix { - conda = "$projectDir/conda/demuxmix.yml" - } withName:bff { conda = "$projectDir/conda/dropletutils.yml" } From e67a52f60e82ffb0800bfd493c1680eb7bf81538 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Myl=C3=A8ne=20Mariana=20Gonzales=20Andr=C3=A9?= Date: Mon, 27 Nov 2023 15:08:53 +0100 Subject: [PATCH 15/32] demuxmix env deleted --- conda/demuxmix.yml | 10 ---------- 1 file changed, 10 deletions(-) delete mode 100644 conda/demuxmix.yml diff --git a/conda/demuxmix.yml b/conda/demuxmix.yml deleted file mode 100644 index 8556ed1..0000000 --- a/conda/demuxmix.yml +++ /dev/null @@ -1,10 +0,0 @@ -name: demuxmix -channels: - - conda-forge - - bioconda -dependencies: - - conda-forge::r-base>=4.2 - - bioconductor-demuxmix - - r-seurat - - r-argparse - - r-data.table From c33b1c74fe6247d50fdb329d9a2101a82a9503d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Myl=C3=A8ne=20Mariana=20Gonzales=20Andr=C3=A9?= Date: Mon, 27 Nov 2023 22:27:59 +0100 Subject: [PATCH 16/32] deleting demuxmix parts --- modules/hash_demulti/demuxmix.nf | 70 --------------------------- modules/hash_demultiplexing.nf | 11 ++--- modules/single/hash_demultiplexing.nf | 1 - 3 files changed, 5 insertions(+), 77 deletions(-) delete mode 100644 modules/hash_demulti/demuxmix.nf diff --git a/modules/hash_demulti/demuxmix.nf b/modules/hash_demulti/demuxmix.nf deleted file mode 100644 index 5103a02..0000000 --- a/modules/hash_demulti/demuxmix.nf +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env nextflow -nextflow.enable.dsl=2 - -process demuxmix{ - publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/hash_demulti/demuxmix", mode:'copy' - label 'small_mem' - - conda "$projectDir/conda/demuxmix.yml" - - input: - tuple val(sampleId), path(raw_hto_matrix_dir, stageAs: "hto_data_${params.hto_matrix_demuxmix}"), - path(raw_rna_matrix_dir, stageAs: "rna_data_${params.rna_matrix_demuxmix}") - val hto_raw_or_filtered - val rna_raw_or_filtered - val rna_available - val assay - val ndelim - val model - val alpha_demuxmix - val beta_demuxmix - val tol_demuxmix - val maxIter_demuxmix - val k_hto - val k_rna - val correctTails - val assignmentOutDemuxmix - val gene_col - - output: - path "demuxmix_${sampleId}" - - script: - """ - mkdir demuxmix_${sampleId} - - demuxmix.R --fileUmi rna_data_${params.rna_matrix_demuxmix} --fileHto hto_data_${params.hto_matrix_demuxmix} --rna_available $rna_available --assay $assay \ - --ndelim $ndelim --model $model --alpha_demuxmix $alpha_demuxmix --beta_demuxmix $beta_demuxmix --tol_demuxmix $tol_demuxmix \ - --maxIter_demuxmix $maxIter_demuxmix --correctTails $correctTails --k_hto $k_hto --k_rna $k_rna --outputdir demuxmix_${sampleId} \ - --assignmentOutDemuxmix $assignmentOutDemuxmix --gene_col $gene_col - """ - -} - -workflow demuxmix_hashing{ - take: - input_list - main: - assay = params.assay - ndelim = params.ndelim - model = params.model - alpha_demuxmix = params.alpha_demuxmix - beta_demuxmix = params.beta_demuxmix - tol_demuxmix = params.tol_demuxmix - maxIter_demuxmix = params.maxIter_demuxmix - k_hto = params.k_hto - k_rna = params.k_rna - correctTails = params.correctTails - assignmentOutDemuxmix = params.assignmentOutDemuxmix - gene_col = params.gene_col - - - demuxmix(input_list, assay,ndelim,model,alpha_demuxmix, beta_demuxmix, tol_demuxmix, maxIter_demuxmix, k_hto, k_rna,correctTails,assignmentOutDemuxmix, gene_col) - - emit: - demuxmix.out.collect() -} - -workflow{ - demuxmix_hashing() -} diff --git a/modules/hash_demultiplexing.nf b/modules/hash_demultiplexing.nf index 7c29cd7..ee51c75 100644 --- a/modules/hash_demultiplexing.nf +++ b/modules/hash_demultiplexing.nf @@ -37,11 +37,15 @@ process summary{ def multiseq_files = "" def hashedDrops_files = "" def gmmDemux_files = "" - def demuxmix_files = "" + def demuxem_files = "" def bff_files = "" def generate_adata = "" def generate_mdata = "" + if (demuxem_result != "no_result"){ + demuxem_res = demuxem_result.find{it.name.contains(sampleId)} + demuxem_files = "--demuxem ${demuxem_res}" + } if (hashsolo_result != "no_result"){ hashsolo_res = hashsolo_result.find{it.name.contains(sampleId)} hashsolo_files = "--hashsolo ${hashsolo_res}" @@ -62,10 +66,6 @@ process summary{ gmmDemux_res = gmmDemux_result.find{it.name.contains(sampleId)} gmmDemux_files = "--gmm_demux ${gmmDemux_res}" } - if (demuxmix_result != "no_result"){ - demuxmix_res = demuxmix_result.find{it.name.contains(sampleId)} - demuxmix_files = "--demuxmix ${demuxmix_res}" - } if (bff_result != "no_result"){ bff_res = bff_result.find{it.name.contains(sampleId)} bff_files = "--bff ${bff_res}" @@ -87,7 +87,6 @@ process summary{ } """ - summary_hash.py $demuxem_files $htodemux_files $multiseq_files $hashedDrops_files $hashsolo_files $demuxmix_files $gmmDemux_files $bff_files $generate_adata $generate_mdata summary_hash.py $demuxem_files $htodemux_files $multiseq_files $hashedDrops_files $hashsolo_files $gmmDemux_files $bff_files $generate_adata $generate_mdata --sampleId $sampleId """ } diff --git a/modules/single/hash_demultiplexing.nf b/modules/single/hash_demultiplexing.nf index 57dca67..fc3592a 100644 --- a/modules/single/hash_demultiplexing.nf +++ b/modules/single/hash_demultiplexing.nf @@ -39,7 +39,6 @@ process summary{ def multiseq_files = "" def hashedDrops_files = "" def gmmDemux_files = "" - def demuxmix_files = "" def bff_files = "" def generate_adata = "" def generate_mdata = "" From 20fd60f724f3bf982b848ed6f141ae90171657d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Myl=C3=A8ne=20Mariana=20Gonzales=20Andr=C3=A9?= Date: Tue, 28 Nov 2023 13:11:48 +0100 Subject: [PATCH 17/32] demuxmix module out - single --- modules/single/hash_demulti/demuxmix.nf | 82 ------------------------- 1 file changed, 82 deletions(-) delete mode 100644 modules/single/hash_demulti/demuxmix.nf diff --git a/modules/single/hash_demulti/demuxmix.nf b/modules/single/hash_demulti/demuxmix.nf deleted file mode 100644 index b687e97..0000000 --- a/modules/single/hash_demulti/demuxmix.nf +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env nextflow -nextflow.enable.dsl=2 - -process demuxmix{ - publishDir "$projectDir/$params.outdir/$params.mode/hash_demulti/demuxmix", mode:'copy' - label 'small_mem' - - conda "$projectDir/conda/demuxmix.yml" - - input: - path hto_matrix, stageAs: 'hto_data' - path umi_matrix, stageAs: 'rna_data' - val hto_raw_or_filtered - val rna_raw_or_filtered - each rna_available - each assay - val ndelim - each model - each alpha_demuxmix - each beta_demuxmix - each tol_demuxmix - each maxIter_demuxmix - each k_hto - each k_rna - each correctTails - each assignmentOutDemuxmix - each gene_col - - output: - path "demuxmix_${task.index}" - - script: - - """ - mkdir demuxmix_${task.index} - demuxmix.R --fileUmi rna_data --fileHto hto_data --rna_available $rna_available --assay $assay --ndelim $ndelim --model $model --alpha_demuxmix $alpha_demuxmix \ - --beta_demuxmix $beta_demuxmix --tol_demuxmix $tol_demuxmix --maxIter_demuxmix $maxIter_demuxmix --correctTails $correctTails \ - --k_hto $k_hto --k_rna $k_rna --outputdir demuxmix_${task.index} --assignmentOutDemuxmix $assignmentOutDemuxmix --gene_col $gene_col - """ - -} - -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{ return it.tokenize(';')}.flatten() - } - else{ - Channel.from(input) - } -} - -workflow demuxmix_hashing{ - take: - hto_matrix - rna_matrix - hto_raw_or_filtered - rna_raw_or_filtered - rna_available - main: - assay = split_input(params.assay) - ndelim = params.ndelim - model = split_input(params.model) - alpha_demuxmix = split_input(params.alpha_demuxmix) - beta_demuxmix = split_input(params.beta_demuxmix) - tol_demuxmix = split_input(params.tol_demuxmix) - maxIter_demuxmix = split_input(params.maxIter_demuxmix) - k_hto = split_input(params.k_hto) - k_rna = split_input(params.k_rna) - correctTails = split_input(params.correctTails) - assignmentOutDemuxmix = split_input(params.assignmentOutDemuxmix) - gene_col = split_input(params.gene_col) - - - demuxmix(hto_matrix,rna_matrix,hto_raw_or_filtered,rna_raw_or_filtered,rna_available, assay,ndelim,model, alpha_demuxmix, beta_demuxmix, tol_demuxmix, maxIter_demuxmix, k_hto, k_rna,correctTails,assignmentOutDemuxmix,gene_col ) - - emit: - demuxmix.out.collect() -} - -workflow{ - demuxmix_hashing() -} From 5b722edc715a1848a018152b8c514fc7e6ba4ced Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Myl=C3=A8ne=20Mariana=20Gonzales=20Andr=C3=A9?= Date: Wed, 29 Nov 2023 11:21:08 +0100 Subject: [PATCH 18/32] fix for hashed drops, nextflow passing incompatible strings for NULL values --- bin/dropletUtils.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bin/dropletUtils.R b/bin/dropletUtils.R index d775e91..0b83d06 100755 --- a/bin/dropletUtils.R +++ b/bin/dropletUtils.R @@ -83,10 +83,11 @@ png(paste0(args$outputdir, "/", "plot_emptyDrops.png")) plot(emptyDrops_out$Total, -emptyDrops_out$LogProb, col=colors, xlab="Total UMI count", ylab="-Log Probability") dev.off() +combinations_transformed <- ifelse(tolower(args$combinations) == "null", NULL, args$combinations) if (args$ambient == TRUE) { - hashedDrops_out <- hashedDrops(hto[,which(is.cell)], min.prop = args$minProp, ambient = metadata(emptyDrops_out)$ambient, pseudo.count = args$pseudoCount, constant.ambient = args$constantAmbient, doublet.nmads = args$doubletNmads, doublet.min = args$doubletMin, doublet.mixture = args$doubletMixture, confident.nmads = args$confidentNmads, confident.min = args$confidenMin, combinations = args$combinations) + hashedDrops_out <- hashedDrops(hto[,which(is.cell)], min.prop = args$minProp, ambient = metadata(emptyDrops_out)$ambient, pseudo.count = args$pseudoCount, constant.ambient = args$constantAmbient, doublet.nmads = args$doubletNmads, doublet.min = args$doubletMin, doublet.mixture = args$doubletMixture, confident.nmads = args$confidentNmads, confident.min = args$confidenMin, combinations = combinations_transformed) } else { - hashedDrops_out <- hashedDrops(hto[,which(is.cell)], min.prop = args$minProp, pseudo.count = args$pseudoCount, constant.ambient = args$constantAmbient, doublet.nmads = args$doubletNmads, doublet.min = args$doubletMin, doublet.mixture = args$doubletMixture, confident.nmads = args$confidentNmads, confident.min = args$confidenMin, combinations = args$combinations) + hashedDrops_out <- hashedDrops(hto[,which(is.cell)], min.prop = args$minProp, pseudo.count = args$pseudoCount, constant.ambient = args$constantAmbient, doublet.nmads = args$doubletNmads, doublet.min = args$doubletMin, doublet.mixture = args$doubletMixture, confident.nmads = args$confidentNmads, confident.min = args$confidenMin, combinations = combinations_transformed) } print("------------------- hashedDrops finished ---------------------------------") From 64d1bb19117dcb277e7661a1e813b229acfc7c07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Myl=C3=A8ne=20Mariana=20Gonzales=20Andr=C3=A9?= Date: Wed, 29 Nov 2023 11:27:43 +0100 Subject: [PATCH 19/32] fix for hashed drops, nextflow passing incompatible strings for NULL values --- bin/dropletUtils.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bin/dropletUtils.R b/bin/dropletUtils.R index 0b83d06..8cb7cfa 100755 --- a/bin/dropletUtils.R +++ b/bin/dropletUtils.R @@ -60,10 +60,10 @@ parser$add_argument("--gene_col", help = "Specify which column of genes.tsv or f args <- parser$parse_args() hto <- Read10X(data.dir = args$raw_hto_matrix_dir, gene.column = args$gene_col) - +ignore_transformed <- ifelse(tolower(args$ignore) == "null", NULL, args$ignore) emptyDrops_out <- emptyDrops(hto, lower = args$lower, niters = args$niters, test.ambient = args$testAmbient, - ignore = args$ignore, + ignore = ignore_transformed, alpha = args$alpha, round = args$round, by.rank = args$byRank) @@ -84,6 +84,7 @@ plot(emptyDrops_out$Total, -emptyDrops_out$LogProb, col=colors, xlab="Total UMI dev.off() combinations_transformed <- ifelse(tolower(args$combinations) == "null", NULL, args$combinations) + if (args$ambient == TRUE) { hashedDrops_out <- hashedDrops(hto[,which(is.cell)], min.prop = args$minProp, ambient = metadata(emptyDrops_out)$ambient, pseudo.count = args$pseudoCount, constant.ambient = args$constantAmbient, doublet.nmads = args$doubletNmads, doublet.min = args$doubletMin, doublet.mixture = args$doubletMixture, confident.nmads = args$confidentNmads, confident.min = args$confidenMin, combinations = combinations_transformed) } else { From f00e1e62ef99d8faa64125b410b6ace0ea0d8881 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Myl=C3=A8ne=20Mariana=20Gonzales=20Andr=C3=A9?= Date: Wed, 29 Nov 2023 12:41:47 +0100 Subject: [PATCH 20/32] test NULL for hashing drops --- bin/dropletUtils.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/dropletUtils.R b/bin/dropletUtils.R index 8cb7cfa..159eedf 100755 --- a/bin/dropletUtils.R +++ b/bin/dropletUtils.R @@ -63,7 +63,7 @@ hto <- Read10X(data.dir = args$raw_hto_matrix_dir, gene.column = args$gene_col) ignore_transformed <- ifelse(tolower(args$ignore) == "null", NULL, args$ignore) emptyDrops_out <- emptyDrops(hto, lower = args$lower, niters = args$niters, test.ambient = args$testAmbient, - ignore = ignore_transformed, + ignore = NULL, alpha = args$alpha, round = args$round, by.rank = args$byRank) From 8575aa4af6cff7fa88233e52b57b71ffe632fd35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Myl=C3=A8ne=20Mariana=20Gonzales=20Andr=C3=A9?= Date: Wed, 6 Dec 2023 11:14:36 +0100 Subject: [PATCH 21/32] debugging --- .gitignore | 3 ++- docs/source/hashing.md | 2 +- main.nf | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index b2b6327..b390657 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ result*/ testing/ testing* *.pyc -docs/build/ \ No newline at end of file +docs/build/ +nextflow_internal.config \ No newline at end of file diff --git a/docs/source/hashing.md b/docs/source/hashing.md index 16c869c..9ced8bd 100644 --- a/docs/source/hashing.md +++ b/docs/source/hashing.md @@ -134,7 +134,7 @@ output directory: `$pipeline_output_folder/gmm_demux/gmm_demux_[task_ID/sampleId output directory: `$pipeline_output_folder/bff/bff_[task_ID/sampleId]` -- `${params.assignmentOutBff}_assignment_demuxmix.csv`: the assignment and classification results produced by BFF +- `${params.assignmentOutBff}_assignment_bff.csv`: the assignment and classification results produced by BFF - `params.csv`: specified parameters in the BFF task ## **Parameter** diff --git a/main.nf b/main.nf index 547e332..000d535 100644 --- a/main.nf +++ b/main.nf @@ -72,7 +72,7 @@ workflow run_single{ } } else if (params.mode == "hashing"){ - + print("Running single sample") hash_demultiplexing(params.rna_matrix_raw, params.rna_matrix_filtered, params.hto_matrix_raw, params.hto_matrix_filtered) if (params.match_donor == "True"){ donor_match(hash_demultiplexing.out) @@ -102,6 +102,7 @@ workflow run_single{ } workflow { + print(params.multi_input) if (params.multi_input == null){ run_single() } From c68f70aec369efc97e48b06978ecdc1622b5aef8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Myl=C3=A8ne=20Mariana=20Gonzales=20Andr=C3=A9?= Date: Wed, 6 Dec 2023 23:37:01 +0100 Subject: [PATCH 22/32] debugging --- bin/dropletUtils.R | 5 +++-- main.nf | 1 - 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/dropletUtils.R b/bin/dropletUtils.R index 159eedf..9504872 100755 --- a/bin/dropletUtils.R +++ b/bin/dropletUtils.R @@ -63,10 +63,11 @@ hto <- Read10X(data.dir = args$raw_hto_matrix_dir, gene.column = args$gene_col) ignore_transformed <- ifelse(tolower(args$ignore) == "null", NULL, args$ignore) emptyDrops_out <- emptyDrops(hto, lower = args$lower, niters = args$niters, test.ambient = args$testAmbient, - ignore = NULL, + ignore = ignore_transformed, alpha = args$alpha, round = args$round, by.rank = args$byRank) - +print("-------------") +print(emptyDrops_out) print("------------------- emptyDrops finished ---------------------------------") diff --git a/main.nf b/main.nf index 000d535..c455a47 100644 --- a/main.nf +++ b/main.nf @@ -102,7 +102,6 @@ workflow run_single{ } workflow { - print(params.multi_input) if (params.multi_input == null){ run_single() } From d49018fda80d970af7b7c8dfeef70c17760404b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Myl=C3=A8ne=20Mariana=20Gonzales=20Andr=C3=A9?= Date: Wed, 13 Dec 2023 09:14:53 -0400 Subject: [PATCH 23/32] summary correction classification and assignment --- bin/bff.R | 2 +- bin/summary_hash.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/bin/bff.R b/bin/bff.R index 6154dcf..b0ea7af 100755 --- a/bin/bff.R +++ b/bin/bff.R @@ -74,8 +74,8 @@ if(as.logical(args$preprocess)){ # Step 3: Create a vector from the barcodesl vector <- unlist(words) print("Preprocessing") - #counts <- Read10X(args$fileHto) counts <- ProcessCountMatrix(rawCountData = args$fileHto, barcodeBlacklist = vector) + print("Preprocessing done") }else{ print("No preprocessing") counts <- Read10X(args$fileHto) diff --git a/bin/summary_hash.py b/bin/summary_hash.py index bad62f7..d47fd16 100755 --- a/bin/summary_hash.py +++ b/bin/summary_hash.py @@ -364,7 +364,6 @@ def bff_summary(bff_res,raw_adata, raw_mudata): for column in column_names: if column in dt_assign.columns: dt_assign = dt_assign.drop([column], axis=1) - dt_assign.loc[dt_assign["consensuscall"] == "Singlet", "consensuscall"] = "singlet" dt_assign.loc[dt_assign["consensuscall"] == "Doublet", "consensuscall"] = "doublet" dt_assign.loc[dt_assign["consensuscall"] == "Negative", "consensuscall"] = "negative" dt_assign['consensuscall'] = dt_assign['consensuscall'].astype('category') @@ -392,14 +391,14 @@ def bff_summary(bff_res,raw_adata, raw_mudata): dt_classi = data_bff.copy() column_names_class = ["bff_raw","bff_cluster","consensuscall"] for column in column_names_class: - if column in dt_assign.columns: + if column in dt_classi.columns: dt_classi = dt_classi.drop([column], axis=1) dt_classi.loc[dt_classi["consensuscall.global"] == "Singlet", "consensuscall.global"] = "singlet" dt_classi.loc[dt_classi["consensuscall.global"] == "Doublet", "consensuscall.global"] = "doublet" dt_classi.loc[dt_classi["consensuscall.global"] == "Negative", "consensuscall.global"] = "negative" dt_classi = dt_classi.rename(columns={"cellbarcode": "Barcode", "consensuscall.global": os.path.basename(x)}) classi.append(dt_classi) - + params_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename == "params.csv"][0]) params_res = pd.read_csv(params_dir, usecols=[1, 2], keep_default_na=False, index_col=0) params_res.columns = [os.path.basename(x)] From 1940f26e2521e10f7e5f0a143a0ba9e91989e887 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Myl=C3=A8ne=20Mariana=20Gonzales=20Andr=C3=A9?= Date: Wed, 13 Dec 2023 21:14:21 -0400 Subject: [PATCH 24/32] restructured GMM-demux summary for assignment and classification --- bin/summary_hash.py | 54 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 12 deletions(-) diff --git a/bin/summary_hash.py b/bin/summary_hash.py index d47fd16..8e3d6fb 100755 --- a/bin/summary_hash.py +++ b/bin/summary_hash.py @@ -268,15 +268,33 @@ def htodemux_summary(htodemux_res, raw_adata, raw_mudata): params = pd.concat(params, axis=1) params.to_csv("hash_summary/htodemux_params.csv") + + def gmm_summary(gmmDemux_res,raw_adata, raw_mudata): classi = [] assign = [] params = [] for x in gmmDemux_res: obs_res_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename.endswith("GMM_full.csv")][0]) + + #we get the number of hashes used for the experiment from the parameters + #so that we now how many kinds of singlets we could find in the assignment + params_dir = os.path.join(x, "params.csv") + params_res = pd.read_csv(params_dir,index_col=False) + params_res.columns = ["Argument", os.path.basename(x)] + params.append(params_res) + + result_row = params_res[params_res['Argument'].str.contains('hto_name_gmm', case=False, na=False)] + hashes_used = "" + if not result_row.empty: + hashes_used = result_row[os.path.basename(x)].iloc[0] + else: + print("No row contains the number of hashes") + hashes = hashes_used.split(',') + number_of_hashes = len(hashes) + #GMM assignement file gmm_classi = pd.read_csv(obs_res_dir) - #GMM full is the name given by GMM_demux per default to all results classification_config = os.path.join(x, "GMM_full.config") classif_file = pd.read_csv(classification_config,header=None) @@ -284,23 +302,39 @@ def gmm_summary(gmmDemux_res,raw_adata, raw_mudata): #Classification and Assigment come from the same file gmm_dt = pd.DataFrame(gmm_classi) classification_dt = pd.DataFrame(classif_file) + #change column names classification_dt = classification_dt.rename(columns={0: "Cluster_id", 1: "assignment"}) + gmm_dt = gmm_dt.rename(columns={"Unnamed: 0": "Barcode"}) #Create classification following the assignment found for the barcodes #we keep the original assigment and add a classification column - classification_dt["assignment_binary"] = classification_dt["assignment"].str.contains("-") - classification_dt["classification"] = classification_dt["assignment"].apply(lambda x: "doublet" if x else "singlet") - classification_dt.at[0, "classification"] = "negative" + + ##Inner function + def classify_hash(row,number_hashes): + if row == 0: + return 'negative' + elif 0 > row <= number_hashes: + return 'singlet' + else: + return 'doublet' + + classification_dt['Classification'] = classification_dt['Cluster_id'].apply(lambda x: classify_hash(x, number_of_hashes)) + #Compare classification guide file with classification found merged = pd.merge(classification_dt, gmm_dt, on='Cluster_id', how='left') - - gmm_dt['Classification'] = merged['classification'] + gmm_dt['Classification'] = merged['Classification'] gmm_dt['Assignment'] = merged['assignment'] - + #instead of multiple hashes, add doublet + + gmm_dt["Assignment"] = gmm_dt["Assignment"].apply(lambda x: "doublet" if "-" in x else x) + classification_dt['Classification'] = classification_dt['Classification'].str.replace(' ', '') #Assigment for GMM-Demux + #'Cluster_id', gmm_dt_assign = gmm_dt.drop(['Cluster_id','Confidence','Classification' ], axis=1) + gmm_dt_assign['Assignment'] = gmm_dt_assign['Assignment'].str.replace(' ', '') gmm_dt_assign.columns = ["Barcode", os.path.basename(x)] + assign.append(gmm_dt_assign) if raw_adata is not None: @@ -325,11 +359,7 @@ def gmm_summary(gmmDemux_res,raw_adata, raw_mudata): gmm_dt_classi.columns =["Barcode", os.path.basename(x)] classi.append(gmm_dt_classi) - params_dir = os.path.join(x, "params.csv") - params_res = pd.read_csv(params_dir,index_col=False) - params_res.columns = ["Argument", os.path.basename(x)] - params.append(params_res) - + classi_df = pd.concat(classi, axis=1, join="outer") classi_df.to_csv("hash_summary" +"/GMM_classification.csv", index=False) From d52e505ae5b3616fb07618c7fc3408d852ef24dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Myl=C3=A8ne=20Mariana=20Gonzales=20Andr=C3=A9?= Date: Wed, 13 Dec 2023 21:54:16 -0400 Subject: [PATCH 25/32] correction Bff params --- bin/summary_hash.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bin/summary_hash.py b/bin/summary_hash.py index 8e3d6fb..7b4f022 100755 --- a/bin/summary_hash.py +++ b/bin/summary_hash.py @@ -430,8 +430,10 @@ def bff_summary(bff_res,raw_adata, raw_mudata): classi.append(dt_classi) params_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename == "params.csv"][0]) - params_res = pd.read_csv(params_dir, usecols=[1, 2], keep_default_na=False, index_col=0) - params_res.columns = [os.path.basename(x)] + params_res = pd.read_csv(params_dir,sep=';',index_col=None) + params_res = params_res.drop(['Unnamed: 0' ], axis=1) + params_res = params_res.dropna(subset=['Argument', 'Value'], how='any') + params_res.columns = ["Argument",os.path.basename(x)] params.append(params_res) classi_df = pd.concat(classi, axis=1, join="outer") From fa48b0c5084b8196494eacc24c398e4998a73c92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Myl=C3=A8ne=20Mariana=20Gonzales=20Andr=C3=A9?= Date: Thu, 14 Dec 2023 09:14:17 -0400 Subject: [PATCH 26/32] fixed summary hash general --- bin/summary_hash.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/bin/summary_hash.py b/bin/summary_hash.py index 7b4f022..d484a8a 100755 --- a/bin/summary_hash.py +++ b/bin/summary_hash.py @@ -398,6 +398,7 @@ def bff_summary(bff_res,raw_adata, raw_mudata): dt_assign.loc[dt_assign["consensuscall"] == "Negative", "consensuscall"] = "negative" dt_assign['consensuscall'] = dt_assign['consensuscall'].astype('category') dt_assign = dt_assign.rename(columns={"cellbarcode": "Barcode", "consensuscall": os.path.basename(x)}) + dt_assign['Barcode'] = dt_assign['Barcode'].apply(lambda x: str(x) + "-1") assign.append(dt_assign) if raw_adata is not None: adata = raw_adata.copy() @@ -427,6 +428,7 @@ def bff_summary(bff_res,raw_adata, raw_mudata): dt_classi.loc[dt_classi["consensuscall.global"] == "Doublet", "consensuscall.global"] = "doublet" dt_classi.loc[dt_classi["consensuscall.global"] == "Negative", "consensuscall.global"] = "negative" dt_classi = dt_classi.rename(columns={"cellbarcode": "Barcode", "consensuscall.global": os.path.basename(x)}) + dt_classi['Barcode'] = dt_classi['Barcode'].apply(lambda x: str(x) + "-1") classi.append(dt_classi) params_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename == "params.csv"][0]) @@ -494,19 +496,23 @@ def bff_summary(bff_res,raw_adata, raw_mudata): # Read and combine assignment files assignment = [file for file in os.listdir("hash_summary") if file.endswith("_assignment.csv")] - assignment_all = pd.read_csv(os.path.join("hash_summary", assignment[0])) - - if len(assignment) > 1: - for df in assignment[1:]: - df = pd.read_csv(os.path.join("hash_summary", df)) - assignment_all = pd.merge(assignment_all, df, on='Barcode', how='outer') + assignment_all = pd.DataFrame() + + for file in assignment: + file_path = os.path.join("hash_summary", file) + df = pd.read_csv(file_path, usecols=['Barcode', pd.read_csv(file_path).columns[1]]) + assignment_all = pd.concat([assignment_all, df.set_index('Barcode')], axis=1, join='outer') + + # Reset the index to have 'Barcode' as a regular column + assignment_all.reset_index(inplace=True) + assignment_all.to_csv("hash_summary/hashing_assignment_all.csv", index=False) # Read and combine classification files classification = [file for file in os.listdir("hash_summary") if file.endswith("_classification.csv")] classification_all = pd.read_csv(os.path.join("hash_summary", classification[0])) - - if len(classification) > 1: + + if len(classification) > 1: for df in classification[1:]: df = pd.read_csv(os.path.join("hash_summary", df)) classification_all = pd.merge(classification_all, df, on='Barcode', how='outer') From 33e90f2e2c3803cc0f11a9754e6781cd63e1b351 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Myl=C3=A8ne=20Mariana=20Gonzales=20Andr=C3=A9?= Date: Thu, 14 Dec 2023 09:41:16 -0400 Subject: [PATCH 27/32] fix warning hashsolo --- bin/summary_hash.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/summary_hash.py b/bin/summary_hash.py index d484a8a..4c18a49 100755 --- a/bin/summary_hash.py +++ b/bin/summary_hash.py @@ -104,7 +104,8 @@ def hashsolo_summary(hashsolo_res, raw_adata, raw_mudata): mudata.write("hash_summary/mudata/mudata_with_mudata_"+ os.path.basename(x)+".h5mu") hashsolo_classi = obs_res[["most_likely_hypothesis"]] - hashsolo_classi.loc[:, "most_likely_hypothesis"] = hashsolo_classi["most_likely_hypothesis"].replace({0.0: "negative", 1.0: "singlet", 2.0: "doublet"}) + #hashsolo_classi.loc[:, "most_likely_hypothesis"] = hashsolo_classi["most_likely_hypothesis"].replace({0.0: "negative", 1.0: "singlet", 2.0: "doublet"}) + hashsolo_classi['most_likely_hypothesis'] = hashsolo_classi['most_likely_hypothesis'].apply(lambda x: 'negative' if x == 0.0 else ('singlet' if x == 1.0 else 'doublet')) hashsolo_classi.columns = [os.path.basename(x)] classi.append(hashsolo_classi) From a2f9037b72c777005bbdffed9ba5152ee9c7f35c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Myl=C3=A8ne=20Mariana=20Gonzales=20Andr=C3=A9?= Date: Mon, 18 Dec 2023 10:39:58 -0400 Subject: [PATCH 28/32] fixes hashed drops --- bin/dropletUtils.R | 48 +++++++++++----------- modules/single/hash_demulti/bff.nf | 1 + modules/single/hash_demulti/demuxem.nf | 2 +- modules/single/hash_demulti/hashedDrops.nf | 7 +++- modules/single/hash_demulti/hashsolo.nf | 2 +- nextflow.config | 1 + 6 files changed, 34 insertions(+), 27 deletions(-) diff --git a/bin/dropletUtils.R b/bin/dropletUtils.R index 9504872..6cc3a62 100755 --- a/bin/dropletUtils.R +++ b/bin/dropletUtils.R @@ -27,6 +27,8 @@ parser$add_argument("--objectOutEmptyDrops", default = "emptyDroplets", help = "Prefix name for the emptyDrops RDS file") parser$add_argument("--assignmentOutEmptyDrops", default = "emptyDroplets", help = "prefex name for emptyDrops assignment CSV file") +parser$add_argument("--runEmptyDrops", action="store_true", + help = "Executes emptyDrops function only when desired, recomended only for raw data") #for hashedDrops parser$add_argument("--ambient", action = "store_true", @@ -60,29 +62,29 @@ parser$add_argument("--gene_col", help = "Specify which column of genes.tsv or f args <- parser$parse_args() hto <- Read10X(data.dir = args$raw_hto_matrix_dir, gene.column = args$gene_col) -ignore_transformed <- ifelse(tolower(args$ignore) == "null", NULL, args$ignore) -emptyDrops_out <- emptyDrops(hto, lower = args$lower, niters = args$niters, - test.ambient = args$testAmbient, - ignore = ignore_transformed, - alpha = args$alpha, round = args$round, - by.rank = args$byRank) -print("-------------") -print(emptyDrops_out) -print("------------------- emptyDrops finished ---------------------------------") - - -print("-------- Following Files are saved in folder hashedDrops_out ------------") -print(paste0(args$objectOutEmptyDrops, ".rds")) -print(paste0(args$assignmentOutEmptyDrops, ".csv")) -write.csv(emptyDrops_out, paste0(args$outputdir, "/", args$assignmentOutEmptyDrops, ".csv")) -saveRDS(emptyDrops_out, file=paste0(args$outputdir, "/", args$objectOutEmptyDrops, ".rds")) - -print("------------------- filtering empty droplets ----------------------------") -is.cell <- emptyDrops_out$FDR <= args$isCellFDR -colors <- ifelse(is.cell, "red", "black") -png(paste0(args$outputdir, "/", "plot_emptyDrops.png")) -plot(emptyDrops_out$Total, -emptyDrops_out$LogProb, col=colors, xlab="Total UMI count", ylab="-Log Probability") -dev.off() +rna <- Read10X(data.dir = args$raw_rna_matrix_dir, gene.column = args$gene_col) +print(args$runEmptyDrops) +#if (args$runEmptyDrops == TRUE) { + print("------------------- executing emptyDrops ---------------------------------") + ignore_transformed <- ifelse(tolower(args$ignore) == "null", NULL, args$ignore) + emptyDrops_out <- emptyDrops(rna, lower = args$lower, niters = args$niters, + test.ambient = args$testAmbient, + ignore = NULL, + alpha = args$alpha, round = args$round, + by.rank = args$byRank) + + + print("------------------- emptyDrops finished ---------------------------------") + write.csv(emptyDrops_out, paste0(args$outputdir, "/", args$assignmentOutEmptyDrops, ".csv")) + saveRDS(emptyDrops_out, file=paste0(args$outputdir, "/", args$objectOutEmptyDrops, ".rds")) + + print("------------------- filtering empty droplets ----------------------------") + is.cell <- emptyDrops_out$FDR <= args$isCellFDR + colors <- ifelse(is.cell, "red", "black") + png(paste0(args$outputdir, "/", "plot_emptyDrops.png")) + plot(emptyDrops_out$Total, -emptyDrops_out$LogProb, col=colors, xlab="Total UMI count", ylab="-Log Probability") + dev.off() +#} combinations_transformed <- ifelse(tolower(args$combinations) == "null", NULL, args$combinations) diff --git a/modules/single/hash_demulti/bff.nf b/modules/single/hash_demulti/bff.nf index f60e098..1160786 100644 --- a/modules/single/hash_demulti/bff.nf +++ b/modules/single/hash_demulti/bff.nf @@ -7,6 +7,7 @@ process bff{ conda "$projectDir/conda/bff.yml" + input: path hto_matrix, stageAs: 'hto_data' diff --git a/modules/single/hash_demulti/demuxem.nf b/modules/single/hash_demulti/demuxem.nf index 4d94480..32d59b3 100644 --- a/modules/single/hash_demulti/demuxem.nf +++ b/modules/single/hash_demulti/demuxem.nf @@ -6,7 +6,7 @@ process demuxem{ label 'small_mem' conda "bioconda::pegasuspy demuxEM scanpy" - + input: path raw_rna_matrix_dir, stageAs: "rna_data_${params.rna_matrix_demuxem}" path raw_hto_matrix_dir, stageAs: "hto_data_${params.hto_matrix_demuxem}" diff --git a/modules/single/hash_demulti/hashedDrops.nf b/modules/single/hash_demulti/hashedDrops.nf index a89a7ce..f470750 100755 --- a/modules/single/hash_demulti/hashedDrops.nf +++ b/modules/single/hash_demulti/hashedDrops.nf @@ -19,6 +19,7 @@ process hashedDrops{ each isCellFDR val objectOutEmptyDrops val assignmentOutEmptyDrops + each runEmptyDrops each ambient each minProp @@ -45,11 +46,12 @@ process hashedDrops{ def alp = alpha != "None" ? " --alpha ${alpha}" : '' def byR = byRank != "None" ? " --by.rank ${byRank}" : '' def amb = ambient != 'False' ? " --ambient" : '' + def run_empty = runEmptyDrops != 'False' ? " --runEmptyDrops" : '' def comb = combinations != "None" ? " --combinations ${combinations}" : '' """ mkdir hashedDrops_${task.index} - dropletUtils.R --raw_hto_matrix_dir $raw_hto_matrix_dir --lower $lower --niters $niters --isCellFDR $isCellFDR --objectOutEmptyDrops $objectOutEmptyDrops --assignmentOutEmptyDrops $assignmentOutEmptyDrops --minProp $minProp --pseudoCount $pseudoCount --doubletNmads $doubletNmads --doubletMin $doubletMin --confidentNmads $confidentNmads --confidenMin $confidenMin --objectOutHashedDrops $objectOutHashedDrops --outputdir hashedDrops_${task.index} --assignmentOutHashedDrops ${assignmentOutHashedDrops}${testAmb}${ign}${alp}${rou}${byR}${constantAmb}${doubletMix}${amb}${comb} --gene_col $gene_col + dropletUtils.R --raw_hto_matrix_dir $raw_hto_matrix_dir --lower $lower --niters $niters --isCellFDR $isCellFDR --objectOutEmptyDrops $objectOutEmptyDrops --assignmentOutEmptyDrops $assignmentOutEmptyDrops --minProp $minProp --pseudoCount $pseudoCount --doubletNmads $doubletNmads --doubletMin $doubletMin --confidentNmads $confidentNmads --confidenMin $confidenMin --objectOutHashedDrops $objectOutHashedDrops --outputdir hashedDrops_${task.index} --assignmentOutHashedDrops ${assignmentOutHashedDrops}${testAmb}${ign}${alp}${rou}${byR}${constantAmb}${doubletMix}${amb}${comb}${run_empty} --gene_col $gene_col --runEmptyDrops $runEmptyDrops """ } @@ -79,6 +81,7 @@ workflow hashedDrops_hashing{ isCellFDR = split_input(params.isCellFDR) objectOutEmptyDrops = params.objectOutEmptyDrops assignmentOutEmptyDrops = params.assignmentOutEmptyDrops + runEmptyDrops = split_input(params.runEmptyDrops) ambient = split_input(params.ambient) minProp = split_input(params.minProp) @@ -94,7 +97,7 @@ workflow hashedDrops_hashing{ assignmentOutHashedDrops = params.assignmentOutHashedDrops gene_col = split_input(params.gene_col) - hashedDrops(hto_matrix, lower, niters, testAmbient, ignore, alpha, round, byRank, isCellFDR, objectOutEmptyDrops, assignmentOutEmptyDrops, ambient, minProp, pseudoCount, constantAmbient, doubletNmads, doubletMin, doubletMixture, confidentNmads, confidenMin, combinations, objectOutHashedDrops, assignmentOutHashedDrops, gene_col) + hashedDrops(hto_matrix, lower, niters, testAmbient, ignore, alpha, round, byRank, isCellFDR, objectOutEmptyDrops, assignmentOutEmptyDrops,runEmptyDrops, ambient, minProp, pseudoCount, constantAmbient, doubletNmads, doubletMin, doubletMixture, confidentNmads, confidenMin, combinations, objectOutHashedDrops, assignmentOutHashedDrops, gene_col) emit: hashedDrops.out.collect() diff --git a/modules/single/hash_demulti/hashsolo.nf b/modules/single/hash_demulti/hashsolo.nf index d2a03a8..e6ce254 100755 --- a/modules/single/hash_demulti/hashsolo.nf +++ b/modules/single/hash_demulti/hashsolo.nf @@ -5,7 +5,7 @@ process hash_solo{ label 'small_mem' conda "$projectDir/conda/hashsolo_py.yml" - + input: path hto_data, stageAs: "hto_data_${params.hto_matrix_hashsolo}" each priors_negative diff --git a/nextflow.config b/nextflow.config index 3e9c700..f8834a8 100644 --- a/nextflow.config +++ b/nextflow.config @@ -107,6 +107,7 @@ params { combinations = "None" objectOutHashedDrops = "hashedDrops" assignmentOutHashedDrops = "hashedDrops" + runEmptyDrops = "True" // demuxem demuxem = "True" From 73ca4770177b9bf94164171432acde3e9039c74d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Myl=C3=A8ne=20Mariana=20Gonzales=20Andr=C3=A9?= Date: Mon, 18 Dec 2023 14:16:00 -0400 Subject: [PATCH 29/32] fixes to hashed drops --- bin/dropletUtils.R | 26 +++++++++++++--------- modules/single/hash_demulti/hashedDrops.nf | 2 +- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/bin/dropletUtils.R b/bin/dropletUtils.R index 6cc3a62..83c5caa 100755 --- a/bin/dropletUtils.R +++ b/bin/dropletUtils.R @@ -27,11 +27,11 @@ parser$add_argument("--objectOutEmptyDrops", default = "emptyDroplets", help = "Prefix name for the emptyDrops RDS file") parser$add_argument("--assignmentOutEmptyDrops", default = "emptyDroplets", help = "prefex name for emptyDrops assignment CSV file") -parser$add_argument("--runEmptyDrops", action="store_true", +parser$add_argument("--runEmptyDrops", action="store_false", help = "Executes emptyDrops function only when desired, recomended only for raw data") #for hashedDrops -parser$add_argument("--ambient", action = "store_true", +parser$add_argument("--ambient", action = "store_false", help = "Whether to use the relative abundance of each HTO in the ambient solution from emtpyDrops, set TRUE only when test_ambient is TRUE.") parser$add_argument("--minProp", default = 0.05, type = "double", help = "Numeric scalar to be used to infer the ambient profile when ambient=NULL,") @@ -62,9 +62,13 @@ parser$add_argument("--gene_col", help = "Specify which column of genes.tsv or f args <- parser$parse_args() hto <- Read10X(data.dir = args$raw_hto_matrix_dir, gene.column = args$gene_col) -rna <- Read10X(data.dir = args$raw_rna_matrix_dir, gene.column = args$gene_col) -print(args$runEmptyDrops) -#if (args$runEmptyDrops == TRUE) { +combinations_transformed <- ifelse(tolower(args$combinations) == "null", NULL, args$combinations) + +print(args$constantAmbient) +print("----------------") +print(args$ambient) +if (args$runEmptyDrops == TRUE) { + rna <- Read10X(data.dir = args$raw_rna_matrix_dir,gene.column = args$gene_col) print("------------------- executing emptyDrops ---------------------------------") ignore_transformed <- ifelse(tolower(args$ignore) == "null", NULL, args$ignore) emptyDrops_out <- emptyDrops(rna, lower = args$lower, niters = args$niters, @@ -84,14 +88,16 @@ print(args$runEmptyDrops) png(paste0(args$outputdir, "/", "plot_emptyDrops.png")) plot(emptyDrops_out$Total, -emptyDrops_out$LogProb, col=colors, xlab="Total UMI count", ylab="-Log Probability") dev.off() -#} -combinations_transformed <- ifelse(tolower(args$combinations) == "null", NULL, args$combinations) -if (args$ambient == TRUE) { - hashedDrops_out <- hashedDrops(hto[,which(is.cell)], min.prop = args$minProp, ambient = metadata(emptyDrops_out)$ambient, pseudo.count = args$pseudoCount, constant.ambient = args$constantAmbient, doublet.nmads = args$doubletNmads, doublet.min = args$doubletMin, doublet.mixture = args$doubletMixture, confident.nmads = args$confidentNmads, confident.min = args$confidenMin, combinations = combinations_transformed) + if (args$ambient == TRUE) { + hashedDrops_out <- hashedDrops(hto[,which(is.cell)], min.prop = args$minProp, ambient = metadata(emptyDrops_out)$ambient, pseudo.count = args$pseudoCount, constant.ambient = args$constantAmbient, doublet.nmads = args$doubletNmads, doublet.min = args$doubletMin, doublet.mixture = args$doubletMixture, confident.nmads = args$confidentNmads, confident.min = args$confidenMin, combinations = combinations_transformed) + } else { + hashedDrops_out <- hashedDrops(hto[,which(is.cell)], min.prop = args$minProp, pseudo.count = args$pseudoCount, constant.ambient = args$constantAmbient, doublet.nmads = args$doubletNmads, doublet.min = args$doubletMin, doublet.mixture = args$doubletMixture, confident.nmads = args$confidentNmads, confident.min = args$confidenMin, combinations = combinations_transformed) + } } else { - hashedDrops_out <- hashedDrops(hto[,which(is.cell)], min.prop = args$minProp, pseudo.count = args$pseudoCount, constant.ambient = args$constantAmbient, doublet.nmads = args$doubletNmads, doublet.min = args$doubletMin, doublet.mixture = args$doubletMixture, confident.nmads = args$confidentNmads, confident.min = args$confidenMin, combinations = combinations_transformed) + hashedDrops_out <- hashedDrops(hto,min.prop = args$minProp,pseudo.count = args$pseudoCount, constant.ambient = args$constantAmbient,doublet.nmads = args$doubletNmads, doublet.min = args$doubletMin,confident.nmads = args$confidentNmads,confident.min = args$confidenMin) + } print("------------------- hashedDrops finished ---------------------------------") diff --git a/modules/single/hash_demulti/hashedDrops.nf b/modules/single/hash_demulti/hashedDrops.nf index f470750..ec4bd43 100755 --- a/modules/single/hash_demulti/hashedDrops.nf +++ b/modules/single/hash_demulti/hashedDrops.nf @@ -51,7 +51,7 @@ process hashedDrops{ """ mkdir hashedDrops_${task.index} - dropletUtils.R --raw_hto_matrix_dir $raw_hto_matrix_dir --lower $lower --niters $niters --isCellFDR $isCellFDR --objectOutEmptyDrops $objectOutEmptyDrops --assignmentOutEmptyDrops $assignmentOutEmptyDrops --minProp $minProp --pseudoCount $pseudoCount --doubletNmads $doubletNmads --doubletMin $doubletMin --confidentNmads $confidentNmads --confidenMin $confidenMin --objectOutHashedDrops $objectOutHashedDrops --outputdir hashedDrops_${task.index} --assignmentOutHashedDrops ${assignmentOutHashedDrops}${testAmb}${ign}${alp}${rou}${byR}${constantAmb}${doubletMix}${amb}${comb}${run_empty} --gene_col $gene_col --runEmptyDrops $runEmptyDrops + dropletUtils.R --raw_hto_matrix_dir $raw_hto_matrix_dir --lower $lower --niters $niters --isCellFDR $isCellFDR --objectOutEmptyDrops $objectOutEmptyDrops --assignmentOutEmptyDrops $assignmentOutEmptyDrops --minProp $minProp --pseudoCount $pseudoCount --doubletNmads $doubletNmads --doubletMin $doubletMin --confidentNmads $confidentNmads --confidenMin $confidenMin --objectOutHashedDrops $objectOutHashedDrops --outputdir hashedDrops_${task.index} --assignmentOutHashedDrops ${assignmentOutHashedDrops}${testAmb}${ign}${alp}${rou}${byR}${constantAmb}${doubletMix}${amb}${comb}${run_empty} --gene_col $gene_col """ } From c119904fe7aaba74f07cc1278cac5e60bd00c83a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Myl=C3=A8ne=20Mariana=20Gonzales=20Andr=C3=A9?= Date: Tue, 19 Dec 2023 12:58:15 -0400 Subject: [PATCH 30/32] taking out comments debugging --- bin/dropletUtils.R | 5 ----- 1 file changed, 5 deletions(-) diff --git a/bin/dropletUtils.R b/bin/dropletUtils.R index 83c5caa..8efdcde 100755 --- a/bin/dropletUtils.R +++ b/bin/dropletUtils.R @@ -64,9 +64,6 @@ args <- parser$parse_args() hto <- Read10X(data.dir = args$raw_hto_matrix_dir, gene.column = args$gene_col) combinations_transformed <- ifelse(tolower(args$combinations) == "null", NULL, args$combinations) -print(args$constantAmbient) -print("----------------") -print(args$ambient) if (args$runEmptyDrops == TRUE) { rna <- Read10X(data.dir = args$raw_rna_matrix_dir,gene.column = args$gene_col) print("------------------- executing emptyDrops ---------------------------------") @@ -78,7 +75,6 @@ if (args$runEmptyDrops == TRUE) { by.rank = args$byRank) - print("------------------- emptyDrops finished ---------------------------------") write.csv(emptyDrops_out, paste0(args$outputdir, "/", args$assignmentOutEmptyDrops, ".csv")) saveRDS(emptyDrops_out, file=paste0(args$outputdir, "/", args$objectOutEmptyDrops, ".rds")) @@ -100,7 +96,6 @@ if (args$runEmptyDrops == TRUE) { } -print("------------------- hashedDrops finished ---------------------------------") ignore <- args$ignore if (is.null(ignore)) { From aaba40265a5cfa205c16972f86b6898a4a4960b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Myl=C3=A8ne=20Mariana=20Gonzales=20Andr=C3=A9?= Date: Tue, 19 Dec 2023 15:30:57 -0400 Subject: [PATCH 31/32] docs hashing --- docs/source/hashing.md | 94 ++++++++++++++++++++++-------------------- 1 file changed, 49 insertions(+), 45 deletions(-) diff --git a/docs/source/hashing.md b/docs/source/hashing.md index 9ced8bd..9414658 100644 --- a/docs/source/hashing.md +++ b/docs/source/hashing.md @@ -17,16 +17,15 @@ nextflow run main.nf -profile test --mode hashing The input data depends heavily on the deconvolution tools. In the following table, you will find the minimal input data required by different tools. -| Deconvolution method | Input data | Parameter | -| -------------------- | ------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------- | -| HTODemux | - Seurat object with both UMI and hashing count matrix (RDS) | `params.rna_matrix_htodemux`
`params.hto_matrix_htodemux` | -| Multiseq | - Seurat object with both UMI and hashing count matrix (RDS) | `params.rna_matrix_multiseq`
`params.hto_matrix_multiseq` | -| HashSolo | - 10x mtx directory with hashing count matrix (H5) | `params.hto_matrix_hashsolo`
`params.rna_matrix_hashsolo` | -| HashedDrops | - 10x mtx directory with hashing count matrix (Directory) | `params.hto_matrix_hashedDrops` | +| Deconvolution method | Input data | Parameter | +| -------------------- | ------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------- | +| HTODemux | - Seurat object with both UMI and hashing count matrix (RDS) | `params.rna_matrix_htodemux`
`params.hto_matrix_htodemux` | +| Multiseq | - Seurat object with both UMI and hashing count matrix (RDS) | `params.rna_matrix_multiseq`
`params.hto_matrix_multiseq` | +| HashSolo | - 10x mtx directory with hashing count matrix (H5) | `params.hto_matrix_hashsolo`
`params.rna_matrix_hashsolo` | +| HashedDrops | - 10x mtx directory with hashing count matrix (Directory) | `params.hto_matrix_hashedDrops` | | Demuxem | - 10x mtx directory with UMI count matrix (Directory)
- 10x mtx directory with hashing count matrix (Directory) | `params.hto_matrix_demuxem`
`params.rna_matrix_demuxem` | -| GMM - Demux | - 10x mtx directory with UMI count matrix (Directory)
- 10x mtx directory with hashing count matrix (Directory) | `params.hto_matrix_gmm_demux` | -| BFF | - 10x mtx directory with UMI count matrix (Directory)
- 10x mtx directory with hashing count matrix (Directory) | `params.hto_matrix_bff` | - +| GMM - Demux | - 10x mtx directory with UMI count matrix (Directory)
- 10x mtx directory with hashing count matrix (Directory) | `params.hto_matrix_gmm_demux` | +| BFF | - 10x mtx directory with UMI count matrix (Directory)
- 10x mtx directory with hashing count matrix (Directory) | `params.hto_matrix_bff` | Similary as genotype-based deconvlution methods, hashing methods also have some input in common. So we also try to utilize common input parameters `params.[rna/hto]_matrix_[raw/filtered]` to store count matrices for better control and `params.[rna/hto]_matrix_[method]` is used to specify whether to use raw or filtered counts for each method. @@ -117,7 +116,6 @@ output directory: `$pipeline_output_folder/hashedDrops/hashedDrops_[task_ID/samp - `${params.objectOutHashedDrops}_LogFC.png`: a diagnostic plot comparing the log-fold change between the second HTO's abundance and the ambient contamination - `params.csv`: specified parameters in the HashedDrops task - ### GMM-Demux output directory: `$pipeline_output_folder/gmm_demux/gmm_demux_[task_ID/sampleId]` @@ -283,55 +281,61 @@ output directory: `$pipeline_output_folder/bff/bff_[task_ID/sampleId]` | combinations | An integer matrix specifying valid combinations of HTOs. Each row corresponds to a single sample and specifies the indices of rows in x corresponding to the HTOs used to label that sample. Default: None | | objectOutHashedDrops | Prefix of the hashedDrops output RDS object. Default: hashedDrops | | assignmentOutHashedDrops | Prefix of the hashedDrops output CSV file. Default: hashedDrops | + ### GMM-Demux -| | | -| ------------------------ | -------------------------------------------------------------------------------------------- | -| gmmDemux | Whether to perform GMMDemux. Default: True | -| hto_matrix_gmm_demux | Whether to use raw or filtered HTO count matrix. Default: filtered | -| assignmentOutGmmDemux | Name for the folder output. Default: gmm_demux| -| hto_name_gmm | list of sample tags (HTOs) separated by ',' without whitespace. Default: None | -| summary | the estimated total count of cells in the single cell assay. Default: 2000 | -| report_gmm | Name for the file generated by the summary. Default:report.txt | -| mode_GMM | Format of the input, either tsv or csv. Default: tsv | -| extract | extract names of the sample barcoding tag(s) to extract, separated by ','. Joint tags are linked with '+'. Default: None | -| threshold_gmm | Provide the confidence threshold value. Requires a float in (0,1). Default: 0.8 | -| ambiguous | The estimated chance of having a phony GEM getting included in a pure type GEM cluster by the clustering algorithm. Default: 0.5. | -| plotOutHashSolo | Prefix of the output figures. Default: hashsolo | + +| | | +| --------------------- | --------------------------------------------------------------------------------------------------------------------------------- | +| gmmDemux | Whether to perform GMMDemux. Default: True | +| hto_matrix_gmm_demux | Whether to use raw or filtered HTO count matrix. Default: filtered | +| assignmentOutGmmDemux | Name for the folder output. Default: gmm_demux | +| hto_name_gmm | list of sample tags (HTOs) separated by ',' without whitespace. Default: None | +| summary | the estimated total count of cells in the single cell assay. Default: 2000 | +| report_gmm | Name for the file generated by the summary. Default:report.txt | +| mode_GMM | Format of the input, either tsv or csv. Default: tsv | +| extract | extract names of the sample barcoding tag(s) to extract, separated by ','. Joint tags are linked with '+'. Default: None | +| threshold_gmm | Provide the confidence threshold value. Requires a float in (0,1). Default: 0.8 | +| ambiguous | The estimated chance of having a phony GEM getting included in a pure type GEM cluster by the clustering algorithm. Default: 0.5. | +| plotOutHashSolo | Prefix of the output figures. Default: hashsolo | ### BFF -| | | -| ------------------------ | -------------------------------------------------------------------------------------------- | -| BFF | Whether to perform BFF. Default: False | -| hto_matrix_bff | Whether to use raw or filtered HTO count matrix. Default: raw | -| rna_matrix_bff | Whether to use raw or filtered scRNA-seq count matrix. Default: raw | -| assignmentOutBff | Name for the folder output. Default: bff| -| methods | method or list of methods to be used. Default: combined_bff | -| methodsForConsensus | a consensus call will be generated using all methods especified. Default: NULL | -| cellbarcodeWhitelist | A vector of expected cell barcodes. Default:NULL | -| metricsFile | summary metrics will be written to this file. Default: metrics_bff.cvs | -| doTSNE | tSNE will be run on the resulting hashing calls after each caller. Default: True | -| doHeatmap | if true, Seurat::HTOHeatmap will be run on the results of each calle Default: True | -| perCellSaturation | An optional dataframe with the columns cellbarcode and saturation. Default: NULL | -| majorityConsensusThreshold | This applies to calculating a consensus call when multiple algorithms are used. Default: NULL | -| chemistry | This string is passed to EstimateMultipletRate. Should be either 10xV2 or 10xV3. Default: 10xV3 | -| callerDisagreementThreshold | If provided, the agreement rate will be calculated between each caller and the simple majority call, ignoring discordant and no-call cells. Default: NULL | -| preprocess_bff | When True, the data is preprocess using the method ProcessCountMatrix from CellHashR. Default: False | -| barcodeWhitelist | A vector of barcode names to retain. This parameter is used only when the pre-processing step is executed. Default: NULL | +| | | +| --------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | +| BFF | Whether to perform BFF. Default: False | +| hto_matrix_bff | Whether to use raw or filtered HTO count matrix. Default: raw | +| rna_matrix_bff | Whether to use raw or filtered scRNA-seq count matrix. Default: raw | +| assignmentOutBff | Name for the folder output. Default: bff | +| methods | method or list of methods to be used. Default: combined_bff | +| methodsForConsensus | a consensus call will be generated using all methods especified. Default: NULL | +| cellbarcodeWhitelist | A vector of expected cell barcodes. Default:NULL | +| metricsFile | summary metrics will be written to this file. Default: metrics_bff.cvs | +| doTSNE | tSNE will be run on the resulting hashing calls after each caller. Default: True | +| doHeatmap | if true, Seurat::HTOHeatmap will be run on the results of each calle Default: True | +| perCellSaturation | An optional dataframe with the columns cellbarcode and saturation. Default: NULL | +| majorityConsensusThreshold | This applies to calculating a consensus call when multiple algorithms are used. Default: NULL | +| chemistry | This string is passed to EstimateMultipletRate. Should be either 10xV2 or 10xV3. Default: 10xV3 | +| callerDisagreementThreshold | If provided, the agreement rate will be calculated between each caller and the simple majority call, ignoring discordant and no-call cells. Default: NULL | +| preprocess_bff | When True, the data is preprocess using the method ProcessCountMatrix from CellHashR. Default: False | +| barcodeWhitelist | A vector of barcode names to retain. This parameter is used only when the pre-processing step is executed. Default: NULL | ### General Use #### Single sample use + The use of the pipeline for a single samples require the definition of certain parameters in order to run the tools under default configuration. The parameter `--mode hashing` must be included with the purpose of running the hashing tools only. + ##### GMM-Demux + The names of the hashtags must be given as a list of string, separated by ','. This list is given under the parameter `--hto_name_gmm` + ##### BFF -The demultiplexing method for the experiment must be given under the parameter `--methods`. Multiple methods can be given as a list, separated by ','. -Besides, the method or methods for consensus must be given under the parameter `--methodsForConsensus`. +The demultiplexing method for the experiment must be given under the parameter `--methods`. Multiple methods can be given as a list, separated by ','. +Besides, the method or methods for consensus must be given under the parameter `--methodsForConsensus`. ```bash -nextflow run main.nf --mode hashing --match_donor False --hto_matrix_raw /data_folder/raw_hto_data +nextflow run main.nf --mode hashing --match_donor False --hto_matrix_raw /data_folder/raw_hto_data --hto_matrix_filtered /data_folder/filtered_hto_data --barcodes /data_folder/filtered_hto_data/barcodes.tsv.gz --rna_matrix_raw /data_folder/raw_rna_data --rna_matrix_filtered /data_folder/filtered_rna_data --hto_name_gmm "hto_name_1,hto_name_2,hto_name_3" --methods bff_cluster --methodsForConsensus bff_cluster -``` \ No newline at end of file +``` From 731635e558df8b15129941fd73db0ed09d49ecb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Myl=C3=A8ne=20Mariana=20Gonzales=20Andr=C3=A9?= Date: Mon, 8 Jan 2024 09:08:30 -0400 Subject: [PATCH 32/32] resolved comments PR, multi_sample_input file in test_data --- bin/demuxem.py | 10 ---------- bin/summary_hash.py | 7 ++----- nextflow.config | 4 ++-- .../multi_sample_input.csv | 0 4 files changed, 4 insertions(+), 17 deletions(-) rename multi_sample_input.csv => test_data/multi_sample_input.csv (100%) diff --git a/bin/demuxem.py b/bin/demuxem.py index 814baef..4fac528 100755 --- a/bin/demuxem.py +++ b/bin/demuxem.py @@ -33,17 +33,7 @@ # load input rna data rna_data = sc.read_10x_mtx(args.rna_matrix_dir) hashing_data = sc.read_10x_mtx(args.hto_matrix_dir,gex_only=False) - #data.subset_data(modality_subset=['rna']) - #data.concat_data() # in case of multi-organism mixing data - # load input hashing data - #data.update(io.read_input(args.hto_matrix_dir, modality="hashing")) - # Extract rna and hashing data - #rna_data = data.get_data(modality="rna") - #hashing_data = data.get_data(modality="hashing") rna = args.rna_matrix_dir - print("-------------------") - print(rna) - print("-------------------") filter = "" if args.filter_demuxem.lower() in ['true', 't', 'yes', 'y', '1']: filter = True diff --git a/bin/summary_hash.py b/bin/summary_hash.py index 4c18a49..fac8ea8 100755 --- a/bin/summary_hash.py +++ b/bin/summary_hash.py @@ -104,7 +104,6 @@ def hashsolo_summary(hashsolo_res, raw_adata, raw_mudata): mudata.write("hash_summary/mudata/mudata_with_mudata_"+ os.path.basename(x)+".h5mu") hashsolo_classi = obs_res[["most_likely_hypothesis"]] - #hashsolo_classi.loc[:, "most_likely_hypothesis"] = hashsolo_classi["most_likely_hypothesis"].replace({0.0: "negative", 1.0: "singlet", 2.0: "doublet"}) hashsolo_classi['most_likely_hypothesis'] = hashsolo_classi['most_likely_hypothesis'].apply(lambda x: 'negative' if x == 0.0 else ('singlet' if x == 1.0 else 'doublet')) hashsolo_classi.columns = [os.path.basename(x)] classi.append(hashsolo_classi) @@ -310,9 +309,7 @@ def gmm_summary(gmmDemux_res,raw_adata, raw_mudata): gmm_dt = gmm_dt.rename(columns={"Unnamed: 0": "Barcode"}) #Create classification following the assignment found for the barcodes #we keep the original assigment and add a classification column - - ##Inner function - def classify_hash(row,number_hashes): + def _classify_hash(row,number_hashes): if row == 0: return 'negative' elif 0 > row <= number_hashes: @@ -320,7 +317,7 @@ def classify_hash(row,number_hashes): else: return 'doublet' - classification_dt['Classification'] = classification_dt['Cluster_id'].apply(lambda x: classify_hash(x, number_of_hashes)) + classification_dt['Classification'] = classification_dt['Cluster_id'].apply(lambda x: _classify_hash(x, number_of_hashes)) #Compare classification guide file with classification found merged = pd.merge(classification_dt, gmm_dt, on='Cluster_id', how='left') diff --git a/nextflow.config b/nextflow.config index f8834a8..ed91d0a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -126,7 +126,7 @@ params { filter_demuxem = "True" // gmm-demux - gmmDemux = "False" + gmmDemux = "True" hto_matrix_gmm_demux = "filtered" assignmentOutGmmDemux = "gmm_demux" hto_name_gmm = "None" @@ -138,7 +138,7 @@ params { ambiguous = 0.05 // bff - bff = "False" + bff = "True" rna_matrix_bff = "raw" hto_matrix_bff = "raw" assignmentOutBff = "bff" diff --git a/multi_sample_input.csv b/test_data/multi_sample_input.csv similarity index 100% rename from multi_sample_input.csv rename to test_data/multi_sample_input.csv