From 2d5767bb446e8d8cc6097f796611400df45aa4f2 Mon Sep 17 00:00:00 2001 From: Xichen Wu Date: Sun, 5 Nov 2023 01:36:20 +0100 Subject: [PATCH] 1.add line break 2.remove container setting 3. adapt souporcell testing --- docs/source/general.md | 13 ++++++++++--- docs/source/genetic.md | 1 + docs/source/index.md | 8 +++++--- docs/source/rescue.md | 3 ++- modules/gene_demulti/bcftools.nf | 1 - modules/gene_demulti/freebayes.nf | 1 - modules/single/gene_demulti/bcftools.nf | 1 - modules/single/gene_demulti/freebayes.nf | 1 - test.config | 7 ++----- 9 files changed, 20 insertions(+), 16 deletions(-) diff --git a/docs/source/general.md b/docs/source/general.md index f80e280..3ee2973 100644 --- a/docs/source/general.md +++ b/docs/source/general.md @@ -119,7 +119,10 @@ nextflow run main.nf -profile standard,conda ### **Running on multiple samples** -The pipeline is able to run on multiple samples. In this scenario, the shared parameters for input data are retrieved from a sample sheet using `params.multi_sample`, which is set to None by default. Along with the input data, the sample sheet should contain an additional column for unique sample IDs assigned to each sample. The remaining parameters for each process are specified in the nextflow.config file, just like when demultiplexing a single sample. However, there is a distinction between running on a single sample and running on multiple samples. When processing multiple samples, the pipeline only permits a single value for each process parameter, whereas in the case of a single sample, multiple values separated by commas are allowed. The sample sheet (example file see the Resources section below) should have e.g. following columns depending on the methods you want to run: +The pipeline is able to run on multiple samples. In this scenario, the shared parameters for input data are retrieved from a sample sheet using `params.multi_sample`, which is set to None by default. +Along with the input data, the sample sheet should contain an additional column for unique sample IDs assigned to each sample. The remaining parameters for each process are specified in the nextflow.config file, just like when demultiplexing a single sample. +However, there is a distinction between running on a single sample and running on multiple samples. When processing multiple samples, the pipeline only permits a single value for each process parameter, whereas in the case of a single sample, multiple values separated by commas are allowed. +The sample sheet (example file see the Resources section below) should have e.g. following columns depending on the methods you want to run: - sampleId - na_matrix_raw @@ -136,11 +139,15 @@ The pipeline is able to run on multiple samples. In this scenario, the shared pa ### **scverse compatibility** -To ensure scverse compatibility, the pipeline provides the option to generate AnnData or MuData objects after demultiplexing specified by `params.generate_anndata` and `params.generate_mudata`. The objects contain the scRNA-seq counts from `params.rna_matrix_filered` and stores the assignment of each demultiplexing method in the `assignment` column of `obs`. Additionally, if `match_donor` is True, the pipeline also produces an AnnData object which contains the assignment of the best-matched method pair after donor matching. +To ensure scverse compatibility, the pipeline provides the option to generate AnnData or MuData objects after demultiplexing specified by `params.generate_anndata` and `params.generate_mudata`. +The objects contain the scRNA-seq counts from `params.rna_matrix_filered` and stores the assignment of each demultiplexing method in the `assignment` column of `obs`. +Additionally, if `match_donor` is True, the pipeline also produces an AnnData object which contains the assignment of the best-matched method pair after donor matching. ## **Pipeline output** -The output directory of the pipeline is set by `$params.outdir`. By default, the pipeline is run on a single sample. In this case, all pipeline output will be saved in the folder `$projectDir/$params.outdir/$params.mode`. When running the pipeline on multiple samples, the pipeline output will be found in the folder `"$projectDir/$params.outdir/$sampleId/$params.mode`. To simplify this, we'll refer to this folder as `$pipeline_output_folder` from now on. +The output directory of the pipeline is set by `$params.outdir`. +By default, the pipeline is run on a single sample. In this case, all pipeline output will be saved in the folder `$projectDir/$params.outdir/$params.mode`. +When running the pipeline on multiple samples, the pipeline output will be found in the folder `"$projectDir/$params.outdir/$sampleId/$params.mode`. To simplify this, we'll refer to this folder as `$pipeline_output_folder` from now on. The demultiplexing workflow saves its output in `$pipeline_output_folder/[gene/hash]_demulti`. The pipeline will also generate some TSV files to summarize the results in the folder `[gene/hash]_summary` under this directory. diff --git a/docs/source/genetic.md b/docs/source/genetic.md index 619b3a3..33ab3b9 100644 --- a/docs/source/genetic.md +++ b/docs/source/genetic.md @@ -317,6 +317,7 @@ output directory: `$pipeline_output_folder/souporcell/souporcell_[task_ID/sample | max_loci | Max loci per cell, affects speed. Default: 2048 | | restarts | Number of restarts in clustering, when there are > 12 clusters we recommend increasing this to avoid local minima. Default: None | | common_variants_souporcell | Common variant loci or known variant loci vcf, must be vs same reference fasta. | +| use_known_genotype | Whether to use known donor genotype. Default: True | | vcf_donor | Known variants per clone in population vcf mode, must be VCF file. | | known_genotypes_sample_names | Which samples in population vcf from known genotypes option represent the donors in your sample. Default: None | | skip_remap | Don't remap with minimap2, not recommended unless in conjunction with comman variants. Default: True | diff --git a/docs/source/index.md b/docs/source/index.md index 3385fce..6a55207 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -49,18 +49,20 @@ You can also: ```bash sh test_data/download_data.sh -nextflow run main.nf -profile test,conda +nextflow run main.nf -profile test ``` ## **Pipeline output** -By default, the pipeline is run on a single sample. In this case, all pipeline output will be saved in the folder `$projectDir/$params.outdir/$params.mode`. When running the pipeline on multiple samples, the pipeline output will be found in the folder `"$projectDir/$params.outdir/$sampleId/$params.mode/`. To simplify this, we'll refer to this folder as `$pipeline_output_folder` from now on. +By default, the pipeline is run on a single sample. In this case, all pipeline output will be saved in the folder `$projectDir/$params.outdir/$params.mode`. +When running the pipeline on multiple samples, the pipeline output will be found in the folder `"$projectDir/$params.outdir/$sampleId/$params.mode/`. To simplify this, we'll refer to this folder as `$pipeline_output_folder` from now on. ### **Intermediate output** The pipeline saves the output of each process for two workflows separately, so you will find the results of hashing-based and genetics-based deconvolution methods in the folder `hash_demulti` and `gene_demulti` respectively. -If the pipeline is run on single sample, each demultiplexing process will generate some intermediate files in the folder in the format `$pipeline_output_folder/[method]/[method]_[task_ID]`, e.g. `htodemux/htodemux_1`. If the pipeline is run on multiple samples, the `task_ID` will be replaced by `sampleId`. In the folder, you can find following files: +If the pipeline is run on single sample, each demultiplexing process will generate some intermediate files in the folder in the format `$pipeline_output_folder/[method]/[method]_[task_ID]`, e.g. `htodemux/htodemux_1`. +If the pipeline is run on multiple samples, the `task_ID` will be replaced by `sampleId`. In the folder, you can find following files: - `params.csv`: specified parameters in the task - Output of the task, check [](output) for more details. diff --git a/docs/source/rescue.md b/docs/source/rescue.md index 2a92bec..356bcdd 100644 --- a/docs/source/rescue.md +++ b/docs/source/rescue.md @@ -21,7 +21,8 @@ The joint call of hashing and genetic deconvolution methods has been shown to be ## **Output** -By default, the pipeline is run on a single sample. In this case, all pipeline output will be saved in the folder `$projectDir/$params.outdir/rescue`. When running the pipeline on multiple samples, the pipeline output will be found in the folder `"$projectDir/$params.outdir/$sampleId/rescue`. To simplify this, we'll refer to this folder as `$pipeline_output_folder` from now on. +By default, the pipeline is run on a single sample. In this case, all pipeline output will be saved in the folder `$projectDir/$params.outdir/rescue`. When running the pipeline on multiple samples, the pipeline output will be found in the folder `"$projectDir/$params.outdir/$sampleId/rescue`. +To simplify this, we'll refer to this folder as `$pipeline_output_folder` from now on. In rescue mode, the genotype- and hashing-based demultiplexing workflow run in parallel. They save their output in `$pipeline_output_folder/[gene/hash]_demulti`. Before running the donor-matching preocess, the pipeline merges the results of two workflows into `classification_all_genetic_and_hash.csv` and `assignment_all_genetic_and_hash.csv` in the `$pipeline_output_folder/summary` folder. diff --git a/modules/gene_demulti/bcftools.nf b/modules/gene_demulti/bcftools.nf index 979466c..5de6b67 100644 --- a/modules/gene_demulti/bcftools.nf +++ b/modules/gene_demulti/bcftools.nf @@ -5,7 +5,6 @@ process bcftools{ label 'big_mem' conda "bioconda::bcftools=1.9" - container "biocontainers/bcftools" input: tuple val(sampleId), val(vcf_list) diff --git a/modules/gene_demulti/freebayes.nf b/modules/gene_demulti/freebayes.nf index e74879a..fc85d92 100644 --- a/modules/gene_demulti/freebayes.nf +++ b/modules/gene_demulti/freebayes.nf @@ -6,7 +6,6 @@ process freebayes{ label 'big_mem' conda "bioconda::freebayes=1.2" - container "biocontainers/freebayes" input: tuple val(sampleId), path(bam_freebayes), path(bai_freebayes) diff --git a/modules/single/gene_demulti/bcftools.nf b/modules/single/gene_demulti/bcftools.nf index 8793780..d53efe7 100644 --- a/modules/single/gene_demulti/bcftools.nf +++ b/modules/single/gene_demulti/bcftools.nf @@ -5,7 +5,6 @@ process bcftools{ label 'big_mem' conda "bioconda::bcftools=1.9" - container "biocontainers/bcftools" input: val vcf diff --git a/modules/single/gene_demulti/freebayes.nf b/modules/single/gene_demulti/freebayes.nf index 8397539..930448c 100644 --- a/modules/single/gene_demulti/freebayes.nf +++ b/modules/single/gene_demulti/freebayes.nf @@ -6,7 +6,6 @@ process freebayes{ label 'big_mem' conda "bioconda::freebayes=1.2" - container "biocontainers/freebayes" input: path bam_freebayes diff --git a/test.config b/test.config index ad09f4d..d771617 100644 --- a/test.config +++ b/test.config @@ -1,13 +1,11 @@ params { - outdir = "result_test_new" + outdir = "result_test" // input for hashing-based deconvolution hto_matrix_raw = "$projectDir/test_data/hto" hto_matrix_filtered = "$projectDir/test_data/hto" rna_matrix_raw = "$projectDir/test_data/rna" rna_matrix_filtered = "$projectDir/test_data/rna" - souporcell = "False" - // input for genotype-based deconvolution bam = "$projectDir/test_data/jurkat_293t_downsampled_n500_full_bam.bam" bai = "$projectDir/test_data/jurkat_293t_downsampled_n500_full_bam.bam.bai" @@ -20,11 +18,10 @@ params { common_variants_freemuxlet = "$projectDir/test_data/jurkat_293t_exons_only.vcf.withAF.vcf.gz" common_variants_cellsnp = "$projectDir/test_data/genome1K.phase3.SNP_AF5e2.chr1toX.hg19.vcf.gz" vcf_donor = "$projectDir/test_data/jurkat_293t_exons_only.vcf.withAF.vcf" - // Call freebayes on chr 1 and chr 2 only to speed up run time region = "1;2" - // donor genotype file provided by popscle doesnt work on souporcell use_known_genotype = "False" + ignore = "True" }