1.add line break 2.remove container setting 3. adapt souporcell testing

theislab · Nov 5, 2023 · 2d5767b · 2d5767b
1 parent d8fb67d
commit 2d5767b
Show file tree

Hide file tree

Showing 9 changed files with 20 additions and 16 deletions.
diff --git a/docs/source/general.md b/docs/source/general.md
@@ -119,7 +119,10 @@ nextflow run main.nf -profile standard,conda
 
 ### **Running on multiple samples**
 
-The pipeline is able to run on multiple samples. In this scenario, the shared parameters for input data are retrieved from a sample sheet using `params.multi_sample`, which is set to None by default. Along with the input data, the sample sheet should contain an additional column for unique sample IDs assigned to each sample. The remaining parameters for each process are specified in the nextflow.config file, just like when demultiplexing a single sample. However, there is a distinction between running on a single sample and running on multiple samples. When processing multiple samples, the pipeline only permits a single value for each process parameter, whereas in the case of a single sample, multiple values separated by commas are allowed. The sample sheet (example file see the Resources section below) should have e.g. following columns depending on the methods you want to run:
+The pipeline is able to run on multiple samples. In this scenario, the shared parameters for input data are retrieved from a sample sheet using `params.multi_sample`, which is set to None by default. 
+Along with the input data, the sample sheet should contain an additional column for unique sample IDs assigned to each sample. The remaining parameters for each process are specified in the nextflow.config file, just like when demultiplexing a single sample. 
+However, there is a distinction between running on a single sample and running on multiple samples. When processing multiple samples, the pipeline only permits a single value for each process parameter, whereas in the case of a single sample, multiple values separated by commas are allowed. 
+The sample sheet (example file see the Resources section below) should have e.g. following columns depending on the methods you want to run:
 
 - sampleId
 - na_matrix_raw
@@ -136,11 +139,15 @@ The pipeline is able to run on multiple samples. In this scenario, the shared pa
 
 ### **scverse compatibility**
 
-To ensure scverse compatibility, the pipeline provides the option to generate AnnData or MuData objects after demultiplexing specified by `params.generate_anndata` and `params.generate_mudata`. The objects contain the scRNA-seq counts from `params.rna_matrix_filered` and stores the assignment of each demultiplexing method in the `assignment` column of `obs`. Additionally, if `match_donor` is True, the pipeline also produces an AnnData object which contains the assignment of the best-matched method pair after donor matching.
+To ensure scverse compatibility, the pipeline provides the option to generate AnnData or MuData objects after demultiplexing specified by `params.generate_anndata` and `params.generate_mudata`. 
+The objects contain the scRNA-seq counts from `params.rna_matrix_filered` and stores the assignment of each demultiplexing method in the `assignment` column of `obs`. 
+Additionally, if `match_donor` is True, the pipeline also produces an AnnData object which contains the assignment of the best-matched method pair after donor matching.
 
 ## **Pipeline output**
 
-The output directory of the pipeline is set by `$params.outdir`. By default, the pipeline is run on a single sample. In this case, all pipeline output will be saved in the folder `$projectDir/$params.outdir/$params.mode`. When running the pipeline on multiple samples, the pipeline output will be found in the folder `"$projectDir/$params.outdir/$sampleId/$params.mode`. To simplify this, we'll refer to this folder as `$pipeline_output_folder` from now on.
+The output directory of the pipeline is set by `$params.outdir`. 
+By default, the pipeline is run on a single sample. In this case, all pipeline output will be saved in the folder `$projectDir/$params.outdir/$params.mode`. 
+When running the pipeline on multiple samples, the pipeline output will be found in the folder `"$projectDir/$params.outdir/$sampleId/$params.mode`. To simplify this, we'll refer to this folder as `$pipeline_output_folder` from now on.
 
 The demultiplexing workflow saves its output in `$pipeline_output_folder/[gene/hash]_demulti`. The pipeline will also generate some TSV files to summarize the results in the folder `[gene/hash]_summary` under this directory.
 

diff --git a/docs/source/genetic.md b/docs/source/genetic.md
@@ -317,6 +317,7 @@ output directory: `$pipeline_output_folder/souporcell/souporcell_[task_ID/sample
 | max_loci                     | Max loci per cell, affects speed. Default: 2048                                                                                                                |
 | restarts                     | Number of restarts in clustering, when there are > 12 clusters we recommend increasing this to avoid local minima. Default: None                               |
 | common_variants_souporcell   | Common variant loci or known variant loci vcf, must be vs same reference fasta.                                                                                |
+| use_known_genotype           | Whether to use known donor genotype. Default: True   |
 | vcf_donor                    | Known variants per clone in population vcf mode, must be VCF file.                                                                                             |
 | known_genotypes_sample_names | Which samples in population vcf from known genotypes option represent the donors in your sample. Default: None                                                 |
 | skip_remap                   | Don't remap with minimap2, not recommended unless in conjunction with comman variants. Default: True                                                           |

diff --git a/docs/source/index.md b/docs/source/index.md
@@ -49,18 +49,20 @@ You can also:
 
 ```bash
 sh test_data/download_data.sh
-nextflow run main.nf -profile test,conda
+nextflow run main.nf -profile test
 ```
 
 ## **Pipeline output**
 
-By default, the pipeline is run on a single sample. In this case, all pipeline output will be saved in the folder `$projectDir/$params.outdir/$params.mode`. When running the pipeline on multiple samples, the pipeline output will be found in the folder `"$projectDir/$params.outdir/$sampleId/$params.mode/`. To simplify this, we'll refer to this folder as `$pipeline_output_folder` from now on.
+By default, the pipeline is run on a single sample. In this case, all pipeline output will be saved in the folder `$projectDir/$params.outdir/$params.mode`. 
+When running the pipeline on multiple samples, the pipeline output will be found in the folder `"$projectDir/$params.outdir/$sampleId/$params.mode/`. To simplify this, we'll refer to this folder as `$pipeline_output_folder` from now on.
 
 ### **Intermediate output**
 
 The pipeline saves the output of each process for two workflows separately, so you will find the results of hashing-based and genetics-based deconvolution methods in the folder `hash_demulti` and `gene_demulti` respectively.
 
-If the pipeline is run on single sample, each demultiplexing process will generate some intermediate files in the folder in the format `$pipeline_output_folder/[method]/[method]_[task_ID]`, e.g. `htodemux/htodemux_1`. If the pipeline is run on multiple samples, the `task_ID` will be replaced by `sampleId`. In the folder, you can find following files:
+If the pipeline is run on single sample, each demultiplexing process will generate some intermediate files in the folder in the format `$pipeline_output_folder/[method]/[method]_[task_ID]`, e.g. `htodemux/htodemux_1`. 
+If the pipeline is run on multiple samples, the `task_ID` will be replaced by `sampleId`. In the folder, you can find following files:
 
 - `params.csv`: specified parameters in the task
 - Output of the task, check [](output) for more details.

diff --git a/docs/source/rescue.md b/docs/source/rescue.md
@@ -21,7 +21,8 @@ The joint call of hashing and genetic deconvolution methods has been shown to be
 
 ## **Output**
 
-By default, the pipeline is run on a single sample. In this case, all pipeline output will be saved in the folder `$projectDir/$params.outdir/rescue`. When running the pipeline on multiple samples, the pipeline output will be found in the folder `"$projectDir/$params.outdir/$sampleId/rescue`. To simplify this, we'll refer to this folder as `$pipeline_output_folder` from now on.
+By default, the pipeline is run on a single sample. In this case, all pipeline output will be saved in the folder `$projectDir/$params.outdir/rescue`. When running the pipeline on multiple samples, the pipeline output will be found in the folder `"$projectDir/$params.outdir/$sampleId/rescue`. 
+To simplify this, we'll refer to this folder as `$pipeline_output_folder` from now on.
 
 In rescue mode, the genotype- and hashing-based demultiplexing workflow run in parallel. They save their output in `$pipeline_output_folder/[gene/hash]_demulti`. Before running the donor-matching preocess, the pipeline merges the results of two workflows into `classification_all_genetic_and_hash.csv` and `assignment_all_genetic_and_hash.csv` in the `$pipeline_output_folder/summary` folder.
 

diff --git a/modules/gene_demulti/bcftools.nf b/modules/gene_demulti/bcftools.nf
@@ -5,7 +5,6 @@ process bcftools{
     label 'big_mem'
 
     conda "bioconda::bcftools=1.9"
-    container "biocontainers/bcftools"
 
     input:
         tuple val(sampleId), val(vcf_list)

diff --git a/modules/gene_demulti/freebayes.nf b/modules/gene_demulti/freebayes.nf
@@ -6,7 +6,6 @@ process freebayes{
     label 'big_mem'
 
     conda "bioconda::freebayes=1.2"
-    container "biocontainers/freebayes"
 
     input:
         tuple val(sampleId), path(bam_freebayes), path(bai_freebayes)

diff --git a/modules/single/gene_demulti/bcftools.nf b/modules/single/gene_demulti/bcftools.nf
@@ -5,7 +5,6 @@ process bcftools{
     label 'big_mem'
 
     conda "bioconda::bcftools=1.9"
-    container "biocontainers/bcftools"
 
     input:
         val vcf

diff --git a/modules/single/gene_demulti/freebayes.nf b/modules/single/gene_demulti/freebayes.nf
@@ -6,7 +6,6 @@ process freebayes{
     label 'big_mem'
 
     conda "bioconda::freebayes=1.2"
-    container "biocontainers/freebayes"
 
     input:
         path bam_freebayes

diff --git a/test.config b/test.config
@@ -1,13 +1,11 @@
 params {
-  outdir = "result_test_new"
+  outdir = "result_test"
   // input for hashing-based deconvolution
   hto_matrix_raw = "$projectDir/test_data/hto"
   hto_matrix_filtered = "$projectDir/test_data/hto"
   rna_matrix_raw = "$projectDir/test_data/rna"
   rna_matrix_filtered = "$projectDir/test_data/rna"
 
-  souporcell = "False"
-
   // input for genotype-based deconvolution
   bam = "$projectDir/test_data/jurkat_293t_downsampled_n500_full_bam.bam"
   bai = "$projectDir/test_data/jurkat_293t_downsampled_n500_full_bam.bam.bai"
@@ -20,11 +18,10 @@ params {
   common_variants_freemuxlet = "$projectDir/test_data/jurkat_293t_exons_only.vcf.withAF.vcf.gz"
   common_variants_cellsnp  = "$projectDir/test_data/genome1K.phase3.SNP_AF5e2.chr1toX.hg19.vcf.gz"
   vcf_donor = "$projectDir/test_data/jurkat_293t_exons_only.vcf.withAF.vcf"
-
   // Call freebayes on chr 1 and chr 2 only to speed up run time
   region = "1;2"
-
   // donor genotype file provided by popscle doesnt work on souporcell
   use_known_genotype = "False"
+  ignore = "True"
 
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,7 +5,6 @@ process bcftools{ @@
         label 'big_mem'
         conda "bioconda::bcftools=1.9"
-        container "biocontainers/bcftools"
         input:
             val vcf
@@ Expand Down @@