From 2aee8186c242d78288604d72fdf65e924f6dfe5b Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 21 Apr 2021 13:48:14 -0700 Subject: [PATCH 1/5] Add --singleton option for sourmash --- main.nf | 5 +++++ nextflow.config | 1 + 2 files changed, 6 insertions(+) diff --git a/main.nf b/main.nf index 94756edf..03315c80 100644 --- a/main.nf +++ b/main.nf @@ -468,6 +468,7 @@ sketch_num_hashes = params.sketch_num_hashes sketch_num_hashes_log2 = params.sketch_num_hashes_log2 sketch_scaled = params.sketch_scaled sketch_scaled_log2 = params.sketch_scaled_log2 +sketch_singleton = params.sketch_singleton have_sketch_value = params.sketch_num_hashes || params.sketch_num_hashes_log2 || params.sketch_scaled || params.sketch_scaled_log2 if (!have_sketch_value && !params.split_kmer) { @@ -1324,12 +1325,14 @@ if (!params.remove_ribo_rna) { ) sketch_value_flag = make_sketch_value_flag(sketch_style_parsed[0], sketch_value_parsed[0]) track_abundance_flag = track_abundance ? '--track-abundance' : '' + singleton_flag = sketch_singleton ? "--singleton" : '' sig_id = "${sample_id}__${sketch_id}" sig = "${sig_id}.sig" csv = "${sig_id}.csv" """ sourmash compute \\ ${sketch_value_flag} \\ + ${singleton_flag} \\ --ksizes ${params.ksizes} \\ --dna \\ $track_abundance_flag \\ @@ -1408,12 +1411,14 @@ if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){ sketch_value_flag = make_sketch_value_flag(sketch_style_parsed[0], sketch_value_parsed[0]) track_abundance_flag = track_abundance ? '--track-abundance' : '' + singleton_flag = sketch_singleton ? "--singleton" : '' sig_id = "${sample_id}__${sketch_id}" sig = "${sig_id}.sig" csv = "${sig_id}.csv" """ sourmash compute \\ ${sketch_value_flag} \\ + ${singleton_flag} \\ --ksizes ${params.ksizes} \\ --input-is-protein \\ ${peptide_molecule_flags} \\ diff --git a/nextflow.config b/nextflow.config index 589d186f..ffc576bf 100644 --- a/nextflow.config +++ b/nextflow.config @@ -36,6 +36,7 @@ params { sketch_num_hashes_log2 = false sketch_scaled = false sketch_scaled_log2 = false + sketch_singleton = false skip_sig_merge = false // Comparing sketches From 7b2f231a045d067a083272cea5d0e44932329eee Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 21 Apr 2021 14:35:12 -0700 Subject: [PATCH 2/5] Can't have --name if singleton --- main.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/main.nf b/main.nf index 03315c80..05b9349a 100644 --- a/main.nf +++ b/main.nf @@ -90,6 +90,7 @@ def helpMessage() { --skip_compare If provided, skip comparison of hashes using sourmash compare --skip_compute If provided, skip computing of signatures using sourmash compute --skip_sig_merge If provided, skip merging of aligned/unaligned signatures created from bam files or tenx tgz files + --sketch_singleton If provided, compute one k-mer sketch per fasta entry, not for the whole file Sketch size options: --sketch_num_hashes Number of hashes to use for making the sketches. @@ -583,6 +584,7 @@ summary['Skip multiqc?'] = params.skip_multiqc summary['K-mer sizes'] = params.ksizes summary['Molecule'] = params.molecules summary['Track Abundance'] = params.track_abundance +summary['Singleton sketches?'] = params.sketch_singleton // -- Sketch size parameters -- if (params.sketch_num_hashes) summary['Sketch Sizes'] = params.sketch_num_hashes if (params.sketch_num_hashes_log2) summary['Sketch Sizes (log2)'] = params.sketch_num_hashes_log2 @@ -1325,7 +1327,7 @@ if (!params.remove_ribo_rna) { ) sketch_value_flag = make_sketch_value_flag(sketch_style_parsed[0], sketch_value_parsed[0]) track_abundance_flag = track_abundance ? '--track-abundance' : '' - singleton_flag = sketch_singleton ? "--singleton" : '' + singleton_flag = sketch_singleton ? "--singleton" : "--name '${sample_id}'" sig_id = "${sample_id}__${sketch_id}" sig = "${sig_id}.sig" csv = "${sig_id}.csv" @@ -1337,7 +1339,6 @@ if (!params.remove_ribo_rna) { --dna \\ $track_abundance_flag \\ --output ${sig} \\ - --name '${sample_id}' \\ $reads sourmash sig describe --csv ${csv} ${sig} """ @@ -1411,7 +1412,7 @@ if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){ sketch_value_flag = make_sketch_value_flag(sketch_style_parsed[0], sketch_value_parsed[0]) track_abundance_flag = track_abundance ? '--track-abundance' : '' - singleton_flag = sketch_singleton ? "--singleton" : '' + singleton_flag = sketch_singleton ? "--singleton" : "--name '${sample_id}'" sig_id = "${sample_id}__${sketch_id}" sig = "${sig_id}.sig" csv = "${sig_id}.csv" @@ -1422,7 +1423,6 @@ if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){ --ksizes ${params.ksizes} \\ --input-is-protein \\ ${peptide_molecule_flags} \\ - --name '${sample_id}' \\ --no-dna \\ $track_abundance_flag \\ --output ${sig} \\ From 9eaf06b084994fd755addec456cf6c59f860822c Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 5 May 2021 08:42:40 -0700 Subject: [PATCH 3/5] Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a977cf1b..bbfedb75 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ barcode fastq * Add `--skip_compare option` to skip `sourmash_compare_sketches` process * Add merging of aligned/unaligned parts of single-cell data ([#117](https://github.com/nf-core/kmermaid/pull/117)) * Add renamed package dependency orpheum (used to be known as sencha) +* Added `--singleton` option for sourmash to compute one signature per FASTA/FASTQ entry ### `Fixed` From 9b9948f3ed709f899393b26dcfabfdbe0c49b1ba Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Wed, 5 May 2021 08:47:35 -0700 Subject: [PATCH 4/5] update schema --- nextflow_schema.json | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index a279fa64..b7f8ec88 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -150,6 +150,11 @@ "type": "integer", "fa_icon": "fas fa-barcode", "description": "Integer value to subsample reads from input fastq files" + }, + "sketch_singleton": { + "type": "string", + "description": "Compute one signature per entry in the FASTA file, which is useful when the file contains e.g. a transcript or genome per entry. This is not recommended for FASTQ files as it would compute one signature per read, and presumably one would want one signature per sequencing dataset", + "fa_icon": "fas fa-dice-one" } }, "fa_icon": "fas fa-cogs" @@ -187,6 +192,16 @@ "type": "integer", "description": "Maximum table size for bloom filter creation", "fa_icon": "fas fa-code-branch" + }, + "save_translate_csv": { + "type": "string", + "description": "Path to save the coding scores as a csv", + "default": "False" + }, + "save_translate_json": { + "type": "string", + "description": "Path to save summarization of coding/\" \"noncoding/other categorizations, the \" \"min/max/mean/median/stddev of Jaccard scores, and other as a json", + "default": "False" } } }, @@ -484,17 +499,5 @@ { "$ref": "#/definitions/generic_options" } - ], - "properties": { - "save_translate_csv": { - "type": "string", - "description": "Path to save the coding scores as a csv", - "default": "False" - }, - "save_translate_json": { - "type": "string", - "description": "Path to save summarization of coding/\" \"noncoding/other categorizations, the \" \"min/max/mean/median/stddev of Jaccard scores, and other as a json", - "default": "False" - } - } + ] } \ No newline at end of file From 0512fc9bfdd5a4365591920729c4af5ad1fe5462 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Thu, 22 Jul 2021 09:52:41 -0700 Subject: [PATCH 5/5] Move skip_trimming parameter in nextflow config --- nextflow.config | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/nextflow.config b/nextflow.config index ffc576bf..6e775a21 100644 --- a/nextflow.config +++ b/nextflow.config @@ -26,6 +26,9 @@ params { tenx_molecular_barcode_pattern = '(UB|XB|XM):Z:([ACGT]+)' tenx_min_umi_per_cell = 1000 + // DNA sequence parsing + skip_trimming = false + // Creating sketches molecules ='dna,protein,dayhoff' ksizes = '21,30,51' @@ -45,8 +48,6 @@ params { // Computing sketches skip_compute = false - skip_trimming = false - // translate options translate_peptide_ksize = 8 translate_peptide_molecule = 'protein'