nf-core · olgabot · Apr 21, 2021 · Apr 21, 2021 · May 5, 2021 · May 5, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -25,6 +25,7 @@ barcode fastq
 * Add `--skip_compare option` to skip `sourmash_compare_sketches` process
 * Add merging of aligned/unaligned parts of single-cell data ([#117](https://github.com/nf-core/kmermaid/pull/117))
 * Add renamed package dependency orpheum (used to be known as sencha)
+* Added `--singleton` option for sourmash to compute one signature per FASTA/FASTQ entry
 
 ### `Fixed`
 

diff --git a/main.nf b/main.nf
@@ -90,6 +90,7 @@ def helpMessage() {
       --skip_compare                If provided, skip comparison of hashes using sourmash compare
       --skip_compute                If provided, skip computing of signatures using sourmash compute
       --skip_sig_merge              If provided, skip merging of aligned/unaligned signatures created from bam files or tenx tgz files
+      --sketch_singleton            If provided, compute one k-mer sketch per fasta entry, not for the whole file
 
      Sketch size options:
       --sketch_num_hashes           Number of hashes to use for making the sketches.
@@ -468,6 +469,7 @@ sketch_num_hashes = params.sketch_num_hashes
 sketch_num_hashes_log2 = params.sketch_num_hashes_log2
 sketch_scaled = params.sketch_scaled
 sketch_scaled_log2 = params.sketch_scaled_log2
+sketch_singleton = params.sketch_singleton
 have_sketch_value = params.sketch_num_hashes || params.sketch_num_hashes_log2 || params.sketch_scaled || params.sketch_scaled_log2
 
 if (!have_sketch_value && !params.split_kmer) {
@@ -582,6 +584,7 @@ summary['Skip multiqc?'] = params.skip_multiqc
 summary['K-mer sizes']            = params.ksizes
 summary['Molecule']               = params.molecules
 summary['Track Abundance']        = params.track_abundance
+summary['Singleton sketches?'] = params.sketch_singleton
 // -- Sketch size parameters --
 if (params.sketch_num_hashes) summary['Sketch Sizes']                  = params.sketch_num_hashes
 if (params.sketch_num_hashes_log2) summary['Sketch Sizes (log2)']      = params.sketch_num_hashes_log2
@@ -1324,17 +1327,18 @@ if (!params.remove_ribo_rna) {
       )
       sketch_value_flag = make_sketch_value_flag(sketch_style_parsed[0], sketch_value_parsed[0])
       track_abundance_flag = track_abundance ? '--track-abundance' : ''
+      singleton_flag = sketch_singleton ? "--singleton" : "--name '${sample_id}'"
       sig_id = "${sample_id}__${sketch_id}"
       sig = "${sig_id}.sig"
       csv = "${sig_id}.csv"
       """
         sourmash compute \\
           ${sketch_value_flag} \\
+          ${singleton_flag} \\
           --ksizes ${params.ksizes} \\
           --dna \\
           $track_abundance_flag \\
           --output ${sig} \\
-          --name '${sample_id}' \\
           $reads
         sourmash sig describe --csv ${csv} ${sig}
       """
@@ -1408,16 +1412,17 @@ if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){
 
     sketch_value_flag = make_sketch_value_flag(sketch_style_parsed[0], sketch_value_parsed[0])
     track_abundance_flag = track_abundance ? '--track-abundance' : ''
+      singleton_flag = sketch_singleton ? "--singleton" : "--name '${sample_id}'"
     sig_id = "${sample_id}__${sketch_id}"
     sig = "${sig_id}.sig"
     csv = "${sig_id}.csv"
     """
       sourmash compute \\
         ${sketch_value_flag} \\
+        ${singleton_flag} \\
         --ksizes ${params.ksizes} \\
         --input-is-protein \\
         ${peptide_molecule_flags} \\
-        --name '${sample_id}' \\
         --no-dna \\
         $track_abundance_flag \\
         --output ${sig} \\

diff --git a/nextflow.config b/nextflow.config
@@ -26,6 +26,9 @@ params {
   tenx_molecular_barcode_pattern = '(UB|XB|XM):Z:([ACGT]+)'
   tenx_min_umi_per_cell = 1000
 
+  // DNA sequence parsing
+  skip_trimming = false
+
   // Creating sketches
   molecules ='dna,protein,dayhoff'
   ksizes = '21,30,51'
@@ -36,6 +39,7 @@ params {
   sketch_num_hashes_log2 = false
   sketch_scaled = false
   sketch_scaled_log2 = false
+  sketch_singleton = false
   skip_sig_merge = false
 
   // Comparing sketches
@@ -44,8 +48,6 @@ params {
   // Computing sketches
   skip_compute = false
 
-  skip_trimming = false
-
   // translate options
   translate_peptide_ksize = 8
   translate_peptide_molecule = 'protein'

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -150,6 +150,11 @@
                     "type": "integer",
                     "fa_icon": "fas fa-barcode",
                     "description": "Integer value to subsample reads from input fastq files"
+                },
+                "sketch_singleton": {
+                    "type": "string",
+                    "description": "Compute one signature per entry in the FASTA file, which is useful when the file contains e.g. a transcript or genome per entry. This is not recommended for FASTQ files as it would compute one signature per read, and presumably one would want one signature per sequencing dataset",
+                    "fa_icon": "fas fa-dice-one"
                 }
             },
             "fa_icon": "fas fa-cogs"
@@ -187,6 +192,16 @@
                     "type": "integer",
                     "description": "Maximum table size for bloom filter creation",
                     "fa_icon": "fas fa-code-branch"
+                },
+                "save_translate_csv": {
+                    "type": "string",
+                    "description": "Path to save the coding scores as a csv",
+                    "default": "False"
+                },
+                "save_translate_json": {
+                    "type": "string",
+                    "description": "Path to save summarization of coding/\"     \"noncoding/other categorizations, the \"     \"min/max/mean/median/stddev of Jaccard scores, and other as a json",
+                    "default": "False"
                 }
             }
         },
@@ -484,17 +499,5 @@
         {
             "$ref": "#/definitions/generic_options"
         }
-    ],
-    "properties": {
-        "save_translate_csv": {
-            "type": "string",
-            "description": "Path to save the coding scores as a csv",
-            "default": "False"
-        },
-        "save_translate_json": {
-            "type": "string",
-            "description": "Path to save summarization of coding/\"     \"noncoding/other categorizations, the \"     \"min/max/mean/median/stddev of Jaccard scores, and other as a json",
-            "default": "False"
-        }
-    }
+    ]
 }