Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add --singleton option for sourmash #140

Open
wants to merge 5 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ barcode fastq
* Add `--skip_compare option` to skip `sourmash_compare_sketches` process
* Add merging of aligned/unaligned parts of single-cell data ([#117](https://github.com/nf-core/kmermaid/pull/117))
* Add renamed package dependency orpheum (used to be known as sencha)
* Added `--singleton` option for sourmash to compute one signature per FASTA/FASTQ entry

### `Fixed`

Expand Down
9 changes: 7 additions & 2 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def helpMessage() {
--skip_compare If provided, skip comparison of hashes using sourmash compare
--skip_compute If provided, skip computing of signatures using sourmash compute
--skip_sig_merge If provided, skip merging of aligned/unaligned signatures created from bam files or tenx tgz files
--sketch_singleton If provided, compute one k-mer sketch per fasta entry, not for the whole file

Sketch size options:
--sketch_num_hashes Number of hashes to use for making the sketches.
Expand Down Expand Up @@ -468,6 +469,7 @@ sketch_num_hashes = params.sketch_num_hashes
sketch_num_hashes_log2 = params.sketch_num_hashes_log2
sketch_scaled = params.sketch_scaled
sketch_scaled_log2 = params.sketch_scaled_log2
sketch_singleton = params.sketch_singleton
have_sketch_value = params.sketch_num_hashes || params.sketch_num_hashes_log2 || params.sketch_scaled || params.sketch_scaled_log2

if (!have_sketch_value && !params.split_kmer) {
Expand Down Expand Up @@ -582,6 +584,7 @@ summary['Skip multiqc?'] = params.skip_multiqc
summary['K-mer sizes'] = params.ksizes
summary['Molecule'] = params.molecules
summary['Track Abundance'] = params.track_abundance
summary['Singleton sketches?'] = params.sketch_singleton
// -- Sketch size parameters --
if (params.sketch_num_hashes) summary['Sketch Sizes'] = params.sketch_num_hashes
if (params.sketch_num_hashes_log2) summary['Sketch Sizes (log2)'] = params.sketch_num_hashes_log2
Expand Down Expand Up @@ -1324,17 +1327,18 @@ if (!params.remove_ribo_rna) {
)
sketch_value_flag = make_sketch_value_flag(sketch_style_parsed[0], sketch_value_parsed[0])
track_abundance_flag = track_abundance ? '--track-abundance' : ''
singleton_flag = sketch_singleton ? "--singleton" : "--name '${sample_id}'"
sig_id = "${sample_id}__${sketch_id}"
sig = "${sig_id}.sig"
csv = "${sig_id}.csv"
"""
sourmash compute \\
${sketch_value_flag} \\
${singleton_flag} \\
--ksizes ${params.ksizes} \\
--dna \\
$track_abundance_flag \\
--output ${sig} \\
--name '${sample_id}' \\
$reads
sourmash sig describe --csv ${csv} ${sig}
"""
Expand Down Expand Up @@ -1408,16 +1412,17 @@ if (!params.skip_compute && (protein_input || params.reference_proteome_fasta)){

sketch_value_flag = make_sketch_value_flag(sketch_style_parsed[0], sketch_value_parsed[0])
track_abundance_flag = track_abundance ? '--track-abundance' : ''
singleton_flag = sketch_singleton ? "--singleton" : "--name '${sample_id}'"
sig_id = "${sample_id}__${sketch_id}"
sig = "${sig_id}.sig"
csv = "${sig_id}.csv"
"""
sourmash compute \\
${sketch_value_flag} \\
${singleton_flag} \\
--ksizes ${params.ksizes} \\
--input-is-protein \\
${peptide_molecule_flags} \\
--name '${sample_id}' \\
--no-dna \\
$track_abundance_flag \\
--output ${sig} \\
Expand Down
6 changes: 4 additions & 2 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ params {
tenx_molecular_barcode_pattern = '(UB|XB|XM):Z:([ACGT]+)'
tenx_min_umi_per_cell = 1000

// DNA sequence parsing
skip_trimming = false

// Creating sketches
molecules ='dna,protein,dayhoff'
ksizes = '21,30,51'
Expand All @@ -36,6 +39,7 @@ params {
sketch_num_hashes_log2 = false
sketch_scaled = false
sketch_scaled_log2 = false
sketch_singleton = false
skip_sig_merge = false

// Comparing sketches
Expand All @@ -44,8 +48,6 @@ params {
// Computing sketches
skip_compute = false

skip_trimming = false

// translate options
translate_peptide_ksize = 8
translate_peptide_molecule = 'protein'
Expand Down
29 changes: 16 additions & 13 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,11 @@
"type": "integer",
"fa_icon": "fas fa-barcode",
"description": "Integer value to subsample reads from input fastq files"
},
"sketch_singleton": {
"type": "string",
"description": "Compute one signature per entry in the FASTA file, which is useful when the file contains e.g. a transcript or genome per entry. This is not recommended for FASTQ files as it would compute one signature per read, and presumably one would want one signature per sequencing dataset",
"fa_icon": "fas fa-dice-one"
}
},
"fa_icon": "fas fa-cogs"
Expand Down Expand Up @@ -187,6 +192,16 @@
"type": "integer",
"description": "Maximum table size for bloom filter creation",
"fa_icon": "fas fa-code-branch"
},
"save_translate_csv": {
"type": "string",
"description": "Path to save the coding scores as a csv",
"default": "False"
},
"save_translate_json": {
"type": "string",
"description": "Path to save summarization of coding/\" \"noncoding/other categorizations, the \" \"min/max/mean/median/stddev of Jaccard scores, and other as a json",
"default": "False"
}
}
},
Expand Down Expand Up @@ -484,17 +499,5 @@
{
"$ref": "#/definitions/generic_options"
}
],
"properties": {
"save_translate_csv": {
"type": "string",
"description": "Path to save the coding scores as a csv",
"default": "False"
},
"save_translate_json": {
"type": "string",
"description": "Path to save summarization of coding/\" \"noncoding/other categorizations, the \" \"min/max/mean/median/stddev of Jaccard scores, and other as a json",
"default": "False"
}
}
]
}