Skip to content

Commit

Permalink
ingest: Add optional Nextclade rules
Browse files Browse the repository at this point in the history
Add rules for running Nextclade as a part of the ingest workflow. These
rules are optional because not every pathogen will have a Nextclade
dataset to be able to run Nextclade as a part of ingest.
  • Loading branch information
joverlee521 committed Oct 10, 2023
1 parent 3493a93 commit 6c4b21d
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 0 deletions.
3 changes: 3 additions & 0 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@ configfile: "config/defaults.yaml"

include: "rules/fetch_from_ncbi.smk"
include: "rules/curate.smk"

if "nextclade" in config:
include: "rules/nextclade.smk"
9 changes: 9 additions & 0 deletions ingest/config/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,12 @@ curate:
output_sequence_field: ""
# The list of metadata columns to keep in the final output of the curation pipeline.
metadata_columns: []


# Nextclade parameters to include if you are running Nextclade as a part of your ingest workflow
# Note that this requires a Nextclade dataset to already exist for your pathogen.
# Remove the following parameters if you do not plan to run Nextclade.
nextclade:
# The name of the Nextclade dataset to use for running nextclade.
# Run `nextclade dataset list` to get a full list of available Nextclade datasets
dataset_name: ""
50 changes: 50 additions & 0 deletions ingest/rules/nextclade.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""
This part of the workflow handles running Nextclade on the curated metadata
and sequences.
See Nextclade docs for more details on usage, inputs, and outputs if you would
like to customize the rules:
https://docs.nextstrain.org/projects/nextclade/en/stable/user/nextclade-cli.html
"""
DATASET_NAME = config["nextclade"]["dataset_name"]


rule get_nextclade_dataset:
"""Download Nextclade dataset"""
output:
dataset=f"data/nextclade_data/{DATASET_NAME}.zip",
params:
dataset_name=DATASET_NAME
shell:
"""
nextclade dataset get \
--name={params.dataset_name:q} \
--output-zip={output.dataset} \
--verbose
"""


rule run_nextclade:
input:
dataset=f"data/nextclade_data/{DATASET_NAME}.zip",
sequences="results/sequences.fasta",
output:
nextclade="results/nextclade.tsv",
alignment="results/alignment.fasta",
translations="results/translations.zip",
params:
# The lambda is used to deactivate automatic wildcard expansion.
# https://github.com/snakemake/snakemake/blob/384d0066c512b0429719085f2cf886fdb97fd80a/snakemake/rules.py#L997-L1000
translations=lambda w: "results/translations/{gene}.fasta",
shell:
"""
nextclade run \
{input.sequences} \
--input-dataset {input.dataset} \
--output-tsv {output.nextclade} \
--output-fasta {output.alignment} \
--output-translations {params.translations}
zip -rj {output.translations} results/translations
"""

0 comments on commit 6c4b21d

Please sign in to comment.