From 6c4b21d36054eaef700d8ae8d560338c29ec7ef3 Mon Sep 17 00:00:00 2001 From: Jover Date: Fri, 6 Oct 2023 16:43:23 -0700 Subject: [PATCH] ingest: Add optional Nextclade rules Add rules for running Nextclade as a part of the ingest workflow. These rules are optional because not every pathogen will have a Nextclade dataset to be able to run Nextclade as a part of ingest. --- ingest/Snakefile | 3 +++ ingest/config/defaults.yaml | 9 +++++++ ingest/rules/nextclade.smk | 50 +++++++++++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+) create mode 100644 ingest/rules/nextclade.smk diff --git a/ingest/Snakefile b/ingest/Snakefile index 1be1d6f..fb0a17a 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -4,3 +4,6 @@ configfile: "config/defaults.yaml" include: "rules/fetch_from_ncbi.smk" include: "rules/curate.smk" + +if "nextclade" in config: + include: "rules/nextclade.smk" diff --git a/ingest/config/defaults.yaml b/ingest/config/defaults.yaml index 17fb708..f3a9b7e 100644 --- a/ingest/config/defaults.yaml +++ b/ingest/config/defaults.yaml @@ -73,3 +73,12 @@ curate: output_sequence_field: "" # The list of metadata columns to keep in the final output of the curation pipeline. metadata_columns: [] + + +# Nextclade parameters to include if you are running Nextclade as a part of your ingest workflow +# Note that this requires a Nextclade dataset to already exist for your pathogen. +# Remove the following parameters if you do not plan to run Nextclade. +nextclade: + # The name of the Nextclade dataset to use for running nextclade. + # Run `nextclade dataset list` to get a full list of available Nextclade datasets + dataset_name: "" diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk new file mode 100644 index 0000000..9b355ad --- /dev/null +++ b/ingest/rules/nextclade.smk @@ -0,0 +1,50 @@ +""" +This part of the workflow handles running Nextclade on the curated metadata +and sequences. + +See Nextclade docs for more details on usage, inputs, and outputs if you would +like to customize the rules: +https://docs.nextstrain.org/projects/nextclade/en/stable/user/nextclade-cli.html +""" +DATASET_NAME = config["nextclade"]["dataset_name"] + + +rule get_nextclade_dataset: + """Download Nextclade dataset""" + output: + dataset=f"data/nextclade_data/{DATASET_NAME}.zip", + params: + dataset_name=DATASET_NAME + shell: + """ + nextclade dataset get \ + --name={params.dataset_name:q} \ + --output-zip={output.dataset} \ + --verbose + """ + + +rule run_nextclade: + input: + dataset=f"data/nextclade_data/{DATASET_NAME}.zip", + sequences="results/sequences.fasta", + output: + nextclade="results/nextclade.tsv", + alignment="results/alignment.fasta", + translations="results/translations.zip", + params: + # The lambda is used to deactivate automatic wildcard expansion. + # https://github.com/snakemake/snakemake/blob/384d0066c512b0429719085f2cf886fdb97fd80a/snakemake/rules.py#L997-L1000 + translations=lambda w: "results/translations/{gene}.fasta", + shell: + """ + nextclade run \ + {input.sequences} \ + --input-dataset {input.dataset} \ + --output-tsv {output.nextclade} \ + --output-fasta {output.alignment} \ + --output-translations {params.translations} + + zip -rj {output.translations} results/translations + """ +