From 6c4b21d36054eaef700d8ae8d560338c29ec7ef3 Mon Sep 17 00:00:00 2001
From: Jover <joverlee521@gmail.com>
Date: Fri, 6 Oct 2023 16:43:23 -0700
Subject: [PATCH] ingest: Add optional Nextclade rules

Add rules for running Nextclade as a part of the ingest workflow. These
rules are optional because not every pathogen will have a Nextclade
dataset to be able to run Nextclade as a part of ingest.
---
 ingest/Snakefile            |  3 +++
 ingest/config/defaults.yaml |  9 +++++++
 ingest/rules/nextclade.smk  | 50 +++++++++++++++++++++++++++++++++++++
 3 files changed, 62 insertions(+)
 create mode 100644 ingest/rules/nextclade.smk

diff --git a/ingest/Snakefile b/ingest/Snakefile
index 1be1d6f..fb0a17a 100644
--- a/ingest/Snakefile
+++ b/ingest/Snakefile
@@ -4,3 +4,6 @@ configfile: "config/defaults.yaml"
 
 include: "rules/fetch_from_ncbi.smk"
 include: "rules/curate.smk"
+
+if "nextclade" in config:
+    include: "rules/nextclade.smk"
diff --git a/ingest/config/defaults.yaml b/ingest/config/defaults.yaml
index 17fb708..f3a9b7e 100644
--- a/ingest/config/defaults.yaml
+++ b/ingest/config/defaults.yaml
@@ -73,3 +73,12 @@ curate:
   output_sequence_field: ""
   # The list of metadata columns to keep in the final output of the curation pipeline.
   metadata_columns: []
+
+
+# Nextclade parameters to include if you are running Nextclade as a part of your ingest workflow
+# Note that this requires a Nextclade dataset to already exist for your pathogen.
+# Remove the following parameters if you do not plan to run Nextclade.
+nextclade:
+  # The name of the Nextclade dataset to use for running nextclade.
+  # Run `nextclade dataset list` to get a full list of available Nextclade datasets
+  dataset_name: ""
diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk
new file mode 100644
index 0000000..9b355ad
--- /dev/null
+++ b/ingest/rules/nextclade.smk
@@ -0,0 +1,50 @@
+"""
+This part of the workflow handles running Nextclade on the curated metadata
+and sequences.
+
+See Nextclade docs for more details on usage, inputs, and outputs if you would
+like to customize the rules:
+https://docs.nextstrain.org/projects/nextclade/en/stable/user/nextclade-cli.html
+"""
+DATASET_NAME = config["nextclade"]["dataset_name"]
+
+
+rule get_nextclade_dataset:
+    """Download Nextclade dataset"""
+    output:
+        dataset=f"data/nextclade_data/{DATASET_NAME}.zip",
+    params:
+        dataset_name=DATASET_NAME
+    shell:
+        """
+        nextclade dataset get \
+            --name={params.dataset_name:q} \
+            --output-zip={output.dataset} \
+            --verbose
+        """
+
+
+rule run_nextclade:
+    input:
+        dataset=f"data/nextclade_data/{DATASET_NAME}.zip",
+        sequences="results/sequences.fasta",
+    output:
+        nextclade="results/nextclade.tsv",
+        alignment="results/alignment.fasta",
+        translations="results/translations.zip",
+    params:
+        # The lambda is used to deactivate automatic wildcard expansion.
+        # https://github.com/snakemake/snakemake/blob/384d0066c512b0429719085f2cf886fdb97fd80a/snakemake/rules.py#L997-L1000
+        translations=lambda w: "results/translations/{gene}.fasta",
+    shell:
+        """
+        nextclade run \
+            {input.sequences} \
+            --input-dataset {input.dataset} \
+            --output-tsv {output.nextclade} \
+            --output-fasta {output.alignment} \
+            --output-translations {params.translations}
+
+        zip -rj {output.translations} results/translations
+        """
+