From 5ee42473d7bba44a0730af599ee88dbfefeff1d4 Mon Sep 17 00:00:00 2001
From: Jennifer Chang <jennifer.chang.bioinform@gmail.com>
Date: Fri, 29 Nov 2024 09:02:42 -0800
Subject: [PATCH] wip: assemble nextclade dataset rules

---
 .../rules/assemble_nextclade_dataset.smk      | 71 +++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 nextclade/rules/assemble_nextclade_dataset.smk

diff --git a/nextclade/rules/assemble_nextclade_dataset.smk b/nextclade/rules/assemble_nextclade_dataset.smk
new file mode 100644
index 0000000..930aa64
--- /dev/null
+++ b/nextclade/rules/assemble_nextclade_dataset.smk
@@ -0,0 +1,71 @@
+"""
+This part of the workflow organizes the files for a Nextstrain dataset.
+
+REQUIRED INPUTS:
+
+    tree            = auspice/tree.nwk
+    reference_files = reference.fasta, reference.gff3
+    pathogen_json   = defaults/pathogen.json
+    doc_files       = defaults/README.md, defaults/CHANGELOG.md
+    example query   = defaults/example_sequences.fasta
+
+OUTPUTS:
+
+    dataset_zip     = dataset.zip
+    test_output     = results of testing the example query against the Nextclade dataset
+
+This part of the workflow usually includes the following steps:
+
+    - zipping the final Nextclade dataset
+    - running a test of the final Nextclade dataset
+
+See the Nextclade documentation for more information:
+
+    - https://github.com/nextstrain/nextclade_data/blob/master/docs/dataset-creation-guide.md
+    - https://github.com/nextstrain/nextclade_data/blob/master/docs/dataset-curation-guide.md
+
+"""
+
+rule assemble_dataset:
+    input:
+        tree="auspice/tree.json",
+        reference="defaults/reference.fasta",
+        annotation="defaults/reference.gff3",
+        sequences="defaults/example_sequences.fasta",
+        pathogen="defaults/pathogen.json",
+        readme="defaults/README.md",
+        changelog="defaults/CHANGELOG.md",
+    output:
+        tree="dataset/tree.json",
+        reference="dataset/reference.fasta",
+        annotation="dataset/genome_annotation.gff3",
+        sequences="dataset/sequences.fasta",
+        pathogen="dataset/pathogen.json",
+        readme="dataset/README.md",
+        changelog="dataset/CHANGELOG.md",
+        dataset_zip="dataset.zip",
+    shell:
+        """
+        cp {input.tree} {output.tree}
+        cp {input.reference} {output.reference}
+        cp {input.annotation} {output.annotation}
+        cp {input.sequences} {output.sequences}
+        cp {input.pathogen} {output.pathogen}
+        cp {input.readme} {output.readme}
+        cp {input.changelog} {output.changelog}
+        zip -rj dataset.zip  dataset/*
+        """
+
+rule test:
+    input:
+        dataset="dataset.zip",
+        sequences="defaults/example_sequences.fasta",
+    output:
+        output=directory("test_out"),
+    shell:
+        """
+        nextclade3 run \
+            --input-dataset {input.dataset} \
+            --output-all {output.output} \
+            {input.sequences}
+        """