From 46acdd78fef714ca6b6a05bd76f2d073852b0645 Mon Sep 17 00:00:00 2001
From: John SJ Anderson <janders4@fredhutch.org>
Date: Wed, 31 Jul 2024 20:48:06 -0700
Subject: [PATCH] Add `nextclade` workflow [#2]

---
 .gitignore                                    |   7 +
 .markdownlintrc                               |   9 ++
 nextclade/README.md                           |  29 ++++
 nextclade/Snakefile                           |  30 ++++
 nextclade/defaults/auspice_config.json        |  56 ++++++++
 nextclade/defaults/clades.tsv                 |  40 ++++++
 nextclade/defaults/colors.tsv                 |   8 ++
 nextclade/defaults/config.yaml                |  20 +++
 nextclade/defaults/genome_annotation.gff3     |   5 +
 nextclade/defaults/include_strains.txt        | 136 ++++++++++++++++++
 .../defaults/nextclade-dataset/CHANGELOG.md   |   3 +
 .../defaults/nextclade-dataset/README.md      |  47 ++++++
 .../defaults/nextclade-dataset/pathogen.json  |  52 +++++++
 nextclade/defaults/reference.fasta            |  13 ++
 nextclade/rules/annotate_phylogeny.smk        |  82 +++++++++++
 nextclade/rules/assemble_dataset.smk          |  71 +++++++++
 nextclade/rules/construct_phylogeny.smk       |  67 +++++++++
 nextclade/rules/export.smk                    |  44 ++++++
 nextclade/rules/prepare_sequences.smk         |  64 +++++++++
 19 files changed, 783 insertions(+)
 create mode 100644 .markdownlintrc
 create mode 100644 nextclade/README.md
 create mode 100644 nextclade/Snakefile
 create mode 100644 nextclade/defaults/auspice_config.json
 create mode 100644 nextclade/defaults/clades.tsv
 create mode 100644 nextclade/defaults/colors.tsv
 create mode 100644 nextclade/defaults/config.yaml
 create mode 100644 nextclade/defaults/genome_annotation.gff3
 create mode 100644 nextclade/defaults/include_strains.txt
 create mode 100644 nextclade/defaults/nextclade-dataset/CHANGELOG.md
 create mode 100644 nextclade/defaults/nextclade-dataset/README.md
 create mode 100644 nextclade/defaults/nextclade-dataset/pathogen.json
 create mode 100644 nextclade/defaults/reference.fasta
 create mode 100644 nextclade/rules/annotate_phylogeny.smk
 create mode 100644 nextclade/rules/assemble_dataset.smk
 create mode 100644 nextclade/rules/construct_phylogeny.smk
 create mode 100644 nextclade/rules/export.smk
 create mode 100644 nextclade/rules/prepare_sequences.smk

diff --git a/.gitignore b/.gitignore
index b943590..e8a7bfe 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,13 @@ ingest/benchmarks
 ingest/data
 ingest/logs
 ingest/results
+nextclade/auspice
+nextclade/benchmarks
+nextclade/data
+nextclade/dataset
+nextclade/logs
+nextclade/results
+nextclade/test_output
 phylogenetic/auspice
 phylogenetic/benchmarks
 phylogenetic/logs
diff --git a/.markdownlintrc b/.markdownlintrc
new file mode 100644
index 0000000..17fb483
--- /dev/null
+++ b/.markdownlintrc
@@ -0,0 +1,9 @@
+{
+  // long lines are okay
+  "MD013":{
+    "line_length": 100,
+    "tables": false
+  },
+  // don't require top-level heading on L1
+  "MD041": false
+}
diff --git a/nextclade/README.md b/nextclade/README.md
new file mode 100644
index 0000000..0fe50bf
--- /dev/null
+++ b/nextclade/README.md
@@ -0,0 +1,29 @@
+# Yellow Fever Virus Nextclade Dataset Tree
+
+This workflow creates a phylogenetic tree that can be used as part of
+a Nextclade dataset to assign genotypes to yellow fever virus samples
+based on [Mutebi et al.][] (J Virol. 2001 Aug;75(15):6999-7008) and
+[Bryant et al.][] (PLoS Pathog. 2007 May 18;3(5):e75).
+
+* Build a tree using samples from the `ingest` output, with the following
+  sampling criteria:
+  * Force-include the following samples:
+    * genotype reference strains from the 2 papers cited above
+* Assign genotypes to each sample and internal nodes of the tree with
+  `augur clades`, using clade-defining mutations in `defaults/clades.tsv`
+* Provide the following coloring options on the tree:
+  * Genotype assignment from `augur clades`
+
+## How to create a new tree
+
+* Run the workflow: `nextstrain build .`
+* Inspect the output tree by comparing genotype assignments from the following sources:
+  * `augur clades` output
+* If unwanted samples are present in the tree, add them to
+  `defaults/dropped_strains.tsv` and re-run the workflow
+* If any changes are needed to the clade-defining mutations, add
+  changes to `defaults/clades.tsv` and re-run the workflow
+* Repeat as needed
+
+[Mutebi et al.]: https://pubmed.ncbi.nlm.nih.gov/11435580/
+[Bryant et al.]: https://pubmed.ncbi.nlm.nih.gov/17511518/
diff --git a/nextclade/Snakefile b/nextclade/Snakefile
new file mode 100644
index 0000000..a6618eb
--- /dev/null
+++ b/nextclade/Snakefile
@@ -0,0 +1,30 @@
+configfile: "defaults/config.yaml"
+
+rule all:
+    input:
+        auspice_json = config["files"]["auspice_json"],
+        nextclade_dataset = "dataset/tree.json",
+        test_dataset = "test_output",
+
+include: "rules/prepare_sequences.smk"
+include: "rules/construct_phylogeny.smk"
+include: "rules/annotate_phylogeny.smk"
+include: "rules/export.smk"
+include: "rules/assemble_dataset.smk"
+
+rule clean:
+    params:
+        targets = [
+            ".snakemake",
+            "auspice",
+            "benchmarks",
+            "data",
+            "dataset",
+            "logs",
+            "results",
+            "test_output",
+        ]
+    shell:
+        """
+        rm -rfv {params.targets}
+        """
diff --git a/nextclade/defaults/auspice_config.json b/nextclade/defaults/auspice_config.json
new file mode 100644
index 0000000..142d5ea
--- /dev/null
+++ b/nextclade/defaults/auspice_config.json
@@ -0,0 +1,56 @@
+{
+  "title": "Real-time tracking of yellow fever virus full genome virus evolution",
+  "maintainers": [
+    {"name": "John SJ Anderson", "url": "https://bedford.io/team/john-sj-anderson/"},
+    {"name": "the Nextstrain team", "url": "https://nextstrain.org/team"}
+  ],
+  "data_provenance": [
+    {
+      "name": "GenBank",
+      "url": "https://www.ncbi.nlm.nih.gov/genbank/"
+    }
+  ],
+  "build_url": "https://github.com/nextstrain/yellow-fever",
+  "colorings": [
+    {
+      "key": "gt",
+      "title": "Genotype",
+      "type": "categorical"
+    },
+    {
+      "key": "num_date",
+      "title": "Date",
+      "type": "continuous"
+    },
+    {
+      "key": "region",
+      "title": "Region",
+      "type": "categorical"
+    },
+    {
+      "key": "country",
+      "title": "Country",
+      "type": "categorical"
+    },
+    {
+      "key": "host",
+      "title": "Host",
+      "type": "categorical"
+    }
+  ],
+  "geo_resolutions": [
+    "country",
+    "region"
+  ],
+  "display_defaults": {
+    "map_triplicate": true,
+    "color_by": "region"
+  },
+  "filters": [
+    "clade",
+    "region",
+    "country",
+    "author",
+    "host"
+  ]
+}
diff --git a/nextclade/defaults/clades.tsv b/nextclade/defaults/clades.tsv
new file mode 100644
index 0000000..44f98c1
--- /dev/null
+++ b/nextclade/defaults/clades.tsv
@@ -0,0 +1,40 @@
+clade	gene	site	alt
+Angola	nuc	72	A
+Angola	nuc	81	G
+Angola	nuc	88	C
+Angola	nuc	90	A
+Angola	nuc	99	T
+Angola	nuc	111	G
+Angola	nuc	219	T
+Angola	nuc	240	C
+Angola	nuc	246	A
+Angola	nuc	252	A
+Angola	nuc	255	A
+Angola	nuc	291	G
+Angola	nuc	294	A
+Angola	nuc	300	A
+Angola	nuc	315	G
+Angola	nuc	327	G
+Angola	nuc	372	A
+Angola	nuc	420	A
+Angola	nuc	432	A
+Angola	nuc	453	T
+Angola	nuc	492	G
+Angola	nuc	651	T
+East Africa	nuc	45	A
+East Africa	nuc	171	G
+East Africa	nuc	438	G
+East Africa	nuc	468	T
+East/Central Africa	nuc	228	G
+West Africa I	nuc	183	G
+West Africa I	nuc	255	C
+West Africa II	nuc	93	T
+West Africa II	nuc	270	A
+West Africa II	nuc	321	T
+West Africa II	nuc	477 	A
+South America I	nuc	219	A
+South America I	nuc	532	A
+South America II	nuc	114	C
+South America II	nuc	193	T
+South America II	nuc	249	A
+South America II	nuc	639	G
diff --git a/nextclade/defaults/colors.tsv b/nextclade/defaults/colors.tsv
new file mode 100644
index 0000000..e0b687b
--- /dev/null
+++ b/nextclade/defaults/colors.tsv
@@ -0,0 +1,8 @@
+# genotypes assigned by augur clades
+clade_membership	Angola	#FCF007
+clade_membership	East Africa	#4B26B1
+clade_membership	East/Central Africa	#E307FC
+clade_membership	West Africa I	#2CFC07
+clade_membership	West Africa II	#9EFC07
+clade_membership	South America I	#996633
+clade_membership	South America II	#FC0740
diff --git a/nextclade/defaults/config.yaml b/nextclade/defaults/config.yaml
new file mode 100644
index 0000000..a412aa3
--- /dev/null
+++ b/nextclade/defaults/config.yaml
@@ -0,0 +1,20 @@
+files:
+    auspice_config: "defaults/auspice_config.json"
+    auspice_json: "auspice/tree.json"
+    clades: "defaults/clades.tsv"
+    colors: "defaults/colors.tsv"
+    include: "defaults/include_strains.txt"
+    reference_prM-E_fasta: "defaults/reference.fasta"
+    reference_prM-E_gff: "defaults/genome_annotation.gff3"
+strain_id_field: "accession"
+align_and_extract_prM-E:
+    min_length: 500
+    min_seed_cover: 0.01
+refine:
+    coalescent: "opt"
+    date_inference: "marginal"
+    clock_filter_iqd: 4
+ancestral:
+    inference: "joint"
+export:
+    metadata_columns: "strain division location region year host"
diff --git a/nextclade/defaults/genome_annotation.gff3 b/nextclade/defaults/genome_annotation.gff3
new file mode 100644
index 0000000..60c279d
--- /dev/null
+++ b/nextclade/defaults/genome_annotation.gff3
@@ -0,0 +1,5 @@
+##sequence-region prM-E 1 672
+NC_002031.1	feature	source	1	672	.	+	.	gene=nuc
+NC_002031.1	feature	gene	1	333	.	+	.	gene_name=prM
+NC_002031.1	feature	gene	109	333	.	+	.	gene_name=M
+NC_002031.1	feature	gene	334	672	.	+	.	gene_name=E
diff --git a/nextclade/defaults/include_strains.txt b/nextclade/defaults/include_strains.txt
new file mode 100644
index 0000000..52ef94f
--- /dev/null
+++ b/nextclade/defaults/include_strains.txt
@@ -0,0 +1,136 @@
+# Extracted from tables and figures in Mutebi et al. (J Virol. 2001
+# Aug;75(15):6999-7008) and Bryant et al. (PLoS Pathog. 2007 May
+# 18;3(5):e75)
+AF369669
+AF369670
+AF369671
+AY540431
+AY540432
+AY540433
+AY540434
+AY540435
+U52390
+AY540437
+AY540438
+AY540439
+AY540440
+AY540441
+AY540442
+AY540443
+AY540444
+AY540445
+AY540446
+AY540447
+AY540448
+AY540449
+AY540450
+AY540451
+AY540452
+AY540453
+U23570
+AY540454
+AY540455
+AY540456
+AY540457
+AY540458
+AY540459
+AY540460
+AY540461
+AY540462
+AY540463
+AY540464
+AY540465
+AY540466
+AY540467
+AY540468
+AY540469
+AY540470
+AY540471
+AY540472
+AY540473
+AY540436
+U52392
+U52395
+AF369672
+AF369673
+AY540475
+AY540476
+AY540474
+U52399
+AY540477
+AY540478
+AF369674
+AF369675
+AY572535
+AY640589
+AF369686
+U54798
+AY603338
+AF369676
+U52403
+AF369677
+AF369678
+AF368679
+AF369680
+AF369681
+AF369682
+AF369683
+AF369684
+AF369685
+AY540479
+AY540480
+AY161927
+AY161928
+AY161929
+AY161930
+AY161931
+U52411
+AY161933
+AY161934
+AY161935
+U52405
+U52407
+AY161938
+AY161939
+AY161940
+AY161941
+AY161942
+AY161943
+AY161944
+AY161945
+AY161946
+AY161947
+AY161948
+AY161949
+AY161950
+AY161951
+GI694115
+U89338
+AF369687
+AF369688
+U52413
+AF369689
+AF369690
+AF369691
+AF369692
+AF369693
+AY690831
+AY690832
+AY690833
+DQ872411
+DQ872412
+AY540481
+AY540482
+AY540483
+AY540484
+AY540485
+AY540486
+AF369694
+U52422
+AF369695
+AF369696
+AY540487
+AY540488
+AY540489
+AY540490
+AF369697
diff --git a/nextclade/defaults/nextclade-dataset/CHANGELOG.md b/nextclade/defaults/nextclade-dataset/CHANGELOG.md
new file mode 100644
index 0000000..3b67234
--- /dev/null
+++ b/nextclade/defaults/nextclade-dataset/CHANGELOG.md
@@ -0,0 +1,3 @@
+## Unreleased
+
+Initial release of yellow fever virus dataset.
diff --git a/nextclade/defaults/nextclade-dataset/README.md b/nextclade/defaults/nextclade-dataset/README.md
new file mode 100644
index 0000000..409611a
--- /dev/null
+++ b/nextclade/defaults/nextclade-dataset/README.md
@@ -0,0 +1,47 @@
+# Yellow fever virus dataset
+
+| Key               | Value                                                            |
+| ----------------- | -----------------------------------------------------------------|
+| name              | Yellow fever virus (YFV) prM-E region                            |
+| authors           | [Nextstrain](https://nextstrain.org)                             |
+| reference         | AY640589.1                                                       |
+| workflow          | <https://github.com/nextstrain/yellow-fever/tree/main/nextclade> |
+| path              | `nextstrain/yellow-fever/prM-E`                                  |
+
+## Scope of this dataset
+
+This dataset assigns genotypes to yellow fever virus samples based on
+strain and genotype information from [Mutebi et al.][] (J Virol. 2001
+Aug;75(15):6999-7008) and [Bryant et al.][] (PLoS Pathog. 2007 May 18;3(5):e75)
+
+These two papers, collectively, define 7 distinct yellow fever virus
+genotypes based on a 670 nucleotide region of the yellow fever virus
+genome, (bases 641-1310), called the prM-E region. This region
+comprises the 3' end of the pre-membrane protein (prM) gene, the
+entire membrane protein (M) gene, and the 5' end of the envelope
+protein (E) gene.
+
+(N.b., the reference sequence used in this data set is actually 672nt
+long, from bases 641-1312 of the genome reference. The 2 extra bases
+make the reference an complete open reading frame.)
+
+This dataset can be used to assign genotypes to any sequence that
+includes at least 500 bp of the prM-E region, including whole genome
+sequences. Sequence data beyond the prM-E region will be reported as an
+insertion in the Nextclade output.
+
+## Features
+
+This dataset supports:
+
+- Assignment of genotypes
+- Phylogenetic placement
+- Sequence quality control (QC)
+
+## What are Nextclade datasets
+
+Read more about Nextclade datasets in the Nextclade documentation:
+<https://docs.nextstrain.org/projects/nextclade/en/stable/user/datasets.html>
+
+[Mutebi et al.]: https://pubmed.ncbi.nlm.nih.gov/11435580/
+[Bryant et al.]: https://pubmed.ncbi.nlm.nih.gov/17511518/
diff --git a/nextclade/defaults/nextclade-dataset/pathogen.json b/nextclade/defaults/nextclade-dataset/pathogen.json
new file mode 100644
index 0000000..92213e6
--- /dev/null
+++ b/nextclade/defaults/nextclade-dataset/pathogen.json
@@ -0,0 +1,52 @@
+{
+  "files": {
+    "reference": "reference.fasta",
+    "pathogenJson": "pathogen.json",
+    "genomeAnnotation": "genome_annotation.gff3",
+    "treeJson": "tree.json",
+    "examples": "sequences.fasta",
+    "readme": "README.md",
+    "changelog": "CHANGELOG.md"
+  },
+  "attributes": {
+    "name": "Yellow fever virus (YFV) prM-E region",
+    "reference name": "Asibi",
+    "reference accession": "AY640589.1"
+  },
+  "schemaVersion": "3.0.0",
+  "alignmentParams": {
+      "minSeedCover": 0.01,
+      "minLength": 500
+  },
+  "qc": {
+    "missingData": {
+      "enabled": true,
+      "missingDataThreshold": 20,
+      "scoreBias": 4
+    },
+    "mixedSites": {
+      "enabled": true,
+      "mixedSitesThreshold": 4
+    },
+    "frameShifts": {
+      "enabled": true
+    },
+    "stopCodons": {
+      "enabled": true
+    },
+    "privateMutations": {
+      "enabled": true,
+      "cutoff": 8,
+      "typical": 2,
+      "weightLabeledSubstitutions": 1,
+      "weightReversionSubstitutions": 1,
+      "weightUnlabeledSubstitutions": 1
+    },
+    "snpClusters": {
+      "enabled": true,
+      "clusterCutOff": 3,
+      "scoreWeight": 50,
+      "windowSize": 50
+    }
+  }
+}
diff --git a/nextclade/defaults/reference.fasta b/nextclade/defaults/reference.fasta
new file mode 100644
index 0000000..1b51ee3
--- /dev/null
+++ b/nextclade/defaults/reference.fasta
@@ -0,0 +1,13 @@
+> prM-E region (genome 641-1312, 672 nt)
+CCAAGAGAGGAGCCAGATGACATTGATTGCTGGTGCTATGGGGTGGAAAACGTTAGAGTC
+GCATATGGTAAGTGTGACTCAGCAGGCAGGTCTAGGAGGTCAAGAAGGGCCATTGACTTG
+CCTACGCATGAAAACCATGGTTTGAAGACCCGGCAAGAAAAATGGATGACTGGAAGAATG
+GGTGAAAGGCAACTCCAAAAGATTGAGAGATGGCTCGTGAGGAACCCCTTTTTTGCAGTG
+ACAGCTCTGACCATTGCCTACCTTGTGGGAAGCAACATGACGCAACGAGTCGTGATTGCC
+CTACTGGTCTTGGCTGTTGGTCCGGCCTACTCAGCTCACTGCATTGGAATTACTGACAGG
+GATTTCATTGAGGGGGTGCATGGAGGAACTTGGGTTTCAGCTACCCTGGAGCAAGACAAG
+TGTGTCACTGTTATGGCCCCTGACAAGCCTTCATTGGACATCTCACTAGAGACAGTAGCC
+ATTGATGGACCTGCTGAGGCGAGGAAAGTGTGTTACAATGCAGTTCTCACTCATGTGAAG
+ATTAATGACAAGTGCCCCAGCACTGGAGAGGCCCACCTAGCTGAAGAGAACGAAGGGGAC
+AATGCGTGCAAGCGCACTTATTCTGATAGAGGCTGGGGCAATGGCTGTGGCCTATTTGGG
+AAAGGGAGCATT
diff --git a/nextclade/rules/annotate_phylogeny.smk b/nextclade/rules/annotate_phylogeny.smk
new file mode 100644
index 0000000..a33745f
--- /dev/null
+++ b/nextclade/rules/annotate_phylogeny.smk
@@ -0,0 +1,82 @@
+"""
+This part of the workflow creates additonal annotations for the phylogenetic tree.
+"""
+
+rule ancestral:
+    message:
+        """
+        Reconstructing ancestral sequences and mutations
+        """
+    input:
+        alignment = "results/aligned.fasta",
+        tree = "results/tree.nwk",
+    output:
+        node_data = "results/nt_muts.json",
+    params:
+        inference = config["ancestral"]["inference"],
+        reference_fasta = config["files"]["reference_prM-E_fasta"],
+    log:
+        "logs/ancestral.txt",
+    benchmark:
+        "benchmarks/ancestral.txt",
+    shell:
+        """
+        augur ancestral \
+            --tree {input.tree} \
+            --alignment {input.alignment} \
+            --output-node-data {output.node_data} \
+            --inference {params.inference}  \
+            --root-sequence {params.reference_fasta} \
+          &> {log:q}
+        """
+
+rule translate:
+    message:
+        """
+        Translating amino acid sequences
+        """
+    input:
+        tree = "results/tree.nwk",
+        node_data = "results/nt_muts.json",
+        reference = config["files"]["reference_prM-E_gff"],
+    output:
+        node_data = "results/aa_muts.json"
+    log:
+        "logs/translate.txt",
+    benchmark:
+        "benchmarks/translate.txt",
+    shell:
+        """
+        augur translate \
+            --tree {input.tree:q} \
+            --ancestral-sequences {input.node_data:q} \
+            --reference-sequence {input.reference:q} \
+            --output {output.node_data:q} \
+          &> {log:q}
+        """
+
+rule clades:
+    message:
+        """
+        Running augur clades
+        """
+    input:
+        tree = "results/tree.nwk",
+        nt_muts = "results/nt_muts.json",
+        aa_muts = "results/aa_muts.json",
+        clade_defs = config["files"]["clades"]
+    output:
+        clades = "results/clades.json"
+    log:
+        "logs/clades.txt",
+    benchmark:
+        "benchmarks/clades.txt",
+    shell:
+        """
+        augur clades \
+            --tree {input.tree:q} \
+            --mutations {input.nt_muts} {input.aa_muts} \
+            --clades {input.clade_defs:q} \
+            --output {output.clades:q} \
+          &> {log:q}
+        """
diff --git a/nextclade/rules/assemble_dataset.smk b/nextclade/rules/assemble_dataset.smk
new file mode 100644
index 0000000..9f00848
--- /dev/null
+++ b/nextclade/rules/assemble_dataset.smk
@@ -0,0 +1,71 @@
+"""
+This part of the workflow assembles a nextclade dataset.
+"""
+rule assemble_dataset:
+    message:
+        """
+        Copying outputs into dataset/
+        """
+    input:
+        reference_fasta="defaults/reference.fasta",
+        tree=config["files"]["auspice_json"],
+        # TODO once this repo is fully automated and uploading data to
+        # S3, this step should download data from there instead of
+        # depending on the ingest build
+        sequences = "../ingest/results/sequences.fasta",
+        annotation="defaults/genome_annotation.gff3",
+        pathogen_json="defaults/nextclade-dataset/pathogen.json",
+        readme="defaults/nextclade-dataset/README.md",
+        changelog="defaults/nextclade-dataset/CHANGELOG.md",
+    output:
+        reference_fasta="dataset/reference.fasta",
+        tree="dataset/tree.json",
+        pathogen_json="dataset/pathogen.json",
+        sequences="dataset/sequences.fasta",
+        annotation="dataset/genome_annotation.gff3",
+        readme="dataset/README.md",
+        changelog="dataset/CHANGELOG.md",
+    log:
+        "logs/assemble_dataset.txt",
+    benchmark:
+        "benchmarks/assemble_dataset.txt",
+    shell:
+        """
+        (
+          cp -v {input.reference_fasta:q} {output.reference_fasta:q}
+          cp -v {input.tree:q} {output.tree:q}
+          cp -v {input.pathogen_json:q} {output.pathogen_json:q}
+          cp -v {input.annotation:q} {output.annotation:q}
+          cp -v {input.readme:q} {output.readme:q}
+          cp -v {input.changelog:q} {output.changelog:q}
+          cp -v {input.sequences:q} {output.sequences:q}
+        ) &> {log:q}
+        """
+
+rule test_dataset:
+    input:
+        # TODO once this repo is fully automated and uploading data to
+        # S3, this step should download data from there instead of
+        # depending on the ingest build
+        sequences = "../ingest/results/sequences.fasta",
+        # this isn't used by the command below, but it is included as
+        # an input to force the preceding rule to finish running
+        # before this one starts
+        tree="dataset/tree.json",
+    output:
+        outdir=directory("test_output/"),
+    params:
+        dataset_dir="dataset",
+    log:
+        "logs/test_dataset.txt",
+    benchmark:
+        "benchmarks/test_dataset.txt",
+    shell:
+        """
+        nextclade run \
+            --input-dataset {params.dataset_dir:q} \
+            --output-all {output.outdir:q} \
+            --silent \
+            {input.sequences:q} \
+          >& {log:q}
+        """
diff --git a/nextclade/rules/construct_phylogeny.smk b/nextclade/rules/construct_phylogeny.smk
new file mode 100644
index 0000000..58341f2
--- /dev/null
+++ b/nextclade/rules/construct_phylogeny.smk
@@ -0,0 +1,67 @@
+"""
+This part of the workflow constructs the phylogenetic tree.
+"""
+rule tree:
+    message:
+        """
+        Building tree
+        """
+    input:
+        alignment = "results/aligned.fasta",
+    output:
+        tree = "results/tree_raw.nwk",
+    log:
+        "logs/tree.txt",
+    benchmark:
+        "benchmarks/tree.txt",
+    shell:
+        """
+        augur tree \
+            --alignment {input.alignment} \
+            --output {output.tree}
+          &> {log:q}
+        """
+
+rule refine:
+    message:
+        """
+        Refining tree
+        - use {params.coalescent} coalescent timescale
+        - estimate {params.date_inference} node dates
+        - filter tips more than {params.clock_filter_iqd} IQDs from clock expectation
+        """
+    input:
+        alignment = "results/aligned.fasta",
+        # TODO once this repo is fully automated and uploading data to
+        # S3, this step should download data from there instead of
+        # depending on the ingest build
+        metadata = "../ingest/results/metadata.tsv",
+        tree = "results/tree_raw.nwk",
+    output:
+        node_data = "results/branch_lengths.json",
+        tree = "results/tree.nwk",
+    params:
+        strain_id = config["strain_id_field"],
+        coalescent = config["refine"]["coalescent"],
+        date_inference = config["refine"]["date_inference"],
+        clock_filter_iqd = config["refine"]["clock_filter_iqd"],
+    log:
+        "logs/refine.txt",
+    benchmark:
+        "benchmarks/refine.txt",
+    shell:
+        """
+        augur refine \
+            --tree {input.tree:q} \
+            --alignment {input.alignment:q} \
+            --metadata {input.metadata:q} \
+            --metadata-id-columns {params.strain_id:q} \
+            --output-tree {output.tree:q} \
+            --output-node-data {output.node_data:q} \
+            --coalescent {params.coalescent:q} \
+            --date-confidence \
+            --date-inference {params.date_inference:q} \
+            --clock-filter-iqd {params.clock_filter_iqd:q} \
+            --stochastic-resolve
+          &> {log:q}
+        """
diff --git a/nextclade/rules/export.smk b/nextclade/rules/export.smk
new file mode 100644
index 0000000..ac254c8
--- /dev/null
+++ b/nextclade/rules/export.smk
@@ -0,0 +1,44 @@
+"""
+This part of the workflow collects the phylogenetic tree and
+annotations to export a Nextstrain dataset.
+"""
+rule export:
+    message:
+        """
+        Exporting data files for auspice
+        """
+    input:
+        tree = "results/tree.nwk",
+        # TODO once this repo is fully automated and uploading data to
+        # S3, this step should download data from there instead of
+        # depending on the ingest build
+        metadata = "../ingest/results/metadata.tsv",
+        branch_lengths = "results/branch_lengths.json",
+        clades = "results/clades.json",
+        nt_muts = "results/nt_muts.json",
+        aa_muts = "results/aa_muts.json",
+        colors = config["files"]["colors"],
+        auspice_config = config["files"]["auspice_config"],
+    output:
+        auspice_json = config["files"]["auspice_json"],
+    params:
+        strain_id = config["strain_id_field"],
+        metadata_columns = config["export"]["metadata_columns"],
+    log:
+        "logs/export.txt",
+    benchmark:
+        "benchmarks/export.txt",
+    shell:
+        """
+        augur export v2 \
+            --tree {input.tree:q} \
+            --metadata {input.metadata:q} \
+            --metadata-id-columns {params.strain_id:q} \
+            --node-data {input.branch_lengths} {input.nt_muts} {input.aa_muts} {input.clades} \
+            --colors {input.colors:q} \
+            --metadata-columns {params.metadata_columns} \
+            --auspice-config {input.auspice_config:q} \
+            --include-root-sequence-inline \
+            --output {output.auspice_json:q} \
+          &> {log:q}
+        """
diff --git a/nextclade/rules/prepare_sequences.smk b/nextclade/rules/prepare_sequences.smk
new file mode 100644
index 0000000..8beca2f
--- /dev/null
+++ b/nextclade/rules/prepare_sequences.smk
@@ -0,0 +1,64 @@
+"""
+This part of the workflow prepares sequences for constructing the
+phylogenetic tree.
+"""
+rule align_and_extract_prM_E:
+    input:
+        # TODO once this repo is fully automated and uploading data to
+        # S3, this step should download data from there instead of
+        # depending on the ingest build
+        sequences = "../ingest/results/sequences.fasta",
+        reference = config["files"]["reference_prM-E_fasta"],
+    output:
+        sequences = "results/sequences.fasta",
+    params:
+        min_length = config['align_and_extract_prM-E']['min_length'],
+        min_seed_cover = config['align_and_extract_prM-E']['min_seed_cover'],
+    log:
+        "logs/align_and_extract_prM_E.txt",
+    benchmark:
+        "benchmarks/align_and_extract_prM_E.txt",
+    threads: workflow.cores
+    shell:
+        """
+        nextclade3 run \
+            --jobs {threads:q} \
+            --input-ref {input.reference:q} \
+            --output-fasta {output.sequences:q} \
+            --min-seed-cover {params.min_seed_cover:q} \
+            --min-length {params.min_length:q} \
+            --silent \
+            {input.sequences:q} \
+          &> {log:q}
+        """
+
+rule filter:
+    message: """
+    Filtering to defined set of strains with known genotypes.
+    """
+    input:
+        include = config["files"]["include"],
+        # TODO once this repo is fully automated and uploading data to
+        # S3, this step should download data from there instead of
+        # depending on the ingest build
+        metadata = "../ingest/results/metadata.tsv",
+        sequences = "results/sequences.fasta",
+    output:
+        sequences = "results/aligned.fasta",
+    params:
+        strain_id = config["strain_id_field"],
+    log:
+        "logs/filter.txt",
+    benchmark:
+        "benchmarks/filter.txt"
+    shell:
+        """
+        augur filter \
+            --sequences {input.sequences:q} \
+            --metadata {input.metadata:q} \
+            --metadata-id-columns {params.strain_id:q} \
+            --exclude-all \
+            --include {input.include:q} \
+            --output {output.sequences:q} \
+          &> {log:q}
+        """