diff --git a/.gitignore b/.gitignore
index b943590..09c6411 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,11 @@ ingest/benchmarks
 ingest/data
 ingest/logs
 ingest/results
+nextclade/auspice
+nextclade/benchmarks
+nextclade/data
+nextclade/logs
+nextclade/results
 phylogenetic/auspice
 phylogenetic/benchmarks
 phylogenetic/logs
diff --git a/nextclade/README.md b/nextclade/README.md
new file mode 100644
index 0000000..ac2853a
--- /dev/null
+++ b/nextclade/README.md
@@ -0,0 +1,26 @@
+# Yellow Fever Virus Nextclade Dataset Tree
+
+This workflow creates a phylogenetic tree that can be used as part of
+a Nextclade dataset to assign genotypes to yellow fever virus samples based on
+FIXME reference to those two papers goes here.
+
+* Build a tree using samples from the `ingest` output, with the following
+  sampling criteria:
+  * Force-include the following samples:
+    * genotype reference strains
+* Assign genotypes to each sample and internal nodes of the tree with
+  `augur clades`, using clade-defining mutations in
+  `defaults/clades.tsv`
+* Provide the following coloring options on the tree:
+  * Genotype assignment from `augur clades`
+
+## How to create a new tree
+
+* Run the workflow: `nextstrain build .`
+* Inspect the output tree by comparing genotype assignments from the following sources:
+  * `augur clades` output
+* If unwanted samples are present in the tree, add them to
+  `defaults/dropped_strains.tsv` and re-run the workflow
+* If any changes are needed to the clade-defining mutations, add
+  changes to `defaults/clades.tsv` and re-run the workflow
+* Repeat as needed
diff --git a/nextclade/Snakefile b/nextclade/Snakefile
new file mode 100644
index 0000000..760cccc
--- /dev/null
+++ b/nextclade/Snakefile
@@ -0,0 +1,25 @@
+configfile: "defaults/config.yaml"
+
+rule all:
+    input:
+        auspice_json = config["files"]["auspice_json"],
+
+include: "rules/prepare_sequences.smk"
+include: "rules/construct_phylogeny.smk"
+include: "rules/annotate_phylogeny.smk"
+include: "rules/export.smk"
+
+rule clean:
+    params:
+        targets = [
+            ".snakemake",
+            "auspice",
+            "benchmarks",
+            "data",
+            "logs",
+            "results",
+        ]
+    shell:
+        """
+        rm -rfv {params.targets}
+        """
diff --git a/nextclade/defaults/auspice_config.json b/nextclade/defaults/auspice_config.json
new file mode 100644
index 0000000..3843edd
--- /dev/null
+++ b/nextclade/defaults/auspice_config.json
@@ -0,0 +1,59 @@
+{
+  "title": "Real-time tracking of yellow fever virus full genome virus evolution",
+  "maintainers": [
+    {"name": "John SJ Anderson", "url": "https://bedford.io/team/john-sj-anderson/"},
+    {"name": "the Nextstrain team", "url": "https://nextstrain.org/team"}
+  ],
+  "data_provenance": [
+    {
+      "name": "GenBank",
+      "url": "https://www.ncbi.nlm.nih.gov/genbank/"
+    }
+  ],
+  "build_url": "https://github.com/nextstrain/yellow-fever",
+  "colorings": [
+    {
+      "key": "gt",
+      "title": "Genotype",
+      "type": "categorical"
+    },
+    {
+      "key": "num_date",
+      "title": "Date",
+      "type": "continuous"
+    },
+    {
+      "key": "region",
+      "title": "Region",
+      "type": "categorical"
+    },
+    {
+      "key": "country",
+      "title": "Country",
+      "type": "categorical"
+    },
+    {
+      "key": "host",
+      "title": "Host",
+      "type": "categorical"
+    }
+  ],
+  "geo_resolutions": [
+    "country",
+    "region"
+  ],
+  "display_defaults": {
+    "map_triplicate": true,
+    "color_by": "region"
+  },
+  "filters": [
+    "clade",
+    "region",
+    "country",
+    "author",
+    "host"
+  ],
+  "metadata_columns": [
+    "author"
+  ]
+}
diff --git a/nextclade/defaults/clades.tsv b/nextclade/defaults/clades.tsv
new file mode 100644
index 0000000..44f98c1
--- /dev/null
+++ b/nextclade/defaults/clades.tsv
@@ -0,0 +1,40 @@
+clade	gene	site	alt
+Angola	nuc	72	A
+Angola	nuc	81	G
+Angola	nuc	88	C
+Angola	nuc	90	A
+Angola	nuc	99	T
+Angola	nuc	111	G
+Angola	nuc	219	T
+Angola	nuc	240	C
+Angola	nuc	246	A
+Angola	nuc	252	A
+Angola	nuc	255	A
+Angola	nuc	291	G
+Angola	nuc	294	A
+Angola	nuc	300	A
+Angola	nuc	315	G
+Angola	nuc	327	G
+Angola	nuc	372	A
+Angola	nuc	420	A
+Angola	nuc	432	A
+Angola	nuc	453	T
+Angola	nuc	492	G
+Angola	nuc	651	T
+East Africa	nuc	45	A
+East Africa	nuc	171	G
+East Africa	nuc	438	G
+East Africa	nuc	468	T
+East/Central Africa	nuc	228	G
+West Africa I	nuc	183	G
+West Africa I	nuc	255	C
+West Africa II	nuc	93	T
+West Africa II	nuc	270	A
+West Africa II	nuc	321	T
+West Africa II	nuc	477 	A
+South America I	nuc	219	A
+South America I	nuc	532	A
+South America II	nuc	114	C
+South America II	nuc	193	T
+South America II	nuc	249	A
+South America II	nuc	639	G
diff --git a/nextclade/defaults/colors.tsv b/nextclade/defaults/colors.tsv
new file mode 100644
index 0000000..e0b687b
--- /dev/null
+++ b/nextclade/defaults/colors.tsv
@@ -0,0 +1,8 @@
+# genotypes assigned by augur clades
+clade_membership	Angola	#FCF007
+clade_membership	East Africa	#4B26B1
+clade_membership	East/Central Africa	#E307FC
+clade_membership	West Africa I	#2CFC07
+clade_membership	West Africa II	#9EFC07
+clade_membership	South America I	#996633
+clade_membership	South America II	#FC0740
diff --git a/nextclade/defaults/config.yaml b/nextclade/defaults/config.yaml
new file mode 100644
index 0000000..0cccf83
--- /dev/null
+++ b/nextclade/defaults/config.yaml
@@ -0,0 +1,25 @@
+files:
+    auspice_config: "defaults/auspice_config.json"
+    auspice_json: "auspice/yellow-fever-virus_prM-E.json"
+    clades: "defaults/clades.tsv"
+    colors: "defaults/colors.tsv"
+    include: "defaults/include_strains.txt"
+    reference_prM-E_fasta: "defaults/yellow-fever-virus-reference_prM-E.fasta"
+    reference_prM-E_gff: "defaults/yellow-fever-virus-reference_prM-E.gff"
+strain_id_field: "accession"
+align_and_extract_prM-E:
+    min_length: 650
+    min_seed_cover: 0.01
+filter:
+    group_by: "region year"
+    subsample_max_sequences: 500
+    min_date: 1927
+    min_length: 650
+refine:
+    coalescent: "opt"
+    date_inference: "marginal"
+    clock_filter_iqd: 4
+ancestral:
+    inference: "joint"
+export:
+    metadata_columns: "strain division location region year host"
diff --git a/nextclade/defaults/include_strains.txt b/nextclade/defaults/include_strains.txt
new file mode 100644
index 0000000..1e6da32
--- /dev/null
+++ b/nextclade/defaults/include_strains.txt
@@ -0,0 +1,134 @@
+# FIXME provide reference
+AF369669
+AF369670
+AF369671
+AY540431
+AY540432
+AY540433
+AY540434
+AY540435
+U52390
+AY540437
+AY540438
+AY540439
+AY540440
+AY540441
+AY540442
+AY540443
+AY540444
+AY540445
+AY540446
+AY540447
+AY540448
+AY540449
+AY540450
+AY540451
+AY540452
+AY540453
+U23570
+AY540454
+AY540455
+AY540456
+AY540457
+AY540458
+AY540459
+AY540460
+AY540461
+AY540462
+AY540463
+AY540464
+AY540465
+AY540466
+AY540467
+AY540468
+AY540469
+AY540470
+AY540471
+AY540472
+AY540473
+AY540436
+U52392
+U52395
+AF369672
+AF369673
+AY540475
+AY540476
+AY540474
+U52399
+AY540477
+AY540478
+AF369674
+AF369675
+AY572535
+AY640589
+AF369686
+U54798
+AY603338
+AF369676
+U52403
+AF369677
+AF369678
+AF368679
+AF369680
+AF369681
+AF369682
+AF369683
+AF369684
+AF369685
+AY540479
+AY540480
+AY161927
+AY161928
+AY161929
+AY161930
+AY161931
+U52411
+AY161933
+AY161934
+AY161935
+U52405
+U52407
+AY161938
+AY161939
+AY161940
+AY161941
+AY161942
+AY161943
+AY161944
+AY161945
+AY161946
+AY161947
+AY161948
+AY161949
+AY161950
+AY161951
+GI694115
+U89338
+AF369687
+AF369688
+U52413
+AF369689
+AF369690
+AF369691
+AF369692
+AF369693
+AY690831
+AY690832
+AY690833
+DQ872411
+DQ872412
+AY540481
+AY540482
+AY540483
+AY540484
+AY540485
+AY540486
+AF369694
+U52422
+AF369695
+AF369696
+AY540487
+AY540488
+AY540489
+AY540490
+AF369697
diff --git a/nextclade/defaults/yellow-fever-virus-reference_prM-E.fasta b/nextclade/defaults/yellow-fever-virus-reference_prM-E.fasta
new file mode 100644
index 0000000..6e332fc
--- /dev/null
+++ b/nextclade/defaults/yellow-fever-virus-reference_prM-E.fasta
@@ -0,0 +1,13 @@
+> prM-E region (genome 641-1312, 672 nt)
+CCAAGAGAGGAGCCAGATGACATTGATTGCTGGTGCTATGGGGTGGAAAACGTTAGAGTC
+GCATATGGTAAGTGTGACTCAGCAGGCAGGTCTAGGAGGTCAAGAAGGGCCATTGACTTG
+CCTACGCATGAAAACCATGGTTTGAAGACCCGGCAAGAAAAATGGATGACTGGAAGAATG
+GGTGAAAGGCAACTCCAAAAGATTGAGAGATGGTTCGTGAGGAACCCCTTTTTTGCAGTG
+ACGGCTCTGACCATTGCCTACCTTGTGGGAAGCAACATGACGCAACGAGTCGTGATTGCC
+CTACTGGTCTTGGCTGTTGGTCCGGCCTACTCAGCTCACTGCATTGGAATTACTGACAGG
+GATTTCATTGAGGGGGTGCATGGAGGAACTTGGGTTTCAGCTACCCTGGAGCAAGACAAG
+TGTGTCACTGTTATGGCCCCTGACAAGCCTTCATTGGACATCTCACTAGAGACAGTAGCC
+ATTGATAGACCTGCTGAGGTGAGGAAAGTGTGTTACAATGCAGTTCTCACTCATGTGAAG
+ATTAATGACAAGTGCCCCAGCACTGGAGAGGCCCACCTAGCTGAAGAGAACGAAGGGGAC
+AATGCGTGCAAGCGCACTTATTCTGATAGAGGCTGGGGCAATGGCTGTGGCCTATTTGGG
+AAAGGGAGCATT
diff --git a/nextclade/defaults/yellow-fever-virus-reference_prM-E.gff b/nextclade/defaults/yellow-fever-virus-reference_prM-E.gff
new file mode 100644
index 0000000..615ba42
--- /dev/null
+++ b/nextclade/defaults/yellow-fever-virus-reference_prM-E.gff
@@ -0,0 +1,5 @@
+##sequence-region prM-E 1 672
+NC_002031.1	feature	source	1	672	.	+	.	gene=nuc
+NC_002031.1	feature	gene	1	334	.	+	.	gene_name=prM
+NC_002031.1	feature	gene	110	334	.	+	.	gene_name=M
+NC_002031.1	feature	gene	335	672	.	+	.	gene_name=E
diff --git a/nextclade/rules/annotate_phylogeny.smk b/nextclade/rules/annotate_phylogeny.smk
new file mode 100644
index 0000000..a33745f
--- /dev/null
+++ b/nextclade/rules/annotate_phylogeny.smk
@@ -0,0 +1,82 @@
+"""
+This part of the workflow creates additonal annotations for the phylogenetic tree.
+"""
+
+rule ancestral:
+    message:
+        """
+        Reconstructing ancestral sequences and mutations
+        """
+    input:
+        alignment = "results/aligned.fasta",
+        tree = "results/tree.nwk",
+    output:
+        node_data = "results/nt_muts.json",
+    params:
+        inference = config["ancestral"]["inference"],
+        reference_fasta = config["files"]["reference_prM-E_fasta"],
+    log:
+        "logs/ancestral.txt",
+    benchmark:
+        "benchmarks/ancestral.txt",
+    shell:
+        """
+        augur ancestral \
+            --tree {input.tree} \
+            --alignment {input.alignment} \
+            --output-node-data {output.node_data} \
+            --inference {params.inference}  \
+            --root-sequence {params.reference_fasta} \
+          &> {log:q}
+        """
+
+rule translate:
+    message:
+        """
+        Translating amino acid sequences
+        """
+    input:
+        tree = "results/tree.nwk",
+        node_data = "results/nt_muts.json",
+        reference = config["files"]["reference_prM-E_gff"],
+    output:
+        node_data = "results/aa_muts.json"
+    log:
+        "logs/translate.txt",
+    benchmark:
+        "benchmarks/translate.txt",
+    shell:
+        """
+        augur translate \
+            --tree {input.tree:q} \
+            --ancestral-sequences {input.node_data:q} \
+            --reference-sequence {input.reference:q} \
+            --output {output.node_data:q} \
+          &> {log:q}
+        """
+
+rule clades:
+    message:
+        """
+        Running augur clades
+        """
+    input:
+        tree = "results/tree.nwk",
+        nt_muts = "results/nt_muts.json",
+        aa_muts = "results/aa_muts.json",
+        clade_defs = config["files"]["clades"]
+    output:
+        clades = "results/clades.json"
+    log:
+        "logs/clades.txt",
+    benchmark:
+        "benchmarks/clades.txt",
+    shell:
+        """
+        augur clades \
+            --tree {input.tree:q} \
+            --mutations {input.nt_muts} {input.aa_muts} \
+            --clades {input.clade_defs:q} \
+            --output {output.clades:q} \
+          &> {log:q}
+        """
diff --git a/nextclade/rules/construct_phylogeny.smk b/nextclade/rules/construct_phylogeny.smk
new file mode 100644
index 0000000..fa51afe
--- /dev/null
+++ b/nextclade/rules/construct_phylogeny.smk
@@ -0,0 +1,68 @@
+"""
+This part of the workflow constructs the phylogenetic tree.
+"""
+rule tree:
+    message:
+        """
+        Building tree
+        """
+    input:
+        alignment = "results/aligned.fasta",
+    output:
+        tree = "results/tree_raw.nwk",
+    log:
+        "logs/tree.txt",
+    benchmark:
+        "benchmarks/tree.txt",
+    shell:
+        """
+        augur tree \
+            --alignment {input.alignment} \
+            --output {output.tree}
+          &> {log:q}
+        """
+
+rule refine:
+    message:
+        """
+        Refining tree
+        - use {params.coalescent} coalescent timescale
+        - estimate {params.date_inference} node dates
+        - filter tips more than {params.clock_filter_iqd} IQDs from clock expectation
+        """
+    input:
+        alignment = "results/aligned.fasta",
+        # TODO once this repo is fully automated and uploading data to
+        # S3, this step should download data from there instead of
+        # depending on the ingest build
+        metadata = "../ingest/results/metadata.tsv",
+        tree = "results/tree_raw.nwk",
+    output:
+        node_data = "results/branch_lengths.json",
+        tree = "results/tree.nwk",
+    params:
+        strain_id = config["strain_id_field"],
+        coalescent = config["refine"]["coalescent"],
+        date_inference = config["refine"]["date_inference"],
+        clock_filter_iqd = config["refine"]["clock_filter_iqd"],
+    log:
+        "logs/refine.txt",
+    benchmark:
+        "benchmarks/refine.txt",
+    shell:
+        """
+        augur refine \
+            --tree {input.tree:q} \
+            --alignment {input.alignment:q} \
+            --metadata {input.metadata:q} \
+            --metadata-id-columns {params.strain_id:q} \
+            --output-tree {output.tree:q} \
+            --output-node-data {output.node_data:q} \
+            --coalescent {params.coalescent:q} \
+            --timetree \
+            --date-confidence \
+            --date-inference {params.date_inference:q} \
+            --clock-filter-iqd {params.clock_filter_iqd:q} \
+            --stochastic-resolve
+          &> {log:q}
+        """
diff --git a/nextclade/rules/export.smk b/nextclade/rules/export.smk
new file mode 100644
index 0000000..ac254c8
--- /dev/null
+++ b/nextclade/rules/export.smk
@@ -0,0 +1,44 @@
+"""
+This part of the workflow collects the phylogenetic tree and
+annotations to export a Nextstrain dataset.
+"""
+rule export:
+    message:
+        """
+        Exporting data files for auspice
+        """
+    input:
+        tree = "results/tree.nwk",
+        # TODO once this repo is fully automated and uploading data to
+        # S3, this step should download data from there instead of
+        # depending on the ingest build
+        metadata = "../ingest/results/metadata.tsv",
+        branch_lengths = "results/branch_lengths.json",
+        clades = "results/clades.json",
+        nt_muts = "results/nt_muts.json",
+        aa_muts = "results/aa_muts.json",
+        colors = config["files"]["colors"],
+        auspice_config = config["files"]["auspice_config"],
+    output:
+        auspice_json = config["files"]["auspice_json"],
+    params:
+        strain_id = config["strain_id_field"],
+        metadata_columns = config["export"]["metadata_columns"],
+    log:
+        "logs/export.txt",
+    benchmark:
+        "benchmarks/export.txt",
+    shell:
+        """
+        augur export v2 \
+            --tree {input.tree:q} \
+            --metadata {input.metadata:q} \
+            --metadata-id-columns {params.strain_id:q} \
+            --node-data {input.branch_lengths} {input.nt_muts} {input.aa_muts} {input.clades} \
+            --colors {input.colors:q} \
+            --metadata-columns {params.metadata_columns} \
+            --auspice-config {input.auspice_config:q} \
+            --include-root-sequence-inline \
+            --output {output.auspice_json:q} \
+          &> {log:q}
+        """
diff --git a/nextclade/rules/prepare_sequences.smk b/nextclade/rules/prepare_sequences.smk
new file mode 100644
index 0000000..72eaea8
--- /dev/null
+++ b/nextclade/rules/prepare_sequences.smk
@@ -0,0 +1,73 @@
+"""
+This part of the workflow prepares sequences for constructing the
+phylogenetic tree.
+"""
+rule align_and_extract_prM_E:
+    input:
+        # TODO once this repo is fully automated and uploading data to
+        # S3, this step should download data from there instead of
+        # depending on the ingest build
+        sequences = "../ingest/results/sequences.fasta",
+        reference = config["files"]["reference_prM-E_fasta"],
+    output:
+        sequences = "results/sequences_prM-E.fasta",
+    params:
+        min_length = config['align_and_extract_prM-E']['min_length'],
+        min_seed_cover = config['align_and_extract_prM-E']['min_seed_cover'],
+    log:
+        "logs/align_and_extract_prM_E.txt",
+    benchmark:
+        "benchmarks/align_and_extract_prM_E.txt",
+    threads: workflow.cores
+    shell:
+        """
+        nextclade3 run \
+            --jobs {threads:q} \
+            --input-ref {input.reference:q} \
+            --output-fasta {output.sequences:q} \
+            --min-seed-cover {params.min_seed_cover:q} \
+            --min-length {params.min_length:q} \
+            --silent \
+            {input.sequences:q} \
+          &> {log:q}
+        """
+
+rule filter:
+    message: """
+    Filtering to
+    * FIXME
+    """
+    input:
+        include = config["files"]["include"],
+        # TODO once this repo is fully automated and uploading data to
+        # S3, this step should download data from there instead of
+        # depending on the ingest build
+        metadata = "../ingest/results/metadata.tsv",
+        sequences = "results/sequences_prM-E.fasta",
+    output:
+        sequences = "results/aligned.fasta"
+    params:
+        group_by = config["filter"]["group_by"],
+        subsample_max_sequences = config["filter"]["subsample_max_sequences"],
+        min_date = config["filter"]["min_date"],
+        min_length = config["filter"]["min_length"],
+        strain_id = config["strain_id_field"]
+    log:
+        "logs/filter.txt",
+    benchmark:
+        "benchmarks/filter.txt"
+    shell:
+        """
+        augur filter \
+            --sequences {input.sequences:q} \
+            --metadata {input.metadata:q} \
+            --metadata-id-columns {params.strain_id:q} \
+            --exclude-all \
+            --include {input.include:q} \
+            --output {output.sequences:q} \
+            --group-by {params.group_by} \
+            --subsample-max-sequences {params.subsample_max_sequences:q} \
+            --min-date {params.min_date:q} \
+            --min-length {params.min_length:q} \
+          &> {log:q}
+        """