From 7dbb4baeb591c55ab874d7af9ca5d4c3990506f8 Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Wed, 29 Nov 2023 15:31:45 -0800
Subject: [PATCH 01/16] phylo: Rename workflow/snakemake_rules/ to rules/

Part of work to update this repo to match the pathogen-repo-guide.

Simplify directory structure by putting the core Snakemake rules in
a "rules" directory.
---
 phylogenetic/README.md                                 |  2 +-
 phylogenetic/Snakefile                                 | 10 +++++-----
 .../{workflow/snakemake_rules => rules}/chores.smk     |  0
 .../{workflow/snakemake_rules => rules}/core.smk       |  0
 .../snakemake_rules => rules}/download_via_lapis.smk   |  0
 .../nextstrain_automation.smk                          |  0
 .../{workflow/snakemake_rules => rules}/prepare.smk    |  0
 7 files changed, 6 insertions(+), 6 deletions(-)
 rename phylogenetic/{workflow/snakemake_rules => rules}/chores.smk (100%)
 rename phylogenetic/{workflow/snakemake_rules => rules}/core.smk (100%)
 rename phylogenetic/{workflow/snakemake_rules => rules}/download_via_lapis.smk (100%)
 rename phylogenetic/{workflow/snakemake_rules => rules}/nextstrain_automation.smk (100%)
 rename phylogenetic/{workflow/snakemake_rules => rules}/prepare.smk (100%)

diff --git a/phylogenetic/README.md b/phylogenetic/README.md
index 99c5d7c1..47a659eb 100644
--- a/phylogenetic/README.md
+++ b/phylogenetic/README.md
@@ -109,7 +109,7 @@ nextstrain view .
 ## Configuration
 
 Configuration takes place in `config/*/config.yaml` files for each build.
-The analysis pipeline is contained in `workflow/snakemake_rule/core.smk`.
+The analysis pipeline is contained in `rules/core.smk`.
 This can be read top-to-bottom, each rule specifies its file inputs and output and pulls its parameters from `config`.
 There is little redirection and each rule should be able to be reasoned with on its own.
 
diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile
index 285012c9..03a06e67 100644
--- a/phylogenetic/Snakefile
+++ b/phylogenetic/Snakefile
@@ -41,20 +41,20 @@ rule all:
 
 if config.get("data_source", None) == "lapis":
 
-    include: "workflow/snakemake_rules/download_via_lapis.smk"
+    include: "rules/download_via_lapis.smk"
 
 else:
 
-    include: "workflow/snakemake_rules/prepare.smk"
+    include: "rules/prepare.smk"
 
 
-include: "workflow/snakemake_rules/chores.smk"
-include: "workflow/snakemake_rules/core.smk"
+include: "rules/chores.smk"
+include: "rules/core.smk"
 
 
 if config.get("deploy_url", False):
 
-    include: "workflow/snakemake_rules/nextstrain_automation.smk"
+    include: "rules/nextstrain_automation.smk"
 
 
 # Include custom rules defined in the config.
diff --git a/phylogenetic/workflow/snakemake_rules/chores.smk b/phylogenetic/rules/chores.smk
similarity index 100%
rename from phylogenetic/workflow/snakemake_rules/chores.smk
rename to phylogenetic/rules/chores.smk
diff --git a/phylogenetic/workflow/snakemake_rules/core.smk b/phylogenetic/rules/core.smk
similarity index 100%
rename from phylogenetic/workflow/snakemake_rules/core.smk
rename to phylogenetic/rules/core.smk
diff --git a/phylogenetic/workflow/snakemake_rules/download_via_lapis.smk b/phylogenetic/rules/download_via_lapis.smk
similarity index 100%
rename from phylogenetic/workflow/snakemake_rules/download_via_lapis.smk
rename to phylogenetic/rules/download_via_lapis.smk
diff --git a/phylogenetic/workflow/snakemake_rules/nextstrain_automation.smk b/phylogenetic/rules/nextstrain_automation.smk
similarity index 100%
rename from phylogenetic/workflow/snakemake_rules/nextstrain_automation.smk
rename to phylogenetic/rules/nextstrain_automation.smk
diff --git a/phylogenetic/workflow/snakemake_rules/prepare.smk b/phylogenetic/rules/prepare.smk
similarity index 100%
rename from phylogenetic/workflow/snakemake_rules/prepare.smk
rename to phylogenetic/rules/prepare.smk

From 5bb98f1a617b6e5f51d21aa29bedd09a6b9a0b4c Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Mon, 12 Feb 2024 15:45:12 -0800
Subject: [PATCH 02/16] phylo: Rename config/ to defaults/

Part of work to update this repo to match the pathogen-repo-guide.

Renamed in preparation for adding the "build-configs" directory in
following commits to hold additional build configurations and
customizations.
---
 .github/workflows/rebuild-hmpxv1-big.yaml     |  2 +-
 .github/workflows/rebuild-hmpxv1.yaml         |  2 +-
 .github/workflows/rebuild-mpxv.yaml           |  2 +-
 phylogenetic/README.md                        |  8 +++----
 phylogenetic/Snakefile                        |  2 +-
 phylogenetic/{config => defaults}/clades.tsv  |  0
 .../{config => defaults}/color_ordering.tsv   |  0
 .../{config => defaults}/color_schemes.tsv    |  0
 .../{config => defaults}/description.md       |  2 +-
 .../exclude_accessions.txt                    |  0
 phylogenetic/{config => defaults}/genemap.gff |  0
 .../hmpxv1/auspice_config.json                |  0
 .../{config => defaults}/hmpxv1/config.yaml   | 22 +++++++++----------
 .../{config => defaults}/hmpxv1/include.txt   |  0
 .../hmpxv1_big/auspice_config.json            |  0
 .../hmpxv1_big/config.yaml                    | 22 +++++++++----------
 .../hmpxv1_big/include.txt                    |  0
 .../{config => defaults}/lat_longs.tsv        |  0
 phylogenetic/{config => defaults}/mask.bed    |  0
 .../{config => defaults}/mask_overview.bed    |  0
 .../mpxv/auspice_config.json                  |  0
 .../{config => defaults}/mpxv/config.yaml     | 22 +++++++++----------
 .../{config => defaults}/mpxv/include.txt     |  0
 .../nextstrain_automation.yaml                |  0
 .../{config => defaults}/reference.fasta      |  0
 .../{config => defaults}/reference.gb         |  0
 .../{config => defaults}/tree_mask.tsv        |  0
 phylogenetic/profiles/ci/builds.yaml          | 22 +++++++++----------
 phylogenetic/rules/core.smk                   |  4 ++--
 29 files changed, 55 insertions(+), 55 deletions(-)
 rename phylogenetic/{config => defaults}/clades.tsv (100%)
 rename phylogenetic/{config => defaults}/color_ordering.tsv (100%)
 rename phylogenetic/{config => defaults}/color_schemes.tsv (100%)
 rename phylogenetic/{config => defaults}/description.md (96%)
 rename phylogenetic/{config => defaults}/exclude_accessions.txt (100%)
 rename phylogenetic/{config => defaults}/genemap.gff (100%)
 rename phylogenetic/{config => defaults}/hmpxv1/auspice_config.json (100%)
 rename phylogenetic/{config => defaults}/hmpxv1/config.yaml (76%)
 rename phylogenetic/{config => defaults}/hmpxv1/include.txt (100%)
 rename phylogenetic/{config => defaults}/hmpxv1_big/auspice_config.json (100%)
 rename phylogenetic/{config => defaults}/hmpxv1_big/config.yaml (67%)
 rename phylogenetic/{config => defaults}/hmpxv1_big/include.txt (100%)
 rename phylogenetic/{config => defaults}/lat_longs.tsv (100%)
 rename phylogenetic/{config => defaults}/mask.bed (100%)
 rename phylogenetic/{config => defaults}/mask_overview.bed (100%)
 rename phylogenetic/{config => defaults}/mpxv/auspice_config.json (100%)
 rename phylogenetic/{config => defaults}/mpxv/config.yaml (74%)
 rename phylogenetic/{config => defaults}/mpxv/include.txt (100%)
 rename phylogenetic/{config => defaults}/nextstrain_automation.yaml (100%)
 rename phylogenetic/{config => defaults}/reference.fasta (100%)
 rename phylogenetic/{config => defaults}/reference.gb (100%)
 rename phylogenetic/{config => defaults}/tree_mask.tsv (100%)

diff --git a/.github/workflows/rebuild-hmpxv1-big.yaml b/.github/workflows/rebuild-hmpxv1-big.yaml
index 717532ab..4b115254 100644
--- a/.github/workflows/rebuild-hmpxv1-big.yaml
+++ b/.github/workflows/rebuild-hmpxv1-big.yaml
@@ -63,5 +63,5 @@ jobs:
           --env SLACK_CHANNELS \
           . \
             notify_on_deploy \
-              --configfiles $BUILD_DIR/config/$BUILD_NAME/config.yaml $BUILD_DIR/config/nextstrain_automation.yaml \
+              --configfiles $BUILD_DIR/defaults/$BUILD_NAME/config.yaml $BUILD_DIR/defaults/nextstrain_automation.yaml \
               $CONFIG_OVERRIDES --directory $BUILD_DIR --snakefile $BUILD_DIR/Snakefile
diff --git a/.github/workflows/rebuild-hmpxv1.yaml b/.github/workflows/rebuild-hmpxv1.yaml
index 0802f522..1ed7a235 100644
--- a/.github/workflows/rebuild-hmpxv1.yaml
+++ b/.github/workflows/rebuild-hmpxv1.yaml
@@ -63,5 +63,5 @@ jobs:
           --env SLACK_CHANNELS \
           . \
             notify_on_deploy \
-            --configfiles $BUILD_DIR/config/$BUILD_NAME/config.yaml $BUILD_DIR/config/nextstrain_automation.yaml \
+            --configfiles $BUILD_DIR/defaults/$BUILD_NAME/config.yaml $BUILD_DIR/defaults/nextstrain_automation.yaml \
             $CONFIG_OVERRIDES --directory $BUILD_DIR --snakefile $BUILD_DIR/Snakefile
diff --git a/.github/workflows/rebuild-mpxv.yaml b/.github/workflows/rebuild-mpxv.yaml
index f07f77b7..fe3d614f 100644
--- a/.github/workflows/rebuild-mpxv.yaml
+++ b/.github/workflows/rebuild-mpxv.yaml
@@ -63,5 +63,5 @@ jobs:
           --env SLACK_CHANNELS \
           . \
             notify_on_deploy \
-              --configfiles $BUILD_DIR/config/$BUILD_NAME/config.yaml $BUILD_DIR/config/nextstrain_automation.yaml \
+              --configfiles $BUILD_DIR/defaults/$BUILD_NAME/config.yaml $BUILD_DIR/defaults/nextstrain_automation.yaml \
               $CONFIG_OVERRIDES --directory $BUILD_DIR --snakefile $BUILD_DIR/Snakefile
diff --git a/phylogenetic/README.md b/phylogenetic/README.md
index 47a659eb..e1cb1cab 100644
--- a/phylogenetic/README.md
+++ b/phylogenetic/README.md
@@ -61,19 +61,19 @@ nextstrain build . data/sequences.fasta data/metadata.tsv
 Run pipeline to produce the "overview" tree for `/mpox/all-clades` with:
 
 ```bash
-nextstrain build . --configfile config/mpxv/config.yaml
+nextstrain build . --configfile defaults/mpxv/config.yaml
 ```
 
 Run pipeline to produce the "clade IIb" tree for `/mpox/clade-IIb` with:
 
 ```bash
-nextstrain build . --configfile config/hmpxv1/config.yaml
+nextstrain build . --configfile defaults/hmpxv1/config.yaml
 ```
 
 Run pipeline to produce the "lineage B.1" tree for `/mpox/lineage-B.1` with:
 
 ```bash
-nextstrain build . --configfile config/hmpxv1_big/config.yaml
+nextstrain build . --configfile defaults/hmpxv1_big/config.yaml
 ```
 
 ### Deploy
@@ -108,7 +108,7 @@ nextstrain view .
 
 ## Configuration
 
-Configuration takes place in `config/*/config.yaml` files for each build.
+The default configuration takes place in `defaults/*/config.yaml` files for each build.
 The analysis pipeline is contained in `rules/core.smk`.
 This can be read top-to-bottom, each rule specifies its file inputs and output and pulls its parameters from `config`.
 There is little redirection and each rule should be able to be reasoned with on its own.
diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile
index 03a06e67..85296416 100644
--- a/phylogenetic/Snakefile
+++ b/phylogenetic/Snakefile
@@ -12,7 +12,7 @@ if version.parse(augur_version) < version.parse(min_augur_version):
 
 if not config:
 
-    configfile: "config/hmpxv1/config.yaml"
+    configfile: "defaults/hmpxv1/config.yaml"
 
 
 build_dir = "results"
diff --git a/phylogenetic/config/clades.tsv b/phylogenetic/defaults/clades.tsv
similarity index 100%
rename from phylogenetic/config/clades.tsv
rename to phylogenetic/defaults/clades.tsv
diff --git a/phylogenetic/config/color_ordering.tsv b/phylogenetic/defaults/color_ordering.tsv
similarity index 100%
rename from phylogenetic/config/color_ordering.tsv
rename to phylogenetic/defaults/color_ordering.tsv
diff --git a/phylogenetic/config/color_schemes.tsv b/phylogenetic/defaults/color_schemes.tsv
similarity index 100%
rename from phylogenetic/config/color_schemes.tsv
rename to phylogenetic/defaults/color_schemes.tsv
diff --git a/phylogenetic/config/description.md b/phylogenetic/defaults/description.md
similarity index 96%
rename from phylogenetic/config/description.md
rename to phylogenetic/defaults/description.md
index a33eb1d3..624be722 100644
--- a/phylogenetic/config/description.md
+++ b/phylogenetic/defaults/description.md
@@ -14,7 +14,7 @@ Our bioinformatic processing workflow can be found at [github.com/nextstrain/mpo
 - masking several regions of the genome, including the first 1350 and last 6422 base pairs and multiple repetitive regions of variable length
 - phylogenetic reconstruction using [IQTREE-2](http://www.iqtree.org/)
 - ancestral state reconstruction and temporal inference using [TreeTime](https://github.com/neherlab/treetime)
-- clade assignment via [clade definitions defined here](https://github.com/nextstrain/mpox/blob/master/config/clades.tsv), to label broader MPXV clades I, IIa and IIb and to label hMPXV1 lineages A, A.1, A.1.1, etc...
+- clade assignment via [clade definitions defined here](https://github.com/nextstrain/mpox/blob/master/defaults/clades.tsv), to label broader MPXV clades I, IIa and IIb and to label hMPXV1 lineages A, A.1, A.1.1, etc...
 
 #### Underlying data
 We curate sequence data and metadata from the [NCBI Datasets command line tools](https://www.ncbi.nlm.nih.gov/datasets/docs/v2/download-and-install/),
diff --git a/phylogenetic/config/exclude_accessions.txt b/phylogenetic/defaults/exclude_accessions.txt
similarity index 100%
rename from phylogenetic/config/exclude_accessions.txt
rename to phylogenetic/defaults/exclude_accessions.txt
diff --git a/phylogenetic/config/genemap.gff b/phylogenetic/defaults/genemap.gff
similarity index 100%
rename from phylogenetic/config/genemap.gff
rename to phylogenetic/defaults/genemap.gff
diff --git a/phylogenetic/config/hmpxv1/auspice_config.json b/phylogenetic/defaults/hmpxv1/auspice_config.json
similarity index 100%
rename from phylogenetic/config/hmpxv1/auspice_config.json
rename to phylogenetic/defaults/hmpxv1/auspice_config.json
diff --git a/phylogenetic/config/hmpxv1/config.yaml b/phylogenetic/defaults/hmpxv1/config.yaml
similarity index 76%
rename from phylogenetic/config/hmpxv1/config.yaml
rename to phylogenetic/defaults/hmpxv1/config.yaml
index a9b6739a..6e416e95 100644
--- a/phylogenetic/config/hmpxv1/config.yaml
+++ b/phylogenetic/defaults/hmpxv1/config.yaml
@@ -1,12 +1,12 @@
-reference: "config/reference.fasta"
-genemap: "config/genemap.gff"
-genbank_reference: "config/reference.gb"
-include: "config/hmpxv1/include.txt"
-clades: "config/clades.tsv"
-lat_longs: "config/lat_longs.tsv"
-auspice_config: "config/hmpxv1/auspice_config.json"
-description: "config/description.md"
-tree_mask: "config/tree_mask.tsv"
+reference: "defaults/reference.fasta"
+genemap: "defaults/genemap.gff"
+genbank_reference: "defaults/reference.gb"
+include: "defaults/hmpxv1/include.txt"
+clades: "defaults/clades.tsv"
+lat_longs: "defaults/lat_longs.tsv"
+auspice_config: "defaults/hmpxv1/auspice_config.json"
+description: "defaults/description.md"
+tree_mask: "defaults/tree_mask.tsv"
 
 # Use `accession` as the ID column since `strain` currently contains duplicates¹.
 # ¹ https://github.com/nextstrain/mpox/issues/33
@@ -17,7 +17,7 @@ build_name: "hmpxv1"
 auspice_name: "mpox_clade-IIb"
 
 filter:
-  exclude: "config/exclude_accessions.txt"
+  exclude: "defaults/exclude_accessions.txt"
   min_date: 2017
   min_length: 100000
 
@@ -78,4 +78,4 @@ recency: true
 mask:
   from_beginning: 800
   from_end: 6422
-  maskfile: "config/mask.bed"
+  maskfile: "defaults/mask.bed"
diff --git a/phylogenetic/config/hmpxv1/include.txt b/phylogenetic/defaults/hmpxv1/include.txt
similarity index 100%
rename from phylogenetic/config/hmpxv1/include.txt
rename to phylogenetic/defaults/hmpxv1/include.txt
diff --git a/phylogenetic/config/hmpxv1_big/auspice_config.json b/phylogenetic/defaults/hmpxv1_big/auspice_config.json
similarity index 100%
rename from phylogenetic/config/hmpxv1_big/auspice_config.json
rename to phylogenetic/defaults/hmpxv1_big/auspice_config.json
diff --git a/phylogenetic/config/hmpxv1_big/config.yaml b/phylogenetic/defaults/hmpxv1_big/config.yaml
similarity index 67%
rename from phylogenetic/config/hmpxv1_big/config.yaml
rename to phylogenetic/defaults/hmpxv1_big/config.yaml
index 513e530f..dd807541 100644
--- a/phylogenetic/config/hmpxv1_big/config.yaml
+++ b/phylogenetic/defaults/hmpxv1_big/config.yaml
@@ -1,12 +1,12 @@
-reference: "config/reference.fasta"
-genemap: "config/genemap.gff"
-genbank_reference: "config/reference.gb"
-include: "config/hmpxv1_big/include.txt"
-clades: "config/clades.tsv"
-lat_longs: "config/lat_longs.tsv"
-auspice_config: "config/hmpxv1_big/auspice_config.json"
-description: "config/description.md"
-tree_mask: "config/tree_mask.tsv"
+reference: "defaults/reference.fasta"
+genemap: "defaults/genemap.gff"
+genbank_reference: "defaults/reference.gb"
+include: "defaults/hmpxv1_big/include.txt"
+clades: "defaults/clades.tsv"
+lat_longs: "defaults/lat_longs.tsv"
+auspice_config: "defaults/hmpxv1_big/auspice_config.json"
+description: "defaults/description.md"
+tree_mask: "defaults/tree_mask.tsv"
 
 # Use `accession` as the ID column since `strain` currently contains duplicates¹.
 # ¹ https://github.com/nextstrain/mpox/issues/33
@@ -17,7 +17,7 @@ build_name: "hmpxv1_big"
 auspice_name: "mpox_lineage-B.1"
 
 filter:
-  exclude: "config/exclude_accessions.txt"
+  exclude: "defaults/exclude_accessions.txt"
   min_date: 2022
   min_length: 180000
 
@@ -57,4 +57,4 @@ recency: true
 mask:
   from_beginning: 800
   from_end: 6422
-  maskfile: "config/mask.bed"
+  maskfile: "defaults/mask.bed"
diff --git a/phylogenetic/config/hmpxv1_big/include.txt b/phylogenetic/defaults/hmpxv1_big/include.txt
similarity index 100%
rename from phylogenetic/config/hmpxv1_big/include.txt
rename to phylogenetic/defaults/hmpxv1_big/include.txt
diff --git a/phylogenetic/config/lat_longs.tsv b/phylogenetic/defaults/lat_longs.tsv
similarity index 100%
rename from phylogenetic/config/lat_longs.tsv
rename to phylogenetic/defaults/lat_longs.tsv
diff --git a/phylogenetic/config/mask.bed b/phylogenetic/defaults/mask.bed
similarity index 100%
rename from phylogenetic/config/mask.bed
rename to phylogenetic/defaults/mask.bed
diff --git a/phylogenetic/config/mask_overview.bed b/phylogenetic/defaults/mask_overview.bed
similarity index 100%
rename from phylogenetic/config/mask_overview.bed
rename to phylogenetic/defaults/mask_overview.bed
diff --git a/phylogenetic/config/mpxv/auspice_config.json b/phylogenetic/defaults/mpxv/auspice_config.json
similarity index 100%
rename from phylogenetic/config/mpxv/auspice_config.json
rename to phylogenetic/defaults/mpxv/auspice_config.json
diff --git a/phylogenetic/config/mpxv/config.yaml b/phylogenetic/defaults/mpxv/config.yaml
similarity index 74%
rename from phylogenetic/config/mpxv/config.yaml
rename to phylogenetic/defaults/mpxv/config.yaml
index d569445a..5327d6f2 100644
--- a/phylogenetic/config/mpxv/config.yaml
+++ b/phylogenetic/defaults/mpxv/config.yaml
@@ -1,12 +1,12 @@
-auspice_config: "config/mpxv/auspice_config.json"
-include: "config/mpxv/include.txt"
-reference: "config/reference.fasta"
-genemap: "config/genemap.gff"
-genbank_reference: "config/reference.gb"
-lat_longs: "config/lat_longs.tsv"
-description: "config/description.md"
-clades: "config/clades.tsv"
-tree_mask: "config/tree_mask.tsv"
+auspice_config: "defaults/mpxv/auspice_config.json"
+include: "defaults/mpxv/include.txt"
+reference: "defaults/reference.fasta"
+genemap: "defaults/genemap.gff"
+genbank_reference: "defaults/reference.gb"
+lat_longs: "defaults/lat_longs.tsv"
+description: "defaults/description.md"
+clades: "defaults/clades.tsv"
+tree_mask: "defaults/tree_mask.tsv"
 
 # Use `accession` as the ID column since `strain` currently contains duplicates¹.
 # ¹ https://github.com/nextstrain/mpox/issues/33
@@ -17,7 +17,7 @@ build_name: "mpxv"
 auspice_name: "mpox_all-clades"
 
 filter:
-  exclude: "config/exclude_accessions.txt"
+  exclude: "defaults/exclude_accessions.txt"
   min_date: 1950
   min_length: 100000
 
@@ -74,4 +74,4 @@ recency: true
 mask:
   from_beginning: 1350
   from_end: 6422
-  maskfile: "config/mask_overview.bed"
+  maskfile: "defaults/mask_overview.bed"
diff --git a/phylogenetic/config/mpxv/include.txt b/phylogenetic/defaults/mpxv/include.txt
similarity index 100%
rename from phylogenetic/config/mpxv/include.txt
rename to phylogenetic/defaults/mpxv/include.txt
diff --git a/phylogenetic/config/nextstrain_automation.yaml b/phylogenetic/defaults/nextstrain_automation.yaml
similarity index 100%
rename from phylogenetic/config/nextstrain_automation.yaml
rename to phylogenetic/defaults/nextstrain_automation.yaml
diff --git a/phylogenetic/config/reference.fasta b/phylogenetic/defaults/reference.fasta
similarity index 100%
rename from phylogenetic/config/reference.fasta
rename to phylogenetic/defaults/reference.fasta
diff --git a/phylogenetic/config/reference.gb b/phylogenetic/defaults/reference.gb
similarity index 100%
rename from phylogenetic/config/reference.gb
rename to phylogenetic/defaults/reference.gb
diff --git a/phylogenetic/config/tree_mask.tsv b/phylogenetic/defaults/tree_mask.tsv
similarity index 100%
rename from phylogenetic/config/tree_mask.tsv
rename to phylogenetic/defaults/tree_mask.tsv
diff --git a/phylogenetic/profiles/ci/builds.yaml b/phylogenetic/profiles/ci/builds.yaml
index 08dd36c8..797154f4 100644
--- a/phylogenetic/profiles/ci/builds.yaml
+++ b/phylogenetic/profiles/ci/builds.yaml
@@ -1,15 +1,15 @@
 custom_rules:
   - profiles/ci/copy_example_data.smk
 
-reference: "config/reference.fasta"
-genemap: "config/genemap.gff"
-genbank_reference: "config/reference.gb"
-include: "config/hmpxv1/include.txt"
-clades: "config/clades.tsv"
-lat_longs: "config/lat_longs.tsv"
-auspice_config: "config/hmpxv1/auspice_config.json"
-description: "config/description.md"
-tree_mask: "config/tree_mask.tsv"
+reference: "defaults/reference.fasta"
+genemap: "defaults/genemap.gff"
+genbank_reference: "defaults/reference.gb"
+include: "defaults/hmpxv1/include.txt"
+clades: "defaults/clades.tsv"
+lat_longs: "defaults/lat_longs.tsv"
+auspice_config: "defaults/hmpxv1/auspice_config.json"
+description: "defaults/description.md"
+tree_mask: "defaults/tree_mask.tsv"
 
 # Use `accession` as the ID column since `strain` currently contains duplicates¹.
 # ¹ https://github.com/nextstrain/monkeypox/issues/33
@@ -20,7 +20,7 @@ build_name: "hmpxv1"
 auspice_name: "mpox_clade-IIb"
 
 filter:
-  exclude: "config/exclude_accessions.txt"
+  exclude: "defaults/exclude_accessions.txt"
   min_date: 2017
   min_length: 100000
 
@@ -81,4 +81,4 @@ recency: true
 mask:
   from_beginning: 800
   from_end: 6422
-  maskfile: "config/mask.bed"
+  maskfile: "defaults/mask.bed"
diff --git a/phylogenetic/rules/core.smk b/phylogenetic/rules/core.smk
index 474f3a7b..75fc17f0 100644
--- a/phylogenetic/rules/core.smk
+++ b/phylogenetic/rules/core.smk
@@ -419,8 +419,8 @@ rule recency:
 
 rule colors:
     input:
-        ordering="config/color_ordering.tsv",
-        color_schemes="config/color_schemes.tsv",
+        ordering="defaults/color_ordering.tsv",
+        color_schemes="defaults/color_schemes.tsv",
         metadata=build_dir + "/{build_name}/metadata.tsv",
     output:
         colors=build_dir + "/{build_name}/colors.tsv",

From 6eb2c38d5986ebea5599a2096fee04ec80e11c20 Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Mon, 12 Feb 2024 17:04:09 -0800
Subject: [PATCH 03/16] phylo: Add build-configs directory

Part of work to update this repo to match the pathogen-repo-guide.

Moves the exisitng CI build config and customizations to the new
"build-configs" directory. Includes renaming of the `builds.yaml` to
`config.yaml` since the file no longer clashes with Snakemake Profiles.
---
 .github/workflows/ci.yaml                                 | 2 +-
 phylogenetic/README.md                                    | 8 +++++++-
 .../ci/builds.yaml => build-configs/ci/config.yaml}       | 2 +-
 .../{profiles => build-configs}/ci/copy_example_data.smk  | 0
 4 files changed, 9 insertions(+), 3 deletions(-)
 rename phylogenetic/{profiles/ci/builds.yaml => build-configs/ci/config.yaml} (97%)
 rename phylogenetic/{profiles => build-configs}/ci/copy_example_data.smk (100%)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index f4b3e601..91325533 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -21,7 +21,7 @@ jobs:
       run: |
         nextstrain build \
           phylogenetic \
-          --configfile profiles/ci/builds.yaml
+          --configfiles build-configs/ci/config.yaml
       artifact-name: output-${{ matrix.runtime }}
       artifact-paths: |
         phylogenetic/auspice/
diff --git a/phylogenetic/README.md b/phylogenetic/README.md
index e1cb1cab..6f1166d6 100644
--- a/phylogenetic/README.md
+++ b/phylogenetic/README.md
@@ -28,7 +28,7 @@ Once you've run the build, you can view the results with:
 You can run an example build using the example data provided in this repository via:
 
 ```
-nextstrain build .  --configfile profiles/ci/builds.yaml
+nextstrain build .  --configfile build-configs/ci/config.yaml
 ```
 
 When the build has finished running, view the output Auspice trees via:
@@ -113,6 +113,12 @@ The analysis pipeline is contained in `rules/core.smk`.
 This can be read top-to-bottom, each rule specifies its file inputs and output and pulls its parameters from `config`.
 There is little redirection and each rule should be able to be reasoned with on its own.
 
+### Custom build configs
+
+The build-configs directory contains configs and customizations that override and/or extend the default workflow.
+
+- [ci](build-configs/ci/) - CI build that run the [example build](#example-build) with the [example data](example_data/).
+
 ## Update example data
 
 [Example data](./example_data/) is used by [CI](https://github.com/nextstrain/mpox/actions/workflows/ci.yaml). It can also be used as a small subset of real-world data.
diff --git a/phylogenetic/profiles/ci/builds.yaml b/phylogenetic/build-configs/ci/config.yaml
similarity index 97%
rename from phylogenetic/profiles/ci/builds.yaml
rename to phylogenetic/build-configs/ci/config.yaml
index 797154f4..de6d5ed0 100644
--- a/phylogenetic/profiles/ci/builds.yaml
+++ b/phylogenetic/build-configs/ci/config.yaml
@@ -1,5 +1,5 @@
 custom_rules:
-  - profiles/ci/copy_example_data.smk
+  - build-configs/ci/copy_example_data.smk
 
 reference: "defaults/reference.fasta"
 genemap: "defaults/genemap.gff"
diff --git a/phylogenetic/profiles/ci/copy_example_data.smk b/phylogenetic/build-configs/ci/copy_example_data.smk
similarity index 100%
rename from phylogenetic/profiles/ci/copy_example_data.smk
rename to phylogenetic/build-configs/ci/copy_example_data.smk

From 2d8493c37b6e15980761897affc1eee2f4a78ba1 Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Mon, 12 Feb 2024 17:30:05 -0800
Subject: [PATCH 04/16] phylo: move chores to build-configs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Part of work to update this repo to match the pathogen-repo-guide.

Since the chores rules are internal Nextstrain rules, they do not need
to be part of the core workflow. This also resolves the DAG confusion
that Snakemake occassionally runs into when running CI locally.¹

README.md includes the new instructions on how to invoke the workflow to
update the example data. This requires two config files:

1. The CI config to provide all required config params and to ensure
the example data uses correct `strain_id_field` for CI builds.
2. The chores config to include the custom rules

¹ https://github.com/nextstrain/mpox/issues/237
---
 phylogenetic/README.md                                 | 10 +++++++---
 phylogenetic/Snakefile                                 |  1 -
 .../{rules => build-configs/chores}/chores.smk         |  3 +++
 phylogenetic/build-configs/chores/config.yaml          |  2 ++
 4 files changed, 12 insertions(+), 4 deletions(-)
 rename phylogenetic/{rules => build-configs/chores}/chores.smk (82%)
 create mode 100644 phylogenetic/build-configs/chores/config.yaml

diff --git a/phylogenetic/README.md b/phylogenetic/README.md
index 6f1166d6..0920e720 100644
--- a/phylogenetic/README.md
+++ b/phylogenetic/README.md
@@ -117,16 +117,20 @@ There is little redirection and each rule should be able to be reasoned with on
 
 The build-configs directory contains configs and customizations that override and/or extend the default workflow.
 
+- [chores](build-configs/chores/) - internal Nextstrain chores such as [updating the example data](#update-example-data).
 - [ci](build-configs/ci/) - CI build that run the [example build](#example-build) with the [example data](example_data/).
 
 ## Update example data
 
-[Example data](./example_data/) is used by [CI](https://github.com/nextstrain/mpox/actions/workflows/ci.yaml). It can also be used as a small subset of real-world data.
+[Example data](./example_data/) is used by [CI](https://github.com/nextstrain/mpox/actions/workflows/ci.yaml).
+It can also be used as a small subset of real-world data.
 
-Example data should be updated every time metadata schema is changed or a new clade/lineage emerges. To update, run:
+Example data should be updated every time metadata schema is changed or a new clade/lineage emerges.
+To update, run:
 
 ```sh
-nextstrain build . update_example_data -F
+nextstrain build . update_example_data -F \
+    --configfiles build-configs/ci/config.yaml build-configs/chores/config.yaml
 ```
 
 ## Data use
diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile
index 85296416..1f6aaf17 100644
--- a/phylogenetic/Snakefile
+++ b/phylogenetic/Snakefile
@@ -48,7 +48,6 @@ else:
     include: "rules/prepare.smk"
 
 
-include: "rules/chores.smk"
 include: "rules/core.smk"
 
 
diff --git a/phylogenetic/rules/chores.smk b/phylogenetic/build-configs/chores/chores.smk
similarity index 82%
rename from phylogenetic/rules/chores.smk
rename to phylogenetic/build-configs/chores/chores.smk
index bad47d7a..2998701f 100644
--- a/phylogenetic/rules/chores.smk
+++ b/phylogenetic/build-configs/chores/chores.smk
@@ -1,3 +1,6 @@
+# I was hoping to use the Snakemake `default_target` directive to make this the
+# default target when including this rule via `custom_rules`, but that is
+# currently not possible: https://github.com/snakemake/snakemake/issues/2056
 rule update_example_data:
     """This updates the files under example_data/ based on latest available data from data.nextstrain.org.
 
diff --git a/phylogenetic/build-configs/chores/config.yaml b/phylogenetic/build-configs/chores/config.yaml
new file mode 100644
index 00000000..cc316a83
--- /dev/null
+++ b/phylogenetic/build-configs/chores/config.yaml
@@ -0,0 +1,2 @@
+custom_rules:
+  - build-configs/chores/chores.smk

From 63c5e5616ca5226386d170a7a194ffcfd63ea8f0 Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Tue, 13 Feb 2024 10:25:50 -0800
Subject: [PATCH 05/16] phylo: move nextstrain_automation to build-configs

Part of work to update this repo to match the pathogen-repo-guide.
---
 .github/workflows/rebuild-hmpxv1-big.yaml                    | 2 +-
 .github/workflows/rebuild-hmpxv1.yaml                        | 2 +-
 .github/workflows/rebuild-mpxv.yaml                          | 2 +-
 phylogenetic/README.md                                       | 1 +
 phylogenetic/Snakefile                                       | 5 -----
 .../nextstrain-automation/config.yaml}                       | 3 +++
 .../nextstrain-automation/nextstrain-automation.smk}         | 0
 7 files changed, 7 insertions(+), 8 deletions(-)
 rename phylogenetic/{defaults/nextstrain_automation.yaml => build-configs/nextstrain-automation/config.yaml} (66%)
 rename phylogenetic/{rules/nextstrain_automation.smk => build-configs/nextstrain-automation/nextstrain-automation.smk} (100%)

diff --git a/.github/workflows/rebuild-hmpxv1-big.yaml b/.github/workflows/rebuild-hmpxv1-big.yaml
index 4b115254..5146f291 100644
--- a/.github/workflows/rebuild-hmpxv1-big.yaml
+++ b/.github/workflows/rebuild-hmpxv1-big.yaml
@@ -63,5 +63,5 @@ jobs:
           --env SLACK_CHANNELS \
           . \
             notify_on_deploy \
-              --configfiles $BUILD_DIR/defaults/$BUILD_NAME/config.yaml $BUILD_DIR/defaults/nextstrain_automation.yaml \
+              --configfiles $BUILD_DIR/defaults/$BUILD_NAME/config.yaml $BUILD_DIR/build-configs/nextstrain-automation/config.yaml \
               $CONFIG_OVERRIDES --directory $BUILD_DIR --snakefile $BUILD_DIR/Snakefile
diff --git a/.github/workflows/rebuild-hmpxv1.yaml b/.github/workflows/rebuild-hmpxv1.yaml
index 1ed7a235..3db74076 100644
--- a/.github/workflows/rebuild-hmpxv1.yaml
+++ b/.github/workflows/rebuild-hmpxv1.yaml
@@ -63,5 +63,5 @@ jobs:
           --env SLACK_CHANNELS \
           . \
             notify_on_deploy \
-            --configfiles $BUILD_DIR/defaults/$BUILD_NAME/config.yaml $BUILD_DIR/defaults/nextstrain_automation.yaml \
+            --configfiles $BUILD_DIR/defaults/$BUILD_NAME/config.yaml $BUILD_DIR/build-configs/nextstrain-automation/config.yaml \
             $CONFIG_OVERRIDES --directory $BUILD_DIR --snakefile $BUILD_DIR/Snakefile
diff --git a/.github/workflows/rebuild-mpxv.yaml b/.github/workflows/rebuild-mpxv.yaml
index fe3d614f..2c352227 100644
--- a/.github/workflows/rebuild-mpxv.yaml
+++ b/.github/workflows/rebuild-mpxv.yaml
@@ -63,5 +63,5 @@ jobs:
           --env SLACK_CHANNELS \
           . \
             notify_on_deploy \
-              --configfiles $BUILD_DIR/defaults/$BUILD_NAME/config.yaml $BUILD_DIR/defaults/nextstrain_automation.yaml \
+              --configfiles $BUILD_DIR/defaults/$BUILD_NAME/config.yaml $BUILD_DIR/build-configs/nextstrain-automation/config.yaml \
               $CONFIG_OVERRIDES --directory $BUILD_DIR --snakefile $BUILD_DIR/Snakefile
diff --git a/phylogenetic/README.md b/phylogenetic/README.md
index 0920e720..1a17a42f 100644
--- a/phylogenetic/README.md
+++ b/phylogenetic/README.md
@@ -119,6 +119,7 @@ The build-configs directory contains configs and customizations that override an
 
 - [chores](build-configs/chores/) - internal Nextstrain chores such as [updating the example data](#update-example-data).
 - [ci](build-configs/ci/) - CI build that run the [example build](#example-build) with the [example data](example_data/).
+- [nextstrain-automation](build-configs/nextstrain-automation/) - internal Nextstrain automated builds
 
 ## Update example data
 
diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile
index 1f6aaf17..a84bd5f2 100644
--- a/phylogenetic/Snakefile
+++ b/phylogenetic/Snakefile
@@ -51,11 +51,6 @@ else:
 include: "rules/core.smk"
 
 
-if config.get("deploy_url", False):
-
-    include: "rules/nextstrain_automation.smk"
-
-
 # Include custom rules defined in the config.
 if "custom_rules" in config:
     for rule_file in config["custom_rules"]:
diff --git a/phylogenetic/defaults/nextstrain_automation.yaml b/phylogenetic/build-configs/nextstrain-automation/config.yaml
similarity index 66%
rename from phylogenetic/defaults/nextstrain_automation.yaml
rename to phylogenetic/build-configs/nextstrain-automation/config.yaml
index d0389c67..cdcda6de 100644
--- a/phylogenetic/defaults/nextstrain_automation.yaml
+++ b/phylogenetic/build-configs/nextstrain-automation/config.yaml
@@ -1,5 +1,8 @@
 # Optional configs to include for automated Nextstrain builds
 # Intended to be used internally by the Nextstrain team
 
+custom_rules:
+  - build-configs/nextstrain-automation/nextstrain-automation.smk
+
 # deploy
 deploy_url: "s3://nextstrain-data"
diff --git a/phylogenetic/rules/nextstrain_automation.smk b/phylogenetic/build-configs/nextstrain-automation/nextstrain-automation.smk
similarity index 100%
rename from phylogenetic/rules/nextstrain_automation.smk
rename to phylogenetic/build-configs/nextstrain-automation/nextstrain-automation.smk

From b0bbd39e89bc68a455fdbf4473dab8b8d4727500 Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Tue, 13 Feb 2024 10:57:11 -0800
Subject: [PATCH 06/16] phylo: Move lapis to build-configs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Part of work to update this repo to match the pathogen-repo-guide.

LAPIS was the default data source until we switched to using
data.nextstrain.org.¹ This commit moves the LAPIS related rules to a
custom build-config and is able to successfully override the default
download/decompress rules. However, there are downstream changes to the
workflow that cause errors with the LAPIS data, e.g. hard-coded filters
on columns do not exist in the LAPIS data.²

Rather than spending the time to make sure the workflow runs with the
LAPIS data, I will remove the lapis build-configs in the next commit.
I wanted to do so separately to be able to easily revert the removal
if needed.

¹ https://github.com/nextstrain/mpox/commit/5d0791a06315eabb1eba43f6889996f0dcf824b2
² https://github.com/nextstrain/mpox/blob/34feb4a4e6f4d8b7929cc701734b15b8a5e6a6fc/phylogenetic/workflow/snakemake_rules/core.smk#L43
---
 phylogenetic/README.md                                 |  1 +
 phylogenetic/Snakefile                                 | 10 +---------
 phylogenetic/build-configs/lapis/config.yaml           |  2 ++
 .../lapis}/download_via_lapis.smk                      |  4 ++++
 4 files changed, 8 insertions(+), 9 deletions(-)
 create mode 100644 phylogenetic/build-configs/lapis/config.yaml
 rename phylogenetic/{rules => build-configs/lapis}/download_via_lapis.smk (73%)

diff --git a/phylogenetic/README.md b/phylogenetic/README.md
index 1a17a42f..00242afd 100644
--- a/phylogenetic/README.md
+++ b/phylogenetic/README.md
@@ -119,6 +119,7 @@ The build-configs directory contains configs and customizations that override an
 
 - [chores](build-configs/chores/) - internal Nextstrain chores such as [updating the example data](#update-example-data).
 - [ci](build-configs/ci/) - CI build that run the [example build](#example-build) with the [example data](example_data/).
+- [lapis](build-configs/lapis/) - builds that use LAPIS as the data source.
 - [nextstrain-automation](build-configs/nextstrain-automation/) - internal Nextstrain automated builds
 
 ## Update example data
diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile
index a84bd5f2..c851cf7a 100644
--- a/phylogenetic/Snakefile
+++ b/phylogenetic/Snakefile
@@ -39,15 +39,7 @@ rule all:
         """
 
 
-if config.get("data_source", None) == "lapis":
-
-    include: "rules/download_via_lapis.smk"
-
-else:
-
-    include: "rules/prepare.smk"
-
-
+include: "rules/prepare.smk"
 include: "rules/core.smk"
 
 
diff --git a/phylogenetic/build-configs/lapis/config.yaml b/phylogenetic/build-configs/lapis/config.yaml
new file mode 100644
index 00000000..5e918251
--- /dev/null
+++ b/phylogenetic/build-configs/lapis/config.yaml
@@ -0,0 +1,2 @@
+custom_rules:
+  - build-configs/lapis/download_via_lapis.smk
diff --git a/phylogenetic/rules/download_via_lapis.smk b/phylogenetic/build-configs/lapis/download_via_lapis.smk
similarity index 73%
rename from phylogenetic/rules/download_via_lapis.smk
rename to phylogenetic/build-configs/lapis/download_via_lapis.smk
index 55c7ae9f..57c9927b 100644
--- a/phylogenetic/rules/download_via_lapis.smk
+++ b/phylogenetic/build-configs/lapis/download_via_lapis.smk
@@ -16,3 +16,7 @@ rule download_metadata_via_lapis:
             tr -d "\r" |
             sed -E 's/("([^"]*)")?,/\\2\\t/g' > {output.metadata}
         """
+
+# Override the default download/decompress rules to download data from LAPIS
+ruleorder: download_sequences_via_lapis > decompress
+ruleorder: download_metadata_via_lapis > decompress

From 307c3cd6a61cbb22c74bad1f4d68ea93caf39ad6 Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Tue, 13 Feb 2024 11:19:19 -0800
Subject: [PATCH 07/16] phylo: Remove build-configs/lapis

Removing build-configs/lapis instead of spending the time to make sure
the workflow runs with the LAPIS data.

If others need/want to use the LAPIS data, this custom build config can
be easily restored.
---
 phylogenetic/README.md                        |  1 -
 phylogenetic/build-configs/lapis/config.yaml  |  2 --
 .../lapis/download_via_lapis.smk              | 22 -------------------
 3 files changed, 25 deletions(-)
 delete mode 100644 phylogenetic/build-configs/lapis/config.yaml
 delete mode 100644 phylogenetic/build-configs/lapis/download_via_lapis.smk

diff --git a/phylogenetic/README.md b/phylogenetic/README.md
index 00242afd..1a17a42f 100644
--- a/phylogenetic/README.md
+++ b/phylogenetic/README.md
@@ -119,7 +119,6 @@ The build-configs directory contains configs and customizations that override an
 
 - [chores](build-configs/chores/) - internal Nextstrain chores such as [updating the example data](#update-example-data).
 - [ci](build-configs/ci/) - CI build that run the [example build](#example-build) with the [example data](example_data/).
-- [lapis](build-configs/lapis/) - builds that use LAPIS as the data source.
 - [nextstrain-automation](build-configs/nextstrain-automation/) - internal Nextstrain automated builds
 
 ## Update example data
diff --git a/phylogenetic/build-configs/lapis/config.yaml b/phylogenetic/build-configs/lapis/config.yaml
deleted file mode 100644
index 5e918251..00000000
--- a/phylogenetic/build-configs/lapis/config.yaml
+++ /dev/null
@@ -1,2 +0,0 @@
-custom_rules:
-  - build-configs/lapis/download_via_lapis.smk
diff --git a/phylogenetic/build-configs/lapis/download_via_lapis.smk b/phylogenetic/build-configs/lapis/download_via_lapis.smk
deleted file mode 100644
index 57c9927b..00000000
--- a/phylogenetic/build-configs/lapis/download_via_lapis.smk
+++ /dev/null
@@ -1,22 +0,0 @@
-rule download_sequences_via_lapis:
-    output:
-        sequences="data/sequences.fasta",
-    shell:
-        """
-        curl https://mpox-lapis.genspectrum.org/v1/sample/fasta --output {output.sequences}
-        """
-
-
-rule download_metadata_via_lapis:
-    output:
-        metadata="data/metadata.tsv",
-    shell:
-        """
-        curl https://mpox-lapis.genspectrum.org/v1/sample/details?dataFormat=csv | \
-            tr -d "\r" |
-            sed -E 's/("([^"]*)")?,/\\2\\t/g' > {output.metadata}
-        """
-
-# Override the default download/decompress rules to download data from LAPIS
-ruleorder: download_sequences_via_lapis > decompress
-ruleorder: download_metadata_via_lapis > decompress

From 4595120d001bdeafff9d7b01cafd6220470be7c9 Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Tue, 13 Feb 2024 11:51:59 -0800
Subject: [PATCH 08/16] phylo: Rename prepare.smk to prepare_sequences.smk

Part of work to update this repo to match the pathogen-repo-guide.

Following commit will move other rules for preparing sequences from
core.smk to prepare_sequences.smk.
---
 phylogenetic/Snakefile                                    | 2 +-
 phylogenetic/rules/{prepare.smk => prepare_sequences.smk} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename phylogenetic/rules/{prepare.smk => prepare_sequences.smk} (100%)

diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile
index c851cf7a..0e2753ef 100644
--- a/phylogenetic/Snakefile
+++ b/phylogenetic/Snakefile
@@ -39,7 +39,7 @@ rule all:
         """
 
 
-include: "rules/prepare.smk"
+include: "rules/prepare_sequences.smk"
 include: "rules/core.smk"
 
 
diff --git a/phylogenetic/rules/prepare.smk b/phylogenetic/rules/prepare_sequences.smk
similarity index 100%
rename from phylogenetic/rules/prepare.smk
rename to phylogenetic/rules/prepare_sequences.smk

From 730bf494cb254af2a20f3144926b03f9c9d54a3f Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Tue, 13 Feb 2024 11:56:08 -0800
Subject: [PATCH 09/16] phylo: Move rules to prepare_sequences.smk

Part of work to update this repo to match the pathogen-repo-guide.
---
 phylogenetic/rules/core.smk              | 159 --------------------
 phylogenetic/rules/prepare_sequences.smk | 176 +++++++++++++++++++++++
 2 files changed, 176 insertions(+), 159 deletions(-)

diff --git a/phylogenetic/rules/core.smk b/phylogenetic/rules/core.smk
index 75fc17f0..af817f4a 100644
--- a/phylogenetic/rules/core.smk
+++ b/phylogenetic/rules/core.smk
@@ -13,165 +13,6 @@ In addition, `build_dir` and `auspice_dir` need to be defined upstream.
 """
 
 
-rule filter:
-    """
-    Removing strains that do not satisfy certain requirements.
-    """
-    input:
-        sequences="data/sequences.fasta",
-        metadata="data/metadata.tsv",
-    output:
-        sequences=build_dir + "/{build_name}/good_sequences.fasta",
-        metadata=build_dir + "/{build_name}/good_metadata.tsv",
-        log=build_dir + "/{build_name}/good_filter.log",
-    params:
-        exclude=config["filter"]["exclude"],
-        min_date=config["filter"]["min_date"],
-        min_length=config["filter"]["min_length"],
-        strain_id=config["strain_id_field"],
-    shell:
-        """
-        augur filter \
-            --sequences {input.sequences} \
-            --metadata {input.metadata} \
-            --metadata-id-columns {params.strain_id} \
-            --output-sequences {output.sequences} \
-            --output-metadata {output.metadata} \
-            --exclude {params.exclude} \
-            --min-date {params.min_date} \
-            --min-length {params.min_length} \
-            --query "(QC_rare_mutations == 'good' | QC_rare_mutations == 'mediocre')" \
-            --output-log {output.log}
-        """
-
-
-rule subsample:
-    input:
-        metadata=rules.filter.output.metadata,
-    output:
-        strains=build_dir + "/{build_name}/{sample}_strains.txt",
-        log=build_dir + "/{build_name}/{sample}_filter.log",
-    params:
-        group_by=lambda w: config["subsample"][w.sample]["group_by"],
-        sequences_per_group=lambda w: config["subsample"][w.sample][
-            "sequences_per_group"
-        ],
-        other_filters=lambda w: config["subsample"][w.sample].get("other_filters", ""),
-        exclude=lambda w: f"--exclude-where {' '.join([f'lineage={l}' for l in config['subsample'][w.sample]['exclude_lineages']])}"
-        if "exclude_lineages" in config["subsample"][w.sample]
-        else "",
-        strain_id=config["strain_id_field"],
-    shell:
-        """
-        augur filter \
-            --metadata {input.metadata} \
-            --metadata-id-columns {params.strain_id} \
-            --output-strains {output.strains} \
-            {params.group_by} \
-            {params.sequences_per_group} \
-            {params.exclude} \
-            {params.other_filters} \
-            --output-log {output.log}
-        """
-
-
-rule combine_samples:
-    input:
-        strains=lambda w: [
-            f"{build_dir}/{w.build_name}/{sample}_strains.txt"
-            for sample in config["subsample"]
-        ],
-        sequences=rules.filter.output.sequences,
-        metadata=rules.filter.output.metadata,
-        include=config["include"],
-    output:
-        sequences=build_dir + "/{build_name}/filtered.fasta",
-        metadata=build_dir + "/{build_name}/metadata.tsv",
-    params:
-        strain_id=config["strain_id_field"],
-    shell:
-        """
-        augur filter \
-            --metadata-id-columns {params.strain_id} \
-            --sequences {input.sequences} \
-            --metadata {input.metadata} \
-            --exclude-all \
-            --include {input.strains} {input.include}\
-            --output-sequences {output.sequences} \
-            --output-metadata {output.metadata}
-        """
-
-
-rule reverse_reverse_complements:
-    input:
-        metadata=build_dir + "/{build_name}/metadata.tsv",
-        sequences=build_dir + "/{build_name}/filtered.fasta",
-    output:
-        build_dir + "/{build_name}/reversed.fasta",
-    shell:
-        """
-        python3 scripts/reverse_reversed_sequences.py \
-            --metadata {input.metadata} \
-            --sequences {input.sequences} \
-            --output {output}
-        """
-
-
-rule align:
-    """
-    Aligning sequences to {input.reference}
-      - filling gaps with N
-    """
-    input:
-        sequences=rules.reverse_reverse_complements.output,
-        reference=config["reference"],
-        genemap=config["genemap"],
-    output:
-        alignment=build_dir + "/{build_name}/aligned.fasta",
-        insertions=build_dir + "/{build_name}/insertions.fasta",
-    params:
-        max_indel=config["max_indel"],
-        seed_spacing=config["seed_spacing"],
-    threads: workflow.cores
-    shell:
-        """
-        nextalign run \
-            --jobs {threads} \
-            --reference {input.reference} \
-            --genemap {input.genemap} \
-            --max-indel {params.max_indel} \
-            --seed-spacing {params.seed_spacing} \
-            --retry-reverse-complement \
-            --output-fasta - \
-            --output-insertions {output.insertions} \
-            {input.sequences} | seqkit seq -i > {output.alignment}
-        """
-
-
-rule mask:
-    """
-    Mask ends of the alignment:
-      - from start: {params.from_start}
-      - from end: {params.from_end}
-    """
-    input:
-        sequences=build_dir + "/{build_name}/aligned.fasta",
-        mask=config["mask"]["maskfile"],
-    output:
-        build_dir + "/{build_name}/masked.fasta",
-    params:
-        from_start=config["mask"]["from_beginning"],
-        from_end=config["mask"]["from_end"],
-    shell:
-        """
-        augur mask \
-            --sequences {input.sequences} \
-            --mask {input.mask} \
-            --mask-from-beginning {params.from_start} \
-            --mask-from-end {params.from_end} --output {output}
-        """
-
-
 rule tree:
     """
     Building tree
diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk
index 60f65d69..b87820c3 100644
--- a/phylogenetic/rules/prepare_sequences.smk
+++ b/phylogenetic/rules/prepare_sequences.smk
@@ -1,3 +1,20 @@
+"""
+This part of the workflow prepares sequences for constructing the phylogenetic tree.
+
+REQUIRED INPUTS:
+
+    include     = path to file of sequences to in force include
+    reference   = path to reference sequence FASTA for Nextclade alignment
+    genemap     = path to genemap GFF for Nextclade alignment
+    maskfile    = path to maskfile of sites to be masked
+
+OUTPUTS:
+
+    prepared_sequences = {build_dir}/{build_name}/masked.fasta
+
+"""
+
+
 rule download:
     """
     Downloading sequences and metadata from data.nextstrain.org
@@ -30,3 +47,162 @@ rule decompress:
         gzip --decompress --keep {input.metadata}
         xz --decompress --keep {input.sequences}
         """
+
+
+rule filter:
+    """
+    Removing strains that do not satisfy certain requirements.
+    """
+    input:
+        sequences="data/sequences.fasta",
+        metadata="data/metadata.tsv",
+    output:
+        sequences=build_dir + "/{build_name}/good_sequences.fasta",
+        metadata=build_dir + "/{build_name}/good_metadata.tsv",
+        log=build_dir + "/{build_name}/good_filter.log",
+    params:
+        exclude=config["filter"]["exclude"],
+        min_date=config["filter"]["min_date"],
+        min_length=config["filter"]["min_length"],
+        strain_id=config["strain_id_field"],
+    shell:
+        """
+        augur filter \
+            --sequences {input.sequences} \
+            --metadata {input.metadata} \
+            --metadata-id-columns {params.strain_id} \
+            --output-sequences {output.sequences} \
+            --output-metadata {output.metadata} \
+            --exclude {params.exclude} \
+            --min-date {params.min_date} \
+            --min-length {params.min_length} \
+            --query "(QC_rare_mutations == 'good' | QC_rare_mutations == 'mediocre')" \
+            --output-log {output.log}
+        """
+
+
+rule subsample:
+    input:
+        metadata=rules.filter.output.metadata,
+    output:
+        strains=build_dir + "/{build_name}/{sample}_strains.txt",
+        log=build_dir + "/{build_name}/{sample}_filter.log",
+    params:
+        group_by=lambda w: config["subsample"][w.sample]["group_by"],
+        sequences_per_group=lambda w: config["subsample"][w.sample][
+            "sequences_per_group"
+        ],
+        other_filters=lambda w: config["subsample"][w.sample].get("other_filters", ""),
+        exclude=lambda w: f"--exclude-where {' '.join([f'lineage={l}' for l in config['subsample'][w.sample]['exclude_lineages']])}"
+        if "exclude_lineages" in config["subsample"][w.sample]
+        else "",
+        strain_id=config["strain_id_field"],
+    shell:
+        """
+        augur filter \
+            --metadata {input.metadata} \
+            --metadata-id-columns {params.strain_id} \
+            --output-strains {output.strains} \
+            {params.group_by} \
+            {params.sequences_per_group} \
+            {params.exclude} \
+            {params.other_filters} \
+            --output-log {output.log}
+        """
+
+
+rule combine_samples:
+    input:
+        strains=lambda w: [
+            f"{build_dir}/{w.build_name}/{sample}_strains.txt"
+            for sample in config["subsample"]
+        ],
+        sequences=rules.filter.output.sequences,
+        metadata=rules.filter.output.metadata,
+        include=config["include"],
+    output:
+        sequences=build_dir + "/{build_name}/filtered.fasta",
+        metadata=build_dir + "/{build_name}/metadata.tsv",
+    params:
+        strain_id=config["strain_id_field"],
+    shell:
+        """
+        augur filter \
+            --metadata-id-columns {params.strain_id} \
+            --sequences {input.sequences} \
+            --metadata {input.metadata} \
+            --exclude-all \
+            --include {input.strains} {input.include}\
+            --output-sequences {output.sequences} \
+            --output-metadata {output.metadata}
+        """
+
+
+rule reverse_reverse_complements:
+    input:
+        metadata=build_dir + "/{build_name}/metadata.tsv",
+        sequences=build_dir + "/{build_name}/filtered.fasta",
+    output:
+        build_dir + "/{build_name}/reversed.fasta",
+    shell:
+        """
+        python3 scripts/reverse_reversed_sequences.py \
+            --metadata {input.metadata} \
+            --sequences {input.sequences} \
+            --output {output}
+        """
+
+
+rule align:
+    """
+    Aligning sequences to {input.reference}
+      - filling gaps with N
+    """
+    input:
+        sequences=rules.reverse_reverse_complements.output,
+        reference=config["reference"],
+        genemap=config["genemap"],
+    output:
+        alignment=build_dir + "/{build_name}/aligned.fasta",
+        insertions=build_dir + "/{build_name}/insertions.fasta",
+    params:
+        max_indel=config["max_indel"],
+        seed_spacing=config["seed_spacing"],
+    threads: workflow.cores
+    shell:
+        """
+        nextalign run \
+            --jobs {threads} \
+            --reference {input.reference} \
+            --genemap {input.genemap} \
+            --max-indel {params.max_indel} \
+            --seed-spacing {params.seed_spacing} \
+            --retry-reverse-complement \
+            --output-fasta - \
+            --output-insertions {output.insertions} \
+            {input.sequences} | seqkit seq -i > {output.alignment}
+        """
+
+
+rule mask:
+    """
+    Mask ends of the alignment:
+      - from start: {params.from_start}
+      - from end: {params.from_end}
+    """
+    input:
+        sequences=build_dir + "/{build_name}/aligned.fasta",
+        mask=config["mask"]["maskfile"],
+    output:
+        build_dir + "/{build_name}/masked.fasta",
+    params:
+        from_start=config["mask"]["from_beginning"],
+        from_end=config["mask"]["from_end"],
+    shell:
+        """
+        augur mask \
+            --sequences {input.sequences} \
+            --mask {input.mask} \
+            --mask-from-beginning {params.from_start} \
+            --mask-from-end {params.from_end} --output {output}
+        """

From 11dcb7b731c7d5955e89d3273db123f1bb2b55be Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Tue, 13 Feb 2024 12:28:29 -0800
Subject: [PATCH 10/16] phylo: Move rules to construct_phylogeny.smk

Part of work to update this repo to match the pathogen-repo-guide.
---
 phylogenetic/Snakefile                     |   1 +
 phylogenetic/rules/construct_phylogeny.smk | 111 +++++++++++++++++++++
 phylogenetic/rules/core.smk                |  96 ------------------
 3 files changed, 112 insertions(+), 96 deletions(-)
 create mode 100644 phylogenetic/rules/construct_phylogeny.smk

diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile
index 0e2753ef..d6add2bd 100644
--- a/phylogenetic/Snakefile
+++ b/phylogenetic/Snakefile
@@ -40,6 +40,7 @@ rule all:
 
 
 include: "rules/prepare_sequences.smk"
+include: "rules/construct_phylogeny.smk"
 include: "rules/core.smk"
 
 
diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk
new file mode 100644
index 00000000..215c6a9a
--- /dev/null
+++ b/phylogenetic/rules/construct_phylogeny.smk
@@ -0,0 +1,111 @@
+"""
+This part of the workflow constructs the phylogenetic tree.
+
+REQUIRED INPUTS:
+
+    sequences   = {build_dir}/{build_name}/masked.fasta
+    metadata    = {build_dir}/{build_name}/metadata.tsv
+    tree_mask   = path to maskfile of sites to exclude for tree building
+
+OUTPUTS:
+
+    tree            = {build_dir}/{build_name}/tree.nwk
+    branch_lengths  = {build_dir}/{build_name}/branch_lengths.json
+
+"""
+
+
+rule tree:
+    """
+    Building tree
+    """
+    input:
+        alignment=build_dir + "/{build_name}/masked.fasta",
+        tree_mask=config["tree_mask"],
+    output:
+        tree=build_dir + "/{build_name}/tree_raw.nwk",
+    threads: workflow.cores
+    shell:
+        """
+        augur tree \
+            --alignment {input.alignment} \
+            --exclude-sites {input.tree_mask} \
+            --tree-builder-args="-redo" \
+            --output {output.tree} \
+            --nthreads {threads}
+        """
+
+
+rule fix_tree:
+    """
+    Fixing tree
+    """
+    input:
+        tree=rules.tree.output.tree,
+        alignment=build_dir + "/{build_name}/masked.fasta",
+    output:
+        tree=build_dir + "/{build_name}/tree_fixed.nwk",
+    params:
+        root=lambda w: config.get("treefix_root", ""),
+    shell:
+        """
+        python3 scripts/fix_tree.py \
+            --alignment {input.alignment} \
+            --input-tree {input.tree} \
+            {params.root} \
+            --output {output.tree}
+        """
+
+
+rule refine:
+    """
+    Refining tree
+        - estimate timetree
+        - use {params.coalescent} coalescent timescale
+        - estimate {params.date_inference} node dates
+        - filter tips more than {params.clock_filter_iqd} IQDs from clock expectation
+    """
+    input:
+        tree=rules.fix_tree.output.tree
+        if config["fix_tree"]
+        else rules.tree.output.tree,
+        alignment=build_dir + "/{build_name}/masked.fasta",
+        metadata=build_dir + "/{build_name}/metadata.tsv",
+    output:
+        tree=build_dir + "/{build_name}/tree.nwk",
+        node_data=build_dir + "/{build_name}/branch_lengths.json",
+    params:
+        coalescent="opt",
+        date_inference="marginal",
+        clock_filter_iqd=0,
+        root=config["root"],
+        clock_rate=f"--clock-rate {config['clock_rate']}"
+        if "clock_rate" in config
+        else "",
+        clock_std_dev=f"--clock-std-dev {config['clock_std_dev']}"
+        if "clock_std_dev" in config
+        else "",
+        strain_id=config["strain_id_field"],
+        divergence_units=config["divergence_units"],
+    shell:
+        """
+        augur refine \
+            --tree {input.tree} \
+            --alignment {input.alignment} \
+            --metadata {input.metadata} \
+            --metadata-id-columns {params.strain_id} \
+            --output-tree {output.tree} \
+            --timetree \
+            --root {params.root} \
+            --precision 3 \
+            --keep-polytomies \
+            --use-fft \
+            {params.clock_rate} \
+            {params.clock_std_dev} \
+            --output-node-data {output.node_data} \
+            --coalescent {params.coalescent} \
+            --date-inference {params.date_inference} \
+            --date-confidence \
+            --divergence-units {params.divergence_units} \
+            --clock-filter-iqd {params.clock_filter_iqd}
+        """
diff --git a/phylogenetic/rules/core.smk b/phylogenetic/rules/core.smk
index af817f4a..feed9fc0 100644
--- a/phylogenetic/rules/core.smk
+++ b/phylogenetic/rules/core.smk
@@ -13,102 +13,6 @@ In addition, `build_dir` and `auspice_dir` need to be defined upstream.
 """
 
 
-rule tree:
-    """
-    Building tree
-    """
-    input:
-        alignment=build_dir + "/{build_name}/masked.fasta",
-        tree_mask=config["tree_mask"],
-    output:
-        tree=build_dir + "/{build_name}/tree_raw.nwk",
-    threads: workflow.cores
-    shell:
-        """
-        augur tree \
-            --alignment {input.alignment} \
-            --exclude-sites {input.tree_mask} \
-            --tree-builder-args="-redo" \
-            --output {output.tree} \
-            --nthreads {threads}
-        """
-
-
-rule fix_tree:
-    """
-    Fixing tree
-    """
-    input:
-        tree=rules.tree.output.tree,
-        alignment=build_dir + "/{build_name}/masked.fasta",
-    output:
-        tree=build_dir + "/{build_name}/tree_fixed.nwk",
-    params:
-        root=lambda w: config.get("treefix_root", ""),
-    shell:
-        """
-        python3 scripts/fix_tree.py \
-            --alignment {input.alignment} \
-            --input-tree {input.tree} \
-            {params.root} \
-            --output {output.tree}
-        """
-
-
-rule refine:
-    """
-    Refining tree
-        - estimate timetree
-        - use {params.coalescent} coalescent timescale
-        - estimate {params.date_inference} node dates
-        - filter tips more than {params.clock_filter_iqd} IQDs from clock expectation
-    """
-    input:
-        tree=rules.fix_tree.output.tree
-        if config["fix_tree"]
-        else rules.tree.output.tree,
-        alignment=build_dir + "/{build_name}/masked.fasta",
-        metadata=build_dir + "/{build_name}/metadata.tsv",
-    output:
-        tree=build_dir + "/{build_name}/tree.nwk",
-        node_data=build_dir + "/{build_name}/branch_lengths.json",
-    params:
-        coalescent="opt",
-        date_inference="marginal",
-        clock_filter_iqd=0,
-        root=config["root"],
-        clock_rate=f"--clock-rate {config['clock_rate']}"
-        if "clock_rate" in config
-        else "",
-        clock_std_dev=f"--clock-std-dev {config['clock_std_dev']}"
-        if "clock_std_dev" in config
-        else "",
-        strain_id=config["strain_id_field"],
-        divergence_units=config["divergence_units"],
-    shell:
-        """
-        augur refine \
-            --tree {input.tree} \
-            --alignment {input.alignment} \
-            --metadata {input.metadata} \
-            --metadata-id-columns {params.strain_id} \
-            --output-tree {output.tree} \
-            --timetree \
-            --root {params.root} \
-            --precision 3 \
-            --keep-polytomies \
-            --use-fft \
-            {params.clock_rate} \
-            {params.clock_std_dev} \
-            --output-node-data {output.node_data} \
-            --coalescent {params.coalescent} \
-            --date-inference {params.date_inference} \
-            --date-confidence \
-            --divergence-units {params.divergence_units} \
-            --clock-filter-iqd {params.clock_filter_iqd}
-        """
-
-
 rule ancestral:
     """
     Reconstructing ancestral sequences and mutations

From eb393bd6595f452dbde1e3edc1b757a58c87bfc2 Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Tue, 13 Feb 2024 12:43:29 -0800
Subject: [PATCH 11/16] phylo: Move rules to annotate_phylogeny.smk

Part of work to update this repo to match the pathogen-repo-guide.
---
 phylogenetic/Snakefile                    |   1 +
 phylogenetic/rules/annotate_phylogeny.smk | 158 ++++++++++++++++++++++
 phylogenetic/rules/core.smk               | 138 -------------------
 3 files changed, 159 insertions(+), 138 deletions(-)
 create mode 100644 phylogenetic/rules/annotate_phylogeny.smk

diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile
index d6add2bd..9324171e 100644
--- a/phylogenetic/Snakefile
+++ b/phylogenetic/Snakefile
@@ -41,6 +41,7 @@ rule all:
 
 include: "rules/prepare_sequences.smk"
 include: "rules/construct_phylogeny.smk"
+include: "rules/annotate_phylogeny.smk"
 include: "rules/core.smk"
 
 
diff --git a/phylogenetic/rules/annotate_phylogeny.smk b/phylogenetic/rules/annotate_phylogeny.smk
new file mode 100644
index 00000000..564f8c28
--- /dev/null
+++ b/phylogenetic/rules/annotate_phylogeny.smk
@@ -0,0 +1,158 @@
+"""
+This part of the workflow creates additional annotations for the phylogenetic tree.
+
+REQUIRED INPUTS:
+
+    sequences  = {build_dir}/{build_name}/masked.fasta
+    metadata   = {build_dir}/{build_name}/metadata.tsv
+    tree       = {build_dir}/{build_name}/tree.nwk
+    clades     = path to clades definition TSV
+
+OUTPUTS:
+
+    nt_muts             = {build_dir}/{build_name}/nt_muts.json
+    aa_muts             = {build_dir}/{build_name}/aa_muts.json
+    traits              = {build_dir}/{build_name}/traits.json
+    clades              = {build_dir}/{build_name}/clades.json
+    mutation_context    = {build_dir}/{build_name}/mutation_context.json
+    recency             = {build_dir}/{build_name}/recency.json
+
+"""
+
+
+rule ancestral:
+    """
+    Reconstructing ancestral sequences and mutations
+    """
+    input:
+        tree=rules.refine.output.tree,
+        alignment=build_dir + "/{build_name}/masked.fasta",
+    output:
+        node_data=build_dir + "/{build_name}/nt_muts.json",
+    params:
+        inference="joint",
+    shell:
+        """
+        augur ancestral \
+            --tree {input.tree} \
+            --alignment {input.alignment} \
+            --output-node-data {output.node_data} \
+            --inference {params.inference}
+        """
+
+
+rule translate:
+    """
+    Translating amino acid sequences
+    """
+    input:
+        tree=rules.refine.output.tree,
+        node_data=rules.ancestral.output.node_data,
+        genemap=config["genemap"],
+    output:
+        node_data=build_dir + "/{build_name}/aa_muts.json",
+    shell:
+        """
+        augur translate \
+            --tree {input.tree} \
+            --ancestral-sequences {input.node_data} \
+            --reference-sequence {input.genemap} \
+            --output {output.node_data}
+        """
+
+
+rule traits:
+    """
+    Inferring ancestral traits for {params.columns!s}
+      - increase uncertainty of reconstruction by {params.sampling_bias_correction} to partially account for sampling bias
+    """
+    input:
+        tree=rules.refine.output.tree,
+        metadata=build_dir + "/{build_name}/metadata.tsv",
+    output:
+        node_data=build_dir + "/{build_name}/traits.json",
+    params:
+        columns="country",
+        sampling_bias_correction=3,
+        strain_id=config["strain_id_field"],
+    shell:
+        """
+        augur traits \
+            --tree {input.tree} \
+            --metadata {input.metadata} \
+            --metadata-id-columns {params.strain_id} \
+            --output {output.node_data} \
+            --columns {params.columns} \
+            --confidence \
+            --sampling-bias-correction {params.sampling_bias_correction}
+        """
+
+
+rule clades:
+    """
+    Adding internal clade labels
+    """
+    input:
+        tree=rules.refine.output.tree,
+        aa_muts=rules.translate.output.node_data,
+        nuc_muts=rules.ancestral.output.node_data,
+        clades=config["clades"],
+    output:
+        node_data=build_dir + "/{build_name}/clades_raw.json",
+    log:
+        "logs/clades_{build_name}.txt",
+    shell:
+        """
+        augur clades \
+            --tree {input.tree} \
+            --mutations {input.nuc_muts} {input.aa_muts} \
+            --clades {input.clades} \
+            --output-node-data {output.node_data} 2>&1 | tee {log}
+        """
+
+
+rule rename_clades:
+    input:
+        rules.clades.output.node_data,
+    output:
+        node_data=build_dir + "/{build_name}/clades.json",
+    shell:
+        """
+        python scripts/clades_renaming.py \
+        --input-node-data {input} \
+        --output-node-data {output.node_data}
+        """
+
+
+rule mutation_context:
+    input:
+        tree=rules.refine.output.tree,
+        node_data=build_dir + "/{build_name}/nt_muts.json",
+    output:
+        node_data=build_dir + "/{build_name}/mutation_context.json",
+    shell:
+        """
+        python3 scripts/mutation_context.py \
+            --tree {input.tree} \
+            --mutations {input.node_data} \
+            --output {output.node_data}
+        """
+
+
+rule recency:
+    """
+    Use metadata on submission date to construct submission recency field
+    """
+    input:
+        metadata=build_dir + "/{build_name}/metadata.tsv",
+    output:
+        node_data=build_dir + "/{build_name}/recency.json",
+    params:
+        strain_id=config["strain_id_field"],
+    shell:
+        """
+        python3 scripts/construct-recency-from-submission-date.py \
+            --metadata {input.metadata} \
+            --metadata-id-columns {params.strain_id} \
+            --output {output} 2>&1
+        """
diff --git a/phylogenetic/rules/core.smk b/phylogenetic/rules/core.smk
index feed9fc0..3ec78c53 100644
--- a/phylogenetic/rules/core.smk
+++ b/phylogenetic/rules/core.smk
@@ -13,125 +13,6 @@ In addition, `build_dir` and `auspice_dir` need to be defined upstream.
 """
 
 
-rule ancestral:
-    """
-    Reconstructing ancestral sequences and mutations
-    """
-    input:
-        tree=rules.refine.output.tree,
-        alignment=build_dir + "/{build_name}/masked.fasta",
-    output:
-        node_data=build_dir + "/{build_name}/nt_muts.json",
-    params:
-        inference="joint",
-    shell:
-        """
-        augur ancestral \
-            --tree {input.tree} \
-            --alignment {input.alignment} \
-            --output-node-data {output.node_data} \
-            --inference {params.inference}
-        """
-
-
-rule translate:
-    """
-    Translating amino acid sequences
-    """
-    input:
-        tree=rules.refine.output.tree,
-        node_data=rules.ancestral.output.node_data,
-        genemap=config["genemap"],
-    output:
-        node_data=build_dir + "/{build_name}/aa_muts.json",
-    shell:
-        """
-        augur translate \
-            --tree {input.tree} \
-            --ancestral-sequences {input.node_data} \
-            --reference-sequence {input.genemap} \
-            --output {output.node_data}
-        """
-
-
-rule traits:
-    """
-    Inferring ancestral traits for {params.columns!s}
-      - increase uncertainty of reconstruction by {params.sampling_bias_correction} to partially account for sampling bias
-    """
-    input:
-        tree=rules.refine.output.tree,
-        metadata=build_dir + "/{build_name}/metadata.tsv",
-    output:
-        node_data=build_dir + "/{build_name}/traits.json",
-    params:
-        columns="country",
-        sampling_bias_correction=3,
-        strain_id=config["strain_id_field"],
-    shell:
-        """
-        augur traits \
-            --tree {input.tree} \
-            --metadata {input.metadata} \
-            --metadata-id-columns {params.strain_id} \
-            --output {output.node_data} \
-            --columns {params.columns} \
-            --confidence \
-            --sampling-bias-correction {params.sampling_bias_correction}
-        """
-
-
-rule clades:
-    """
-    Adding internal clade labels
-    """
-    input:
-        tree=rules.refine.output.tree,
-        aa_muts=rules.translate.output.node_data,
-        nuc_muts=rules.ancestral.output.node_data,
-        clades=config["clades"],
-    output:
-        node_data=build_dir + "/{build_name}/clades_raw.json",
-    log:
-        "logs/clades_{build_name}.txt",
-    shell:
-        """
-        augur clades \
-            --tree {input.tree} \
-            --mutations {input.nuc_muts} {input.aa_muts} \
-            --clades {input.clades} \
-            --output-node-data {output.node_data} 2>&1 | tee {log}
-        """
-
-
-rule rename_clades:
-    input:
-        rules.clades.output.node_data,
-    output:
-        node_data=build_dir + "/{build_name}/clades.json",
-    shell:
-        """
-        python scripts/clades_renaming.py \
-        --input-node-data {input} \
-        --output-node-data {output.node_data}
-        """
-
-
-rule mutation_context:
-    input:
-        tree=rules.refine.output.tree,
-        node_data=build_dir + "/{build_name}/nt_muts.json",
-    output:
-        node_data=build_dir + "/{build_name}/mutation_context.json",
-    shell:
-        """
-        python3 scripts/mutation_context.py \
-            --tree {input.tree} \
-            --mutations {input.node_data} \
-            --output {output.node_data}
-        """
-
-
 rule remove_time:
     input:
         "results/{build_name}/branch_lengths.json",
@@ -143,25 +24,6 @@ rule remove_time:
         """
 
 
-rule recency:
-    """
-    Use metadata on submission date to construct submission recency field
-    """
-    input:
-        metadata=build_dir + "/{build_name}/metadata.tsv",
-    output:
-        node_data=build_dir + "/{build_name}/recency.json",
-    params:
-        strain_id=config["strain_id_field"],
-    shell:
-        """
-        python3 scripts/construct-recency-from-submission-date.py \
-            --metadata {input.metadata} \
-            --metadata-id-columns {params.strain_id} \
-            --output {output} 2>&1
-        """
-
-
 rule colors:
     input:
         ordering="defaults/color_ordering.tsv",

From 9afc4799951070bc6b9f06b86eed8c21fe905770 Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Tue, 13 Feb 2024 13:21:31 -0800
Subject: [PATCH 12/16] phylo: Rename core.smk to export.smk

Part of work to update this repo to match the pathogen-repo-guide.

The remaining rules in core.smk are all related to the final export
of the Auspice JSONs.
---
 phylogenetic/Snakefile                      |  2 +-
 phylogenetic/rules/{core.smk => export.smk} | 31 ++++++++++++++++-----
 2 files changed, 25 insertions(+), 8 deletions(-)
 rename phylogenetic/rules/{core.smk => export.smk} (75%)

diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile
index 9324171e..d5c3e2d7 100644
--- a/phylogenetic/Snakefile
+++ b/phylogenetic/Snakefile
@@ -42,7 +42,7 @@ rule all:
 include: "rules/prepare_sequences.smk"
 include: "rules/construct_phylogeny.smk"
 include: "rules/annotate_phylogeny.smk"
-include: "rules/core.smk"
+include: "rules/export.smk"
 
 
 # Include custom rules defined in the config.
diff --git a/phylogenetic/rules/core.smk b/phylogenetic/rules/export.smk
similarity index 75%
rename from phylogenetic/rules/core.smk
rename to phylogenetic/rules/export.smk
index 3ec78c53..f4e7ee69 100644
--- a/phylogenetic/rules/core.smk
+++ b/phylogenetic/rules/export.smk
@@ -1,15 +1,32 @@
 """
-This part of the workflow expects input files
+This part of the workflow collects the phylogenetic tree and annotations to
+export a Nextstrain dataset.
 
-        sequences = "data/sequences.fasta",
-        metadata =  "data/metadata.tsv",
+REQUIRED INPUTS:
 
-and will produce output files as
+    metadata            = {build_dir}/{build_name}/metadata.tsv
+    tree                = {build_dir}/{build_name}/tree.nwk
+    branch_lengths      = {build_dir}/{build_name}/branch_lengths.json
+    nt_muts             = {build_dir}/{build_name}/nt_muts.json
+    aa_muts             = {build_dir}/{build_name}/aa_muts.json
+    traits              = {build_dir}/{build_name}/traits.json
+    clades              = {build_dir}/{build_name}/clades.json
+    mutation_context    = {build_dir}/{build_name}/mutation_context.json
+    color_ordering      = defaults/color_ordering.tsv
+    color_schemes       = defaults/color_schemes.tsv
+    lat_longs           = path to lat/long TSV
+    description         = path to description Markdown
+    auspice_config      = path to Auspice config JSON
 
-        auspice_json = auspice_dir + "/mpox_{build_name}.json"
+OPTIONAL INPUTS:
+
+    recency             = {build_dir}/{build_name}/recency.json
+
+OUTPUTS:
+
+    auspice_json        = {build_dir}/{build_name}/tree.json
+    root_sequence       = {build_dir}/{build_name}/tree_root-sequence.json
 
-Parameter are expected to sit in the `config` data structure.
-In addition, `build_dir` and `auspice_dir` need to be defined upstream.
 """
 
 

From 08d158899b74638522894dac44deec9b3efc4b2a Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Tue, 13 Feb 2024 13:42:19 -0800
Subject: [PATCH 13/16] Fix README link

---
 phylogenetic/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/phylogenetic/README.md b/phylogenetic/README.md
index 1a17a42f..52032aa5 100644
--- a/phylogenetic/README.md
+++ b/phylogenetic/README.md
@@ -11,7 +11,7 @@ for Nextstrain's suite of software tools.
 ## Usage
 
 If you're unfamiliar with Nextstrain builds, you may want to follow our
-[Running a Pathogen Workflow guide][] first and then come back here.
+[Running a Pathogen Workflow guide](https://docs.nextstrain.org/en/latest/tutorials/running-a-workflow.html) first and then come back here.
 
 The easiest way to run this pathogen build is using the Nextstrain
 command-line tool from within the `phylogenetic/` directory:

From 40715f997d1cdd070a15b6da4a18de6b58acca38 Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Tue, 13 Feb 2024 13:53:03 -0800
Subject: [PATCH 14/16] phylo: Remove scripts/deploy.py and related docs

Automated builds have been deployed directly to production since
https://github.com/nextstrain/mpox/commit/4017f513d7b74de039153ffa3551d131458b2785.
---
 phylogenetic/README.md         | 22 --------
 phylogenetic/scripts/deploy.py | 95 ----------------------------------
 2 files changed, 117 deletions(-)
 delete mode 100644 phylogenetic/scripts/deploy.py

diff --git a/phylogenetic/README.md b/phylogenetic/README.md
index 52032aa5..6c29f4cc 100644
--- a/phylogenetic/README.md
+++ b/phylogenetic/README.md
@@ -76,28 +76,6 @@ Run pipeline to produce the "lineage B.1" tree for `/mpox/lineage-B.1` with:
 nextstrain build . --configfile defaults/hmpxv1_big/config.yaml
 ```
 
-### Deploy
-
-⚠️ The below is outdated and needs to be adjusted for the new build names (mpox instead of monkeypox, etc.)
-
-<details>
-
-Run the python script [`scripts/deploy.py`](scripts/deploy.py) to deploy the staging build to production.
-
-This will also automatically create a dated build where each node has a unique (random) ID so it can be targeted in shared links/narratives.
-
-```bash
-python scripts/deploy.py --build-names hmpxv1 mpxv
-```
-
-If a dated build already exists it is not overwritten by default. To overwrite, pass `-f`.
-
-To deploy a locally built build to staging, use the `--staging` flag.
-
-To not deploy a dated build to production, add the `--no-dated` flag.
-
-</details>
-
 ### Visualize results
 
 View results with:
diff --git a/phylogenetic/scripts/deploy.py b/phylogenetic/scripts/deploy.py
deleted file mode 100644
index 17cd9d36..00000000
--- a/phylogenetic/scripts/deploy.py
+++ /dev/null
@@ -1,95 +0,0 @@
-"""
-- Generate dated builds where each node has random id
-- Deploy builds from staging to production
-
-Builds are downloaded from staging
-This script is to be run independently from snakemake
-"""
-
-import argparse
-import datetime
-import gzip
-import json
-import os
-import uuid
-
-
-def add_branch_id_recursive(node):
-    """
-    Recursively add randomly generated id to each node in auspice json tree
-    """
-    node["branch_attrs"]["labels"] = {}
-    node["branch_attrs"]["labels"]["id"] = str(uuid.uuid4())[:8]
-    if "children" in node:
-        for child in node["children"]:
-            add_branch_id_recursive(child)
-
-
-
-if __name__=="__main__":
-    parser = argparse.ArgumentParser(
-        description="Deploy builds from staging to production, generate dated builds where each node has random id",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-
-
-    parser.add_argument('--build-names', nargs='+', type=str, required=True, help="build names to upload")
-    parser.add_argument('-f','--force', action='store_true', help="force overwrite of existing dated builds")
-    parser.add_argument('--no-dated' , action='store_true', help="do not deploy dated build")
-    parser.add_argument('--staging' , action='store_true', help="deploy to staging")
-    parser.set_defaults(feature=True)
-    args = parser.parse_args()
-
-    print(f"> Deploying builds {args.build_names} to {'staging' if args.staging else 'production'}")
-    print("----------------------------------------")
-
-    if not os.path.isdir('staging'):
-        os.mkdir('staging')
-    for build_name in args.build_names:
-        if not args.staging:
-            print(f">> Deploying build {build_name} to production")
-
-            # Upload basic builds to staging
-            for auspice_file in ['', '_root-sequence']:
-                os.system(f"aws s3 cp s3://nextstrain-staging/mpox_{build_name}{auspice_file}.json s3://nextstrain-data/mpox_{build_name}{auspice_file}.json")
-            print(f">> Uploaded {build_name} to production: https://nextstrain.org/staging/mpox/{build_name.replace('_', '/')}/")
-
-
-            if not args.no_dated:
-                # Check how many today dated builds exist
-                today = datetime.date.today().strftime("%Y-%m-%d")
-                os.system(f"aws s3 ls nextstrain-data/mpox_{build_name}_{today}.json > dated_builds.txt")
-
-                with open('dated_builds.txt') as fh:
-                    today_dated_builds_count = len(fh.readlines())
-                os.remove('dated_builds.txt')
-                if today_dated_builds_count == 0 or args.force:
-                    if today_dated_builds_count > 0:
-                        print(f">> Overwriting existing dated build due to --force flag being present")
-
-                    for auspice_file in ['', '_root-sequence']:
-                        os.system(f"aws s3 cp s3://nextstrain-staging/mpox_{build_name}{auspice_file}.json staging/")
-
-                    # Load auspice json
-                    with gzip.open(f"staging/mpox_{build_name}.json") as fh:
-                        auspice_json = json.load(fh)
-
-                    add_branch_id_recursive(auspice_json['tree'])
-
-                    with open(f"staging/mpox_{build_name}_{today}.json", 'wt') as fh:
-                        json.dump(auspice_json, fh)
-
-                    os.system(f"aws s3 cp staging/mpox_{build_name}_{today}.json s3://nextstrain-data")
-                    os.system(f"aws s3 cp s3://nextstrain-staging/mpox_{build_name}_root-sequence.json s3://nextstrain-data/mpox_{build_name}_{today}_root-sequence.json")
-                    print(f">> Uploaded dated {build_name} to production: https://nextstrain.org/mpox/{build_name.replace('_', '/')}/{today}/")
-
-                else:
-                    print(f">> Warning: Dated {build_name} with date today already exists, skipping upload: https://nextstrain.org/mpox/{build_name.replace('_', '/')}/{today}/")
-                    print(f">> Hint: Use the --f/--force flag to overwrite existing dated builds")
-        if args.staging:
-            print(f">> Deploying build {build_name} to staging")
-            for auspice_file in ['', '_root-sequence']:
-                os.system(f"aws s3 cp auspice/mpox_{build_name}{auspice_file}.json s3://nextstrain-staging/mpox_{build_name}{auspice_file}.json")
-            print(f">> Uploaded {build_name} to staging: https://nextstrain.org/staging/mpox/{build_name.replace('_', '/')}/")
-
-        print("----------------------------------------")

From b2942626119828c8935f4dd14607dbf52fd1d431 Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Tue, 13 Feb 2024 16:23:38 -0800
Subject: [PATCH 15/16] phylo: Make `auspice_name` config optional
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If `auspice_name` param is not provided, then the workflow will use
the required `build_name` as the default Auspice filename.

Ensures that we are following the Nextstrain Snakemake style guide for
configs¹ by not accessing configs with `config.get(key)`.

¹ https://docs.nextstrain.org/en/latest/reference/snakemake-style-guide.html#access-config-values-appropriately
---
 phylogenetic/Snakefile | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile
index d5c3e2d7..19cce102 100644
--- a/phylogenetic/Snakefile
+++ b/phylogenetic/Snakefile
@@ -16,14 +16,12 @@ if not config:
 
 
 build_dir = "results"
-
-
 auspice_dir = "auspice"
 
 prefix = config.get("auspice_prefix", None)
 AUSPICE_PREFIX = ("trial_" + prefix + "_") if prefix is not None else ""
-AUSPICE_FILENAME = AUSPICE_PREFIX + config.get("auspice_name")
-
+# Defaults to the `build_name` if no `auspice_name` is provided in the config
+AUSPICE_FILENAME = AUSPICE_PREFIX + config.get("auspice_name", config["build_name"])
 
 rule all:
     input:

From ce5c2d53a8b5b1866fd2c17aae008e599fd48c87 Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Tue, 13 Feb 2024 17:17:40 -0800
Subject: [PATCH 16/16] phylo: update rule inputs to literal paths
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Follow the Nextstrain Snakemake style guide¹ to use literal path strings
as inputs for the core workflow rules.

This will make it possible for users to override the default workflow
with `custom_rules`.

¹ https://docs.nextstrain.org/en/latest/reference/snakemake-style-guide.html#define-input-paths-with-literal-path-strings
---
 phylogenetic/rules/annotate_phylogeny.smk  | 18 +++++++++---------
 phylogenetic/rules/construct_phylogeny.smk |  6 +++---
 phylogenetic/rules/export.smk              | 14 +++++++-------
 phylogenetic/rules/prepare_sequences.smk   |  8 ++++----
 4 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/phylogenetic/rules/annotate_phylogeny.smk b/phylogenetic/rules/annotate_phylogeny.smk
index 564f8c28..9f45b296 100644
--- a/phylogenetic/rules/annotate_phylogeny.smk
+++ b/phylogenetic/rules/annotate_phylogeny.smk
@@ -25,7 +25,7 @@ rule ancestral:
     Reconstructing ancestral sequences and mutations
     """
     input:
-        tree=rules.refine.output.tree,
+        tree=build_dir + "/{build_name}/tree.nwk",
         alignment=build_dir + "/{build_name}/masked.fasta",
     output:
         node_data=build_dir + "/{build_name}/nt_muts.json",
@@ -46,8 +46,8 @@ rule translate:
     Translating amino acid sequences
     """
     input:
-        tree=rules.refine.output.tree,
-        node_data=rules.ancestral.output.node_data,
+        tree=build_dir + "/{build_name}/tree.nwk",
+        node_data=build_dir + "/{build_name}/nt_muts.json",
         genemap=config["genemap"],
     output:
         node_data=build_dir + "/{build_name}/aa_muts.json",
@@ -67,7 +67,7 @@ rule traits:
       - increase uncertainty of reconstruction by {params.sampling_bias_correction} to partially account for sampling bias
     """
     input:
-        tree=rules.refine.output.tree,
+        tree=build_dir + "/{build_name}/tree.nwk",
         metadata=build_dir + "/{build_name}/metadata.tsv",
     output:
         node_data=build_dir + "/{build_name}/traits.json",
@@ -93,9 +93,9 @@ rule clades:
     Adding internal clade labels
     """
     input:
-        tree=rules.refine.output.tree,
-        aa_muts=rules.translate.output.node_data,
-        nuc_muts=rules.ancestral.output.node_data,
+        tree=build_dir + "/{build_name}/tree.nwk",
+        aa_muts=build_dir + "/{build_name}/aa_muts.json",
+        nuc_muts=build_dir + "/{build_name}/nt_muts.json",
         clades=config["clades"],
     output:
         node_data=build_dir + "/{build_name}/clades_raw.json",
@@ -113,7 +113,7 @@ rule clades:
 
 rule rename_clades:
     input:
-        rules.clades.output.node_data,
+        build_dir + "/{build_name}/clades_raw.json",
     output:
         node_data=build_dir + "/{build_name}/clades.json",
     shell:
@@ -126,7 +126,7 @@ rule rename_clades:
 
 rule mutation_context:
     input:
-        tree=rules.refine.output.tree,
+        tree=build_dir + "/{build_name}/tree.nwk",
         node_data=build_dir + "/{build_name}/nt_muts.json",
     output:
         node_data=build_dir + "/{build_name}/mutation_context.json",
diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk
index 215c6a9a..fe0a7dea 100644
--- a/phylogenetic/rules/construct_phylogeny.smk
+++ b/phylogenetic/rules/construct_phylogeny.smk
@@ -41,7 +41,7 @@ rule fix_tree:
     Fixing tree
     """
     input:
-        tree=rules.tree.output.tree,
+        tree=build_dir + "/{build_name}/tree_raw.nwk",
         alignment=build_dir + "/{build_name}/masked.fasta",
     output:
         tree=build_dir + "/{build_name}/tree_fixed.nwk",
@@ -66,9 +66,9 @@ rule refine:
         - filter tips more than {params.clock_filter_iqd} IQDs from clock expectation
     """
     input:
-        tree=rules.fix_tree.output.tree
+        tree=build_dir + "/{build_name}/tree_fixed.nwk"
         if config["fix_tree"]
-        else rules.tree.output.tree,
+        else build_dir + "/{build_name}/tree_raw.nwk",
         alignment=build_dir + "/{build_name}/masked.fasta",
         metadata=build_dir + "/{build_name}/metadata.tsv",
     output:
diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk
index f4e7ee69..89499ed1 100644
--- a/phylogenetic/rules/export.smk
+++ b/phylogenetic/rules/export.smk
@@ -63,18 +63,18 @@ rule export:
     Exporting data files for auspice
     """
     input:
-        tree=rules.refine.output.tree,
+        tree=build_dir + "/{build_name}/tree.nwk",
         metadata=build_dir + "/{build_name}/metadata.tsv",
         branch_lengths="results/{build_name}/branch_lengths.json"
         if config.get("timetree", False)
         else "results/{build_name}/branch_lengths_no_time.json",
-        traits=rules.traits.output.node_data,
-        nt_muts=rules.ancestral.output.node_data,
-        aa_muts=rules.translate.output.node_data,
+        traits=build_dir + "/{build_name}/traits.json",
+        nt_muts=build_dir + "/{build_name}/nt_muts.json",
+        aa_muts=build_dir + "/{build_name}/aa_muts.json",
         clades=build_dir + "/{build_name}/clades.json",
-        mutation_context=rules.mutation_context.output.node_data,
-        recency=rules.recency.output.node_data if config.get("recency", False) else [],
-        colors=rules.colors.output.colors,
+        mutation_context=build_dir + "/{build_name}/mutation_context.json",
+        recency=build_dir + "/{build_name}/recency.json" if config.get("recency", False) else [],
+        colors=build_dir + "/{build_name}/colors.tsv",
         lat_longs=config["lat_longs"],
         description=config["description"],
         auspice_config=config["auspice_config"],
diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk
index b87820c3..da4b501e 100644
--- a/phylogenetic/rules/prepare_sequences.smk
+++ b/phylogenetic/rules/prepare_sequences.smk
@@ -83,7 +83,7 @@ rule filter:
 
 rule subsample:
     input:
-        metadata=rules.filter.output.metadata,
+        metadata=build_dir + "/{build_name}/good_metadata.tsv",
     output:
         strains=build_dir + "/{build_name}/{sample}_strains.txt",
         log=build_dir + "/{build_name}/{sample}_filter.log",
@@ -117,8 +117,8 @@ rule combine_samples:
             f"{build_dir}/{w.build_name}/{sample}_strains.txt"
             for sample in config["subsample"]
         ],
-        sequences=rules.filter.output.sequences,
-        metadata=rules.filter.output.metadata,
+        sequences=build_dir + "/{build_name}/good_sequences.fasta",
+        metadata=build_dir + "/{build_name}/good_metadata.tsv",
         include=config["include"],
     output:
         sequences=build_dir + "/{build_name}/filtered.fasta",
@@ -159,7 +159,7 @@ rule align:
       - filling gaps with N
     """
     input:
-        sequences=rules.reverse_reverse_complements.output,
+        sequences=build_dir + "/{build_name}/reversed.fasta",
         reference=config["reference"],
         genemap=config["genemap"],
     output: