From 7dbb4baeb591c55ab874d7af9ca5d4c3990506f8 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Wed, 29 Nov 2023 15:31:45 -0800 Subject: [PATCH 01/16] phylo: Rename workflow/snakemake_rules/ to rules/ Part of work to update this repo to match the pathogen-repo-guide. Simplify directory structure by putting the core Snakemake rules in a "rules" directory. --- phylogenetic/README.md | 2 +- phylogenetic/Snakefile | 10 +++++----- .../{workflow/snakemake_rules => rules}/chores.smk | 0 .../{workflow/snakemake_rules => rules}/core.smk | 0 .../snakemake_rules => rules}/download_via_lapis.smk | 0 .../nextstrain_automation.smk | 0 .../{workflow/snakemake_rules => rules}/prepare.smk | 0 7 files changed, 6 insertions(+), 6 deletions(-) rename phylogenetic/{workflow/snakemake_rules => rules}/chores.smk (100%) rename phylogenetic/{workflow/snakemake_rules => rules}/core.smk (100%) rename phylogenetic/{workflow/snakemake_rules => rules}/download_via_lapis.smk (100%) rename phylogenetic/{workflow/snakemake_rules => rules}/nextstrain_automation.smk (100%) rename phylogenetic/{workflow/snakemake_rules => rules}/prepare.smk (100%) diff --git a/phylogenetic/README.md b/phylogenetic/README.md index 99c5d7c1..47a659eb 100644 --- a/phylogenetic/README.md +++ b/phylogenetic/README.md @@ -109,7 +109,7 @@ nextstrain view . ## Configuration Configuration takes place in `config/*/config.yaml` files for each build. -The analysis pipeline is contained in `workflow/snakemake_rule/core.smk`. +The analysis pipeline is contained in `rules/core.smk`. This can be read top-to-bottom, each rule specifies its file inputs and output and pulls its parameters from `config`. There is little redirection and each rule should be able to be reasoned with on its own. diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index 285012c9..03a06e67 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -41,20 +41,20 @@ rule all: if config.get("data_source", None) == "lapis": - include: "workflow/snakemake_rules/download_via_lapis.smk" + include: "rules/download_via_lapis.smk" else: - include: "workflow/snakemake_rules/prepare.smk" + include: "rules/prepare.smk" -include: "workflow/snakemake_rules/chores.smk" -include: "workflow/snakemake_rules/core.smk" +include: "rules/chores.smk" +include: "rules/core.smk" if config.get("deploy_url", False): - include: "workflow/snakemake_rules/nextstrain_automation.smk" + include: "rules/nextstrain_automation.smk" # Include custom rules defined in the config. diff --git a/phylogenetic/workflow/snakemake_rules/chores.smk b/phylogenetic/rules/chores.smk similarity index 100% rename from phylogenetic/workflow/snakemake_rules/chores.smk rename to phylogenetic/rules/chores.smk diff --git a/phylogenetic/workflow/snakemake_rules/core.smk b/phylogenetic/rules/core.smk similarity index 100% rename from phylogenetic/workflow/snakemake_rules/core.smk rename to phylogenetic/rules/core.smk diff --git a/phylogenetic/workflow/snakemake_rules/download_via_lapis.smk b/phylogenetic/rules/download_via_lapis.smk similarity index 100% rename from phylogenetic/workflow/snakemake_rules/download_via_lapis.smk rename to phylogenetic/rules/download_via_lapis.smk diff --git a/phylogenetic/workflow/snakemake_rules/nextstrain_automation.smk b/phylogenetic/rules/nextstrain_automation.smk similarity index 100% rename from phylogenetic/workflow/snakemake_rules/nextstrain_automation.smk rename to phylogenetic/rules/nextstrain_automation.smk diff --git a/phylogenetic/workflow/snakemake_rules/prepare.smk b/phylogenetic/rules/prepare.smk similarity index 100% rename from phylogenetic/workflow/snakemake_rules/prepare.smk rename to phylogenetic/rules/prepare.smk From 5bb98f1a617b6e5f51d21aa29bedd09a6b9a0b4c Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 12 Feb 2024 15:45:12 -0800 Subject: [PATCH 02/16] phylo: Rename config/ to defaults/ Part of work to update this repo to match the pathogen-repo-guide. Renamed in preparation for adding the "build-configs" directory in following commits to hold additional build configurations and customizations. --- .github/workflows/rebuild-hmpxv1-big.yaml | 2 +- .github/workflows/rebuild-hmpxv1.yaml | 2 +- .github/workflows/rebuild-mpxv.yaml | 2 +- phylogenetic/README.md | 8 +++---- phylogenetic/Snakefile | 2 +- phylogenetic/{config => defaults}/clades.tsv | 0 .../{config => defaults}/color_ordering.tsv | 0 .../{config => defaults}/color_schemes.tsv | 0 .../{config => defaults}/description.md | 2 +- .../exclude_accessions.txt | 0 phylogenetic/{config => defaults}/genemap.gff | 0 .../hmpxv1/auspice_config.json | 0 .../{config => defaults}/hmpxv1/config.yaml | 22 +++++++++---------- .../{config => defaults}/hmpxv1/include.txt | 0 .../hmpxv1_big/auspice_config.json | 0 .../hmpxv1_big/config.yaml | 22 +++++++++---------- .../hmpxv1_big/include.txt | 0 .../{config => defaults}/lat_longs.tsv | 0 phylogenetic/{config => defaults}/mask.bed | 0 .../{config => defaults}/mask_overview.bed | 0 .../mpxv/auspice_config.json | 0 .../{config => defaults}/mpxv/config.yaml | 22 +++++++++---------- .../{config => defaults}/mpxv/include.txt | 0 .../nextstrain_automation.yaml | 0 .../{config => defaults}/reference.fasta | 0 .../{config => defaults}/reference.gb | 0 .../{config => defaults}/tree_mask.tsv | 0 phylogenetic/profiles/ci/builds.yaml | 22 +++++++++---------- phylogenetic/rules/core.smk | 4 ++-- 29 files changed, 55 insertions(+), 55 deletions(-) rename phylogenetic/{config => defaults}/clades.tsv (100%) rename phylogenetic/{config => defaults}/color_ordering.tsv (100%) rename phylogenetic/{config => defaults}/color_schemes.tsv (100%) rename phylogenetic/{config => defaults}/description.md (96%) rename phylogenetic/{config => defaults}/exclude_accessions.txt (100%) rename phylogenetic/{config => defaults}/genemap.gff (100%) rename phylogenetic/{config => defaults}/hmpxv1/auspice_config.json (100%) rename phylogenetic/{config => defaults}/hmpxv1/config.yaml (76%) rename phylogenetic/{config => defaults}/hmpxv1/include.txt (100%) rename phylogenetic/{config => defaults}/hmpxv1_big/auspice_config.json (100%) rename phylogenetic/{config => defaults}/hmpxv1_big/config.yaml (67%) rename phylogenetic/{config => defaults}/hmpxv1_big/include.txt (100%) rename phylogenetic/{config => defaults}/lat_longs.tsv (100%) rename phylogenetic/{config => defaults}/mask.bed (100%) rename phylogenetic/{config => defaults}/mask_overview.bed (100%) rename phylogenetic/{config => defaults}/mpxv/auspice_config.json (100%) rename phylogenetic/{config => defaults}/mpxv/config.yaml (74%) rename phylogenetic/{config => defaults}/mpxv/include.txt (100%) rename phylogenetic/{config => defaults}/nextstrain_automation.yaml (100%) rename phylogenetic/{config => defaults}/reference.fasta (100%) rename phylogenetic/{config => defaults}/reference.gb (100%) rename phylogenetic/{config => defaults}/tree_mask.tsv (100%) diff --git a/.github/workflows/rebuild-hmpxv1-big.yaml b/.github/workflows/rebuild-hmpxv1-big.yaml index 717532ab..4b115254 100644 --- a/.github/workflows/rebuild-hmpxv1-big.yaml +++ b/.github/workflows/rebuild-hmpxv1-big.yaml @@ -63,5 +63,5 @@ jobs: --env SLACK_CHANNELS \ . \ notify_on_deploy \ - --configfiles $BUILD_DIR/config/$BUILD_NAME/config.yaml $BUILD_DIR/config/nextstrain_automation.yaml \ + --configfiles $BUILD_DIR/defaults/$BUILD_NAME/config.yaml $BUILD_DIR/defaults/nextstrain_automation.yaml \ $CONFIG_OVERRIDES --directory $BUILD_DIR --snakefile $BUILD_DIR/Snakefile diff --git a/.github/workflows/rebuild-hmpxv1.yaml b/.github/workflows/rebuild-hmpxv1.yaml index 0802f522..1ed7a235 100644 --- a/.github/workflows/rebuild-hmpxv1.yaml +++ b/.github/workflows/rebuild-hmpxv1.yaml @@ -63,5 +63,5 @@ jobs: --env SLACK_CHANNELS \ . \ notify_on_deploy \ - --configfiles $BUILD_DIR/config/$BUILD_NAME/config.yaml $BUILD_DIR/config/nextstrain_automation.yaml \ + --configfiles $BUILD_DIR/defaults/$BUILD_NAME/config.yaml $BUILD_DIR/defaults/nextstrain_automation.yaml \ $CONFIG_OVERRIDES --directory $BUILD_DIR --snakefile $BUILD_DIR/Snakefile diff --git a/.github/workflows/rebuild-mpxv.yaml b/.github/workflows/rebuild-mpxv.yaml index f07f77b7..fe3d614f 100644 --- a/.github/workflows/rebuild-mpxv.yaml +++ b/.github/workflows/rebuild-mpxv.yaml @@ -63,5 +63,5 @@ jobs: --env SLACK_CHANNELS \ . \ notify_on_deploy \ - --configfiles $BUILD_DIR/config/$BUILD_NAME/config.yaml $BUILD_DIR/config/nextstrain_automation.yaml \ + --configfiles $BUILD_DIR/defaults/$BUILD_NAME/config.yaml $BUILD_DIR/defaults/nextstrain_automation.yaml \ $CONFIG_OVERRIDES --directory $BUILD_DIR --snakefile $BUILD_DIR/Snakefile diff --git a/phylogenetic/README.md b/phylogenetic/README.md index 47a659eb..e1cb1cab 100644 --- a/phylogenetic/README.md +++ b/phylogenetic/README.md @@ -61,19 +61,19 @@ nextstrain build . data/sequences.fasta data/metadata.tsv Run pipeline to produce the "overview" tree for `/mpox/all-clades` with: ```bash -nextstrain build . --configfile config/mpxv/config.yaml +nextstrain build . --configfile defaults/mpxv/config.yaml ``` Run pipeline to produce the "clade IIb" tree for `/mpox/clade-IIb` with: ```bash -nextstrain build . --configfile config/hmpxv1/config.yaml +nextstrain build . --configfile defaults/hmpxv1/config.yaml ``` Run pipeline to produce the "lineage B.1" tree for `/mpox/lineage-B.1` with: ```bash -nextstrain build . --configfile config/hmpxv1_big/config.yaml +nextstrain build . --configfile defaults/hmpxv1_big/config.yaml ``` ### Deploy @@ -108,7 +108,7 @@ nextstrain view . ## Configuration -Configuration takes place in `config/*/config.yaml` files for each build. +The default configuration takes place in `defaults/*/config.yaml` files for each build. The analysis pipeline is contained in `rules/core.smk`. This can be read top-to-bottom, each rule specifies its file inputs and output and pulls its parameters from `config`. There is little redirection and each rule should be able to be reasoned with on its own. diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index 03a06e67..85296416 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -12,7 +12,7 @@ if version.parse(augur_version) < version.parse(min_augur_version): if not config: - configfile: "config/hmpxv1/config.yaml" + configfile: "defaults/hmpxv1/config.yaml" build_dir = "results" diff --git a/phylogenetic/config/clades.tsv b/phylogenetic/defaults/clades.tsv similarity index 100% rename from phylogenetic/config/clades.tsv rename to phylogenetic/defaults/clades.tsv diff --git a/phylogenetic/config/color_ordering.tsv b/phylogenetic/defaults/color_ordering.tsv similarity index 100% rename from phylogenetic/config/color_ordering.tsv rename to phylogenetic/defaults/color_ordering.tsv diff --git a/phylogenetic/config/color_schemes.tsv b/phylogenetic/defaults/color_schemes.tsv similarity index 100% rename from phylogenetic/config/color_schemes.tsv rename to phylogenetic/defaults/color_schemes.tsv diff --git a/phylogenetic/config/description.md b/phylogenetic/defaults/description.md similarity index 96% rename from phylogenetic/config/description.md rename to phylogenetic/defaults/description.md index a33eb1d3..624be722 100644 --- a/phylogenetic/config/description.md +++ b/phylogenetic/defaults/description.md @@ -14,7 +14,7 @@ Our bioinformatic processing workflow can be found at [github.com/nextstrain/mpo - masking several regions of the genome, including the first 1350 and last 6422 base pairs and multiple repetitive regions of variable length - phylogenetic reconstruction using [IQTREE-2](http://www.iqtree.org/) - ancestral state reconstruction and temporal inference using [TreeTime](https://github.com/neherlab/treetime) -- clade assignment via [clade definitions defined here](https://github.com/nextstrain/mpox/blob/master/config/clades.tsv), to label broader MPXV clades I, IIa and IIb and to label hMPXV1 lineages A, A.1, A.1.1, etc... +- clade assignment via [clade definitions defined here](https://github.com/nextstrain/mpox/blob/master/defaults/clades.tsv), to label broader MPXV clades I, IIa and IIb and to label hMPXV1 lineages A, A.1, A.1.1, etc... #### Underlying data We curate sequence data and metadata from the [NCBI Datasets command line tools](https://www.ncbi.nlm.nih.gov/datasets/docs/v2/download-and-install/), diff --git a/phylogenetic/config/exclude_accessions.txt b/phylogenetic/defaults/exclude_accessions.txt similarity index 100% rename from phylogenetic/config/exclude_accessions.txt rename to phylogenetic/defaults/exclude_accessions.txt diff --git a/phylogenetic/config/genemap.gff b/phylogenetic/defaults/genemap.gff similarity index 100% rename from phylogenetic/config/genemap.gff rename to phylogenetic/defaults/genemap.gff diff --git a/phylogenetic/config/hmpxv1/auspice_config.json b/phylogenetic/defaults/hmpxv1/auspice_config.json similarity index 100% rename from phylogenetic/config/hmpxv1/auspice_config.json rename to phylogenetic/defaults/hmpxv1/auspice_config.json diff --git a/phylogenetic/config/hmpxv1/config.yaml b/phylogenetic/defaults/hmpxv1/config.yaml similarity index 76% rename from phylogenetic/config/hmpxv1/config.yaml rename to phylogenetic/defaults/hmpxv1/config.yaml index a9b6739a..6e416e95 100644 --- a/phylogenetic/config/hmpxv1/config.yaml +++ b/phylogenetic/defaults/hmpxv1/config.yaml @@ -1,12 +1,12 @@ -reference: "config/reference.fasta" -genemap: "config/genemap.gff" -genbank_reference: "config/reference.gb" -include: "config/hmpxv1/include.txt" -clades: "config/clades.tsv" -lat_longs: "config/lat_longs.tsv" -auspice_config: "config/hmpxv1/auspice_config.json" -description: "config/description.md" -tree_mask: "config/tree_mask.tsv" +reference: "defaults/reference.fasta" +genemap: "defaults/genemap.gff" +genbank_reference: "defaults/reference.gb" +include: "defaults/hmpxv1/include.txt" +clades: "defaults/clades.tsv" +lat_longs: "defaults/lat_longs.tsv" +auspice_config: "defaults/hmpxv1/auspice_config.json" +description: "defaults/description.md" +tree_mask: "defaults/tree_mask.tsv" # Use `accession` as the ID column since `strain` currently contains duplicates¹. # ¹ https://github.com/nextstrain/mpox/issues/33 @@ -17,7 +17,7 @@ build_name: "hmpxv1" auspice_name: "mpox_clade-IIb" filter: - exclude: "config/exclude_accessions.txt" + exclude: "defaults/exclude_accessions.txt" min_date: 2017 min_length: 100000 @@ -78,4 +78,4 @@ recency: true mask: from_beginning: 800 from_end: 6422 - maskfile: "config/mask.bed" + maskfile: "defaults/mask.bed" diff --git a/phylogenetic/config/hmpxv1/include.txt b/phylogenetic/defaults/hmpxv1/include.txt similarity index 100% rename from phylogenetic/config/hmpxv1/include.txt rename to phylogenetic/defaults/hmpxv1/include.txt diff --git a/phylogenetic/config/hmpxv1_big/auspice_config.json b/phylogenetic/defaults/hmpxv1_big/auspice_config.json similarity index 100% rename from phylogenetic/config/hmpxv1_big/auspice_config.json rename to phylogenetic/defaults/hmpxv1_big/auspice_config.json diff --git a/phylogenetic/config/hmpxv1_big/config.yaml b/phylogenetic/defaults/hmpxv1_big/config.yaml similarity index 67% rename from phylogenetic/config/hmpxv1_big/config.yaml rename to phylogenetic/defaults/hmpxv1_big/config.yaml index 513e530f..dd807541 100644 --- a/phylogenetic/config/hmpxv1_big/config.yaml +++ b/phylogenetic/defaults/hmpxv1_big/config.yaml @@ -1,12 +1,12 @@ -reference: "config/reference.fasta" -genemap: "config/genemap.gff" -genbank_reference: "config/reference.gb" -include: "config/hmpxv1_big/include.txt" -clades: "config/clades.tsv" -lat_longs: "config/lat_longs.tsv" -auspice_config: "config/hmpxv1_big/auspice_config.json" -description: "config/description.md" -tree_mask: "config/tree_mask.tsv" +reference: "defaults/reference.fasta" +genemap: "defaults/genemap.gff" +genbank_reference: "defaults/reference.gb" +include: "defaults/hmpxv1_big/include.txt" +clades: "defaults/clades.tsv" +lat_longs: "defaults/lat_longs.tsv" +auspice_config: "defaults/hmpxv1_big/auspice_config.json" +description: "defaults/description.md" +tree_mask: "defaults/tree_mask.tsv" # Use `accession` as the ID column since `strain` currently contains duplicates¹. # ¹ https://github.com/nextstrain/mpox/issues/33 @@ -17,7 +17,7 @@ build_name: "hmpxv1_big" auspice_name: "mpox_lineage-B.1" filter: - exclude: "config/exclude_accessions.txt" + exclude: "defaults/exclude_accessions.txt" min_date: 2022 min_length: 180000 @@ -57,4 +57,4 @@ recency: true mask: from_beginning: 800 from_end: 6422 - maskfile: "config/mask.bed" + maskfile: "defaults/mask.bed" diff --git a/phylogenetic/config/hmpxv1_big/include.txt b/phylogenetic/defaults/hmpxv1_big/include.txt similarity index 100% rename from phylogenetic/config/hmpxv1_big/include.txt rename to phylogenetic/defaults/hmpxv1_big/include.txt diff --git a/phylogenetic/config/lat_longs.tsv b/phylogenetic/defaults/lat_longs.tsv similarity index 100% rename from phylogenetic/config/lat_longs.tsv rename to phylogenetic/defaults/lat_longs.tsv diff --git a/phylogenetic/config/mask.bed b/phylogenetic/defaults/mask.bed similarity index 100% rename from phylogenetic/config/mask.bed rename to phylogenetic/defaults/mask.bed diff --git a/phylogenetic/config/mask_overview.bed b/phylogenetic/defaults/mask_overview.bed similarity index 100% rename from phylogenetic/config/mask_overview.bed rename to phylogenetic/defaults/mask_overview.bed diff --git a/phylogenetic/config/mpxv/auspice_config.json b/phylogenetic/defaults/mpxv/auspice_config.json similarity index 100% rename from phylogenetic/config/mpxv/auspice_config.json rename to phylogenetic/defaults/mpxv/auspice_config.json diff --git a/phylogenetic/config/mpxv/config.yaml b/phylogenetic/defaults/mpxv/config.yaml similarity index 74% rename from phylogenetic/config/mpxv/config.yaml rename to phylogenetic/defaults/mpxv/config.yaml index d569445a..5327d6f2 100644 --- a/phylogenetic/config/mpxv/config.yaml +++ b/phylogenetic/defaults/mpxv/config.yaml @@ -1,12 +1,12 @@ -auspice_config: "config/mpxv/auspice_config.json" -include: "config/mpxv/include.txt" -reference: "config/reference.fasta" -genemap: "config/genemap.gff" -genbank_reference: "config/reference.gb" -lat_longs: "config/lat_longs.tsv" -description: "config/description.md" -clades: "config/clades.tsv" -tree_mask: "config/tree_mask.tsv" +auspice_config: "defaults/mpxv/auspice_config.json" +include: "defaults/mpxv/include.txt" +reference: "defaults/reference.fasta" +genemap: "defaults/genemap.gff" +genbank_reference: "defaults/reference.gb" +lat_longs: "defaults/lat_longs.tsv" +description: "defaults/description.md" +clades: "defaults/clades.tsv" +tree_mask: "defaults/tree_mask.tsv" # Use `accession` as the ID column since `strain` currently contains duplicates¹. # ¹ https://github.com/nextstrain/mpox/issues/33 @@ -17,7 +17,7 @@ build_name: "mpxv" auspice_name: "mpox_all-clades" filter: - exclude: "config/exclude_accessions.txt" + exclude: "defaults/exclude_accessions.txt" min_date: 1950 min_length: 100000 @@ -74,4 +74,4 @@ recency: true mask: from_beginning: 1350 from_end: 6422 - maskfile: "config/mask_overview.bed" + maskfile: "defaults/mask_overview.bed" diff --git a/phylogenetic/config/mpxv/include.txt b/phylogenetic/defaults/mpxv/include.txt similarity index 100% rename from phylogenetic/config/mpxv/include.txt rename to phylogenetic/defaults/mpxv/include.txt diff --git a/phylogenetic/config/nextstrain_automation.yaml b/phylogenetic/defaults/nextstrain_automation.yaml similarity index 100% rename from phylogenetic/config/nextstrain_automation.yaml rename to phylogenetic/defaults/nextstrain_automation.yaml diff --git a/phylogenetic/config/reference.fasta b/phylogenetic/defaults/reference.fasta similarity index 100% rename from phylogenetic/config/reference.fasta rename to phylogenetic/defaults/reference.fasta diff --git a/phylogenetic/config/reference.gb b/phylogenetic/defaults/reference.gb similarity index 100% rename from phylogenetic/config/reference.gb rename to phylogenetic/defaults/reference.gb diff --git a/phylogenetic/config/tree_mask.tsv b/phylogenetic/defaults/tree_mask.tsv similarity index 100% rename from phylogenetic/config/tree_mask.tsv rename to phylogenetic/defaults/tree_mask.tsv diff --git a/phylogenetic/profiles/ci/builds.yaml b/phylogenetic/profiles/ci/builds.yaml index 08dd36c8..797154f4 100644 --- a/phylogenetic/profiles/ci/builds.yaml +++ b/phylogenetic/profiles/ci/builds.yaml @@ -1,15 +1,15 @@ custom_rules: - profiles/ci/copy_example_data.smk -reference: "config/reference.fasta" -genemap: "config/genemap.gff" -genbank_reference: "config/reference.gb" -include: "config/hmpxv1/include.txt" -clades: "config/clades.tsv" -lat_longs: "config/lat_longs.tsv" -auspice_config: "config/hmpxv1/auspice_config.json" -description: "config/description.md" -tree_mask: "config/tree_mask.tsv" +reference: "defaults/reference.fasta" +genemap: "defaults/genemap.gff" +genbank_reference: "defaults/reference.gb" +include: "defaults/hmpxv1/include.txt" +clades: "defaults/clades.tsv" +lat_longs: "defaults/lat_longs.tsv" +auspice_config: "defaults/hmpxv1/auspice_config.json" +description: "defaults/description.md" +tree_mask: "defaults/tree_mask.tsv" # Use `accession` as the ID column since `strain` currently contains duplicates¹. # ¹ https://github.com/nextstrain/monkeypox/issues/33 @@ -20,7 +20,7 @@ build_name: "hmpxv1" auspice_name: "mpox_clade-IIb" filter: - exclude: "config/exclude_accessions.txt" + exclude: "defaults/exclude_accessions.txt" min_date: 2017 min_length: 100000 @@ -81,4 +81,4 @@ recency: true mask: from_beginning: 800 from_end: 6422 - maskfile: "config/mask.bed" + maskfile: "defaults/mask.bed" diff --git a/phylogenetic/rules/core.smk b/phylogenetic/rules/core.smk index 474f3a7b..75fc17f0 100644 --- a/phylogenetic/rules/core.smk +++ b/phylogenetic/rules/core.smk @@ -419,8 +419,8 @@ rule recency: rule colors: input: - ordering="config/color_ordering.tsv", - color_schemes="config/color_schemes.tsv", + ordering="defaults/color_ordering.tsv", + color_schemes="defaults/color_schemes.tsv", metadata=build_dir + "/{build_name}/metadata.tsv", output: colors=build_dir + "/{build_name}/colors.tsv", From 6eb2c38d5986ebea5599a2096fee04ec80e11c20 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 12 Feb 2024 17:04:09 -0800 Subject: [PATCH 03/16] phylo: Add build-configs directory Part of work to update this repo to match the pathogen-repo-guide. Moves the exisitng CI build config and customizations to the new "build-configs" directory. Includes renaming of the `builds.yaml` to `config.yaml` since the file no longer clashes with Snakemake Profiles. --- .github/workflows/ci.yaml | 2 +- phylogenetic/README.md | 8 +++++++- .../ci/builds.yaml => build-configs/ci/config.yaml} | 2 +- .../{profiles => build-configs}/ci/copy_example_data.smk | 0 4 files changed, 9 insertions(+), 3 deletions(-) rename phylogenetic/{profiles/ci/builds.yaml => build-configs/ci/config.yaml} (97%) rename phylogenetic/{profiles => build-configs}/ci/copy_example_data.smk (100%) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index f4b3e601..91325533 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -21,7 +21,7 @@ jobs: run: | nextstrain build \ phylogenetic \ - --configfile profiles/ci/builds.yaml + --configfiles build-configs/ci/config.yaml artifact-name: output-${{ matrix.runtime }} artifact-paths: | phylogenetic/auspice/ diff --git a/phylogenetic/README.md b/phylogenetic/README.md index e1cb1cab..6f1166d6 100644 --- a/phylogenetic/README.md +++ b/phylogenetic/README.md @@ -28,7 +28,7 @@ Once you've run the build, you can view the results with: You can run an example build using the example data provided in this repository via: ``` -nextstrain build . --configfile profiles/ci/builds.yaml +nextstrain build . --configfile build-configs/ci/config.yaml ``` When the build has finished running, view the output Auspice trees via: @@ -113,6 +113,12 @@ The analysis pipeline is contained in `rules/core.smk`. This can be read top-to-bottom, each rule specifies its file inputs and output and pulls its parameters from `config`. There is little redirection and each rule should be able to be reasoned with on its own. +### Custom build configs + +The build-configs directory contains configs and customizations that override and/or extend the default workflow. + +- [ci](build-configs/ci/) - CI build that run the [example build](#example-build) with the [example data](example_data/). + ## Update example data [Example data](./example_data/) is used by [CI](https://github.com/nextstrain/mpox/actions/workflows/ci.yaml). It can also be used as a small subset of real-world data. diff --git a/phylogenetic/profiles/ci/builds.yaml b/phylogenetic/build-configs/ci/config.yaml similarity index 97% rename from phylogenetic/profiles/ci/builds.yaml rename to phylogenetic/build-configs/ci/config.yaml index 797154f4..de6d5ed0 100644 --- a/phylogenetic/profiles/ci/builds.yaml +++ b/phylogenetic/build-configs/ci/config.yaml @@ -1,5 +1,5 @@ custom_rules: - - profiles/ci/copy_example_data.smk + - build-configs/ci/copy_example_data.smk reference: "defaults/reference.fasta" genemap: "defaults/genemap.gff" diff --git a/phylogenetic/profiles/ci/copy_example_data.smk b/phylogenetic/build-configs/ci/copy_example_data.smk similarity index 100% rename from phylogenetic/profiles/ci/copy_example_data.smk rename to phylogenetic/build-configs/ci/copy_example_data.smk From 2d8493c37b6e15980761897affc1eee2f4a78ba1 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 12 Feb 2024 17:30:05 -0800 Subject: [PATCH 04/16] phylo: move chores to build-configs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Part of work to update this repo to match the pathogen-repo-guide. Since the chores rules are internal Nextstrain rules, they do not need to be part of the core workflow. This also resolves the DAG confusion that Snakemake occassionally runs into when running CI locally.¹ README.md includes the new instructions on how to invoke the workflow to update the example data. This requires two config files: 1. The CI config to provide all required config params and to ensure the example data uses correct `strain_id_field` for CI builds. 2. The chores config to include the custom rules ¹ https://github.com/nextstrain/mpox/issues/237 --- phylogenetic/README.md | 10 +++++++--- phylogenetic/Snakefile | 1 - .../{rules => build-configs/chores}/chores.smk | 3 +++ phylogenetic/build-configs/chores/config.yaml | 2 ++ 4 files changed, 12 insertions(+), 4 deletions(-) rename phylogenetic/{rules => build-configs/chores}/chores.smk (82%) create mode 100644 phylogenetic/build-configs/chores/config.yaml diff --git a/phylogenetic/README.md b/phylogenetic/README.md index 6f1166d6..0920e720 100644 --- a/phylogenetic/README.md +++ b/phylogenetic/README.md @@ -117,16 +117,20 @@ There is little redirection and each rule should be able to be reasoned with on The build-configs directory contains configs and customizations that override and/or extend the default workflow. +- [chores](build-configs/chores/) - internal Nextstrain chores such as [updating the example data](#update-example-data). - [ci](build-configs/ci/) - CI build that run the [example build](#example-build) with the [example data](example_data/). ## Update example data -[Example data](./example_data/) is used by [CI](https://github.com/nextstrain/mpox/actions/workflows/ci.yaml). It can also be used as a small subset of real-world data. +[Example data](./example_data/) is used by [CI](https://github.com/nextstrain/mpox/actions/workflows/ci.yaml). +It can also be used as a small subset of real-world data. -Example data should be updated every time metadata schema is changed or a new clade/lineage emerges. To update, run: +Example data should be updated every time metadata schema is changed or a new clade/lineage emerges. +To update, run: ```sh -nextstrain build . update_example_data -F +nextstrain build . update_example_data -F \ + --configfiles build-configs/ci/config.yaml build-configs/chores/config.yaml ``` ## Data use diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index 85296416..1f6aaf17 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -48,7 +48,6 @@ else: include: "rules/prepare.smk" -include: "rules/chores.smk" include: "rules/core.smk" diff --git a/phylogenetic/rules/chores.smk b/phylogenetic/build-configs/chores/chores.smk similarity index 82% rename from phylogenetic/rules/chores.smk rename to phylogenetic/build-configs/chores/chores.smk index bad47d7a..2998701f 100644 --- a/phylogenetic/rules/chores.smk +++ b/phylogenetic/build-configs/chores/chores.smk @@ -1,3 +1,6 @@ +# I was hoping to use the Snakemake `default_target` directive to make this the +# default target when including this rule via `custom_rules`, but that is +# currently not possible: https://github.com/snakemake/snakemake/issues/2056 rule update_example_data: """This updates the files under example_data/ based on latest available data from data.nextstrain.org. diff --git a/phylogenetic/build-configs/chores/config.yaml b/phylogenetic/build-configs/chores/config.yaml new file mode 100644 index 00000000..cc316a83 --- /dev/null +++ b/phylogenetic/build-configs/chores/config.yaml @@ -0,0 +1,2 @@ +custom_rules: + - build-configs/chores/chores.smk From 63c5e5616ca5226386d170a7a194ffcfd63ea8f0 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Tue, 13 Feb 2024 10:25:50 -0800 Subject: [PATCH 05/16] phylo: move nextstrain_automation to build-configs Part of work to update this repo to match the pathogen-repo-guide. --- .github/workflows/rebuild-hmpxv1-big.yaml | 2 +- .github/workflows/rebuild-hmpxv1.yaml | 2 +- .github/workflows/rebuild-mpxv.yaml | 2 +- phylogenetic/README.md | 1 + phylogenetic/Snakefile | 5 ----- .../nextstrain-automation/config.yaml} | 3 +++ .../nextstrain-automation/nextstrain-automation.smk} | 0 7 files changed, 7 insertions(+), 8 deletions(-) rename phylogenetic/{defaults/nextstrain_automation.yaml => build-configs/nextstrain-automation/config.yaml} (66%) rename phylogenetic/{rules/nextstrain_automation.smk => build-configs/nextstrain-automation/nextstrain-automation.smk} (100%) diff --git a/.github/workflows/rebuild-hmpxv1-big.yaml b/.github/workflows/rebuild-hmpxv1-big.yaml index 4b115254..5146f291 100644 --- a/.github/workflows/rebuild-hmpxv1-big.yaml +++ b/.github/workflows/rebuild-hmpxv1-big.yaml @@ -63,5 +63,5 @@ jobs: --env SLACK_CHANNELS \ . \ notify_on_deploy \ - --configfiles $BUILD_DIR/defaults/$BUILD_NAME/config.yaml $BUILD_DIR/defaults/nextstrain_automation.yaml \ + --configfiles $BUILD_DIR/defaults/$BUILD_NAME/config.yaml $BUILD_DIR/build-configs/nextstrain-automation/config.yaml \ $CONFIG_OVERRIDES --directory $BUILD_DIR --snakefile $BUILD_DIR/Snakefile diff --git a/.github/workflows/rebuild-hmpxv1.yaml b/.github/workflows/rebuild-hmpxv1.yaml index 1ed7a235..3db74076 100644 --- a/.github/workflows/rebuild-hmpxv1.yaml +++ b/.github/workflows/rebuild-hmpxv1.yaml @@ -63,5 +63,5 @@ jobs: --env SLACK_CHANNELS \ . \ notify_on_deploy \ - --configfiles $BUILD_DIR/defaults/$BUILD_NAME/config.yaml $BUILD_DIR/defaults/nextstrain_automation.yaml \ + --configfiles $BUILD_DIR/defaults/$BUILD_NAME/config.yaml $BUILD_DIR/build-configs/nextstrain-automation/config.yaml \ $CONFIG_OVERRIDES --directory $BUILD_DIR --snakefile $BUILD_DIR/Snakefile diff --git a/.github/workflows/rebuild-mpxv.yaml b/.github/workflows/rebuild-mpxv.yaml index fe3d614f..2c352227 100644 --- a/.github/workflows/rebuild-mpxv.yaml +++ b/.github/workflows/rebuild-mpxv.yaml @@ -63,5 +63,5 @@ jobs: --env SLACK_CHANNELS \ . \ notify_on_deploy \ - --configfiles $BUILD_DIR/defaults/$BUILD_NAME/config.yaml $BUILD_DIR/defaults/nextstrain_automation.yaml \ + --configfiles $BUILD_DIR/defaults/$BUILD_NAME/config.yaml $BUILD_DIR/build-configs/nextstrain-automation/config.yaml \ $CONFIG_OVERRIDES --directory $BUILD_DIR --snakefile $BUILD_DIR/Snakefile diff --git a/phylogenetic/README.md b/phylogenetic/README.md index 0920e720..1a17a42f 100644 --- a/phylogenetic/README.md +++ b/phylogenetic/README.md @@ -119,6 +119,7 @@ The build-configs directory contains configs and customizations that override an - [chores](build-configs/chores/) - internal Nextstrain chores such as [updating the example data](#update-example-data). - [ci](build-configs/ci/) - CI build that run the [example build](#example-build) with the [example data](example_data/). +- [nextstrain-automation](build-configs/nextstrain-automation/) - internal Nextstrain automated builds ## Update example data diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index 1f6aaf17..a84bd5f2 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -51,11 +51,6 @@ else: include: "rules/core.smk" -if config.get("deploy_url", False): - - include: "rules/nextstrain_automation.smk" - - # Include custom rules defined in the config. if "custom_rules" in config: for rule_file in config["custom_rules"]: diff --git a/phylogenetic/defaults/nextstrain_automation.yaml b/phylogenetic/build-configs/nextstrain-automation/config.yaml similarity index 66% rename from phylogenetic/defaults/nextstrain_automation.yaml rename to phylogenetic/build-configs/nextstrain-automation/config.yaml index d0389c67..cdcda6de 100644 --- a/phylogenetic/defaults/nextstrain_automation.yaml +++ b/phylogenetic/build-configs/nextstrain-automation/config.yaml @@ -1,5 +1,8 @@ # Optional configs to include for automated Nextstrain builds # Intended to be used internally by the Nextstrain team +custom_rules: + - build-configs/nextstrain-automation/nextstrain-automation.smk + # deploy deploy_url: "s3://nextstrain-data" diff --git a/phylogenetic/rules/nextstrain_automation.smk b/phylogenetic/build-configs/nextstrain-automation/nextstrain-automation.smk similarity index 100% rename from phylogenetic/rules/nextstrain_automation.smk rename to phylogenetic/build-configs/nextstrain-automation/nextstrain-automation.smk From b0bbd39e89bc68a455fdbf4473dab8b8d4727500 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Tue, 13 Feb 2024 10:57:11 -0800 Subject: [PATCH 06/16] phylo: Move lapis to build-configs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Part of work to update this repo to match the pathogen-repo-guide. LAPIS was the default data source until we switched to using data.nextstrain.org.¹ This commit moves the LAPIS related rules to a custom build-config and is able to successfully override the default download/decompress rules. However, there are downstream changes to the workflow that cause errors with the LAPIS data, e.g. hard-coded filters on columns do not exist in the LAPIS data.² Rather than spending the time to make sure the workflow runs with the LAPIS data, I will remove the lapis build-configs in the next commit. I wanted to do so separately to be able to easily revert the removal if needed. ¹ https://github.com/nextstrain/mpox/commit/5d0791a06315eabb1eba43f6889996f0dcf824b2 ² https://github.com/nextstrain/mpox/blob/34feb4a4e6f4d8b7929cc701734b15b8a5e6a6fc/phylogenetic/workflow/snakemake_rules/core.smk#L43 --- phylogenetic/README.md | 1 + phylogenetic/Snakefile | 10 +--------- phylogenetic/build-configs/lapis/config.yaml | 2 ++ .../lapis}/download_via_lapis.smk | 4 ++++ 4 files changed, 8 insertions(+), 9 deletions(-) create mode 100644 phylogenetic/build-configs/lapis/config.yaml rename phylogenetic/{rules => build-configs/lapis}/download_via_lapis.smk (73%) diff --git a/phylogenetic/README.md b/phylogenetic/README.md index 1a17a42f..00242afd 100644 --- a/phylogenetic/README.md +++ b/phylogenetic/README.md @@ -119,6 +119,7 @@ The build-configs directory contains configs and customizations that override an - [chores](build-configs/chores/) - internal Nextstrain chores such as [updating the example data](#update-example-data). - [ci](build-configs/ci/) - CI build that run the [example build](#example-build) with the [example data](example_data/). +- [lapis](build-configs/lapis/) - builds that use LAPIS as the data source. - [nextstrain-automation](build-configs/nextstrain-automation/) - internal Nextstrain automated builds ## Update example data diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index a84bd5f2..c851cf7a 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -39,15 +39,7 @@ rule all: """ -if config.get("data_source", None) == "lapis": - - include: "rules/download_via_lapis.smk" - -else: - - include: "rules/prepare.smk" - - +include: "rules/prepare.smk" include: "rules/core.smk" diff --git a/phylogenetic/build-configs/lapis/config.yaml b/phylogenetic/build-configs/lapis/config.yaml new file mode 100644 index 00000000..5e918251 --- /dev/null +++ b/phylogenetic/build-configs/lapis/config.yaml @@ -0,0 +1,2 @@ +custom_rules: + - build-configs/lapis/download_via_lapis.smk diff --git a/phylogenetic/rules/download_via_lapis.smk b/phylogenetic/build-configs/lapis/download_via_lapis.smk similarity index 73% rename from phylogenetic/rules/download_via_lapis.smk rename to phylogenetic/build-configs/lapis/download_via_lapis.smk index 55c7ae9f..57c9927b 100644 --- a/phylogenetic/rules/download_via_lapis.smk +++ b/phylogenetic/build-configs/lapis/download_via_lapis.smk @@ -16,3 +16,7 @@ rule download_metadata_via_lapis: tr -d "\r" | sed -E 's/("([^"]*)")?,/\\2\\t/g' > {output.metadata} """ + +# Override the default download/decompress rules to download data from LAPIS +ruleorder: download_sequences_via_lapis > decompress +ruleorder: download_metadata_via_lapis > decompress From 307c3cd6a61cbb22c74bad1f4d68ea93caf39ad6 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Tue, 13 Feb 2024 11:19:19 -0800 Subject: [PATCH 07/16] phylo: Remove build-configs/lapis Removing build-configs/lapis instead of spending the time to make sure the workflow runs with the LAPIS data. If others need/want to use the LAPIS data, this custom build config can be easily restored. --- phylogenetic/README.md | 1 - phylogenetic/build-configs/lapis/config.yaml | 2 -- .../lapis/download_via_lapis.smk | 22 ------------------- 3 files changed, 25 deletions(-) delete mode 100644 phylogenetic/build-configs/lapis/config.yaml delete mode 100644 phylogenetic/build-configs/lapis/download_via_lapis.smk diff --git a/phylogenetic/README.md b/phylogenetic/README.md index 00242afd..1a17a42f 100644 --- a/phylogenetic/README.md +++ b/phylogenetic/README.md @@ -119,7 +119,6 @@ The build-configs directory contains configs and customizations that override an - [chores](build-configs/chores/) - internal Nextstrain chores such as [updating the example data](#update-example-data). - [ci](build-configs/ci/) - CI build that run the [example build](#example-build) with the [example data](example_data/). -- [lapis](build-configs/lapis/) - builds that use LAPIS as the data source. - [nextstrain-automation](build-configs/nextstrain-automation/) - internal Nextstrain automated builds ## Update example data diff --git a/phylogenetic/build-configs/lapis/config.yaml b/phylogenetic/build-configs/lapis/config.yaml deleted file mode 100644 index 5e918251..00000000 --- a/phylogenetic/build-configs/lapis/config.yaml +++ /dev/null @@ -1,2 +0,0 @@ -custom_rules: - - build-configs/lapis/download_via_lapis.smk diff --git a/phylogenetic/build-configs/lapis/download_via_lapis.smk b/phylogenetic/build-configs/lapis/download_via_lapis.smk deleted file mode 100644 index 57c9927b..00000000 --- a/phylogenetic/build-configs/lapis/download_via_lapis.smk +++ /dev/null @@ -1,22 +0,0 @@ -rule download_sequences_via_lapis: - output: - sequences="data/sequences.fasta", - shell: - """ - curl https://mpox-lapis.genspectrum.org/v1/sample/fasta --output {output.sequences} - """ - - -rule download_metadata_via_lapis: - output: - metadata="data/metadata.tsv", - shell: - """ - curl https://mpox-lapis.genspectrum.org/v1/sample/details?dataFormat=csv | \ - tr -d "\r" | - sed -E 's/("([^"]*)")?,/\\2\\t/g' > {output.metadata} - """ - -# Override the default download/decompress rules to download data from LAPIS -ruleorder: download_sequences_via_lapis > decompress -ruleorder: download_metadata_via_lapis > decompress From 4595120d001bdeafff9d7b01cafd6220470be7c9 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Tue, 13 Feb 2024 11:51:59 -0800 Subject: [PATCH 08/16] phylo: Rename prepare.smk to prepare_sequences.smk Part of work to update this repo to match the pathogen-repo-guide. Following commit will move other rules for preparing sequences from core.smk to prepare_sequences.smk. --- phylogenetic/Snakefile | 2 +- phylogenetic/rules/{prepare.smk => prepare_sequences.smk} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename phylogenetic/rules/{prepare.smk => prepare_sequences.smk} (100%) diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index c851cf7a..0e2753ef 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -39,7 +39,7 @@ rule all: """ -include: "rules/prepare.smk" +include: "rules/prepare_sequences.smk" include: "rules/core.smk" diff --git a/phylogenetic/rules/prepare.smk b/phylogenetic/rules/prepare_sequences.smk similarity index 100% rename from phylogenetic/rules/prepare.smk rename to phylogenetic/rules/prepare_sequences.smk From 730bf494cb254af2a20f3144926b03f9c9d54a3f Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Tue, 13 Feb 2024 11:56:08 -0800 Subject: [PATCH 09/16] phylo: Move rules to prepare_sequences.smk Part of work to update this repo to match the pathogen-repo-guide. --- phylogenetic/rules/core.smk | 159 -------------------- phylogenetic/rules/prepare_sequences.smk | 176 +++++++++++++++++++++++ 2 files changed, 176 insertions(+), 159 deletions(-) diff --git a/phylogenetic/rules/core.smk b/phylogenetic/rules/core.smk index 75fc17f0..af817f4a 100644 --- a/phylogenetic/rules/core.smk +++ b/phylogenetic/rules/core.smk @@ -13,165 +13,6 @@ In addition, `build_dir` and `auspice_dir` need to be defined upstream. """ -rule filter: - """ - Removing strains that do not satisfy certain requirements. - """ - input: - sequences="data/sequences.fasta", - metadata="data/metadata.tsv", - output: - sequences=build_dir + "/{build_name}/good_sequences.fasta", - metadata=build_dir + "/{build_name}/good_metadata.tsv", - log=build_dir + "/{build_name}/good_filter.log", - params: - exclude=config["filter"]["exclude"], - min_date=config["filter"]["min_date"], - min_length=config["filter"]["min_length"], - strain_id=config["strain_id_field"], - shell: - """ - augur filter \ - --sequences {input.sequences} \ - --metadata {input.metadata} \ - --metadata-id-columns {params.strain_id} \ - --output-sequences {output.sequences} \ - --output-metadata {output.metadata} \ - --exclude {params.exclude} \ - --min-date {params.min_date} \ - --min-length {params.min_length} \ - --query "(QC_rare_mutations == 'good' | QC_rare_mutations == 'mediocre')" \ - --output-log {output.log} - """ - - -rule subsample: - input: - metadata=rules.filter.output.metadata, - output: - strains=build_dir + "/{build_name}/{sample}_strains.txt", - log=build_dir + "/{build_name}/{sample}_filter.log", - params: - group_by=lambda w: config["subsample"][w.sample]["group_by"], - sequences_per_group=lambda w: config["subsample"][w.sample][ - "sequences_per_group" - ], - other_filters=lambda w: config["subsample"][w.sample].get("other_filters", ""), - exclude=lambda w: f"--exclude-where {' '.join([f'lineage={l}' for l in config['subsample'][w.sample]['exclude_lineages']])}" - if "exclude_lineages" in config["subsample"][w.sample] - else "", - strain_id=config["strain_id_field"], - shell: - """ - augur filter \ - --metadata {input.metadata} \ - --metadata-id-columns {params.strain_id} \ - --output-strains {output.strains} \ - {params.group_by} \ - {params.sequences_per_group} \ - {params.exclude} \ - {params.other_filters} \ - --output-log {output.log} - """ - - -rule combine_samples: - input: - strains=lambda w: [ - f"{build_dir}/{w.build_name}/{sample}_strains.txt" - for sample in config["subsample"] - ], - sequences=rules.filter.output.sequences, - metadata=rules.filter.output.metadata, - include=config["include"], - output: - sequences=build_dir + "/{build_name}/filtered.fasta", - metadata=build_dir + "/{build_name}/metadata.tsv", - params: - strain_id=config["strain_id_field"], - shell: - """ - augur filter \ - --metadata-id-columns {params.strain_id} \ - --sequences {input.sequences} \ - --metadata {input.metadata} \ - --exclude-all \ - --include {input.strains} {input.include}\ - --output-sequences {output.sequences} \ - --output-metadata {output.metadata} - """ - - -rule reverse_reverse_complements: - input: - metadata=build_dir + "/{build_name}/metadata.tsv", - sequences=build_dir + "/{build_name}/filtered.fasta", - output: - build_dir + "/{build_name}/reversed.fasta", - shell: - """ - python3 scripts/reverse_reversed_sequences.py \ - --metadata {input.metadata} \ - --sequences {input.sequences} \ - --output {output} - """ - - -rule align: - """ - Aligning sequences to {input.reference} - - filling gaps with N - """ - input: - sequences=rules.reverse_reverse_complements.output, - reference=config["reference"], - genemap=config["genemap"], - output: - alignment=build_dir + "/{build_name}/aligned.fasta", - insertions=build_dir + "/{build_name}/insertions.fasta", - params: - max_indel=config["max_indel"], - seed_spacing=config["seed_spacing"], - threads: workflow.cores - shell: - """ - nextalign run \ - --jobs {threads} \ - --reference {input.reference} \ - --genemap {input.genemap} \ - --max-indel {params.max_indel} \ - --seed-spacing {params.seed_spacing} \ - --retry-reverse-complement \ - --output-fasta - \ - --output-insertions {output.insertions} \ - {input.sequences} | seqkit seq -i > {output.alignment} - """ - - -rule mask: - """ - Mask ends of the alignment: - - from start: {params.from_start} - - from end: {params.from_end} - """ - input: - sequences=build_dir + "/{build_name}/aligned.fasta", - mask=config["mask"]["maskfile"], - output: - build_dir + "/{build_name}/masked.fasta", - params: - from_start=config["mask"]["from_beginning"], - from_end=config["mask"]["from_end"], - shell: - """ - augur mask \ - --sequences {input.sequences} \ - --mask {input.mask} \ - --mask-from-beginning {params.from_start} \ - --mask-from-end {params.from_end} --output {output} - """ - - rule tree: """ Building tree diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk index 60f65d69..b87820c3 100644 --- a/phylogenetic/rules/prepare_sequences.smk +++ b/phylogenetic/rules/prepare_sequences.smk @@ -1,3 +1,20 @@ +""" +This part of the workflow prepares sequences for constructing the phylogenetic tree. + +REQUIRED INPUTS: + + include = path to file of sequences to in force include + reference = path to reference sequence FASTA for Nextclade alignment + genemap = path to genemap GFF for Nextclade alignment + maskfile = path to maskfile of sites to be masked + +OUTPUTS: + + prepared_sequences = {build_dir}/{build_name}/masked.fasta + +""" + + rule download: """ Downloading sequences and metadata from data.nextstrain.org @@ -30,3 +47,162 @@ rule decompress: gzip --decompress --keep {input.metadata} xz --decompress --keep {input.sequences} """ + + +rule filter: + """ + Removing strains that do not satisfy certain requirements. + """ + input: + sequences="data/sequences.fasta", + metadata="data/metadata.tsv", + output: + sequences=build_dir + "/{build_name}/good_sequences.fasta", + metadata=build_dir + "/{build_name}/good_metadata.tsv", + log=build_dir + "/{build_name}/good_filter.log", + params: + exclude=config["filter"]["exclude"], + min_date=config["filter"]["min_date"], + min_length=config["filter"]["min_length"], + strain_id=config["strain_id_field"], + shell: + """ + augur filter \ + --sequences {input.sequences} \ + --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ + --output-sequences {output.sequences} \ + --output-metadata {output.metadata} \ + --exclude {params.exclude} \ + --min-date {params.min_date} \ + --min-length {params.min_length} \ + --query "(QC_rare_mutations == 'good' | QC_rare_mutations == 'mediocre')" \ + --output-log {output.log} + """ + + +rule subsample: + input: + metadata=rules.filter.output.metadata, + output: + strains=build_dir + "/{build_name}/{sample}_strains.txt", + log=build_dir + "/{build_name}/{sample}_filter.log", + params: + group_by=lambda w: config["subsample"][w.sample]["group_by"], + sequences_per_group=lambda w: config["subsample"][w.sample][ + "sequences_per_group" + ], + other_filters=lambda w: config["subsample"][w.sample].get("other_filters", ""), + exclude=lambda w: f"--exclude-where {' '.join([f'lineage={l}' for l in config['subsample'][w.sample]['exclude_lineages']])}" + if "exclude_lineages" in config["subsample"][w.sample] + else "", + strain_id=config["strain_id_field"], + shell: + """ + augur filter \ + --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ + --output-strains {output.strains} \ + {params.group_by} \ + {params.sequences_per_group} \ + {params.exclude} \ + {params.other_filters} \ + --output-log {output.log} + """ + + +rule combine_samples: + input: + strains=lambda w: [ + f"{build_dir}/{w.build_name}/{sample}_strains.txt" + for sample in config["subsample"] + ], + sequences=rules.filter.output.sequences, + metadata=rules.filter.output.metadata, + include=config["include"], + output: + sequences=build_dir + "/{build_name}/filtered.fasta", + metadata=build_dir + "/{build_name}/metadata.tsv", + params: + strain_id=config["strain_id_field"], + shell: + """ + augur filter \ + --metadata-id-columns {params.strain_id} \ + --sequences {input.sequences} \ + --metadata {input.metadata} \ + --exclude-all \ + --include {input.strains} {input.include}\ + --output-sequences {output.sequences} \ + --output-metadata {output.metadata} + """ + + +rule reverse_reverse_complements: + input: + metadata=build_dir + "/{build_name}/metadata.tsv", + sequences=build_dir + "/{build_name}/filtered.fasta", + output: + build_dir + "/{build_name}/reversed.fasta", + shell: + """ + python3 scripts/reverse_reversed_sequences.py \ + --metadata {input.metadata} \ + --sequences {input.sequences} \ + --output {output} + """ + + +rule align: + """ + Aligning sequences to {input.reference} + - filling gaps with N + """ + input: + sequences=rules.reverse_reverse_complements.output, + reference=config["reference"], + genemap=config["genemap"], + output: + alignment=build_dir + "/{build_name}/aligned.fasta", + insertions=build_dir + "/{build_name}/insertions.fasta", + params: + max_indel=config["max_indel"], + seed_spacing=config["seed_spacing"], + threads: workflow.cores + shell: + """ + nextalign run \ + --jobs {threads} \ + --reference {input.reference} \ + --genemap {input.genemap} \ + --max-indel {params.max_indel} \ + --seed-spacing {params.seed_spacing} \ + --retry-reverse-complement \ + --output-fasta - \ + --output-insertions {output.insertions} \ + {input.sequences} | seqkit seq -i > {output.alignment} + """ + + +rule mask: + """ + Mask ends of the alignment: + - from start: {params.from_start} + - from end: {params.from_end} + """ + input: + sequences=build_dir + "/{build_name}/aligned.fasta", + mask=config["mask"]["maskfile"], + output: + build_dir + "/{build_name}/masked.fasta", + params: + from_start=config["mask"]["from_beginning"], + from_end=config["mask"]["from_end"], + shell: + """ + augur mask \ + --sequences {input.sequences} \ + --mask {input.mask} \ + --mask-from-beginning {params.from_start} \ + --mask-from-end {params.from_end} --output {output} + """ From 11dcb7b731c7d5955e89d3273db123f1bb2b55be Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Tue, 13 Feb 2024 12:28:29 -0800 Subject: [PATCH 10/16] phylo: Move rules to construct_phylogeny.smk Part of work to update this repo to match the pathogen-repo-guide. --- phylogenetic/Snakefile | 1 + phylogenetic/rules/construct_phylogeny.smk | 111 +++++++++++++++++++++ phylogenetic/rules/core.smk | 96 ------------------ 3 files changed, 112 insertions(+), 96 deletions(-) create mode 100644 phylogenetic/rules/construct_phylogeny.smk diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index 0e2753ef..d6add2bd 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -40,6 +40,7 @@ rule all: include: "rules/prepare_sequences.smk" +include: "rules/construct_phylogeny.smk" include: "rules/core.smk" diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk new file mode 100644 index 00000000..215c6a9a --- /dev/null +++ b/phylogenetic/rules/construct_phylogeny.smk @@ -0,0 +1,111 @@ +""" +This part of the workflow constructs the phylogenetic tree. + +REQUIRED INPUTS: + + sequences = {build_dir}/{build_name}/masked.fasta + metadata = {build_dir}/{build_name}/metadata.tsv + tree_mask = path to maskfile of sites to exclude for tree building + +OUTPUTS: + + tree = {build_dir}/{build_name}/tree.nwk + branch_lengths = {build_dir}/{build_name}/branch_lengths.json + +""" + + +rule tree: + """ + Building tree + """ + input: + alignment=build_dir + "/{build_name}/masked.fasta", + tree_mask=config["tree_mask"], + output: + tree=build_dir + "/{build_name}/tree_raw.nwk", + threads: workflow.cores + shell: + """ + augur tree \ + --alignment {input.alignment} \ + --exclude-sites {input.tree_mask} \ + --tree-builder-args="-redo" \ + --output {output.tree} \ + --nthreads {threads} + """ + + +rule fix_tree: + """ + Fixing tree + """ + input: + tree=rules.tree.output.tree, + alignment=build_dir + "/{build_name}/masked.fasta", + output: + tree=build_dir + "/{build_name}/tree_fixed.nwk", + params: + root=lambda w: config.get("treefix_root", ""), + shell: + """ + python3 scripts/fix_tree.py \ + --alignment {input.alignment} \ + --input-tree {input.tree} \ + {params.root} \ + --output {output.tree} + """ + + +rule refine: + """ + Refining tree + - estimate timetree + - use {params.coalescent} coalescent timescale + - estimate {params.date_inference} node dates + - filter tips more than {params.clock_filter_iqd} IQDs from clock expectation + """ + input: + tree=rules.fix_tree.output.tree + if config["fix_tree"] + else rules.tree.output.tree, + alignment=build_dir + "/{build_name}/masked.fasta", + metadata=build_dir + "/{build_name}/metadata.tsv", + output: + tree=build_dir + "/{build_name}/tree.nwk", + node_data=build_dir + "/{build_name}/branch_lengths.json", + params: + coalescent="opt", + date_inference="marginal", + clock_filter_iqd=0, + root=config["root"], + clock_rate=f"--clock-rate {config['clock_rate']}" + if "clock_rate" in config + else "", + clock_std_dev=f"--clock-std-dev {config['clock_std_dev']}" + if "clock_std_dev" in config + else "", + strain_id=config["strain_id_field"], + divergence_units=config["divergence_units"], + shell: + """ + augur refine \ + --tree {input.tree} \ + --alignment {input.alignment} \ + --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ + --output-tree {output.tree} \ + --timetree \ + --root {params.root} \ + --precision 3 \ + --keep-polytomies \ + --use-fft \ + {params.clock_rate} \ + {params.clock_std_dev} \ + --output-node-data {output.node_data} \ + --coalescent {params.coalescent} \ + --date-inference {params.date_inference} \ + --date-confidence \ + --divergence-units {params.divergence_units} \ + --clock-filter-iqd {params.clock_filter_iqd} + """ diff --git a/phylogenetic/rules/core.smk b/phylogenetic/rules/core.smk index af817f4a..feed9fc0 100644 --- a/phylogenetic/rules/core.smk +++ b/phylogenetic/rules/core.smk @@ -13,102 +13,6 @@ In addition, `build_dir` and `auspice_dir` need to be defined upstream. """ -rule tree: - """ - Building tree - """ - input: - alignment=build_dir + "/{build_name}/masked.fasta", - tree_mask=config["tree_mask"], - output: - tree=build_dir + "/{build_name}/tree_raw.nwk", - threads: workflow.cores - shell: - """ - augur tree \ - --alignment {input.alignment} \ - --exclude-sites {input.tree_mask} \ - --tree-builder-args="-redo" \ - --output {output.tree} \ - --nthreads {threads} - """ - - -rule fix_tree: - """ - Fixing tree - """ - input: - tree=rules.tree.output.tree, - alignment=build_dir + "/{build_name}/masked.fasta", - output: - tree=build_dir + "/{build_name}/tree_fixed.nwk", - params: - root=lambda w: config.get("treefix_root", ""), - shell: - """ - python3 scripts/fix_tree.py \ - --alignment {input.alignment} \ - --input-tree {input.tree} \ - {params.root} \ - --output {output.tree} - """ - - -rule refine: - """ - Refining tree - - estimate timetree - - use {params.coalescent} coalescent timescale - - estimate {params.date_inference} node dates - - filter tips more than {params.clock_filter_iqd} IQDs from clock expectation - """ - input: - tree=rules.fix_tree.output.tree - if config["fix_tree"] - else rules.tree.output.tree, - alignment=build_dir + "/{build_name}/masked.fasta", - metadata=build_dir + "/{build_name}/metadata.tsv", - output: - tree=build_dir + "/{build_name}/tree.nwk", - node_data=build_dir + "/{build_name}/branch_lengths.json", - params: - coalescent="opt", - date_inference="marginal", - clock_filter_iqd=0, - root=config["root"], - clock_rate=f"--clock-rate {config['clock_rate']}" - if "clock_rate" in config - else "", - clock_std_dev=f"--clock-std-dev {config['clock_std_dev']}" - if "clock_std_dev" in config - else "", - strain_id=config["strain_id_field"], - divergence_units=config["divergence_units"], - shell: - """ - augur refine \ - --tree {input.tree} \ - --alignment {input.alignment} \ - --metadata {input.metadata} \ - --metadata-id-columns {params.strain_id} \ - --output-tree {output.tree} \ - --timetree \ - --root {params.root} \ - --precision 3 \ - --keep-polytomies \ - --use-fft \ - {params.clock_rate} \ - {params.clock_std_dev} \ - --output-node-data {output.node_data} \ - --coalescent {params.coalescent} \ - --date-inference {params.date_inference} \ - --date-confidence \ - --divergence-units {params.divergence_units} \ - --clock-filter-iqd {params.clock_filter_iqd} - """ - - rule ancestral: """ Reconstructing ancestral sequences and mutations From eb393bd6595f452dbde1e3edc1b757a58c87bfc2 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Tue, 13 Feb 2024 12:43:29 -0800 Subject: [PATCH 11/16] phylo: Move rules to annotate_phylogeny.smk Part of work to update this repo to match the pathogen-repo-guide. --- phylogenetic/Snakefile | 1 + phylogenetic/rules/annotate_phylogeny.smk | 158 ++++++++++++++++++++++ phylogenetic/rules/core.smk | 138 ------------------- 3 files changed, 159 insertions(+), 138 deletions(-) create mode 100644 phylogenetic/rules/annotate_phylogeny.smk diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index d6add2bd..9324171e 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -41,6 +41,7 @@ rule all: include: "rules/prepare_sequences.smk" include: "rules/construct_phylogeny.smk" +include: "rules/annotate_phylogeny.smk" include: "rules/core.smk" diff --git a/phylogenetic/rules/annotate_phylogeny.smk b/phylogenetic/rules/annotate_phylogeny.smk new file mode 100644 index 00000000..564f8c28 --- /dev/null +++ b/phylogenetic/rules/annotate_phylogeny.smk @@ -0,0 +1,158 @@ +""" +This part of the workflow creates additional annotations for the phylogenetic tree. + +REQUIRED INPUTS: + + sequences = {build_dir}/{build_name}/masked.fasta + metadata = {build_dir}/{build_name}/metadata.tsv + tree = {build_dir}/{build_name}/tree.nwk + clades = path to clades definition TSV + +OUTPUTS: + + nt_muts = {build_dir}/{build_name}/nt_muts.json + aa_muts = {build_dir}/{build_name}/aa_muts.json + traits = {build_dir}/{build_name}/traits.json + clades = {build_dir}/{build_name}/clades.json + mutation_context = {build_dir}/{build_name}/mutation_context.json + recency = {build_dir}/{build_name}/recency.json + +""" + + +rule ancestral: + """ + Reconstructing ancestral sequences and mutations + """ + input: + tree=rules.refine.output.tree, + alignment=build_dir + "/{build_name}/masked.fasta", + output: + node_data=build_dir + "/{build_name}/nt_muts.json", + params: + inference="joint", + shell: + """ + augur ancestral \ + --tree {input.tree} \ + --alignment {input.alignment} \ + --output-node-data {output.node_data} \ + --inference {params.inference} + """ + + +rule translate: + """ + Translating amino acid sequences + """ + input: + tree=rules.refine.output.tree, + node_data=rules.ancestral.output.node_data, + genemap=config["genemap"], + output: + node_data=build_dir + "/{build_name}/aa_muts.json", + shell: + """ + augur translate \ + --tree {input.tree} \ + --ancestral-sequences {input.node_data} \ + --reference-sequence {input.genemap} \ + --output {output.node_data} + """ + + +rule traits: + """ + Inferring ancestral traits for {params.columns!s} + - increase uncertainty of reconstruction by {params.sampling_bias_correction} to partially account for sampling bias + """ + input: + tree=rules.refine.output.tree, + metadata=build_dir + "/{build_name}/metadata.tsv", + output: + node_data=build_dir + "/{build_name}/traits.json", + params: + columns="country", + sampling_bias_correction=3, + strain_id=config["strain_id_field"], + shell: + """ + augur traits \ + --tree {input.tree} \ + --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ + --output {output.node_data} \ + --columns {params.columns} \ + --confidence \ + --sampling-bias-correction {params.sampling_bias_correction} + """ + + +rule clades: + """ + Adding internal clade labels + """ + input: + tree=rules.refine.output.tree, + aa_muts=rules.translate.output.node_data, + nuc_muts=rules.ancestral.output.node_data, + clades=config["clades"], + output: + node_data=build_dir + "/{build_name}/clades_raw.json", + log: + "logs/clades_{build_name}.txt", + shell: + """ + augur clades \ + --tree {input.tree} \ + --mutations {input.nuc_muts} {input.aa_muts} \ + --clades {input.clades} \ + --output-node-data {output.node_data} 2>&1 | tee {log} + """ + + +rule rename_clades: + input: + rules.clades.output.node_data, + output: + node_data=build_dir + "/{build_name}/clades.json", + shell: + """ + python scripts/clades_renaming.py \ + --input-node-data {input} \ + --output-node-data {output.node_data} + """ + + +rule mutation_context: + input: + tree=rules.refine.output.tree, + node_data=build_dir + "/{build_name}/nt_muts.json", + output: + node_data=build_dir + "/{build_name}/mutation_context.json", + shell: + """ + python3 scripts/mutation_context.py \ + --tree {input.tree} \ + --mutations {input.node_data} \ + --output {output.node_data} + """ + + +rule recency: + """ + Use metadata on submission date to construct submission recency field + """ + input: + metadata=build_dir + "/{build_name}/metadata.tsv", + output: + node_data=build_dir + "/{build_name}/recency.json", + params: + strain_id=config["strain_id_field"], + shell: + """ + python3 scripts/construct-recency-from-submission-date.py \ + --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ + --output {output} 2>&1 + """ diff --git a/phylogenetic/rules/core.smk b/phylogenetic/rules/core.smk index feed9fc0..3ec78c53 100644 --- a/phylogenetic/rules/core.smk +++ b/phylogenetic/rules/core.smk @@ -13,125 +13,6 @@ In addition, `build_dir` and `auspice_dir` need to be defined upstream. """ -rule ancestral: - """ - Reconstructing ancestral sequences and mutations - """ - input: - tree=rules.refine.output.tree, - alignment=build_dir + "/{build_name}/masked.fasta", - output: - node_data=build_dir + "/{build_name}/nt_muts.json", - params: - inference="joint", - shell: - """ - augur ancestral \ - --tree {input.tree} \ - --alignment {input.alignment} \ - --output-node-data {output.node_data} \ - --inference {params.inference} - """ - - -rule translate: - """ - Translating amino acid sequences - """ - input: - tree=rules.refine.output.tree, - node_data=rules.ancestral.output.node_data, - genemap=config["genemap"], - output: - node_data=build_dir + "/{build_name}/aa_muts.json", - shell: - """ - augur translate \ - --tree {input.tree} \ - --ancestral-sequences {input.node_data} \ - --reference-sequence {input.genemap} \ - --output {output.node_data} - """ - - -rule traits: - """ - Inferring ancestral traits for {params.columns!s} - - increase uncertainty of reconstruction by {params.sampling_bias_correction} to partially account for sampling bias - """ - input: - tree=rules.refine.output.tree, - metadata=build_dir + "/{build_name}/metadata.tsv", - output: - node_data=build_dir + "/{build_name}/traits.json", - params: - columns="country", - sampling_bias_correction=3, - strain_id=config["strain_id_field"], - shell: - """ - augur traits \ - --tree {input.tree} \ - --metadata {input.metadata} \ - --metadata-id-columns {params.strain_id} \ - --output {output.node_data} \ - --columns {params.columns} \ - --confidence \ - --sampling-bias-correction {params.sampling_bias_correction} - """ - - -rule clades: - """ - Adding internal clade labels - """ - input: - tree=rules.refine.output.tree, - aa_muts=rules.translate.output.node_data, - nuc_muts=rules.ancestral.output.node_data, - clades=config["clades"], - output: - node_data=build_dir + "/{build_name}/clades_raw.json", - log: - "logs/clades_{build_name}.txt", - shell: - """ - augur clades \ - --tree {input.tree} \ - --mutations {input.nuc_muts} {input.aa_muts} \ - --clades {input.clades} \ - --output-node-data {output.node_data} 2>&1 | tee {log} - """ - - -rule rename_clades: - input: - rules.clades.output.node_data, - output: - node_data=build_dir + "/{build_name}/clades.json", - shell: - """ - python scripts/clades_renaming.py \ - --input-node-data {input} \ - --output-node-data {output.node_data} - """ - - -rule mutation_context: - input: - tree=rules.refine.output.tree, - node_data=build_dir + "/{build_name}/nt_muts.json", - output: - node_data=build_dir + "/{build_name}/mutation_context.json", - shell: - """ - python3 scripts/mutation_context.py \ - --tree {input.tree} \ - --mutations {input.node_data} \ - --output {output.node_data} - """ - - rule remove_time: input: "results/{build_name}/branch_lengths.json", @@ -143,25 +24,6 @@ rule remove_time: """ -rule recency: - """ - Use metadata on submission date to construct submission recency field - """ - input: - metadata=build_dir + "/{build_name}/metadata.tsv", - output: - node_data=build_dir + "/{build_name}/recency.json", - params: - strain_id=config["strain_id_field"], - shell: - """ - python3 scripts/construct-recency-from-submission-date.py \ - --metadata {input.metadata} \ - --metadata-id-columns {params.strain_id} \ - --output {output} 2>&1 - """ - - rule colors: input: ordering="defaults/color_ordering.tsv", From 9afc4799951070bc6b9f06b86eed8c21fe905770 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Tue, 13 Feb 2024 13:21:31 -0800 Subject: [PATCH 12/16] phylo: Rename core.smk to export.smk Part of work to update this repo to match the pathogen-repo-guide. The remaining rules in core.smk are all related to the final export of the Auspice JSONs. --- phylogenetic/Snakefile | 2 +- phylogenetic/rules/{core.smk => export.smk} | 31 ++++++++++++++++----- 2 files changed, 25 insertions(+), 8 deletions(-) rename phylogenetic/rules/{core.smk => export.smk} (75%) diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index 9324171e..d5c3e2d7 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -42,7 +42,7 @@ rule all: include: "rules/prepare_sequences.smk" include: "rules/construct_phylogeny.smk" include: "rules/annotate_phylogeny.smk" -include: "rules/core.smk" +include: "rules/export.smk" # Include custom rules defined in the config. diff --git a/phylogenetic/rules/core.smk b/phylogenetic/rules/export.smk similarity index 75% rename from phylogenetic/rules/core.smk rename to phylogenetic/rules/export.smk index 3ec78c53..f4e7ee69 100644 --- a/phylogenetic/rules/core.smk +++ b/phylogenetic/rules/export.smk @@ -1,15 +1,32 @@ """ -This part of the workflow expects input files +This part of the workflow collects the phylogenetic tree and annotations to +export a Nextstrain dataset. - sequences = "data/sequences.fasta", - metadata = "data/metadata.tsv", +REQUIRED INPUTS: -and will produce output files as + metadata = {build_dir}/{build_name}/metadata.tsv + tree = {build_dir}/{build_name}/tree.nwk + branch_lengths = {build_dir}/{build_name}/branch_lengths.json + nt_muts = {build_dir}/{build_name}/nt_muts.json + aa_muts = {build_dir}/{build_name}/aa_muts.json + traits = {build_dir}/{build_name}/traits.json + clades = {build_dir}/{build_name}/clades.json + mutation_context = {build_dir}/{build_name}/mutation_context.json + color_ordering = defaults/color_ordering.tsv + color_schemes = defaults/color_schemes.tsv + lat_longs = path to lat/long TSV + description = path to description Markdown + auspice_config = path to Auspice config JSON - auspice_json = auspice_dir + "/mpox_{build_name}.json" +OPTIONAL INPUTS: + + recency = {build_dir}/{build_name}/recency.json + +OUTPUTS: + + auspice_json = {build_dir}/{build_name}/tree.json + root_sequence = {build_dir}/{build_name}/tree_root-sequence.json -Parameter are expected to sit in the `config` data structure. -In addition, `build_dir` and `auspice_dir` need to be defined upstream. """ From 08d158899b74638522894dac44deec9b3efc4b2a Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Tue, 13 Feb 2024 13:42:19 -0800 Subject: [PATCH 13/16] Fix README link --- phylogenetic/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phylogenetic/README.md b/phylogenetic/README.md index 1a17a42f..52032aa5 100644 --- a/phylogenetic/README.md +++ b/phylogenetic/README.md @@ -11,7 +11,7 @@ for Nextstrain's suite of software tools. ## Usage If you're unfamiliar with Nextstrain builds, you may want to follow our -[Running a Pathogen Workflow guide][] first and then come back here. +[Running a Pathogen Workflow guide](https://docs.nextstrain.org/en/latest/tutorials/running-a-workflow.html) first and then come back here. The easiest way to run this pathogen build is using the Nextstrain command-line tool from within the `phylogenetic/` directory: From 40715f997d1cdd070a15b6da4a18de6b58acca38 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Tue, 13 Feb 2024 13:53:03 -0800 Subject: [PATCH 14/16] phylo: Remove scripts/deploy.py and related docs Automated builds have been deployed directly to production since https://github.com/nextstrain/mpox/commit/4017f513d7b74de039153ffa3551d131458b2785. --- phylogenetic/README.md | 22 -------- phylogenetic/scripts/deploy.py | 95 ---------------------------------- 2 files changed, 117 deletions(-) delete mode 100644 phylogenetic/scripts/deploy.py diff --git a/phylogenetic/README.md b/phylogenetic/README.md index 52032aa5..6c29f4cc 100644 --- a/phylogenetic/README.md +++ b/phylogenetic/README.md @@ -76,28 +76,6 @@ Run pipeline to produce the "lineage B.1" tree for `/mpox/lineage-B.1` with: nextstrain build . --configfile defaults/hmpxv1_big/config.yaml ``` -### Deploy - -⚠️ The below is outdated and needs to be adjusted for the new build names (mpox instead of monkeypox, etc.) - -
- -Run the python script [`scripts/deploy.py`](scripts/deploy.py) to deploy the staging build to production. - -This will also automatically create a dated build where each node has a unique (random) ID so it can be targeted in shared links/narratives. - -```bash -python scripts/deploy.py --build-names hmpxv1 mpxv -``` - -If a dated build already exists it is not overwritten by default. To overwrite, pass `-f`. - -To deploy a locally built build to staging, use the `--staging` flag. - -To not deploy a dated build to production, add the `--no-dated` flag. - -
- ### Visualize results View results with: diff --git a/phylogenetic/scripts/deploy.py b/phylogenetic/scripts/deploy.py deleted file mode 100644 index 17cd9d36..00000000 --- a/phylogenetic/scripts/deploy.py +++ /dev/null @@ -1,95 +0,0 @@ -""" -- Generate dated builds where each node has random id -- Deploy builds from staging to production - -Builds are downloaded from staging -This script is to be run independently from snakemake -""" - -import argparse -import datetime -import gzip -import json -import os -import uuid - - -def add_branch_id_recursive(node): - """ - Recursively add randomly generated id to each node in auspice json tree - """ - node["branch_attrs"]["labels"] = {} - node["branch_attrs"]["labels"]["id"] = str(uuid.uuid4())[:8] - if "children" in node: - for child in node["children"]: - add_branch_id_recursive(child) - - - -if __name__=="__main__": - parser = argparse.ArgumentParser( - description="Deploy builds from staging to production, generate dated builds where each node has random id", - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - - - parser.add_argument('--build-names', nargs='+', type=str, required=True, help="build names to upload") - parser.add_argument('-f','--force', action='store_true', help="force overwrite of existing dated builds") - parser.add_argument('--no-dated' , action='store_true', help="do not deploy dated build") - parser.add_argument('--staging' , action='store_true', help="deploy to staging") - parser.set_defaults(feature=True) - args = parser.parse_args() - - print(f"> Deploying builds {args.build_names} to {'staging' if args.staging else 'production'}") - print("----------------------------------------") - - if not os.path.isdir('staging'): - os.mkdir('staging') - for build_name in args.build_names: - if not args.staging: - print(f">> Deploying build {build_name} to production") - - # Upload basic builds to staging - for auspice_file in ['', '_root-sequence']: - os.system(f"aws s3 cp s3://nextstrain-staging/mpox_{build_name}{auspice_file}.json s3://nextstrain-data/mpox_{build_name}{auspice_file}.json") - print(f">> Uploaded {build_name} to production: https://nextstrain.org/staging/mpox/{build_name.replace('_', '/')}/") - - - if not args.no_dated: - # Check how many today dated builds exist - today = datetime.date.today().strftime("%Y-%m-%d") - os.system(f"aws s3 ls nextstrain-data/mpox_{build_name}_{today}.json > dated_builds.txt") - - with open('dated_builds.txt') as fh: - today_dated_builds_count = len(fh.readlines()) - os.remove('dated_builds.txt') - if today_dated_builds_count == 0 or args.force: - if today_dated_builds_count > 0: - print(f">> Overwriting existing dated build due to --force flag being present") - - for auspice_file in ['', '_root-sequence']: - os.system(f"aws s3 cp s3://nextstrain-staging/mpox_{build_name}{auspice_file}.json staging/") - - # Load auspice json - with gzip.open(f"staging/mpox_{build_name}.json") as fh: - auspice_json = json.load(fh) - - add_branch_id_recursive(auspice_json['tree']) - - with open(f"staging/mpox_{build_name}_{today}.json", 'wt') as fh: - json.dump(auspice_json, fh) - - os.system(f"aws s3 cp staging/mpox_{build_name}_{today}.json s3://nextstrain-data") - os.system(f"aws s3 cp s3://nextstrain-staging/mpox_{build_name}_root-sequence.json s3://nextstrain-data/mpox_{build_name}_{today}_root-sequence.json") - print(f">> Uploaded dated {build_name} to production: https://nextstrain.org/mpox/{build_name.replace('_', '/')}/{today}/") - - else: - print(f">> Warning: Dated {build_name} with date today already exists, skipping upload: https://nextstrain.org/mpox/{build_name.replace('_', '/')}/{today}/") - print(f">> Hint: Use the --f/--force flag to overwrite existing dated builds") - if args.staging: - print(f">> Deploying build {build_name} to staging") - for auspice_file in ['', '_root-sequence']: - os.system(f"aws s3 cp auspice/mpox_{build_name}{auspice_file}.json s3://nextstrain-staging/mpox_{build_name}{auspice_file}.json") - print(f">> Uploaded {build_name} to staging: https://nextstrain.org/staging/mpox/{build_name.replace('_', '/')}/") - - print("----------------------------------------") From b2942626119828c8935f4dd14607dbf52fd1d431 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Tue, 13 Feb 2024 16:23:38 -0800 Subject: [PATCH 15/16] phylo: Make `auspice_name` config optional MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If `auspice_name` param is not provided, then the workflow will use the required `build_name` as the default Auspice filename. Ensures that we are following the Nextstrain Snakemake style guide for configs¹ by not accessing configs with `config.get(key)`. ¹ https://docs.nextstrain.org/en/latest/reference/snakemake-style-guide.html#access-config-values-appropriately --- phylogenetic/Snakefile | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index d5c3e2d7..19cce102 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -16,14 +16,12 @@ if not config: build_dir = "results" - - auspice_dir = "auspice" prefix = config.get("auspice_prefix", None) AUSPICE_PREFIX = ("trial_" + prefix + "_") if prefix is not None else "" -AUSPICE_FILENAME = AUSPICE_PREFIX + config.get("auspice_name") - +# Defaults to the `build_name` if no `auspice_name` is provided in the config +AUSPICE_FILENAME = AUSPICE_PREFIX + config.get("auspice_name", config["build_name"]) rule all: input: From ce5c2d53a8b5b1866fd2c17aae008e599fd48c87 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Tue, 13 Feb 2024 17:17:40 -0800 Subject: [PATCH 16/16] phylo: update rule inputs to literal paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow the Nextstrain Snakemake style guide¹ to use literal path strings as inputs for the core workflow rules. This will make it possible for users to override the default workflow with `custom_rules`. ¹ https://docs.nextstrain.org/en/latest/reference/snakemake-style-guide.html#define-input-paths-with-literal-path-strings --- phylogenetic/rules/annotate_phylogeny.smk | 18 +++++++++--------- phylogenetic/rules/construct_phylogeny.smk | 6 +++--- phylogenetic/rules/export.smk | 14 +++++++------- phylogenetic/rules/prepare_sequences.smk | 8 ++++---- 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/phylogenetic/rules/annotate_phylogeny.smk b/phylogenetic/rules/annotate_phylogeny.smk index 564f8c28..9f45b296 100644 --- a/phylogenetic/rules/annotate_phylogeny.smk +++ b/phylogenetic/rules/annotate_phylogeny.smk @@ -25,7 +25,7 @@ rule ancestral: Reconstructing ancestral sequences and mutations """ input: - tree=rules.refine.output.tree, + tree=build_dir + "/{build_name}/tree.nwk", alignment=build_dir + "/{build_name}/masked.fasta", output: node_data=build_dir + "/{build_name}/nt_muts.json", @@ -46,8 +46,8 @@ rule translate: Translating amino acid sequences """ input: - tree=rules.refine.output.tree, - node_data=rules.ancestral.output.node_data, + tree=build_dir + "/{build_name}/tree.nwk", + node_data=build_dir + "/{build_name}/nt_muts.json", genemap=config["genemap"], output: node_data=build_dir + "/{build_name}/aa_muts.json", @@ -67,7 +67,7 @@ rule traits: - increase uncertainty of reconstruction by {params.sampling_bias_correction} to partially account for sampling bias """ input: - tree=rules.refine.output.tree, + tree=build_dir + "/{build_name}/tree.nwk", metadata=build_dir + "/{build_name}/metadata.tsv", output: node_data=build_dir + "/{build_name}/traits.json", @@ -93,9 +93,9 @@ rule clades: Adding internal clade labels """ input: - tree=rules.refine.output.tree, - aa_muts=rules.translate.output.node_data, - nuc_muts=rules.ancestral.output.node_data, + tree=build_dir + "/{build_name}/tree.nwk", + aa_muts=build_dir + "/{build_name}/aa_muts.json", + nuc_muts=build_dir + "/{build_name}/nt_muts.json", clades=config["clades"], output: node_data=build_dir + "/{build_name}/clades_raw.json", @@ -113,7 +113,7 @@ rule clades: rule rename_clades: input: - rules.clades.output.node_data, + build_dir + "/{build_name}/clades_raw.json", output: node_data=build_dir + "/{build_name}/clades.json", shell: @@ -126,7 +126,7 @@ rule rename_clades: rule mutation_context: input: - tree=rules.refine.output.tree, + tree=build_dir + "/{build_name}/tree.nwk", node_data=build_dir + "/{build_name}/nt_muts.json", output: node_data=build_dir + "/{build_name}/mutation_context.json", diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk index 215c6a9a..fe0a7dea 100644 --- a/phylogenetic/rules/construct_phylogeny.smk +++ b/phylogenetic/rules/construct_phylogeny.smk @@ -41,7 +41,7 @@ rule fix_tree: Fixing tree """ input: - tree=rules.tree.output.tree, + tree=build_dir + "/{build_name}/tree_raw.nwk", alignment=build_dir + "/{build_name}/masked.fasta", output: tree=build_dir + "/{build_name}/tree_fixed.nwk", @@ -66,9 +66,9 @@ rule refine: - filter tips more than {params.clock_filter_iqd} IQDs from clock expectation """ input: - tree=rules.fix_tree.output.tree + tree=build_dir + "/{build_name}/tree_fixed.nwk" if config["fix_tree"] - else rules.tree.output.tree, + else build_dir + "/{build_name}/tree_raw.nwk", alignment=build_dir + "/{build_name}/masked.fasta", metadata=build_dir + "/{build_name}/metadata.tsv", output: diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk index f4e7ee69..89499ed1 100644 --- a/phylogenetic/rules/export.smk +++ b/phylogenetic/rules/export.smk @@ -63,18 +63,18 @@ rule export: Exporting data files for auspice """ input: - tree=rules.refine.output.tree, + tree=build_dir + "/{build_name}/tree.nwk", metadata=build_dir + "/{build_name}/metadata.tsv", branch_lengths="results/{build_name}/branch_lengths.json" if config.get("timetree", False) else "results/{build_name}/branch_lengths_no_time.json", - traits=rules.traits.output.node_data, - nt_muts=rules.ancestral.output.node_data, - aa_muts=rules.translate.output.node_data, + traits=build_dir + "/{build_name}/traits.json", + nt_muts=build_dir + "/{build_name}/nt_muts.json", + aa_muts=build_dir + "/{build_name}/aa_muts.json", clades=build_dir + "/{build_name}/clades.json", - mutation_context=rules.mutation_context.output.node_data, - recency=rules.recency.output.node_data if config.get("recency", False) else [], - colors=rules.colors.output.colors, + mutation_context=build_dir + "/{build_name}/mutation_context.json", + recency=build_dir + "/{build_name}/recency.json" if config.get("recency", False) else [], + colors=build_dir + "/{build_name}/colors.tsv", lat_longs=config["lat_longs"], description=config["description"], auspice_config=config["auspice_config"], diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk index b87820c3..da4b501e 100644 --- a/phylogenetic/rules/prepare_sequences.smk +++ b/phylogenetic/rules/prepare_sequences.smk @@ -83,7 +83,7 @@ rule filter: rule subsample: input: - metadata=rules.filter.output.metadata, + metadata=build_dir + "/{build_name}/good_metadata.tsv", output: strains=build_dir + "/{build_name}/{sample}_strains.txt", log=build_dir + "/{build_name}/{sample}_filter.log", @@ -117,8 +117,8 @@ rule combine_samples: f"{build_dir}/{w.build_name}/{sample}_strains.txt" for sample in config["subsample"] ], - sequences=rules.filter.output.sequences, - metadata=rules.filter.output.metadata, + sequences=build_dir + "/{build_name}/good_sequences.fasta", + metadata=build_dir + "/{build_name}/good_metadata.tsv", include=config["include"], output: sequences=build_dir + "/{build_name}/filtered.fasta", @@ -159,7 +159,7 @@ rule align: - filling gaps with N """ input: - sequences=rules.reverse_reverse_complements.output, + sequences=build_dir + "/{build_name}/reversed.fasta", reference=config["reference"], genemap=config["genemap"], output: