diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index f4b3e601..91325533 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -21,7 +21,7 @@ jobs: run: | nextstrain build \ phylogenetic \ - --configfile profiles/ci/builds.yaml + --configfiles build-configs/ci/config.yaml artifact-name: output-${{ matrix.runtime }} artifact-paths: | phylogenetic/auspice/ diff --git a/.github/workflows/rebuild-hmpxv1-big.yaml b/.github/workflows/rebuild-hmpxv1-big.yaml index 717532ab..5146f291 100644 --- a/.github/workflows/rebuild-hmpxv1-big.yaml +++ b/.github/workflows/rebuild-hmpxv1-big.yaml @@ -63,5 +63,5 @@ jobs: --env SLACK_CHANNELS \ . \ notify_on_deploy \ - --configfiles $BUILD_DIR/config/$BUILD_NAME/config.yaml $BUILD_DIR/config/nextstrain_automation.yaml \ + --configfiles $BUILD_DIR/defaults/$BUILD_NAME/config.yaml $BUILD_DIR/build-configs/nextstrain-automation/config.yaml \ $CONFIG_OVERRIDES --directory $BUILD_DIR --snakefile $BUILD_DIR/Snakefile diff --git a/.github/workflows/rebuild-hmpxv1.yaml b/.github/workflows/rebuild-hmpxv1.yaml index 0802f522..3db74076 100644 --- a/.github/workflows/rebuild-hmpxv1.yaml +++ b/.github/workflows/rebuild-hmpxv1.yaml @@ -63,5 +63,5 @@ jobs: --env SLACK_CHANNELS \ . \ notify_on_deploy \ - --configfiles $BUILD_DIR/config/$BUILD_NAME/config.yaml $BUILD_DIR/config/nextstrain_automation.yaml \ + --configfiles $BUILD_DIR/defaults/$BUILD_NAME/config.yaml $BUILD_DIR/build-configs/nextstrain-automation/config.yaml \ $CONFIG_OVERRIDES --directory $BUILD_DIR --snakefile $BUILD_DIR/Snakefile diff --git a/.github/workflows/rebuild-mpxv.yaml b/.github/workflows/rebuild-mpxv.yaml index f07f77b7..2c352227 100644 --- a/.github/workflows/rebuild-mpxv.yaml +++ b/.github/workflows/rebuild-mpxv.yaml @@ -63,5 +63,5 @@ jobs: --env SLACK_CHANNELS \ . \ notify_on_deploy \ - --configfiles $BUILD_DIR/config/$BUILD_NAME/config.yaml $BUILD_DIR/config/nextstrain_automation.yaml \ + --configfiles $BUILD_DIR/defaults/$BUILD_NAME/config.yaml $BUILD_DIR/build-configs/nextstrain-automation/config.yaml \ $CONFIG_OVERRIDES --directory $BUILD_DIR --snakefile $BUILD_DIR/Snakefile diff --git a/phylogenetic/README.md b/phylogenetic/README.md index 99c5d7c1..6c29f4cc 100644 --- a/phylogenetic/README.md +++ b/phylogenetic/README.md @@ -11,7 +11,7 @@ for Nextstrain's suite of software tools. ## Usage If you're unfamiliar with Nextstrain builds, you may want to follow our -[Running a Pathogen Workflow guide][] first and then come back here. +[Running a Pathogen Workflow guide](https://docs.nextstrain.org/en/latest/tutorials/running-a-workflow.html) first and then come back here. The easiest way to run this pathogen build is using the Nextstrain command-line tool from within the `phylogenetic/` directory: @@ -28,7 +28,7 @@ Once you've run the build, you can view the results with: You can run an example build using the example data provided in this repository via: ``` -nextstrain build . --configfile profiles/ci/builds.yaml +nextstrain build . --configfile build-configs/ci/config.yaml ``` When the build has finished running, view the output Auspice trees via: @@ -61,43 +61,21 @@ nextstrain build . data/sequences.fasta data/metadata.tsv Run pipeline to produce the "overview" tree for `/mpox/all-clades` with: ```bash -nextstrain build . --configfile config/mpxv/config.yaml +nextstrain build . --configfile defaults/mpxv/config.yaml ``` Run pipeline to produce the "clade IIb" tree for `/mpox/clade-IIb` with: ```bash -nextstrain build . --configfile config/hmpxv1/config.yaml +nextstrain build . --configfile defaults/hmpxv1/config.yaml ``` Run pipeline to produce the "lineage B.1" tree for `/mpox/lineage-B.1` with: ```bash -nextstrain build . --configfile config/hmpxv1_big/config.yaml +nextstrain build . --configfile defaults/hmpxv1_big/config.yaml ``` -### Deploy - -⚠️ The below is outdated and needs to be adjusted for the new build names (mpox instead of monkeypox, etc.) - -
- -Run the python script [`scripts/deploy.py`](scripts/deploy.py) to deploy the staging build to production. - -This will also automatically create a dated build where each node has a unique (random) ID so it can be targeted in shared links/narratives. - -```bash -python scripts/deploy.py --build-names hmpxv1 mpxv -``` - -If a dated build already exists it is not overwritten by default. To overwrite, pass `-f`. - -To deploy a locally built build to staging, use the `--staging` flag. - -To not deploy a dated build to production, add the `--no-dated` flag. - -
- ### Visualize results View results with: @@ -108,19 +86,30 @@ nextstrain view . ## Configuration -Configuration takes place in `config/*/config.yaml` files for each build. -The analysis pipeline is contained in `workflow/snakemake_rule/core.smk`. +The default configuration takes place in `defaults/*/config.yaml` files for each build. +The analysis pipeline is contained in `rules/core.smk`. This can be read top-to-bottom, each rule specifies its file inputs and output and pulls its parameters from `config`. There is little redirection and each rule should be able to be reasoned with on its own. +### Custom build configs + +The build-configs directory contains configs and customizations that override and/or extend the default workflow. + +- [chores](build-configs/chores/) - internal Nextstrain chores such as [updating the example data](#update-example-data). +- [ci](build-configs/ci/) - CI build that run the [example build](#example-build) with the [example data](example_data/). +- [nextstrain-automation](build-configs/nextstrain-automation/) - internal Nextstrain automated builds + ## Update example data -[Example data](./example_data/) is used by [CI](https://github.com/nextstrain/mpox/actions/workflows/ci.yaml). It can also be used as a small subset of real-world data. +[Example data](./example_data/) is used by [CI](https://github.com/nextstrain/mpox/actions/workflows/ci.yaml). +It can also be used as a small subset of real-world data. -Example data should be updated every time metadata schema is changed or a new clade/lineage emerges. To update, run: +Example data should be updated every time metadata schema is changed or a new clade/lineage emerges. +To update, run: ```sh -nextstrain build . update_example_data -F +nextstrain build . update_example_data -F \ + --configfiles build-configs/ci/config.yaml build-configs/chores/config.yaml ``` ## Data use diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index 285012c9..19cce102 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -12,18 +12,16 @@ if version.parse(augur_version) < version.parse(min_augur_version): if not config: - configfile: "config/hmpxv1/config.yaml" + configfile: "defaults/hmpxv1/config.yaml" build_dir = "results" - - auspice_dir = "auspice" prefix = config.get("auspice_prefix", None) AUSPICE_PREFIX = ("trial_" + prefix + "_") if prefix is not None else "" -AUSPICE_FILENAME = AUSPICE_PREFIX + config.get("auspice_name") - +# Defaults to the `build_name` if no `auspice_name` is provided in the config +AUSPICE_FILENAME = AUSPICE_PREFIX + config.get("auspice_name", config["build_name"]) rule all: input: @@ -39,22 +37,10 @@ rule all: """ -if config.get("data_source", None) == "lapis": - - include: "workflow/snakemake_rules/download_via_lapis.smk" - -else: - - include: "workflow/snakemake_rules/prepare.smk" - - -include: "workflow/snakemake_rules/chores.smk" -include: "workflow/snakemake_rules/core.smk" - - -if config.get("deploy_url", False): - - include: "workflow/snakemake_rules/nextstrain_automation.smk" +include: "rules/prepare_sequences.smk" +include: "rules/construct_phylogeny.smk" +include: "rules/annotate_phylogeny.smk" +include: "rules/export.smk" # Include custom rules defined in the config. diff --git a/phylogenetic/workflow/snakemake_rules/chores.smk b/phylogenetic/build-configs/chores/chores.smk similarity index 82% rename from phylogenetic/workflow/snakemake_rules/chores.smk rename to phylogenetic/build-configs/chores/chores.smk index bad47d7a..2998701f 100644 --- a/phylogenetic/workflow/snakemake_rules/chores.smk +++ b/phylogenetic/build-configs/chores/chores.smk @@ -1,3 +1,6 @@ +# I was hoping to use the Snakemake `default_target` directive to make this the +# default target when including this rule via `custom_rules`, but that is +# currently not possible: https://github.com/snakemake/snakemake/issues/2056 rule update_example_data: """This updates the files under example_data/ based on latest available data from data.nextstrain.org. diff --git a/phylogenetic/build-configs/chores/config.yaml b/phylogenetic/build-configs/chores/config.yaml new file mode 100644 index 00000000..cc316a83 --- /dev/null +++ b/phylogenetic/build-configs/chores/config.yaml @@ -0,0 +1,2 @@ +custom_rules: + - build-configs/chores/chores.smk diff --git a/phylogenetic/profiles/ci/builds.yaml b/phylogenetic/build-configs/ci/config.yaml similarity index 75% rename from phylogenetic/profiles/ci/builds.yaml rename to phylogenetic/build-configs/ci/config.yaml index 08dd36c8..de6d5ed0 100644 --- a/phylogenetic/profiles/ci/builds.yaml +++ b/phylogenetic/build-configs/ci/config.yaml @@ -1,15 +1,15 @@ custom_rules: - - profiles/ci/copy_example_data.smk + - build-configs/ci/copy_example_data.smk -reference: "config/reference.fasta" -genemap: "config/genemap.gff" -genbank_reference: "config/reference.gb" -include: "config/hmpxv1/include.txt" -clades: "config/clades.tsv" -lat_longs: "config/lat_longs.tsv" -auspice_config: "config/hmpxv1/auspice_config.json" -description: "config/description.md" -tree_mask: "config/tree_mask.tsv" +reference: "defaults/reference.fasta" +genemap: "defaults/genemap.gff" +genbank_reference: "defaults/reference.gb" +include: "defaults/hmpxv1/include.txt" +clades: "defaults/clades.tsv" +lat_longs: "defaults/lat_longs.tsv" +auspice_config: "defaults/hmpxv1/auspice_config.json" +description: "defaults/description.md" +tree_mask: "defaults/tree_mask.tsv" # Use `accession` as the ID column since `strain` currently contains duplicates¹. # ¹ https://github.com/nextstrain/monkeypox/issues/33 @@ -20,7 +20,7 @@ build_name: "hmpxv1" auspice_name: "mpox_clade-IIb" filter: - exclude: "config/exclude_accessions.txt" + exclude: "defaults/exclude_accessions.txt" min_date: 2017 min_length: 100000 @@ -81,4 +81,4 @@ recency: true mask: from_beginning: 800 from_end: 6422 - maskfile: "config/mask.bed" + maskfile: "defaults/mask.bed" diff --git a/phylogenetic/profiles/ci/copy_example_data.smk b/phylogenetic/build-configs/ci/copy_example_data.smk similarity index 100% rename from phylogenetic/profiles/ci/copy_example_data.smk rename to phylogenetic/build-configs/ci/copy_example_data.smk diff --git a/phylogenetic/config/nextstrain_automation.yaml b/phylogenetic/build-configs/nextstrain-automation/config.yaml similarity index 66% rename from phylogenetic/config/nextstrain_automation.yaml rename to phylogenetic/build-configs/nextstrain-automation/config.yaml index d0389c67..cdcda6de 100644 --- a/phylogenetic/config/nextstrain_automation.yaml +++ b/phylogenetic/build-configs/nextstrain-automation/config.yaml @@ -1,5 +1,8 @@ # Optional configs to include for automated Nextstrain builds # Intended to be used internally by the Nextstrain team +custom_rules: + - build-configs/nextstrain-automation/nextstrain-automation.smk + # deploy deploy_url: "s3://nextstrain-data" diff --git a/phylogenetic/workflow/snakemake_rules/nextstrain_automation.smk b/phylogenetic/build-configs/nextstrain-automation/nextstrain-automation.smk similarity index 100% rename from phylogenetic/workflow/snakemake_rules/nextstrain_automation.smk rename to phylogenetic/build-configs/nextstrain-automation/nextstrain-automation.smk diff --git a/phylogenetic/config/clades.tsv b/phylogenetic/defaults/clades.tsv similarity index 100% rename from phylogenetic/config/clades.tsv rename to phylogenetic/defaults/clades.tsv diff --git a/phylogenetic/config/color_ordering.tsv b/phylogenetic/defaults/color_ordering.tsv similarity index 100% rename from phylogenetic/config/color_ordering.tsv rename to phylogenetic/defaults/color_ordering.tsv diff --git a/phylogenetic/config/color_schemes.tsv b/phylogenetic/defaults/color_schemes.tsv similarity index 100% rename from phylogenetic/config/color_schemes.tsv rename to phylogenetic/defaults/color_schemes.tsv diff --git a/phylogenetic/config/description.md b/phylogenetic/defaults/description.md similarity index 96% rename from phylogenetic/config/description.md rename to phylogenetic/defaults/description.md index a33eb1d3..624be722 100644 --- a/phylogenetic/config/description.md +++ b/phylogenetic/defaults/description.md @@ -14,7 +14,7 @@ Our bioinformatic processing workflow can be found at [github.com/nextstrain/mpo - masking several regions of the genome, including the first 1350 and last 6422 base pairs and multiple repetitive regions of variable length - phylogenetic reconstruction using [IQTREE-2](http://www.iqtree.org/) - ancestral state reconstruction and temporal inference using [TreeTime](https://github.com/neherlab/treetime) -- clade assignment via [clade definitions defined here](https://github.com/nextstrain/mpox/blob/master/config/clades.tsv), to label broader MPXV clades I, IIa and IIb and to label hMPXV1 lineages A, A.1, A.1.1, etc... +- clade assignment via [clade definitions defined here](https://github.com/nextstrain/mpox/blob/master/defaults/clades.tsv), to label broader MPXV clades I, IIa and IIb and to label hMPXV1 lineages A, A.1, A.1.1, etc... #### Underlying data We curate sequence data and metadata from the [NCBI Datasets command line tools](https://www.ncbi.nlm.nih.gov/datasets/docs/v2/download-and-install/), diff --git a/phylogenetic/config/exclude_accessions.txt b/phylogenetic/defaults/exclude_accessions.txt similarity index 100% rename from phylogenetic/config/exclude_accessions.txt rename to phylogenetic/defaults/exclude_accessions.txt diff --git a/phylogenetic/config/genemap.gff b/phylogenetic/defaults/genemap.gff similarity index 100% rename from phylogenetic/config/genemap.gff rename to phylogenetic/defaults/genemap.gff diff --git a/phylogenetic/config/hmpxv1/auspice_config.json b/phylogenetic/defaults/hmpxv1/auspice_config.json similarity index 100% rename from phylogenetic/config/hmpxv1/auspice_config.json rename to phylogenetic/defaults/hmpxv1/auspice_config.json diff --git a/phylogenetic/config/hmpxv1/config.yaml b/phylogenetic/defaults/hmpxv1/config.yaml similarity index 76% rename from phylogenetic/config/hmpxv1/config.yaml rename to phylogenetic/defaults/hmpxv1/config.yaml index a9b6739a..6e416e95 100644 --- a/phylogenetic/config/hmpxv1/config.yaml +++ b/phylogenetic/defaults/hmpxv1/config.yaml @@ -1,12 +1,12 @@ -reference: "config/reference.fasta" -genemap: "config/genemap.gff" -genbank_reference: "config/reference.gb" -include: "config/hmpxv1/include.txt" -clades: "config/clades.tsv" -lat_longs: "config/lat_longs.tsv" -auspice_config: "config/hmpxv1/auspice_config.json" -description: "config/description.md" -tree_mask: "config/tree_mask.tsv" +reference: "defaults/reference.fasta" +genemap: "defaults/genemap.gff" +genbank_reference: "defaults/reference.gb" +include: "defaults/hmpxv1/include.txt" +clades: "defaults/clades.tsv" +lat_longs: "defaults/lat_longs.tsv" +auspice_config: "defaults/hmpxv1/auspice_config.json" +description: "defaults/description.md" +tree_mask: "defaults/tree_mask.tsv" # Use `accession` as the ID column since `strain` currently contains duplicates¹. # ¹ https://github.com/nextstrain/mpox/issues/33 @@ -17,7 +17,7 @@ build_name: "hmpxv1" auspice_name: "mpox_clade-IIb" filter: - exclude: "config/exclude_accessions.txt" + exclude: "defaults/exclude_accessions.txt" min_date: 2017 min_length: 100000 @@ -78,4 +78,4 @@ recency: true mask: from_beginning: 800 from_end: 6422 - maskfile: "config/mask.bed" + maskfile: "defaults/mask.bed" diff --git a/phylogenetic/config/hmpxv1/include.txt b/phylogenetic/defaults/hmpxv1/include.txt similarity index 100% rename from phylogenetic/config/hmpxv1/include.txt rename to phylogenetic/defaults/hmpxv1/include.txt diff --git a/phylogenetic/config/hmpxv1_big/auspice_config.json b/phylogenetic/defaults/hmpxv1_big/auspice_config.json similarity index 100% rename from phylogenetic/config/hmpxv1_big/auspice_config.json rename to phylogenetic/defaults/hmpxv1_big/auspice_config.json diff --git a/phylogenetic/config/hmpxv1_big/config.yaml b/phylogenetic/defaults/hmpxv1_big/config.yaml similarity index 67% rename from phylogenetic/config/hmpxv1_big/config.yaml rename to phylogenetic/defaults/hmpxv1_big/config.yaml index 513e530f..dd807541 100644 --- a/phylogenetic/config/hmpxv1_big/config.yaml +++ b/phylogenetic/defaults/hmpxv1_big/config.yaml @@ -1,12 +1,12 @@ -reference: "config/reference.fasta" -genemap: "config/genemap.gff" -genbank_reference: "config/reference.gb" -include: "config/hmpxv1_big/include.txt" -clades: "config/clades.tsv" -lat_longs: "config/lat_longs.tsv" -auspice_config: "config/hmpxv1_big/auspice_config.json" -description: "config/description.md" -tree_mask: "config/tree_mask.tsv" +reference: "defaults/reference.fasta" +genemap: "defaults/genemap.gff" +genbank_reference: "defaults/reference.gb" +include: "defaults/hmpxv1_big/include.txt" +clades: "defaults/clades.tsv" +lat_longs: "defaults/lat_longs.tsv" +auspice_config: "defaults/hmpxv1_big/auspice_config.json" +description: "defaults/description.md" +tree_mask: "defaults/tree_mask.tsv" # Use `accession` as the ID column since `strain` currently contains duplicates¹. # ¹ https://github.com/nextstrain/mpox/issues/33 @@ -17,7 +17,7 @@ build_name: "hmpxv1_big" auspice_name: "mpox_lineage-B.1" filter: - exclude: "config/exclude_accessions.txt" + exclude: "defaults/exclude_accessions.txt" min_date: 2022 min_length: 180000 @@ -57,4 +57,4 @@ recency: true mask: from_beginning: 800 from_end: 6422 - maskfile: "config/mask.bed" + maskfile: "defaults/mask.bed" diff --git a/phylogenetic/config/hmpxv1_big/include.txt b/phylogenetic/defaults/hmpxv1_big/include.txt similarity index 100% rename from phylogenetic/config/hmpxv1_big/include.txt rename to phylogenetic/defaults/hmpxv1_big/include.txt diff --git a/phylogenetic/config/lat_longs.tsv b/phylogenetic/defaults/lat_longs.tsv similarity index 100% rename from phylogenetic/config/lat_longs.tsv rename to phylogenetic/defaults/lat_longs.tsv diff --git a/phylogenetic/config/mask.bed b/phylogenetic/defaults/mask.bed similarity index 100% rename from phylogenetic/config/mask.bed rename to phylogenetic/defaults/mask.bed diff --git a/phylogenetic/config/mask_overview.bed b/phylogenetic/defaults/mask_overview.bed similarity index 100% rename from phylogenetic/config/mask_overview.bed rename to phylogenetic/defaults/mask_overview.bed diff --git a/phylogenetic/config/mpxv/auspice_config.json b/phylogenetic/defaults/mpxv/auspice_config.json similarity index 100% rename from phylogenetic/config/mpxv/auspice_config.json rename to phylogenetic/defaults/mpxv/auspice_config.json diff --git a/phylogenetic/config/mpxv/config.yaml b/phylogenetic/defaults/mpxv/config.yaml similarity index 74% rename from phylogenetic/config/mpxv/config.yaml rename to phylogenetic/defaults/mpxv/config.yaml index d569445a..5327d6f2 100644 --- a/phylogenetic/config/mpxv/config.yaml +++ b/phylogenetic/defaults/mpxv/config.yaml @@ -1,12 +1,12 @@ -auspice_config: "config/mpxv/auspice_config.json" -include: "config/mpxv/include.txt" -reference: "config/reference.fasta" -genemap: "config/genemap.gff" -genbank_reference: "config/reference.gb" -lat_longs: "config/lat_longs.tsv" -description: "config/description.md" -clades: "config/clades.tsv" -tree_mask: "config/tree_mask.tsv" +auspice_config: "defaults/mpxv/auspice_config.json" +include: "defaults/mpxv/include.txt" +reference: "defaults/reference.fasta" +genemap: "defaults/genemap.gff" +genbank_reference: "defaults/reference.gb" +lat_longs: "defaults/lat_longs.tsv" +description: "defaults/description.md" +clades: "defaults/clades.tsv" +tree_mask: "defaults/tree_mask.tsv" # Use `accession` as the ID column since `strain` currently contains duplicates¹. # ¹ https://github.com/nextstrain/mpox/issues/33 @@ -17,7 +17,7 @@ build_name: "mpxv" auspice_name: "mpox_all-clades" filter: - exclude: "config/exclude_accessions.txt" + exclude: "defaults/exclude_accessions.txt" min_date: 1950 min_length: 100000 @@ -74,4 +74,4 @@ recency: true mask: from_beginning: 1350 from_end: 6422 - maskfile: "config/mask_overview.bed" + maskfile: "defaults/mask_overview.bed" diff --git a/phylogenetic/config/mpxv/include.txt b/phylogenetic/defaults/mpxv/include.txt similarity index 100% rename from phylogenetic/config/mpxv/include.txt rename to phylogenetic/defaults/mpxv/include.txt diff --git a/phylogenetic/config/reference.fasta b/phylogenetic/defaults/reference.fasta similarity index 100% rename from phylogenetic/config/reference.fasta rename to phylogenetic/defaults/reference.fasta diff --git a/phylogenetic/config/reference.gb b/phylogenetic/defaults/reference.gb similarity index 100% rename from phylogenetic/config/reference.gb rename to phylogenetic/defaults/reference.gb diff --git a/phylogenetic/config/tree_mask.tsv b/phylogenetic/defaults/tree_mask.tsv similarity index 100% rename from phylogenetic/config/tree_mask.tsv rename to phylogenetic/defaults/tree_mask.tsv diff --git a/phylogenetic/rules/annotate_phylogeny.smk b/phylogenetic/rules/annotate_phylogeny.smk new file mode 100644 index 00000000..9f45b296 --- /dev/null +++ b/phylogenetic/rules/annotate_phylogeny.smk @@ -0,0 +1,158 @@ +""" +This part of the workflow creates additional annotations for the phylogenetic tree. + +REQUIRED INPUTS: + + sequences = {build_dir}/{build_name}/masked.fasta + metadata = {build_dir}/{build_name}/metadata.tsv + tree = {build_dir}/{build_name}/tree.nwk + clades = path to clades definition TSV + +OUTPUTS: + + nt_muts = {build_dir}/{build_name}/nt_muts.json + aa_muts = {build_dir}/{build_name}/aa_muts.json + traits = {build_dir}/{build_name}/traits.json + clades = {build_dir}/{build_name}/clades.json + mutation_context = {build_dir}/{build_name}/mutation_context.json + recency = {build_dir}/{build_name}/recency.json + +""" + + +rule ancestral: + """ + Reconstructing ancestral sequences and mutations + """ + input: + tree=build_dir + "/{build_name}/tree.nwk", + alignment=build_dir + "/{build_name}/masked.fasta", + output: + node_data=build_dir + "/{build_name}/nt_muts.json", + params: + inference="joint", + shell: + """ + augur ancestral \ + --tree {input.tree} \ + --alignment {input.alignment} \ + --output-node-data {output.node_data} \ + --inference {params.inference} + """ + + +rule translate: + """ + Translating amino acid sequences + """ + input: + tree=build_dir + "/{build_name}/tree.nwk", + node_data=build_dir + "/{build_name}/nt_muts.json", + genemap=config["genemap"], + output: + node_data=build_dir + "/{build_name}/aa_muts.json", + shell: + """ + augur translate \ + --tree {input.tree} \ + --ancestral-sequences {input.node_data} \ + --reference-sequence {input.genemap} \ + --output {output.node_data} + """ + + +rule traits: + """ + Inferring ancestral traits for {params.columns!s} + - increase uncertainty of reconstruction by {params.sampling_bias_correction} to partially account for sampling bias + """ + input: + tree=build_dir + "/{build_name}/tree.nwk", + metadata=build_dir + "/{build_name}/metadata.tsv", + output: + node_data=build_dir + "/{build_name}/traits.json", + params: + columns="country", + sampling_bias_correction=3, + strain_id=config["strain_id_field"], + shell: + """ + augur traits \ + --tree {input.tree} \ + --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ + --output {output.node_data} \ + --columns {params.columns} \ + --confidence \ + --sampling-bias-correction {params.sampling_bias_correction} + """ + + +rule clades: + """ + Adding internal clade labels + """ + input: + tree=build_dir + "/{build_name}/tree.nwk", + aa_muts=build_dir + "/{build_name}/aa_muts.json", + nuc_muts=build_dir + "/{build_name}/nt_muts.json", + clades=config["clades"], + output: + node_data=build_dir + "/{build_name}/clades_raw.json", + log: + "logs/clades_{build_name}.txt", + shell: + """ + augur clades \ + --tree {input.tree} \ + --mutations {input.nuc_muts} {input.aa_muts} \ + --clades {input.clades} \ + --output-node-data {output.node_data} 2>&1 | tee {log} + """ + + +rule rename_clades: + input: + build_dir + "/{build_name}/clades_raw.json", + output: + node_data=build_dir + "/{build_name}/clades.json", + shell: + """ + python scripts/clades_renaming.py \ + --input-node-data {input} \ + --output-node-data {output.node_data} + """ + + +rule mutation_context: + input: + tree=build_dir + "/{build_name}/tree.nwk", + node_data=build_dir + "/{build_name}/nt_muts.json", + output: + node_data=build_dir + "/{build_name}/mutation_context.json", + shell: + """ + python3 scripts/mutation_context.py \ + --tree {input.tree} \ + --mutations {input.node_data} \ + --output {output.node_data} + """ + + +rule recency: + """ + Use metadata on submission date to construct submission recency field + """ + input: + metadata=build_dir + "/{build_name}/metadata.tsv", + output: + node_data=build_dir + "/{build_name}/recency.json", + params: + strain_id=config["strain_id_field"], + shell: + """ + python3 scripts/construct-recency-from-submission-date.py \ + --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ + --output {output} 2>&1 + """ diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk new file mode 100644 index 00000000..fe0a7dea --- /dev/null +++ b/phylogenetic/rules/construct_phylogeny.smk @@ -0,0 +1,111 @@ +""" +This part of the workflow constructs the phylogenetic tree. + +REQUIRED INPUTS: + + sequences = {build_dir}/{build_name}/masked.fasta + metadata = {build_dir}/{build_name}/metadata.tsv + tree_mask = path to maskfile of sites to exclude for tree building + +OUTPUTS: + + tree = {build_dir}/{build_name}/tree.nwk + branch_lengths = {build_dir}/{build_name}/branch_lengths.json + +""" + + +rule tree: + """ + Building tree + """ + input: + alignment=build_dir + "/{build_name}/masked.fasta", + tree_mask=config["tree_mask"], + output: + tree=build_dir + "/{build_name}/tree_raw.nwk", + threads: workflow.cores + shell: + """ + augur tree \ + --alignment {input.alignment} \ + --exclude-sites {input.tree_mask} \ + --tree-builder-args="-redo" \ + --output {output.tree} \ + --nthreads {threads} + """ + + +rule fix_tree: + """ + Fixing tree + """ + input: + tree=build_dir + "/{build_name}/tree_raw.nwk", + alignment=build_dir + "/{build_name}/masked.fasta", + output: + tree=build_dir + "/{build_name}/tree_fixed.nwk", + params: + root=lambda w: config.get("treefix_root", ""), + shell: + """ + python3 scripts/fix_tree.py \ + --alignment {input.alignment} \ + --input-tree {input.tree} \ + {params.root} \ + --output {output.tree} + """ + + +rule refine: + """ + Refining tree + - estimate timetree + - use {params.coalescent} coalescent timescale + - estimate {params.date_inference} node dates + - filter tips more than {params.clock_filter_iqd} IQDs from clock expectation + """ + input: + tree=build_dir + "/{build_name}/tree_fixed.nwk" + if config["fix_tree"] + else build_dir + "/{build_name}/tree_raw.nwk", + alignment=build_dir + "/{build_name}/masked.fasta", + metadata=build_dir + "/{build_name}/metadata.tsv", + output: + tree=build_dir + "/{build_name}/tree.nwk", + node_data=build_dir + "/{build_name}/branch_lengths.json", + params: + coalescent="opt", + date_inference="marginal", + clock_filter_iqd=0, + root=config["root"], + clock_rate=f"--clock-rate {config['clock_rate']}" + if "clock_rate" in config + else "", + clock_std_dev=f"--clock-std-dev {config['clock_std_dev']}" + if "clock_std_dev" in config + else "", + strain_id=config["strain_id_field"], + divergence_units=config["divergence_units"], + shell: + """ + augur refine \ + --tree {input.tree} \ + --alignment {input.alignment} \ + --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ + --output-tree {output.tree} \ + --timetree \ + --root {params.root} \ + --precision 3 \ + --keep-polytomies \ + --use-fft \ + {params.clock_rate} \ + {params.clock_std_dev} \ + --output-node-data {output.node_data} \ + --coalescent {params.coalescent} \ + --date-inference {params.date_inference} \ + --date-confidence \ + --divergence-units {params.divergence_units} \ + --clock-filter-iqd {params.clock_filter_iqd} + """ diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk new file mode 100644 index 00000000..89499ed1 --- /dev/null +++ b/phylogenetic/rules/export.smk @@ -0,0 +1,121 @@ +""" +This part of the workflow collects the phylogenetic tree and annotations to +export a Nextstrain dataset. + +REQUIRED INPUTS: + + metadata = {build_dir}/{build_name}/metadata.tsv + tree = {build_dir}/{build_name}/tree.nwk + branch_lengths = {build_dir}/{build_name}/branch_lengths.json + nt_muts = {build_dir}/{build_name}/nt_muts.json + aa_muts = {build_dir}/{build_name}/aa_muts.json + traits = {build_dir}/{build_name}/traits.json + clades = {build_dir}/{build_name}/clades.json + mutation_context = {build_dir}/{build_name}/mutation_context.json + color_ordering = defaults/color_ordering.tsv + color_schemes = defaults/color_schemes.tsv + lat_longs = path to lat/long TSV + description = path to description Markdown + auspice_config = path to Auspice config JSON + +OPTIONAL INPUTS: + + recency = {build_dir}/{build_name}/recency.json + +OUTPUTS: + + auspice_json = {build_dir}/{build_name}/tree.json + root_sequence = {build_dir}/{build_name}/tree_root-sequence.json + +""" + + +rule remove_time: + input: + "results/{build_name}/branch_lengths.json", + output: + "results/{build_name}/branch_lengths_no_time.json", + shell: + """ + python3 scripts/remove_timeinfo.py --input-node-data {input} --output-node-data {output} + """ + + +rule colors: + input: + ordering="defaults/color_ordering.tsv", + color_schemes="defaults/color_schemes.tsv", + metadata=build_dir + "/{build_name}/metadata.tsv", + output: + colors=build_dir + "/{build_name}/colors.tsv", + shell: + """ + python3 scripts/assign-colors.py \ + --ordering {input.ordering} \ + --color-schemes {input.color_schemes} \ + --output {output.colors} \ + --metadata {input.metadata} 2>&1 + """ + + +rule export: + """ + Exporting data files for auspice + """ + input: + tree=build_dir + "/{build_name}/tree.nwk", + metadata=build_dir + "/{build_name}/metadata.tsv", + branch_lengths="results/{build_name}/branch_lengths.json" + if config.get("timetree", False) + else "results/{build_name}/branch_lengths_no_time.json", + traits=build_dir + "/{build_name}/traits.json", + nt_muts=build_dir + "/{build_name}/nt_muts.json", + aa_muts=build_dir + "/{build_name}/aa_muts.json", + clades=build_dir + "/{build_name}/clades.json", + mutation_context=build_dir + "/{build_name}/mutation_context.json", + recency=build_dir + "/{build_name}/recency.json" if config.get("recency", False) else [], + colors=build_dir + "/{build_name}/colors.tsv", + lat_longs=config["lat_longs"], + description=config["description"], + auspice_config=config["auspice_config"], + output: + auspice_json=build_dir + "/{build_name}/raw_tree.json", + root_sequence=build_dir + "/{build_name}/raw_tree_root-sequence.json", + params: + strain_id=config["strain_id_field"], + shell: + """ + augur export v2 \ + --tree {input.tree} \ + --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ + --node-data {input.branch_lengths} {input.nt_muts} {input.aa_muts} {input.mutation_context} {input.clades} {input.recency}\ + --colors {input.colors} \ + --lat-longs {input.lat_longs} \ + --description {input.description} \ + --auspice-config {input.auspice_config} \ + --include-root-sequence \ + --output {output.auspice_json} + """ + + +rule final_strain_name: + input: + auspice_json=build_dir + "/{build_name}/raw_tree.json", + metadata=build_dir + "/{build_name}/metadata.tsv", + root_sequence=build_dir + "/{build_name}/raw_tree_root-sequence.json", + output: + auspice_json=build_dir + "/{build_name}/tree.json", + root_sequence=build_dir + "/{build_name}/tree_root-sequence.json", + params: + strain_id=config["strain_id_field"], + display_strain_field=config.get("display_strain_field", "strain"), + shell: + """ + python3 scripts/set_final_strain_name.py --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ + --input-auspice-json {input.auspice_json} \ + --display-strain-name {params.display_strain_field} \ + --output {output.auspice_json} + cp {input.root_sequence} {output.root_sequence} + """ diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk new file mode 100644 index 00000000..da4b501e --- /dev/null +++ b/phylogenetic/rules/prepare_sequences.smk @@ -0,0 +1,208 @@ +""" +This part of the workflow prepares sequences for constructing the phylogenetic tree. + +REQUIRED INPUTS: + + include = path to file of sequences to in force include + reference = path to reference sequence FASTA for Nextclade alignment + genemap = path to genemap GFF for Nextclade alignment + maskfile = path to maskfile of sites to be masked + +OUTPUTS: + + prepared_sequences = {build_dir}/{build_name}/masked.fasta + +""" + + +rule download: + """ + Downloading sequences and metadata from data.nextstrain.org + """ + output: + sequences="data/sequences.fasta.xz", + metadata="data/metadata.tsv.gz", + params: + sequences_url="https://data.nextstrain.org/files/workflows/mpox/sequences.fasta.xz", + metadata_url="https://data.nextstrain.org/files/workflows/mpox/metadata.tsv.gz", + shell: + """ + curl -fsSL --compressed {params.sequences_url:q} --output {output.sequences} + curl -fsSL --compressed {params.metadata_url:q} --output {output.metadata} + """ + + +rule decompress: + """ + Decompressing sequences and metadata + """ + input: + sequences="data/sequences.fasta.xz", + metadata="data/metadata.tsv.gz", + output: + sequences="data/sequences.fasta", + metadata="data/metadata.tsv", + shell: + """ + gzip --decompress --keep {input.metadata} + xz --decompress --keep {input.sequences} + """ + + +rule filter: + """ + Removing strains that do not satisfy certain requirements. + """ + input: + sequences="data/sequences.fasta", + metadata="data/metadata.tsv", + output: + sequences=build_dir + "/{build_name}/good_sequences.fasta", + metadata=build_dir + "/{build_name}/good_metadata.tsv", + log=build_dir + "/{build_name}/good_filter.log", + params: + exclude=config["filter"]["exclude"], + min_date=config["filter"]["min_date"], + min_length=config["filter"]["min_length"], + strain_id=config["strain_id_field"], + shell: + """ + augur filter \ + --sequences {input.sequences} \ + --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ + --output-sequences {output.sequences} \ + --output-metadata {output.metadata} \ + --exclude {params.exclude} \ + --min-date {params.min_date} \ + --min-length {params.min_length} \ + --query "(QC_rare_mutations == 'good' | QC_rare_mutations == 'mediocre')" \ + --output-log {output.log} + """ + + +rule subsample: + input: + metadata=build_dir + "/{build_name}/good_metadata.tsv", + output: + strains=build_dir + "/{build_name}/{sample}_strains.txt", + log=build_dir + "/{build_name}/{sample}_filter.log", + params: + group_by=lambda w: config["subsample"][w.sample]["group_by"], + sequences_per_group=lambda w: config["subsample"][w.sample][ + "sequences_per_group" + ], + other_filters=lambda w: config["subsample"][w.sample].get("other_filters", ""), + exclude=lambda w: f"--exclude-where {' '.join([f'lineage={l}' for l in config['subsample'][w.sample]['exclude_lineages']])}" + if "exclude_lineages" in config["subsample"][w.sample] + else "", + strain_id=config["strain_id_field"], + shell: + """ + augur filter \ + --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ + --output-strains {output.strains} \ + {params.group_by} \ + {params.sequences_per_group} \ + {params.exclude} \ + {params.other_filters} \ + --output-log {output.log} + """ + + +rule combine_samples: + input: + strains=lambda w: [ + f"{build_dir}/{w.build_name}/{sample}_strains.txt" + for sample in config["subsample"] + ], + sequences=build_dir + "/{build_name}/good_sequences.fasta", + metadata=build_dir + "/{build_name}/good_metadata.tsv", + include=config["include"], + output: + sequences=build_dir + "/{build_name}/filtered.fasta", + metadata=build_dir + "/{build_name}/metadata.tsv", + params: + strain_id=config["strain_id_field"], + shell: + """ + augur filter \ + --metadata-id-columns {params.strain_id} \ + --sequences {input.sequences} \ + --metadata {input.metadata} \ + --exclude-all \ + --include {input.strains} {input.include}\ + --output-sequences {output.sequences} \ + --output-metadata {output.metadata} + """ + + +rule reverse_reverse_complements: + input: + metadata=build_dir + "/{build_name}/metadata.tsv", + sequences=build_dir + "/{build_name}/filtered.fasta", + output: + build_dir + "/{build_name}/reversed.fasta", + shell: + """ + python3 scripts/reverse_reversed_sequences.py \ + --metadata {input.metadata} \ + --sequences {input.sequences} \ + --output {output} + """ + + +rule align: + """ + Aligning sequences to {input.reference} + - filling gaps with N + """ + input: + sequences=build_dir + "/{build_name}/reversed.fasta", + reference=config["reference"], + genemap=config["genemap"], + output: + alignment=build_dir + "/{build_name}/aligned.fasta", + insertions=build_dir + "/{build_name}/insertions.fasta", + params: + max_indel=config["max_indel"], + seed_spacing=config["seed_spacing"], + threads: workflow.cores + shell: + """ + nextalign run \ + --jobs {threads} \ + --reference {input.reference} \ + --genemap {input.genemap} \ + --max-indel {params.max_indel} \ + --seed-spacing {params.seed_spacing} \ + --retry-reverse-complement \ + --output-fasta - \ + --output-insertions {output.insertions} \ + {input.sequences} | seqkit seq -i > {output.alignment} + """ + + +rule mask: + """ + Mask ends of the alignment: + - from start: {params.from_start} + - from end: {params.from_end} + """ + input: + sequences=build_dir + "/{build_name}/aligned.fasta", + mask=config["mask"]["maskfile"], + output: + build_dir + "/{build_name}/masked.fasta", + params: + from_start=config["mask"]["from_beginning"], + from_end=config["mask"]["from_end"], + shell: + """ + augur mask \ + --sequences {input.sequences} \ + --mask {input.mask} \ + --mask-from-beginning {params.from_start} \ + --mask-from-end {params.from_end} --output {output} + """ diff --git a/phylogenetic/scripts/deploy.py b/phylogenetic/scripts/deploy.py deleted file mode 100644 index 17cd9d36..00000000 --- a/phylogenetic/scripts/deploy.py +++ /dev/null @@ -1,95 +0,0 @@ -""" -- Generate dated builds where each node has random id -- Deploy builds from staging to production - -Builds are downloaded from staging -This script is to be run independently from snakemake -""" - -import argparse -import datetime -import gzip -import json -import os -import uuid - - -def add_branch_id_recursive(node): - """ - Recursively add randomly generated id to each node in auspice json tree - """ - node["branch_attrs"]["labels"] = {} - node["branch_attrs"]["labels"]["id"] = str(uuid.uuid4())[:8] - if "children" in node: - for child in node["children"]: - add_branch_id_recursive(child) - - - -if __name__=="__main__": - parser = argparse.ArgumentParser( - description="Deploy builds from staging to production, generate dated builds where each node has random id", - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - - - parser.add_argument('--build-names', nargs='+', type=str, required=True, help="build names to upload") - parser.add_argument('-f','--force', action='store_true', help="force overwrite of existing dated builds") - parser.add_argument('--no-dated' , action='store_true', help="do not deploy dated build") - parser.add_argument('--staging' , action='store_true', help="deploy to staging") - parser.set_defaults(feature=True) - args = parser.parse_args() - - print(f"> Deploying builds {args.build_names} to {'staging' if args.staging else 'production'}") - print("----------------------------------------") - - if not os.path.isdir('staging'): - os.mkdir('staging') - for build_name in args.build_names: - if not args.staging: - print(f">> Deploying build {build_name} to production") - - # Upload basic builds to staging - for auspice_file in ['', '_root-sequence']: - os.system(f"aws s3 cp s3://nextstrain-staging/mpox_{build_name}{auspice_file}.json s3://nextstrain-data/mpox_{build_name}{auspice_file}.json") - print(f">> Uploaded {build_name} to production: https://nextstrain.org/staging/mpox/{build_name.replace('_', '/')}/") - - - if not args.no_dated: - # Check how many today dated builds exist - today = datetime.date.today().strftime("%Y-%m-%d") - os.system(f"aws s3 ls nextstrain-data/mpox_{build_name}_{today}.json > dated_builds.txt") - - with open('dated_builds.txt') as fh: - today_dated_builds_count = len(fh.readlines()) - os.remove('dated_builds.txt') - if today_dated_builds_count == 0 or args.force: - if today_dated_builds_count > 0: - print(f">> Overwriting existing dated build due to --force flag being present") - - for auspice_file in ['', '_root-sequence']: - os.system(f"aws s3 cp s3://nextstrain-staging/mpox_{build_name}{auspice_file}.json staging/") - - # Load auspice json - with gzip.open(f"staging/mpox_{build_name}.json") as fh: - auspice_json = json.load(fh) - - add_branch_id_recursive(auspice_json['tree']) - - with open(f"staging/mpox_{build_name}_{today}.json", 'wt') as fh: - json.dump(auspice_json, fh) - - os.system(f"aws s3 cp staging/mpox_{build_name}_{today}.json s3://nextstrain-data") - os.system(f"aws s3 cp s3://nextstrain-staging/mpox_{build_name}_root-sequence.json s3://nextstrain-data/mpox_{build_name}_{today}_root-sequence.json") - print(f">> Uploaded dated {build_name} to production: https://nextstrain.org/mpox/{build_name.replace('_', '/')}/{today}/") - - else: - print(f">> Warning: Dated {build_name} with date today already exists, skipping upload: https://nextstrain.org/mpox/{build_name.replace('_', '/')}/{today}/") - print(f">> Hint: Use the --f/--force flag to overwrite existing dated builds") - if args.staging: - print(f">> Deploying build {build_name} to staging") - for auspice_file in ['', '_root-sequence']: - os.system(f"aws s3 cp auspice/mpox_{build_name}{auspice_file}.json s3://nextstrain-staging/mpox_{build_name}{auspice_file}.json") - print(f">> Uploaded {build_name} to staging: https://nextstrain.org/staging/mpox/{build_name.replace('_', '/')}/") - - print("----------------------------------------") diff --git a/phylogenetic/workflow/snakemake_rules/core.smk b/phylogenetic/workflow/snakemake_rules/core.smk deleted file mode 100644 index 474f3a7b..00000000 --- a/phylogenetic/workflow/snakemake_rules/core.smk +++ /dev/null @@ -1,497 +0,0 @@ -""" -This part of the workflow expects input files - - sequences = "data/sequences.fasta", - metadata = "data/metadata.tsv", - -and will produce output files as - - auspice_json = auspice_dir + "/mpox_{build_name}.json" - -Parameter are expected to sit in the `config` data structure. -In addition, `build_dir` and `auspice_dir` need to be defined upstream. -""" - - -rule filter: - """ - Removing strains that do not satisfy certain requirements. - """ - input: - sequences="data/sequences.fasta", - metadata="data/metadata.tsv", - output: - sequences=build_dir + "/{build_name}/good_sequences.fasta", - metadata=build_dir + "/{build_name}/good_metadata.tsv", - log=build_dir + "/{build_name}/good_filter.log", - params: - exclude=config["filter"]["exclude"], - min_date=config["filter"]["min_date"], - min_length=config["filter"]["min_length"], - strain_id=config["strain_id_field"], - shell: - """ - augur filter \ - --sequences {input.sequences} \ - --metadata {input.metadata} \ - --metadata-id-columns {params.strain_id} \ - --output-sequences {output.sequences} \ - --output-metadata {output.metadata} \ - --exclude {params.exclude} \ - --min-date {params.min_date} \ - --min-length {params.min_length} \ - --query "(QC_rare_mutations == 'good' | QC_rare_mutations == 'mediocre')" \ - --output-log {output.log} - """ - - -rule subsample: - input: - metadata=rules.filter.output.metadata, - output: - strains=build_dir + "/{build_name}/{sample}_strains.txt", - log=build_dir + "/{build_name}/{sample}_filter.log", - params: - group_by=lambda w: config["subsample"][w.sample]["group_by"], - sequences_per_group=lambda w: config["subsample"][w.sample][ - "sequences_per_group" - ], - other_filters=lambda w: config["subsample"][w.sample].get("other_filters", ""), - exclude=lambda w: f"--exclude-where {' '.join([f'lineage={l}' for l in config['subsample'][w.sample]['exclude_lineages']])}" - if "exclude_lineages" in config["subsample"][w.sample] - else "", - strain_id=config["strain_id_field"], - shell: - """ - augur filter \ - --metadata {input.metadata} \ - --metadata-id-columns {params.strain_id} \ - --output-strains {output.strains} \ - {params.group_by} \ - {params.sequences_per_group} \ - {params.exclude} \ - {params.other_filters} \ - --output-log {output.log} - """ - - -rule combine_samples: - input: - strains=lambda w: [ - f"{build_dir}/{w.build_name}/{sample}_strains.txt" - for sample in config["subsample"] - ], - sequences=rules.filter.output.sequences, - metadata=rules.filter.output.metadata, - include=config["include"], - output: - sequences=build_dir + "/{build_name}/filtered.fasta", - metadata=build_dir + "/{build_name}/metadata.tsv", - params: - strain_id=config["strain_id_field"], - shell: - """ - augur filter \ - --metadata-id-columns {params.strain_id} \ - --sequences {input.sequences} \ - --metadata {input.metadata} \ - --exclude-all \ - --include {input.strains} {input.include}\ - --output-sequences {output.sequences} \ - --output-metadata {output.metadata} - """ - - -rule reverse_reverse_complements: - input: - metadata=build_dir + "/{build_name}/metadata.tsv", - sequences=build_dir + "/{build_name}/filtered.fasta", - output: - build_dir + "/{build_name}/reversed.fasta", - shell: - """ - python3 scripts/reverse_reversed_sequences.py \ - --metadata {input.metadata} \ - --sequences {input.sequences} \ - --output {output} - """ - - -rule align: - """ - Aligning sequences to {input.reference} - - filling gaps with N - """ - input: - sequences=rules.reverse_reverse_complements.output, - reference=config["reference"], - genemap=config["genemap"], - output: - alignment=build_dir + "/{build_name}/aligned.fasta", - insertions=build_dir + "/{build_name}/insertions.fasta", - params: - max_indel=config["max_indel"], - seed_spacing=config["seed_spacing"], - threads: workflow.cores - shell: - """ - nextalign run \ - --jobs {threads} \ - --reference {input.reference} \ - --genemap {input.genemap} \ - --max-indel {params.max_indel} \ - --seed-spacing {params.seed_spacing} \ - --retry-reverse-complement \ - --output-fasta - \ - --output-insertions {output.insertions} \ - {input.sequences} | seqkit seq -i > {output.alignment} - """ - - -rule mask: - """ - Mask ends of the alignment: - - from start: {params.from_start} - - from end: {params.from_end} - """ - input: - sequences=build_dir + "/{build_name}/aligned.fasta", - mask=config["mask"]["maskfile"], - output: - build_dir + "/{build_name}/masked.fasta", - params: - from_start=config["mask"]["from_beginning"], - from_end=config["mask"]["from_end"], - shell: - """ - augur mask \ - --sequences {input.sequences} \ - --mask {input.mask} \ - --mask-from-beginning {params.from_start} \ - --mask-from-end {params.from_end} --output {output} - """ - - -rule tree: - """ - Building tree - """ - input: - alignment=build_dir + "/{build_name}/masked.fasta", - tree_mask=config["tree_mask"], - output: - tree=build_dir + "/{build_name}/tree_raw.nwk", - threads: workflow.cores - shell: - """ - augur tree \ - --alignment {input.alignment} \ - --exclude-sites {input.tree_mask} \ - --tree-builder-args="-redo" \ - --output {output.tree} \ - --nthreads {threads} - """ - - -rule fix_tree: - """ - Fixing tree - """ - input: - tree=rules.tree.output.tree, - alignment=build_dir + "/{build_name}/masked.fasta", - output: - tree=build_dir + "/{build_name}/tree_fixed.nwk", - params: - root=lambda w: config.get("treefix_root", ""), - shell: - """ - python3 scripts/fix_tree.py \ - --alignment {input.alignment} \ - --input-tree {input.tree} \ - {params.root} \ - --output {output.tree} - """ - - -rule refine: - """ - Refining tree - - estimate timetree - - use {params.coalescent} coalescent timescale - - estimate {params.date_inference} node dates - - filter tips more than {params.clock_filter_iqd} IQDs from clock expectation - """ - input: - tree=rules.fix_tree.output.tree - if config["fix_tree"] - else rules.tree.output.tree, - alignment=build_dir + "/{build_name}/masked.fasta", - metadata=build_dir + "/{build_name}/metadata.tsv", - output: - tree=build_dir + "/{build_name}/tree.nwk", - node_data=build_dir + "/{build_name}/branch_lengths.json", - params: - coalescent="opt", - date_inference="marginal", - clock_filter_iqd=0, - root=config["root"], - clock_rate=f"--clock-rate {config['clock_rate']}" - if "clock_rate" in config - else "", - clock_std_dev=f"--clock-std-dev {config['clock_std_dev']}" - if "clock_std_dev" in config - else "", - strain_id=config["strain_id_field"], - divergence_units=config["divergence_units"], - shell: - """ - augur refine \ - --tree {input.tree} \ - --alignment {input.alignment} \ - --metadata {input.metadata} \ - --metadata-id-columns {params.strain_id} \ - --output-tree {output.tree} \ - --timetree \ - --root {params.root} \ - --precision 3 \ - --keep-polytomies \ - --use-fft \ - {params.clock_rate} \ - {params.clock_std_dev} \ - --output-node-data {output.node_data} \ - --coalescent {params.coalescent} \ - --date-inference {params.date_inference} \ - --date-confidence \ - --divergence-units {params.divergence_units} \ - --clock-filter-iqd {params.clock_filter_iqd} - """ - - -rule ancestral: - """ - Reconstructing ancestral sequences and mutations - """ - input: - tree=rules.refine.output.tree, - alignment=build_dir + "/{build_name}/masked.fasta", - output: - node_data=build_dir + "/{build_name}/nt_muts.json", - params: - inference="joint", - shell: - """ - augur ancestral \ - --tree {input.tree} \ - --alignment {input.alignment} \ - --output-node-data {output.node_data} \ - --inference {params.inference} - """ - - -rule translate: - """ - Translating amino acid sequences - """ - input: - tree=rules.refine.output.tree, - node_data=rules.ancestral.output.node_data, - genemap=config["genemap"], - output: - node_data=build_dir + "/{build_name}/aa_muts.json", - shell: - """ - augur translate \ - --tree {input.tree} \ - --ancestral-sequences {input.node_data} \ - --reference-sequence {input.genemap} \ - --output {output.node_data} - """ - - -rule traits: - """ - Inferring ancestral traits for {params.columns!s} - - increase uncertainty of reconstruction by {params.sampling_bias_correction} to partially account for sampling bias - """ - input: - tree=rules.refine.output.tree, - metadata=build_dir + "/{build_name}/metadata.tsv", - output: - node_data=build_dir + "/{build_name}/traits.json", - params: - columns="country", - sampling_bias_correction=3, - strain_id=config["strain_id_field"], - shell: - """ - augur traits \ - --tree {input.tree} \ - --metadata {input.metadata} \ - --metadata-id-columns {params.strain_id} \ - --output {output.node_data} \ - --columns {params.columns} \ - --confidence \ - --sampling-bias-correction {params.sampling_bias_correction} - """ - - -rule clades: - """ - Adding internal clade labels - """ - input: - tree=rules.refine.output.tree, - aa_muts=rules.translate.output.node_data, - nuc_muts=rules.ancestral.output.node_data, - clades=config["clades"], - output: - node_data=build_dir + "/{build_name}/clades_raw.json", - log: - "logs/clades_{build_name}.txt", - shell: - """ - augur clades \ - --tree {input.tree} \ - --mutations {input.nuc_muts} {input.aa_muts} \ - --clades {input.clades} \ - --output-node-data {output.node_data} 2>&1 | tee {log} - """ - - -rule rename_clades: - input: - rules.clades.output.node_data, - output: - node_data=build_dir + "/{build_name}/clades.json", - shell: - """ - python scripts/clades_renaming.py \ - --input-node-data {input} \ - --output-node-data {output.node_data} - """ - - -rule mutation_context: - input: - tree=rules.refine.output.tree, - node_data=build_dir + "/{build_name}/nt_muts.json", - output: - node_data=build_dir + "/{build_name}/mutation_context.json", - shell: - """ - python3 scripts/mutation_context.py \ - --tree {input.tree} \ - --mutations {input.node_data} \ - --output {output.node_data} - """ - - -rule remove_time: - input: - "results/{build_name}/branch_lengths.json", - output: - "results/{build_name}/branch_lengths_no_time.json", - shell: - """ - python3 scripts/remove_timeinfo.py --input-node-data {input} --output-node-data {output} - """ - - -rule recency: - """ - Use metadata on submission date to construct submission recency field - """ - input: - metadata=build_dir + "/{build_name}/metadata.tsv", - output: - node_data=build_dir + "/{build_name}/recency.json", - params: - strain_id=config["strain_id_field"], - shell: - """ - python3 scripts/construct-recency-from-submission-date.py \ - --metadata {input.metadata} \ - --metadata-id-columns {params.strain_id} \ - --output {output} 2>&1 - """ - - -rule colors: - input: - ordering="config/color_ordering.tsv", - color_schemes="config/color_schemes.tsv", - metadata=build_dir + "/{build_name}/metadata.tsv", - output: - colors=build_dir + "/{build_name}/colors.tsv", - shell: - """ - python3 scripts/assign-colors.py \ - --ordering {input.ordering} \ - --color-schemes {input.color_schemes} \ - --output {output.colors} \ - --metadata {input.metadata} 2>&1 - """ - - -rule export: - """ - Exporting data files for auspice - """ - input: - tree=rules.refine.output.tree, - metadata=build_dir + "/{build_name}/metadata.tsv", - branch_lengths="results/{build_name}/branch_lengths.json" - if config.get("timetree", False) - else "results/{build_name}/branch_lengths_no_time.json", - traits=rules.traits.output.node_data, - nt_muts=rules.ancestral.output.node_data, - aa_muts=rules.translate.output.node_data, - clades=build_dir + "/{build_name}/clades.json", - mutation_context=rules.mutation_context.output.node_data, - recency=rules.recency.output.node_data if config.get("recency", False) else [], - colors=rules.colors.output.colors, - lat_longs=config["lat_longs"], - description=config["description"], - auspice_config=config["auspice_config"], - output: - auspice_json=build_dir + "/{build_name}/raw_tree.json", - root_sequence=build_dir + "/{build_name}/raw_tree_root-sequence.json", - params: - strain_id=config["strain_id_field"], - shell: - """ - augur export v2 \ - --tree {input.tree} \ - --metadata {input.metadata} \ - --metadata-id-columns {params.strain_id} \ - --node-data {input.branch_lengths} {input.nt_muts} {input.aa_muts} {input.mutation_context} {input.clades} {input.recency}\ - --colors {input.colors} \ - --lat-longs {input.lat_longs} \ - --description {input.description} \ - --auspice-config {input.auspice_config} \ - --include-root-sequence \ - --output {output.auspice_json} - """ - - -rule final_strain_name: - input: - auspice_json=build_dir + "/{build_name}/raw_tree.json", - metadata=build_dir + "/{build_name}/metadata.tsv", - root_sequence=build_dir + "/{build_name}/raw_tree_root-sequence.json", - output: - auspice_json=build_dir + "/{build_name}/tree.json", - root_sequence=build_dir + "/{build_name}/tree_root-sequence.json", - params: - strain_id=config["strain_id_field"], - display_strain_field=config.get("display_strain_field", "strain"), - shell: - """ - python3 scripts/set_final_strain_name.py --metadata {input.metadata} \ - --metadata-id-columns {params.strain_id} \ - --input-auspice-json {input.auspice_json} \ - --display-strain-name {params.display_strain_field} \ - --output {output.auspice_json} - cp {input.root_sequence} {output.root_sequence} - """ diff --git a/phylogenetic/workflow/snakemake_rules/download_via_lapis.smk b/phylogenetic/workflow/snakemake_rules/download_via_lapis.smk deleted file mode 100644 index 55c7ae9f..00000000 --- a/phylogenetic/workflow/snakemake_rules/download_via_lapis.smk +++ /dev/null @@ -1,18 +0,0 @@ -rule download_sequences_via_lapis: - output: - sequences="data/sequences.fasta", - shell: - """ - curl https://mpox-lapis.genspectrum.org/v1/sample/fasta --output {output.sequences} - """ - - -rule download_metadata_via_lapis: - output: - metadata="data/metadata.tsv", - shell: - """ - curl https://mpox-lapis.genspectrum.org/v1/sample/details?dataFormat=csv | \ - tr -d "\r" | - sed -E 's/("([^"]*)")?,/\\2\\t/g' > {output.metadata} - """ diff --git a/phylogenetic/workflow/snakemake_rules/prepare.smk b/phylogenetic/workflow/snakemake_rules/prepare.smk deleted file mode 100644 index 60f65d69..00000000 --- a/phylogenetic/workflow/snakemake_rules/prepare.smk +++ /dev/null @@ -1,32 +0,0 @@ -rule download: - """ - Downloading sequences and metadata from data.nextstrain.org - """ - output: - sequences="data/sequences.fasta.xz", - metadata="data/metadata.tsv.gz", - params: - sequences_url="https://data.nextstrain.org/files/workflows/mpox/sequences.fasta.xz", - metadata_url="https://data.nextstrain.org/files/workflows/mpox/metadata.tsv.gz", - shell: - """ - curl -fsSL --compressed {params.sequences_url:q} --output {output.sequences} - curl -fsSL --compressed {params.metadata_url:q} --output {output.metadata} - """ - - -rule decompress: - """ - Decompressing sequences and metadata - """ - input: - sequences="data/sequences.fasta.xz", - metadata="data/metadata.tsv.gz", - output: - sequences="data/sequences.fasta", - metadata="data/metadata.tsv", - shell: - """ - gzip --decompress --keep {input.metadata} - xz --decompress --keep {input.sequences} - """