From 46acdd78fef714ca6b6a05bd76f2d073852b0645 Mon Sep 17 00:00:00 2001 From: John SJ Anderson Date: Wed, 31 Jul 2024 20:48:06 -0700 Subject: [PATCH] Add `nextclade` workflow [#2] --- .gitignore | 7 + .markdownlintrc | 9 ++ nextclade/README.md | 29 ++++ nextclade/Snakefile | 30 ++++ nextclade/defaults/auspice_config.json | 56 ++++++++ nextclade/defaults/clades.tsv | 40 ++++++ nextclade/defaults/colors.tsv | 8 ++ nextclade/defaults/config.yaml | 20 +++ nextclade/defaults/genome_annotation.gff3 | 5 + nextclade/defaults/include_strains.txt | 136 ++++++++++++++++++ .../defaults/nextclade-dataset/CHANGELOG.md | 3 + .../defaults/nextclade-dataset/README.md | 47 ++++++ .../defaults/nextclade-dataset/pathogen.json | 52 +++++++ nextclade/defaults/reference.fasta | 13 ++ nextclade/rules/annotate_phylogeny.smk | 82 +++++++++++ nextclade/rules/assemble_dataset.smk | 71 +++++++++ nextclade/rules/construct_phylogeny.smk | 67 +++++++++ nextclade/rules/export.smk | 44 ++++++ nextclade/rules/prepare_sequences.smk | 64 +++++++++ 19 files changed, 783 insertions(+) create mode 100644 .markdownlintrc create mode 100644 nextclade/README.md create mode 100644 nextclade/Snakefile create mode 100644 nextclade/defaults/auspice_config.json create mode 100644 nextclade/defaults/clades.tsv create mode 100644 nextclade/defaults/colors.tsv create mode 100644 nextclade/defaults/config.yaml create mode 100644 nextclade/defaults/genome_annotation.gff3 create mode 100644 nextclade/defaults/include_strains.txt create mode 100644 nextclade/defaults/nextclade-dataset/CHANGELOG.md create mode 100644 nextclade/defaults/nextclade-dataset/README.md create mode 100644 nextclade/defaults/nextclade-dataset/pathogen.json create mode 100644 nextclade/defaults/reference.fasta create mode 100644 nextclade/rules/annotate_phylogeny.smk create mode 100644 nextclade/rules/assemble_dataset.smk create mode 100644 nextclade/rules/construct_phylogeny.smk create mode 100644 nextclade/rules/export.smk create mode 100644 nextclade/rules/prepare_sequences.smk diff --git a/.gitignore b/.gitignore index b943590..e8a7bfe 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,13 @@ ingest/benchmarks ingest/data ingest/logs ingest/results +nextclade/auspice +nextclade/benchmarks +nextclade/data +nextclade/dataset +nextclade/logs +nextclade/results +nextclade/test_output phylogenetic/auspice phylogenetic/benchmarks phylogenetic/logs diff --git a/.markdownlintrc b/.markdownlintrc new file mode 100644 index 0000000..17fb483 --- /dev/null +++ b/.markdownlintrc @@ -0,0 +1,9 @@ +{ + // long lines are okay + "MD013":{ + "line_length": 100, + "tables": false + }, + // don't require top-level heading on L1 + "MD041": false +} diff --git a/nextclade/README.md b/nextclade/README.md new file mode 100644 index 0000000..0fe50bf --- /dev/null +++ b/nextclade/README.md @@ -0,0 +1,29 @@ +# Yellow Fever Virus Nextclade Dataset Tree + +This workflow creates a phylogenetic tree that can be used as part of +a Nextclade dataset to assign genotypes to yellow fever virus samples +based on [Mutebi et al.][] (J Virol. 2001 Aug;75(15):6999-7008) and +[Bryant et al.][] (PLoS Pathog. 2007 May 18;3(5):e75). + +* Build a tree using samples from the `ingest` output, with the following + sampling criteria: + * Force-include the following samples: + * genotype reference strains from the 2 papers cited above +* Assign genotypes to each sample and internal nodes of the tree with + `augur clades`, using clade-defining mutations in `defaults/clades.tsv` +* Provide the following coloring options on the tree: + * Genotype assignment from `augur clades` + +## How to create a new tree + +* Run the workflow: `nextstrain build .` +* Inspect the output tree by comparing genotype assignments from the following sources: + * `augur clades` output +* If unwanted samples are present in the tree, add them to + `defaults/dropped_strains.tsv` and re-run the workflow +* If any changes are needed to the clade-defining mutations, add + changes to `defaults/clades.tsv` and re-run the workflow +* Repeat as needed + +[Mutebi et al.]: https://pubmed.ncbi.nlm.nih.gov/11435580/ +[Bryant et al.]: https://pubmed.ncbi.nlm.nih.gov/17511518/ diff --git a/nextclade/Snakefile b/nextclade/Snakefile new file mode 100644 index 0000000..a6618eb --- /dev/null +++ b/nextclade/Snakefile @@ -0,0 +1,30 @@ +configfile: "defaults/config.yaml" + +rule all: + input: + auspice_json = config["files"]["auspice_json"], + nextclade_dataset = "dataset/tree.json", + test_dataset = "test_output", + +include: "rules/prepare_sequences.smk" +include: "rules/construct_phylogeny.smk" +include: "rules/annotate_phylogeny.smk" +include: "rules/export.smk" +include: "rules/assemble_dataset.smk" + +rule clean: + params: + targets = [ + ".snakemake", + "auspice", + "benchmarks", + "data", + "dataset", + "logs", + "results", + "test_output", + ] + shell: + """ + rm -rfv {params.targets} + """ diff --git a/nextclade/defaults/auspice_config.json b/nextclade/defaults/auspice_config.json new file mode 100644 index 0000000..142d5ea --- /dev/null +++ b/nextclade/defaults/auspice_config.json @@ -0,0 +1,56 @@ +{ + "title": "Real-time tracking of yellow fever virus full genome virus evolution", + "maintainers": [ + {"name": "John SJ Anderson", "url": "https://bedford.io/team/john-sj-anderson/"}, + {"name": "the Nextstrain team", "url": "https://nextstrain.org/team"} + ], + "data_provenance": [ + { + "name": "GenBank", + "url": "https://www.ncbi.nlm.nih.gov/genbank/" + } + ], + "build_url": "https://github.com/nextstrain/yellow-fever", + "colorings": [ + { + "key": "gt", + "title": "Genotype", + "type": "categorical" + }, + { + "key": "num_date", + "title": "Date", + "type": "continuous" + }, + { + "key": "region", + "title": "Region", + "type": "categorical" + }, + { + "key": "country", + "title": "Country", + "type": "categorical" + }, + { + "key": "host", + "title": "Host", + "type": "categorical" + } + ], + "geo_resolutions": [ + "country", + "region" + ], + "display_defaults": { + "map_triplicate": true, + "color_by": "region" + }, + "filters": [ + "clade", + "region", + "country", + "author", + "host" + ] +} diff --git a/nextclade/defaults/clades.tsv b/nextclade/defaults/clades.tsv new file mode 100644 index 0000000..44f98c1 --- /dev/null +++ b/nextclade/defaults/clades.tsv @@ -0,0 +1,40 @@ +clade gene site alt +Angola nuc 72 A +Angola nuc 81 G +Angola nuc 88 C +Angola nuc 90 A +Angola nuc 99 T +Angola nuc 111 G +Angola nuc 219 T +Angola nuc 240 C +Angola nuc 246 A +Angola nuc 252 A +Angola nuc 255 A +Angola nuc 291 G +Angola nuc 294 A +Angola nuc 300 A +Angola nuc 315 G +Angola nuc 327 G +Angola nuc 372 A +Angola nuc 420 A +Angola nuc 432 A +Angola nuc 453 T +Angola nuc 492 G +Angola nuc 651 T +East Africa nuc 45 A +East Africa nuc 171 G +East Africa nuc 438 G +East Africa nuc 468 T +East/Central Africa nuc 228 G +West Africa I nuc 183 G +West Africa I nuc 255 C +West Africa II nuc 93 T +West Africa II nuc 270 A +West Africa II nuc 321 T +West Africa II nuc 477 A +South America I nuc 219 A +South America I nuc 532 A +South America II nuc 114 C +South America II nuc 193 T +South America II nuc 249 A +South America II nuc 639 G diff --git a/nextclade/defaults/colors.tsv b/nextclade/defaults/colors.tsv new file mode 100644 index 0000000..e0b687b --- /dev/null +++ b/nextclade/defaults/colors.tsv @@ -0,0 +1,8 @@ +# genotypes assigned by augur clades +clade_membership Angola #FCF007 +clade_membership East Africa #4B26B1 +clade_membership East/Central Africa #E307FC +clade_membership West Africa I #2CFC07 +clade_membership West Africa II #9EFC07 +clade_membership South America I #996633 +clade_membership South America II #FC0740 diff --git a/nextclade/defaults/config.yaml b/nextclade/defaults/config.yaml new file mode 100644 index 0000000..a412aa3 --- /dev/null +++ b/nextclade/defaults/config.yaml @@ -0,0 +1,20 @@ +files: + auspice_config: "defaults/auspice_config.json" + auspice_json: "auspice/tree.json" + clades: "defaults/clades.tsv" + colors: "defaults/colors.tsv" + include: "defaults/include_strains.txt" + reference_prM-E_fasta: "defaults/reference.fasta" + reference_prM-E_gff: "defaults/genome_annotation.gff3" +strain_id_field: "accession" +align_and_extract_prM-E: + min_length: 500 + min_seed_cover: 0.01 +refine: + coalescent: "opt" + date_inference: "marginal" + clock_filter_iqd: 4 +ancestral: + inference: "joint" +export: + metadata_columns: "strain division location region year host" diff --git a/nextclade/defaults/genome_annotation.gff3 b/nextclade/defaults/genome_annotation.gff3 new file mode 100644 index 0000000..60c279d --- /dev/null +++ b/nextclade/defaults/genome_annotation.gff3 @@ -0,0 +1,5 @@ +##sequence-region prM-E 1 672 +NC_002031.1 feature source 1 672 . + . gene=nuc +NC_002031.1 feature gene 1 333 . + . gene_name=prM +NC_002031.1 feature gene 109 333 . + . gene_name=M +NC_002031.1 feature gene 334 672 . + . gene_name=E diff --git a/nextclade/defaults/include_strains.txt b/nextclade/defaults/include_strains.txt new file mode 100644 index 0000000..52ef94f --- /dev/null +++ b/nextclade/defaults/include_strains.txt @@ -0,0 +1,136 @@ +# Extracted from tables and figures in Mutebi et al. (J Virol. 2001 +# Aug;75(15):6999-7008) and Bryant et al. (PLoS Pathog. 2007 May +# 18;3(5):e75) +AF369669 +AF369670 +AF369671 +AY540431 +AY540432 +AY540433 +AY540434 +AY540435 +U52390 +AY540437 +AY540438 +AY540439 +AY540440 +AY540441 +AY540442 +AY540443 +AY540444 +AY540445 +AY540446 +AY540447 +AY540448 +AY540449 +AY540450 +AY540451 +AY540452 +AY540453 +U23570 +AY540454 +AY540455 +AY540456 +AY540457 +AY540458 +AY540459 +AY540460 +AY540461 +AY540462 +AY540463 +AY540464 +AY540465 +AY540466 +AY540467 +AY540468 +AY540469 +AY540470 +AY540471 +AY540472 +AY540473 +AY540436 +U52392 +U52395 +AF369672 +AF369673 +AY540475 +AY540476 +AY540474 +U52399 +AY540477 +AY540478 +AF369674 +AF369675 +AY572535 +AY640589 +AF369686 +U54798 +AY603338 +AF369676 +U52403 +AF369677 +AF369678 +AF368679 +AF369680 +AF369681 +AF369682 +AF369683 +AF369684 +AF369685 +AY540479 +AY540480 +AY161927 +AY161928 +AY161929 +AY161930 +AY161931 +U52411 +AY161933 +AY161934 +AY161935 +U52405 +U52407 +AY161938 +AY161939 +AY161940 +AY161941 +AY161942 +AY161943 +AY161944 +AY161945 +AY161946 +AY161947 +AY161948 +AY161949 +AY161950 +AY161951 +GI694115 +U89338 +AF369687 +AF369688 +U52413 +AF369689 +AF369690 +AF369691 +AF369692 +AF369693 +AY690831 +AY690832 +AY690833 +DQ872411 +DQ872412 +AY540481 +AY540482 +AY540483 +AY540484 +AY540485 +AY540486 +AF369694 +U52422 +AF369695 +AF369696 +AY540487 +AY540488 +AY540489 +AY540490 +AF369697 diff --git a/nextclade/defaults/nextclade-dataset/CHANGELOG.md b/nextclade/defaults/nextclade-dataset/CHANGELOG.md new file mode 100644 index 0000000..3b67234 --- /dev/null +++ b/nextclade/defaults/nextclade-dataset/CHANGELOG.md @@ -0,0 +1,3 @@ +## Unreleased + +Initial release of yellow fever virus dataset. diff --git a/nextclade/defaults/nextclade-dataset/README.md b/nextclade/defaults/nextclade-dataset/README.md new file mode 100644 index 0000000..409611a --- /dev/null +++ b/nextclade/defaults/nextclade-dataset/README.md @@ -0,0 +1,47 @@ +# Yellow fever virus dataset + +| Key | Value | +| ----------------- | -----------------------------------------------------------------| +| name | Yellow fever virus (YFV) prM-E region | +| authors | [Nextstrain](https://nextstrain.org) | +| reference | AY640589.1 | +| workflow | | +| path | `nextstrain/yellow-fever/prM-E` | + +## Scope of this dataset + +This dataset assigns genotypes to yellow fever virus samples based on +strain and genotype information from [Mutebi et al.][] (J Virol. 2001 +Aug;75(15):6999-7008) and [Bryant et al.][] (PLoS Pathog. 2007 May 18;3(5):e75) + +These two papers, collectively, define 7 distinct yellow fever virus +genotypes based on a 670 nucleotide region of the yellow fever virus +genome, (bases 641-1310), called the prM-E region. This region +comprises the 3' end of the pre-membrane protein (prM) gene, the +entire membrane protein (M) gene, and the 5' end of the envelope +protein (E) gene. + +(N.b., the reference sequence used in this data set is actually 672nt +long, from bases 641-1312 of the genome reference. The 2 extra bases +make the reference an complete open reading frame.) + +This dataset can be used to assign genotypes to any sequence that +includes at least 500 bp of the prM-E region, including whole genome +sequences. Sequence data beyond the prM-E region will be reported as an +insertion in the Nextclade output. + +## Features + +This dataset supports: + +- Assignment of genotypes +- Phylogenetic placement +- Sequence quality control (QC) + +## What are Nextclade datasets + +Read more about Nextclade datasets in the Nextclade documentation: + + +[Mutebi et al.]: https://pubmed.ncbi.nlm.nih.gov/11435580/ +[Bryant et al.]: https://pubmed.ncbi.nlm.nih.gov/17511518/ diff --git a/nextclade/defaults/nextclade-dataset/pathogen.json b/nextclade/defaults/nextclade-dataset/pathogen.json new file mode 100644 index 0000000..92213e6 --- /dev/null +++ b/nextclade/defaults/nextclade-dataset/pathogen.json @@ -0,0 +1,52 @@ +{ + "files": { + "reference": "reference.fasta", + "pathogenJson": "pathogen.json", + "genomeAnnotation": "genome_annotation.gff3", + "treeJson": "tree.json", + "examples": "sequences.fasta", + "readme": "README.md", + "changelog": "CHANGELOG.md" + }, + "attributes": { + "name": "Yellow fever virus (YFV) prM-E region", + "reference name": "Asibi", + "reference accession": "AY640589.1" + }, + "schemaVersion": "3.0.0", + "alignmentParams": { + "minSeedCover": 0.01, + "minLength": 500 + }, + "qc": { + "missingData": { + "enabled": true, + "missingDataThreshold": 20, + "scoreBias": 4 + }, + "mixedSites": { + "enabled": true, + "mixedSitesThreshold": 4 + }, + "frameShifts": { + "enabled": true + }, + "stopCodons": { + "enabled": true + }, + "privateMutations": { + "enabled": true, + "cutoff": 8, + "typical": 2, + "weightLabeledSubstitutions": 1, + "weightReversionSubstitutions": 1, + "weightUnlabeledSubstitutions": 1 + }, + "snpClusters": { + "enabled": true, + "clusterCutOff": 3, + "scoreWeight": 50, + "windowSize": 50 + } + } +} diff --git a/nextclade/defaults/reference.fasta b/nextclade/defaults/reference.fasta new file mode 100644 index 0000000..1b51ee3 --- /dev/null +++ b/nextclade/defaults/reference.fasta @@ -0,0 +1,13 @@ +> prM-E region (genome 641-1312, 672 nt) +CCAAGAGAGGAGCCAGATGACATTGATTGCTGGTGCTATGGGGTGGAAAACGTTAGAGTC +GCATATGGTAAGTGTGACTCAGCAGGCAGGTCTAGGAGGTCAAGAAGGGCCATTGACTTG +CCTACGCATGAAAACCATGGTTTGAAGACCCGGCAAGAAAAATGGATGACTGGAAGAATG +GGTGAAAGGCAACTCCAAAAGATTGAGAGATGGCTCGTGAGGAACCCCTTTTTTGCAGTG +ACAGCTCTGACCATTGCCTACCTTGTGGGAAGCAACATGACGCAACGAGTCGTGATTGCC +CTACTGGTCTTGGCTGTTGGTCCGGCCTACTCAGCTCACTGCATTGGAATTACTGACAGG +GATTTCATTGAGGGGGTGCATGGAGGAACTTGGGTTTCAGCTACCCTGGAGCAAGACAAG +TGTGTCACTGTTATGGCCCCTGACAAGCCTTCATTGGACATCTCACTAGAGACAGTAGCC +ATTGATGGACCTGCTGAGGCGAGGAAAGTGTGTTACAATGCAGTTCTCACTCATGTGAAG +ATTAATGACAAGTGCCCCAGCACTGGAGAGGCCCACCTAGCTGAAGAGAACGAAGGGGAC +AATGCGTGCAAGCGCACTTATTCTGATAGAGGCTGGGGCAATGGCTGTGGCCTATTTGGG +AAAGGGAGCATT diff --git a/nextclade/rules/annotate_phylogeny.smk b/nextclade/rules/annotate_phylogeny.smk new file mode 100644 index 0000000..a33745f --- /dev/null +++ b/nextclade/rules/annotate_phylogeny.smk @@ -0,0 +1,82 @@ +""" +This part of the workflow creates additonal annotations for the phylogenetic tree. +""" + +rule ancestral: + message: + """ + Reconstructing ancestral sequences and mutations + """ + input: + alignment = "results/aligned.fasta", + tree = "results/tree.nwk", + output: + node_data = "results/nt_muts.json", + params: + inference = config["ancestral"]["inference"], + reference_fasta = config["files"]["reference_prM-E_fasta"], + log: + "logs/ancestral.txt", + benchmark: + "benchmarks/ancestral.txt", + shell: + """ + augur ancestral \ + --tree {input.tree} \ + --alignment {input.alignment} \ + --output-node-data {output.node_data} \ + --inference {params.inference} \ + --root-sequence {params.reference_fasta} \ + &> {log:q} + """ + +rule translate: + message: + """ + Translating amino acid sequences + """ + input: + tree = "results/tree.nwk", + node_data = "results/nt_muts.json", + reference = config["files"]["reference_prM-E_gff"], + output: + node_data = "results/aa_muts.json" + log: + "logs/translate.txt", + benchmark: + "benchmarks/translate.txt", + shell: + """ + augur translate \ + --tree {input.tree:q} \ + --ancestral-sequences {input.node_data:q} \ + --reference-sequence {input.reference:q} \ + --output {output.node_data:q} \ + &> {log:q} + """ + +rule clades: + message: + """ + Running augur clades + """ + input: + tree = "results/tree.nwk", + nt_muts = "results/nt_muts.json", + aa_muts = "results/aa_muts.json", + clade_defs = config["files"]["clades"] + output: + clades = "results/clades.json" + log: + "logs/clades.txt", + benchmark: + "benchmarks/clades.txt", + shell: + """ + augur clades \ + --tree {input.tree:q} \ + --mutations {input.nt_muts} {input.aa_muts} \ + --clades {input.clade_defs:q} \ + --output {output.clades:q} \ + &> {log:q} + """ diff --git a/nextclade/rules/assemble_dataset.smk b/nextclade/rules/assemble_dataset.smk new file mode 100644 index 0000000..9f00848 --- /dev/null +++ b/nextclade/rules/assemble_dataset.smk @@ -0,0 +1,71 @@ +""" +This part of the workflow assembles a nextclade dataset. +""" +rule assemble_dataset: + message: + """ + Copying outputs into dataset/ + """ + input: + reference_fasta="defaults/reference.fasta", + tree=config["files"]["auspice_json"], + # TODO once this repo is fully automated and uploading data to + # S3, this step should download data from there instead of + # depending on the ingest build + sequences = "../ingest/results/sequences.fasta", + annotation="defaults/genome_annotation.gff3", + pathogen_json="defaults/nextclade-dataset/pathogen.json", + readme="defaults/nextclade-dataset/README.md", + changelog="defaults/nextclade-dataset/CHANGELOG.md", + output: + reference_fasta="dataset/reference.fasta", + tree="dataset/tree.json", + pathogen_json="dataset/pathogen.json", + sequences="dataset/sequences.fasta", + annotation="dataset/genome_annotation.gff3", + readme="dataset/README.md", + changelog="dataset/CHANGELOG.md", + log: + "logs/assemble_dataset.txt", + benchmark: + "benchmarks/assemble_dataset.txt", + shell: + """ + ( + cp -v {input.reference_fasta:q} {output.reference_fasta:q} + cp -v {input.tree:q} {output.tree:q} + cp -v {input.pathogen_json:q} {output.pathogen_json:q} + cp -v {input.annotation:q} {output.annotation:q} + cp -v {input.readme:q} {output.readme:q} + cp -v {input.changelog:q} {output.changelog:q} + cp -v {input.sequences:q} {output.sequences:q} + ) &> {log:q} + """ + +rule test_dataset: + input: + # TODO once this repo is fully automated and uploading data to + # S3, this step should download data from there instead of + # depending on the ingest build + sequences = "../ingest/results/sequences.fasta", + # this isn't used by the command below, but it is included as + # an input to force the preceding rule to finish running + # before this one starts + tree="dataset/tree.json", + output: + outdir=directory("test_output/"), + params: + dataset_dir="dataset", + log: + "logs/test_dataset.txt", + benchmark: + "benchmarks/test_dataset.txt", + shell: + """ + nextclade run \ + --input-dataset {params.dataset_dir:q} \ + --output-all {output.outdir:q} \ + --silent \ + {input.sequences:q} \ + >& {log:q} + """ diff --git a/nextclade/rules/construct_phylogeny.smk b/nextclade/rules/construct_phylogeny.smk new file mode 100644 index 0000000..58341f2 --- /dev/null +++ b/nextclade/rules/construct_phylogeny.smk @@ -0,0 +1,67 @@ +""" +This part of the workflow constructs the phylogenetic tree. +""" +rule tree: + message: + """ + Building tree + """ + input: + alignment = "results/aligned.fasta", + output: + tree = "results/tree_raw.nwk", + log: + "logs/tree.txt", + benchmark: + "benchmarks/tree.txt", + shell: + """ + augur tree \ + --alignment {input.alignment} \ + --output {output.tree} + &> {log:q} + """ + +rule refine: + message: + """ + Refining tree + - use {params.coalescent} coalescent timescale + - estimate {params.date_inference} node dates + - filter tips more than {params.clock_filter_iqd} IQDs from clock expectation + """ + input: + alignment = "results/aligned.fasta", + # TODO once this repo is fully automated and uploading data to + # S3, this step should download data from there instead of + # depending on the ingest build + metadata = "../ingest/results/metadata.tsv", + tree = "results/tree_raw.nwk", + output: + node_data = "results/branch_lengths.json", + tree = "results/tree.nwk", + params: + strain_id = config["strain_id_field"], + coalescent = config["refine"]["coalescent"], + date_inference = config["refine"]["date_inference"], + clock_filter_iqd = config["refine"]["clock_filter_iqd"], + log: + "logs/refine.txt", + benchmark: + "benchmarks/refine.txt", + shell: + """ + augur refine \ + --tree {input.tree:q} \ + --alignment {input.alignment:q} \ + --metadata {input.metadata:q} \ + --metadata-id-columns {params.strain_id:q} \ + --output-tree {output.tree:q} \ + --output-node-data {output.node_data:q} \ + --coalescent {params.coalescent:q} \ + --date-confidence \ + --date-inference {params.date_inference:q} \ + --clock-filter-iqd {params.clock_filter_iqd:q} \ + --stochastic-resolve + &> {log:q} + """ diff --git a/nextclade/rules/export.smk b/nextclade/rules/export.smk new file mode 100644 index 0000000..ac254c8 --- /dev/null +++ b/nextclade/rules/export.smk @@ -0,0 +1,44 @@ +""" +This part of the workflow collects the phylogenetic tree and +annotations to export a Nextstrain dataset. +""" +rule export: + message: + """ + Exporting data files for auspice + """ + input: + tree = "results/tree.nwk", + # TODO once this repo is fully automated and uploading data to + # S3, this step should download data from there instead of + # depending on the ingest build + metadata = "../ingest/results/metadata.tsv", + branch_lengths = "results/branch_lengths.json", + clades = "results/clades.json", + nt_muts = "results/nt_muts.json", + aa_muts = "results/aa_muts.json", + colors = config["files"]["colors"], + auspice_config = config["files"]["auspice_config"], + output: + auspice_json = config["files"]["auspice_json"], + params: + strain_id = config["strain_id_field"], + metadata_columns = config["export"]["metadata_columns"], + log: + "logs/export.txt", + benchmark: + "benchmarks/export.txt", + shell: + """ + augur export v2 \ + --tree {input.tree:q} \ + --metadata {input.metadata:q} \ + --metadata-id-columns {params.strain_id:q} \ + --node-data {input.branch_lengths} {input.nt_muts} {input.aa_muts} {input.clades} \ + --colors {input.colors:q} \ + --metadata-columns {params.metadata_columns} \ + --auspice-config {input.auspice_config:q} \ + --include-root-sequence-inline \ + --output {output.auspice_json:q} \ + &> {log:q} + """ diff --git a/nextclade/rules/prepare_sequences.smk b/nextclade/rules/prepare_sequences.smk new file mode 100644 index 0000000..8beca2f --- /dev/null +++ b/nextclade/rules/prepare_sequences.smk @@ -0,0 +1,64 @@ +""" +This part of the workflow prepares sequences for constructing the +phylogenetic tree. +""" +rule align_and_extract_prM_E: + input: + # TODO once this repo is fully automated and uploading data to + # S3, this step should download data from there instead of + # depending on the ingest build + sequences = "../ingest/results/sequences.fasta", + reference = config["files"]["reference_prM-E_fasta"], + output: + sequences = "results/sequences.fasta", + params: + min_length = config['align_and_extract_prM-E']['min_length'], + min_seed_cover = config['align_and_extract_prM-E']['min_seed_cover'], + log: + "logs/align_and_extract_prM_E.txt", + benchmark: + "benchmarks/align_and_extract_prM_E.txt", + threads: workflow.cores + shell: + """ + nextclade3 run \ + --jobs {threads:q} \ + --input-ref {input.reference:q} \ + --output-fasta {output.sequences:q} \ + --min-seed-cover {params.min_seed_cover:q} \ + --min-length {params.min_length:q} \ + --silent \ + {input.sequences:q} \ + &> {log:q} + """ + +rule filter: + message: """ + Filtering to defined set of strains with known genotypes. + """ + input: + include = config["files"]["include"], + # TODO once this repo is fully automated and uploading data to + # S3, this step should download data from there instead of + # depending on the ingest build + metadata = "../ingest/results/metadata.tsv", + sequences = "results/sequences.fasta", + output: + sequences = "results/aligned.fasta", + params: + strain_id = config["strain_id_field"], + log: + "logs/filter.txt", + benchmark: + "benchmarks/filter.txt" + shell: + """ + augur filter \ + --sequences {input.sequences:q} \ + --metadata {input.metadata:q} \ + --metadata-id-columns {params.strain_id:q} \ + --exclude-all \ + --include {input.include:q} \ + --output {output.sequences:q} \ + &> {log:q} + """