diff --git a/.gitignore b/.gitignore index b943590..09c6411 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,11 @@ ingest/benchmarks ingest/data ingest/logs ingest/results +nextclade/auspice +nextclade/benchmarks +nextclade/data +nextclade/logs +nextclade/results phylogenetic/auspice phylogenetic/benchmarks phylogenetic/logs diff --git a/nextclade/README.md b/nextclade/README.md new file mode 100644 index 0000000..ac2853a --- /dev/null +++ b/nextclade/README.md @@ -0,0 +1,26 @@ +# Yellow Fever Virus Nextclade Dataset Tree + +This workflow creates a phylogenetic tree that can be used as part of +a Nextclade dataset to assign genotypes to yellow fever virus samples based on +FIXME reference to those two papers goes here. + +* Build a tree using samples from the `ingest` output, with the following + sampling criteria: + * Force-include the following samples: + * genotype reference strains +* Assign genotypes to each sample and internal nodes of the tree with + `augur clades`, using clade-defining mutations in + `defaults/clades.tsv` +* Provide the following coloring options on the tree: + * Genotype assignment from `augur clades` + +## How to create a new tree + +* Run the workflow: `nextstrain build .` +* Inspect the output tree by comparing genotype assignments from the following sources: + * `augur clades` output +* If unwanted samples are present in the tree, add them to + `defaults/dropped_strains.tsv` and re-run the workflow +* If any changes are needed to the clade-defining mutations, add + changes to `defaults/clades.tsv` and re-run the workflow +* Repeat as needed diff --git a/nextclade/Snakefile b/nextclade/Snakefile new file mode 100644 index 0000000..760cccc --- /dev/null +++ b/nextclade/Snakefile @@ -0,0 +1,25 @@ +configfile: "defaults/config.yaml" + +rule all: + input: + auspice_json = config["files"]["auspice_json"], + +include: "rules/prepare_sequences.smk" +include: "rules/construct_phylogeny.smk" +include: "rules/annotate_phylogeny.smk" +include: "rules/export.smk" + +rule clean: + params: + targets = [ + ".snakemake", + "auspice", + "benchmarks", + "data", + "logs", + "results", + ] + shell: + """ + rm -rfv {params.targets} + """ diff --git a/nextclade/defaults/auspice_config.json b/nextclade/defaults/auspice_config.json new file mode 100644 index 0000000..3843edd --- /dev/null +++ b/nextclade/defaults/auspice_config.json @@ -0,0 +1,59 @@ +{ + "title": "Real-time tracking of yellow fever virus full genome virus evolution", + "maintainers": [ + {"name": "John SJ Anderson", "url": "https://bedford.io/team/john-sj-anderson/"}, + {"name": "the Nextstrain team", "url": "https://nextstrain.org/team"} + ], + "data_provenance": [ + { + "name": "GenBank", + "url": "https://www.ncbi.nlm.nih.gov/genbank/" + } + ], + "build_url": "https://github.com/nextstrain/yellow-fever", + "colorings": [ + { + "key": "gt", + "title": "Genotype", + "type": "categorical" + }, + { + "key": "num_date", + "title": "Date", + "type": "continuous" + }, + { + "key": "region", + "title": "Region", + "type": "categorical" + }, + { + "key": "country", + "title": "Country", + "type": "categorical" + }, + { + "key": "host", + "title": "Host", + "type": "categorical" + } + ], + "geo_resolutions": [ + "country", + "region" + ], + "display_defaults": { + "map_triplicate": true, + "color_by": "region" + }, + "filters": [ + "clade", + "region", + "country", + "author", + "host" + ], + "metadata_columns": [ + "author" + ] +} diff --git a/nextclade/defaults/clades.tsv b/nextclade/defaults/clades.tsv new file mode 100644 index 0000000..44f98c1 --- /dev/null +++ b/nextclade/defaults/clades.tsv @@ -0,0 +1,40 @@ +clade gene site alt +Angola nuc 72 A +Angola nuc 81 G +Angola nuc 88 C +Angola nuc 90 A +Angola nuc 99 T +Angola nuc 111 G +Angola nuc 219 T +Angola nuc 240 C +Angola nuc 246 A +Angola nuc 252 A +Angola nuc 255 A +Angola nuc 291 G +Angola nuc 294 A +Angola nuc 300 A +Angola nuc 315 G +Angola nuc 327 G +Angola nuc 372 A +Angola nuc 420 A +Angola nuc 432 A +Angola nuc 453 T +Angola nuc 492 G +Angola nuc 651 T +East Africa nuc 45 A +East Africa nuc 171 G +East Africa nuc 438 G +East Africa nuc 468 T +East/Central Africa nuc 228 G +West Africa I nuc 183 G +West Africa I nuc 255 C +West Africa II nuc 93 T +West Africa II nuc 270 A +West Africa II nuc 321 T +West Africa II nuc 477 A +South America I nuc 219 A +South America I nuc 532 A +South America II nuc 114 C +South America II nuc 193 T +South America II nuc 249 A +South America II nuc 639 G diff --git a/nextclade/defaults/colors.tsv b/nextclade/defaults/colors.tsv new file mode 100644 index 0000000..e0b687b --- /dev/null +++ b/nextclade/defaults/colors.tsv @@ -0,0 +1,8 @@ +# genotypes assigned by augur clades +clade_membership Angola #FCF007 +clade_membership East Africa #4B26B1 +clade_membership East/Central Africa #E307FC +clade_membership West Africa I #2CFC07 +clade_membership West Africa II #9EFC07 +clade_membership South America I #996633 +clade_membership South America II #FC0740 diff --git a/nextclade/defaults/config.yaml b/nextclade/defaults/config.yaml new file mode 100644 index 0000000..0cccf83 --- /dev/null +++ b/nextclade/defaults/config.yaml @@ -0,0 +1,25 @@ +files: + auspice_config: "defaults/auspice_config.json" + auspice_json: "auspice/yellow-fever-virus_prM-E.json" + clades: "defaults/clades.tsv" + colors: "defaults/colors.tsv" + include: "defaults/include_strains.txt" + reference_prM-E_fasta: "defaults/yellow-fever-virus-reference_prM-E.fasta" + reference_prM-E_gff: "defaults/yellow-fever-virus-reference_prM-E.gff" +strain_id_field: "accession" +align_and_extract_prM-E: + min_length: 650 + min_seed_cover: 0.01 +filter: + group_by: "region year" + subsample_max_sequences: 500 + min_date: 1927 + min_length: 650 +refine: + coalescent: "opt" + date_inference: "marginal" + clock_filter_iqd: 4 +ancestral: + inference: "joint" +export: + metadata_columns: "strain division location region year host" diff --git a/nextclade/defaults/include_strains.txt b/nextclade/defaults/include_strains.txt new file mode 100644 index 0000000..1e6da32 --- /dev/null +++ b/nextclade/defaults/include_strains.txt @@ -0,0 +1,134 @@ +# FIXME provide reference +AF369669 +AF369670 +AF369671 +AY540431 +AY540432 +AY540433 +AY540434 +AY540435 +U52390 +AY540437 +AY540438 +AY540439 +AY540440 +AY540441 +AY540442 +AY540443 +AY540444 +AY540445 +AY540446 +AY540447 +AY540448 +AY540449 +AY540450 +AY540451 +AY540452 +AY540453 +U23570 +AY540454 +AY540455 +AY540456 +AY540457 +AY540458 +AY540459 +AY540460 +AY540461 +AY540462 +AY540463 +AY540464 +AY540465 +AY540466 +AY540467 +AY540468 +AY540469 +AY540470 +AY540471 +AY540472 +AY540473 +AY540436 +U52392 +U52395 +AF369672 +AF369673 +AY540475 +AY540476 +AY540474 +U52399 +AY540477 +AY540478 +AF369674 +AF369675 +AY572535 +AY640589 +AF369686 +U54798 +AY603338 +AF369676 +U52403 +AF369677 +AF369678 +AF368679 +AF369680 +AF369681 +AF369682 +AF369683 +AF369684 +AF369685 +AY540479 +AY540480 +AY161927 +AY161928 +AY161929 +AY161930 +AY161931 +U52411 +AY161933 +AY161934 +AY161935 +U52405 +U52407 +AY161938 +AY161939 +AY161940 +AY161941 +AY161942 +AY161943 +AY161944 +AY161945 +AY161946 +AY161947 +AY161948 +AY161949 +AY161950 +AY161951 +GI694115 +U89338 +AF369687 +AF369688 +U52413 +AF369689 +AF369690 +AF369691 +AF369692 +AF369693 +AY690831 +AY690832 +AY690833 +DQ872411 +DQ872412 +AY540481 +AY540482 +AY540483 +AY540484 +AY540485 +AY540486 +AF369694 +U52422 +AF369695 +AF369696 +AY540487 +AY540488 +AY540489 +AY540490 +AF369697 diff --git a/nextclade/defaults/yellow-fever-virus-reference_prM-E.fasta b/nextclade/defaults/yellow-fever-virus-reference_prM-E.fasta new file mode 100644 index 0000000..6e332fc --- /dev/null +++ b/nextclade/defaults/yellow-fever-virus-reference_prM-E.fasta @@ -0,0 +1,13 @@ +> prM-E region (genome 641-1312, 672 nt) +CCAAGAGAGGAGCCAGATGACATTGATTGCTGGTGCTATGGGGTGGAAAACGTTAGAGTC +GCATATGGTAAGTGTGACTCAGCAGGCAGGTCTAGGAGGTCAAGAAGGGCCATTGACTTG +CCTACGCATGAAAACCATGGTTTGAAGACCCGGCAAGAAAAATGGATGACTGGAAGAATG +GGTGAAAGGCAACTCCAAAAGATTGAGAGATGGTTCGTGAGGAACCCCTTTTTTGCAGTG +ACGGCTCTGACCATTGCCTACCTTGTGGGAAGCAACATGACGCAACGAGTCGTGATTGCC +CTACTGGTCTTGGCTGTTGGTCCGGCCTACTCAGCTCACTGCATTGGAATTACTGACAGG +GATTTCATTGAGGGGGTGCATGGAGGAACTTGGGTTTCAGCTACCCTGGAGCAAGACAAG +TGTGTCACTGTTATGGCCCCTGACAAGCCTTCATTGGACATCTCACTAGAGACAGTAGCC +ATTGATAGACCTGCTGAGGTGAGGAAAGTGTGTTACAATGCAGTTCTCACTCATGTGAAG +ATTAATGACAAGTGCCCCAGCACTGGAGAGGCCCACCTAGCTGAAGAGAACGAAGGGGAC +AATGCGTGCAAGCGCACTTATTCTGATAGAGGCTGGGGCAATGGCTGTGGCCTATTTGGG +AAAGGGAGCATT diff --git a/nextclade/defaults/yellow-fever-virus-reference_prM-E.gff b/nextclade/defaults/yellow-fever-virus-reference_prM-E.gff new file mode 100644 index 0000000..615ba42 --- /dev/null +++ b/nextclade/defaults/yellow-fever-virus-reference_prM-E.gff @@ -0,0 +1,5 @@ +##sequence-region prM-E 1 672 +NC_002031.1 feature source 1 672 . + . gene=nuc +NC_002031.1 feature gene 1 334 . + . gene_name=prM +NC_002031.1 feature gene 110 334 . + . gene_name=M +NC_002031.1 feature gene 335 672 . + . gene_name=E diff --git a/nextclade/rules/annotate_phylogeny.smk b/nextclade/rules/annotate_phylogeny.smk new file mode 100644 index 0000000..a33745f --- /dev/null +++ b/nextclade/rules/annotate_phylogeny.smk @@ -0,0 +1,82 @@ +""" +This part of the workflow creates additonal annotations for the phylogenetic tree. +""" + +rule ancestral: + message: + """ + Reconstructing ancestral sequences and mutations + """ + input: + alignment = "results/aligned.fasta", + tree = "results/tree.nwk", + output: + node_data = "results/nt_muts.json", + params: + inference = config["ancestral"]["inference"], + reference_fasta = config["files"]["reference_prM-E_fasta"], + log: + "logs/ancestral.txt", + benchmark: + "benchmarks/ancestral.txt", + shell: + """ + augur ancestral \ + --tree {input.tree} \ + --alignment {input.alignment} \ + --output-node-data {output.node_data} \ + --inference {params.inference} \ + --root-sequence {params.reference_fasta} \ + &> {log:q} + """ + +rule translate: + message: + """ + Translating amino acid sequences + """ + input: + tree = "results/tree.nwk", + node_data = "results/nt_muts.json", + reference = config["files"]["reference_prM-E_gff"], + output: + node_data = "results/aa_muts.json" + log: + "logs/translate.txt", + benchmark: + "benchmarks/translate.txt", + shell: + """ + augur translate \ + --tree {input.tree:q} \ + --ancestral-sequences {input.node_data:q} \ + --reference-sequence {input.reference:q} \ + --output {output.node_data:q} \ + &> {log:q} + """ + +rule clades: + message: + """ + Running augur clades + """ + input: + tree = "results/tree.nwk", + nt_muts = "results/nt_muts.json", + aa_muts = "results/aa_muts.json", + clade_defs = config["files"]["clades"] + output: + clades = "results/clades.json" + log: + "logs/clades.txt", + benchmark: + "benchmarks/clades.txt", + shell: + """ + augur clades \ + --tree {input.tree:q} \ + --mutations {input.nt_muts} {input.aa_muts} \ + --clades {input.clade_defs:q} \ + --output {output.clades:q} \ + &> {log:q} + """ diff --git a/nextclade/rules/construct_phylogeny.smk b/nextclade/rules/construct_phylogeny.smk new file mode 100644 index 0000000..fa51afe --- /dev/null +++ b/nextclade/rules/construct_phylogeny.smk @@ -0,0 +1,68 @@ +""" +This part of the workflow constructs the phylogenetic tree. +""" +rule tree: + message: + """ + Building tree + """ + input: + alignment = "results/aligned.fasta", + output: + tree = "results/tree_raw.nwk", + log: + "logs/tree.txt", + benchmark: + "benchmarks/tree.txt", + shell: + """ + augur tree \ + --alignment {input.alignment} \ + --output {output.tree} + &> {log:q} + """ + +rule refine: + message: + """ + Refining tree + - use {params.coalescent} coalescent timescale + - estimate {params.date_inference} node dates + - filter tips more than {params.clock_filter_iqd} IQDs from clock expectation + """ + input: + alignment = "results/aligned.fasta", + # TODO once this repo is fully automated and uploading data to + # S3, this step should download data from there instead of + # depending on the ingest build + metadata = "../ingest/results/metadata.tsv", + tree = "results/tree_raw.nwk", + output: + node_data = "results/branch_lengths.json", + tree = "results/tree.nwk", + params: + strain_id = config["strain_id_field"], + coalescent = config["refine"]["coalescent"], + date_inference = config["refine"]["date_inference"], + clock_filter_iqd = config["refine"]["clock_filter_iqd"], + log: + "logs/refine.txt", + benchmark: + "benchmarks/refine.txt", + shell: + """ + augur refine \ + --tree {input.tree:q} \ + --alignment {input.alignment:q} \ + --metadata {input.metadata:q} \ + --metadata-id-columns {params.strain_id:q} \ + --output-tree {output.tree:q} \ + --output-node-data {output.node_data:q} \ + --coalescent {params.coalescent:q} \ + --timetree \ + --date-confidence \ + --date-inference {params.date_inference:q} \ + --clock-filter-iqd {params.clock_filter_iqd:q} \ + --stochastic-resolve + &> {log:q} + """ diff --git a/nextclade/rules/export.smk b/nextclade/rules/export.smk new file mode 100644 index 0000000..ac254c8 --- /dev/null +++ b/nextclade/rules/export.smk @@ -0,0 +1,44 @@ +""" +This part of the workflow collects the phylogenetic tree and +annotations to export a Nextstrain dataset. +""" +rule export: + message: + """ + Exporting data files for auspice + """ + input: + tree = "results/tree.nwk", + # TODO once this repo is fully automated and uploading data to + # S3, this step should download data from there instead of + # depending on the ingest build + metadata = "../ingest/results/metadata.tsv", + branch_lengths = "results/branch_lengths.json", + clades = "results/clades.json", + nt_muts = "results/nt_muts.json", + aa_muts = "results/aa_muts.json", + colors = config["files"]["colors"], + auspice_config = config["files"]["auspice_config"], + output: + auspice_json = config["files"]["auspice_json"], + params: + strain_id = config["strain_id_field"], + metadata_columns = config["export"]["metadata_columns"], + log: + "logs/export.txt", + benchmark: + "benchmarks/export.txt", + shell: + """ + augur export v2 \ + --tree {input.tree:q} \ + --metadata {input.metadata:q} \ + --metadata-id-columns {params.strain_id:q} \ + --node-data {input.branch_lengths} {input.nt_muts} {input.aa_muts} {input.clades} \ + --colors {input.colors:q} \ + --metadata-columns {params.metadata_columns} \ + --auspice-config {input.auspice_config:q} \ + --include-root-sequence-inline \ + --output {output.auspice_json:q} \ + &> {log:q} + """ diff --git a/nextclade/rules/prepare_sequences.smk b/nextclade/rules/prepare_sequences.smk new file mode 100644 index 0000000..72eaea8 --- /dev/null +++ b/nextclade/rules/prepare_sequences.smk @@ -0,0 +1,73 @@ +""" +This part of the workflow prepares sequences for constructing the +phylogenetic tree. +""" +rule align_and_extract_prM_E: + input: + # TODO once this repo is fully automated and uploading data to + # S3, this step should download data from there instead of + # depending on the ingest build + sequences = "../ingest/results/sequences.fasta", + reference = config["files"]["reference_prM-E_fasta"], + output: + sequences = "results/sequences_prM-E.fasta", + params: + min_length = config['align_and_extract_prM-E']['min_length'], + min_seed_cover = config['align_and_extract_prM-E']['min_seed_cover'], + log: + "logs/align_and_extract_prM_E.txt", + benchmark: + "benchmarks/align_and_extract_prM_E.txt", + threads: workflow.cores + shell: + """ + nextclade3 run \ + --jobs {threads:q} \ + --input-ref {input.reference:q} \ + --output-fasta {output.sequences:q} \ + --min-seed-cover {params.min_seed_cover:q} \ + --min-length {params.min_length:q} \ + --silent \ + {input.sequences:q} \ + &> {log:q} + """ + +rule filter: + message: """ + Filtering to + * FIXME + """ + input: + include = config["files"]["include"], + # TODO once this repo is fully automated and uploading data to + # S3, this step should download data from there instead of + # depending on the ingest build + metadata = "../ingest/results/metadata.tsv", + sequences = "results/sequences_prM-E.fasta", + output: + sequences = "results/aligned.fasta" + params: + group_by = config["filter"]["group_by"], + subsample_max_sequences = config["filter"]["subsample_max_sequences"], + min_date = config["filter"]["min_date"], + min_length = config["filter"]["min_length"], + strain_id = config["strain_id_field"] + log: + "logs/filter.txt", + benchmark: + "benchmarks/filter.txt" + shell: + """ + augur filter \ + --sequences {input.sequences:q} \ + --metadata {input.metadata:q} \ + --metadata-id-columns {params.strain_id:q} \ + --exclude-all \ + --include {input.include:q} \ + --output {output.sequences:q} \ + --group-by {params.group_by} \ + --subsample-max-sequences {params.subsample_max_sequences:q} \ + --min-date {params.min_date:q} \ + --min-length {params.min_length:q} \ + &> {log:q} + """