From d8faead5c848e62bdc51f5281cde358808dd135b Mon Sep 17 00:00:00 2001 From: james hadfield Date: Mon, 27 May 2024 16:37:51 +1200 Subject: [PATCH] Add Clade-I build --- .../defaults/clade-i/auspice_config.json | 70 +++++++++++++++++++ phylogenetic/defaults/clade-i/config.yaml | 56 +++++++++++++++ phylogenetic/defaults/clade-i/include.txt | 0 phylogenetic/rules/prepare_sequences.smk | 6 ++ 4 files changed, 132 insertions(+) create mode 100644 phylogenetic/defaults/clade-i/auspice_config.json create mode 100644 phylogenetic/defaults/clade-i/config.yaml create mode 100644 phylogenetic/defaults/clade-i/include.txt diff --git a/phylogenetic/defaults/clade-i/auspice_config.json b/phylogenetic/defaults/clade-i/auspice_config.json new file mode 100644 index 00000000..a924efca --- /dev/null +++ b/phylogenetic/defaults/clade-i/auspice_config.json @@ -0,0 +1,70 @@ +{ + "title": "Genomic epidemiology of mpox clade I viruses", + "maintainers": [ + {"name": "Nextstrain team", "url": "http://nextstrain.org"} + ], + "data_provenance": [ + { + "name": "GenBank", + "url": "https://www.ncbi.nlm.nih.gov/genbank/" + } + ], + "build_url": "https://github.com/nextstrain/mpox", + "colorings": [ + { + "key": "region", + "title": "Region", + "type": "categorical" + }, + { + "key": "country", + "title": "Country", + "type": "categorical" + }, + { + "key": "host", + "title": "Host", + "type": "categorical" + }, + { + "key": "GA_CT_fraction", + "title": "G→A or C→T fraction", + "type": "continuous" + }, + { + "key": "dinuc_context_fraction", + "title": "NGA/TCN context of G→A/C→T mutations", + "type": "continuous" + }, + { + "key": "recency", + "title": "Submission Recency", + "type": "categorical" + }, + { + "key": "date_submitted", + "title": "Release Date", + "type": "categorical" + }, + { + "key": "date", + "title": "Collection date", + "type": "categorical" + } + ], + "geo_resolutions": [ + "country" + ], + "display_defaults": { + "color_by": "country", + "map_triplicate": true, + "distance_measure": "num_date", + "transmission_lines": false + }, + "filters": [ + "country", + "region", + "recency", + "host" + ] +} diff --git a/phylogenetic/defaults/clade-i/config.yaml b/phylogenetic/defaults/clade-i/config.yaml new file mode 100644 index 00000000..0f0d615b --- /dev/null +++ b/phylogenetic/defaults/clade-i/config.yaml @@ -0,0 +1,56 @@ +reference: "defaults/reference.fasta" +genome_annotation: "defaults/genome_annotation.gff3" +genbank_reference: "defaults/reference.gb" +include: "defaults/clade-i/include.txt" +clades: "defaults/clades.tsv" +lat_longs: "defaults/lat_longs.tsv" +auspice_config: "defaults/clade-i/auspice_config.json" +description: "defaults/description.md" +tree_mask: "defaults/tree_mask.tsv" + +# Use `accession` as the ID column since `strain` currently contains duplicates¹. +# ¹ https://github.com/nextstrain/mpox/issues/33 +strain_id_field: "accession" +display_strain_field: "strain" + +build_name: "clade-i" +auspice_name: "mpox_clade-I" + +filter: + min_date: 1900 + min_length: 100000 + exclude_where: 'clade!=I' + + +### We don't want to subsample, so specify a config which is essentially a no-op +subsample: + everything: + group_by: "" + sequences_per_group: "" + +## align +max_indel: 10000 +seed_spacing: 1000 + +## treefix +fix_tree: true +treefix_root: "" # without a root we'll midpoint root which should work great for clade I + +## refine +timetree: true +root: "best" +clock_rate: 5.7e-5 +clock_std_dev: 2e-5 +divergence_units: "mutations" + +traits: + columns: "country" + sampling_bias_correction: 3 + +## recency +recency: true + +mask: + from_beginning: 800 + from_end: 6422 + maskfile: "defaults/mask.bed" diff --git a/phylogenetic/defaults/clade-i/include.txt b/phylogenetic/defaults/clade-i/include.txt new file mode 100644 index 00000000..e69de29b diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk index c9ab5012..4916379a 100644 --- a/phylogenetic/rules/prepare_sequences.smk +++ b/phylogenetic/rules/prepare_sequences.smk @@ -65,6 +65,11 @@ rule filter: min_date=config["filter"]["min_date"], min_length=config["filter"]["min_length"], strain_id=config["strain_id_field"], + exclude_where=lambda w: ( + f"--exclude-where {config['filter']['exclude_where']}" + if "exclude_where" in config["filter"] + else "" + ), shell: """ augur filter \ @@ -74,6 +79,7 @@ rule filter: --output-sequences {output.sequences} \ --output-metadata {output.metadata} \ --exclude {input.exclude} \ + {params.exclude_where} \ --min-date {params.min_date} \ --min-length {params.min_length} \ --query "(QC_rare_mutations == 'good' | QC_rare_mutations == 'mediocre')" \