Skip to content

Commit

Permalink
Add Clade-I build
Browse files Browse the repository at this point in the history
  • Loading branch information
jameshadfield committed May 27, 2024
1 parent 7f5adc3 commit d8faead
Show file tree
Hide file tree
Showing 4 changed files with 132 additions and 0 deletions.
70 changes: 70 additions & 0 deletions phylogenetic/defaults/clade-i/auspice_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
{
"title": "Genomic epidemiology of mpox clade I viruses",
"maintainers": [
{"name": "Nextstrain team", "url": "http://nextstrain.org"}
],
"data_provenance": [
{
"name": "GenBank",
"url": "https://www.ncbi.nlm.nih.gov/genbank/"
}
],
"build_url": "https://github.com/nextstrain/mpox",
"colorings": [
{
"key": "region",
"title": "Region",
"type": "categorical"
},
{
"key": "country",
"title": "Country",
"type": "categorical"
},
{
"key": "host",
"title": "Host",
"type": "categorical"
},
{
"key": "GA_CT_fraction",
"title": "G→A or C→T fraction",
"type": "continuous"
},
{
"key": "dinuc_context_fraction",
"title": "NGA/TCN context of G→A/C→T mutations",
"type": "continuous"
},
{
"key": "recency",
"title": "Submission Recency",
"type": "categorical"
},
{
"key": "date_submitted",
"title": "Release Date",
"type": "categorical"
},
{
"key": "date",
"title": "Collection date",
"type": "categorical"
}
],
"geo_resolutions": [
"country"
],
"display_defaults": {
"color_by": "country",
"map_triplicate": true,
"distance_measure": "num_date",
"transmission_lines": false
},
"filters": [
"country",
"region",
"recency",
"host"
]
}
56 changes: 56 additions & 0 deletions phylogenetic/defaults/clade-i/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
reference: "defaults/reference.fasta"
genome_annotation: "defaults/genome_annotation.gff3"
genbank_reference: "defaults/reference.gb"
include: "defaults/clade-i/include.txt"
clades: "defaults/clades.tsv"
lat_longs: "defaults/lat_longs.tsv"
auspice_config: "defaults/clade-i/auspice_config.json"
description: "defaults/description.md"
tree_mask: "defaults/tree_mask.tsv"

# Use `accession` as the ID column since `strain` currently contains duplicates¹.
# ¹ https://github.com/nextstrain/mpox/issues/33
strain_id_field: "accession"
display_strain_field: "strain"

build_name: "clade-i"
auspice_name: "mpox_clade-I"

filter:
min_date: 1900
min_length: 100000
exclude_where: 'clade!=I'


### We don't want to subsample, so specify a config which is essentially a no-op
subsample:
everything:
group_by: ""
sequences_per_group: ""

## align
max_indel: 10000
seed_spacing: 1000

## treefix
fix_tree: true
treefix_root: "" # without a root we'll midpoint root which should work great for clade I

## refine
timetree: true
root: "best"
clock_rate: 5.7e-5
clock_std_dev: 2e-5
divergence_units: "mutations"

traits:
columns: "country"
sampling_bias_correction: 3

## recency
recency: true

mask:
from_beginning: 800
from_end: 6422
maskfile: "defaults/mask.bed"
Empty file.
6 changes: 6 additions & 0 deletions phylogenetic/rules/prepare_sequences.smk
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,11 @@ rule filter:
min_date=config["filter"]["min_date"],
min_length=config["filter"]["min_length"],
strain_id=config["strain_id_field"],
exclude_where=lambda w: (
f"--exclude-where {config['filter']['exclude_where']}"
if "exclude_where" in config["filter"]
else ""
),
shell:
"""
augur filter \
Expand All @@ -74,6 +79,7 @@ rule filter:
--output-sequences {output.sequences} \
--output-metadata {output.metadata} \
--exclude {input.exclude} \
{params.exclude_where} \
--min-date {params.min_date} \
--min-length {params.min_length} \
--query "(QC_rare_mutations == 'good' | QC_rare_mutations == 'mediocre')" \
Expand Down

0 comments on commit d8faead

Please sign in to comment.