Skip to content

Commit

Permalink
Rename builds and sample differently
Browse files Browse the repository at this point in the history
New names no longer include the deprecated `monkeypox` name

New mpxv and mpxv-IIb builds downsample B.1* sequences so that they don't overwhelm the tree

Add an mpxv-IIb-B.1 build that contains only B.1* sequences
  • Loading branch information
corneliusroemer committed Sep 20, 2023
1 parent cd969f7 commit b34b076
Show file tree
Hide file tree
Showing 7 changed files with 141 additions and 39 deletions.
49 changes: 45 additions & 4 deletions config/config_hmpxv1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,55 @@ colors: "config/colors_hmpxv1.tsv"
auspice_config: "config/auspice_config_hmpxv1.json"

build_name: "hmpxv1"
auspice_name: "monkeypox_hmpxv1"
auspice_name: "mpox_mpxv-IIb"

## filter
min_date: 2017
min_length: 100000
sequences_per_group: "--sequences-per-group 40"
group_by: "--group-by year month country"
filters: "--exclude-where outbreak!=hMPXV-1"


### Set 1: Non-B.1 sequences: use all
### Set 2: B.1 sequences: small sample across year/country, maybe month
filter:
non_b1:
group_by: "--group-by lineage year country"
sequences_per_group: "--sequences-per-group 50"
other_filters: "outbreak!=hMPXV-1 clade!=IIb"
exclude_lineages:
- B.1
- B.1.1
- B.1.2
- B.1.3
- B.1.4
- B.1.5
- B.1.6
- B.1.7
- B.1.8
- B.1.9
- B.1.10
- B.1.11
- B.1.12
- B.1.13
- B.1.14
- B.1.15
- B.1.16
- B.1.17
- B.1.18
- B.1.19
- B.1.20
- C.1
b1:
group_by: "--group-by country year"
sequences_per_group: "--subsample-max-sequences 100"
other_filters: "--exclude-where outbreak!=hMPXV-1 clade!=IIb"

## align
max_indel: 10000
seed_spacing: 1000

## treefix
fix_tree: true
treefix_root: "--root MK783032"

## refine
timetree: true
Expand Down
33 changes: 26 additions & 7 deletions config/config_hmpxv1_big.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,37 @@ colors: "config/colors_hmpxv1.tsv"
auspice_config: "config/auspice_config_hmpxv1_big.json"

build_name: "hmpxv1_big"
auspice_name: "monkeypox_hmpxv1_big"
auspice_name: "mpox_mpxv-IIb-B.1"

## filter
min_date: 2017
min_length: 100000
sequences_per_group: ""
group_by: ""
filters: "--exclude-where outbreak!=hMPXV-1"
min_date: 2022
min_length: 180000
filter:
b1:
group_by: "--group-by year month country"
sequences_per_group: "--subsample-max-sequences 5000"
other_filters: "outbreak!=hMPXV-1 clade!=IIb"
exclude_lineages:
- A
- A.1
- A.1.1
- A.2
- A.2.1
- A.2.2
- A.2.3
- A.3

## align
max_indel: 10000
seed_spacing: 1000

## treefix
fix_tree: true
treefix_root: "--root OP890401"

## refine
timetree: true
root: "MK783032 MK783030"
root: "OP890401"
clock_rate: 5.7e-5
clock_std_dev: 2e-5

Expand Down
37 changes: 34 additions & 3 deletions config/config_mpxv.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,44 @@ colors: "config/colors_mpxv.tsv"
auspice_config: "config/auspice_config_mpxv.json"

build_name: "mpxv"
auspice_name: "monkeypox_mpxv"
auspice_name: "mpox_mpxv"

## filter
min_date: 1950
min_length: 100000
group_by: "--group-by clade year country"
sequences_per_group: "--sequences-per-group 50"

### Set 1: Non-B.1 sequences: use all
### Set 2: B.1 sequences: small sample across year/country, maybe month
filter:
non_b1:
group_by: "--group-by clade year country"
sequences_per_group: "--sequences-per-group 50"
exclude_lineages:
- B.1
- B.1.1
- B.1.2
- B.1.3
- B.1.4
- B.1.5
- B.1.6
- B.1.7
- B.1.8
- B.1.9
- B.1.10
- B.1.11
- B.1.12
- B.1.13
- B.1.14
- B.1.15
- B.1.16
- B.1.17
- B.1.18
- B.1.19
- B.1.20
- C.1
b1:
group_by: "--group-by country year"
sequences_per_group: "--subsample-max-sequences 100"

## refine
timetree: false
Expand Down
Empty file added config/include_hmpxv1.txt
Empty file.
1 change: 1 addition & 0 deletions config/include_hmpxv1_big.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
OP890401
Empty file added config/include_mpxv.txt
Empty file.
60 changes: 35 additions & 25 deletions workflow/snakemake_rules/core.smk
Original file line number Diff line number Diff line change
Expand Up @@ -48,53 +48,60 @@ rule exclude_bad:
"""


rule include_A_strains:
rule filter:
input:
sequences=rules.exclude_bad.output.sequences,
metadata=rules.exclude_bad.output.metadata,
output:
include_strains=build_dir + "/{build_name}/include_strains.txt",
strains=build_dir + "/{build_name}/{subset}_strains.txt",
log=build_dir + "/{build_name}/{subset}_filter.log",
params:
group_by=lambda w: config["filter"][w.subset]["group_by"],
sequences_per_group=lambda w: config["filter"][w.subset]["sequences_per_group"],
other_filters=lambda w: config["filter"][w.subset].get("other_filters", ""),
exclude=lambda w: f"--exclude-where {' '.join([f'lineage={l}' for l in config['filter'][w.subset]['exclude_lineages']])}"
if "exclude_lineages" in config["filter"][w.subset]
else "",
strain_id=config["strain_id_field"],
shell:
"""
tsv-filter -H \
--str-in-fld lineage:'A'\
{input.metadata} \
| tsv-select -f 1 \
> {output.include_strains}
augur filter \
--sequences {input.sequences} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--output-strains {output.strains} \
{params.group_by} \
{params.sequences_per_group} \
{params.exclude} \
{params.other_filters} \
--output-log {output.log}
"""


rule filter:
message:
"""
Filtering to
- {params.sequences_per_group} sequence(s) per {params.group_by!s}
"""
rule filter_merge:
input:
strains=lambda w: [
f"{build_dir}/{w.build_name}/{subset}_strains.txt"
for subset in config["filter"]
],
sequences=rules.exclude_bad.output.sequences,
metadata=rules.exclude_bad.output.metadata,
include=rules.include_A_strains.output.include_strains,
include="config/include_{build_name}.txt",
output:
sequences=build_dir + "/{build_name}/filtered.fasta",
metadata=build_dir + "/{build_name}/metadata.tsv",
log=build_dir + "/{build_name}/filter.log",
params:
group_by=config.get("group_by", "--group-by clade lineage"),
sequences_per_group=config["sequences_per_group"],
other_filters=config.get("filters", ""),
strain_id=config["strain_id_field"],
shell:
"""
augur filter \
--metadata-id-columns {params.strain_id} \
--sequences {input.sequences} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--include {input.include} \
--exclude-all \
--include {input.strains} {input.include}\
--output-sequences {output.sequences} \
--output-metadata {output.metadata} \
{params.group_by} \
{params.sequences_per_group} \
{params.other_filters} \
--output-log {output.log}
--output-metadata {output.metadata}
"""


Expand Down Expand Up @@ -198,11 +205,14 @@ rule fix_tree:
alignment=build_dir + "/{build_name}/masked.fasta",
output:
tree=build_dir + "/{build_name}/tree_fixed.nwk",
params:
root=lambda w: config.get("treefix_root", ""),
shell:
"""
python3 scripts/fix_tree.py \
--alignment {input.alignment} \
--input-tree {input.tree} \
{params.root} \
--output {output.tree}
"""

Expand Down

0 comments on commit b34b076

Please sign in to comment.