Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Various seasonal-cov improvements #43

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions ingest/defaults/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ curate:
Geographic Location: location
Isolate Collection date: date
Host Name: host
Submitter Names: authors
Submitter Names: full_authors
Submitter Affiliation: institution
SRA Accessions: sra_accession

Expand Down Expand Up @@ -93,13 +93,13 @@ curate:
fields: ["region", "country", "division", "location"]

# Metadata field that contains the list of authors associated with the sequence
authors_field: "authors"
authors_field: "full_authors"

# Default value to use if the authors field is empty
authors_default_value: "?"

# Name to use for the generated abbreviated authors field
abbr_authors_field: "abbr_authors"
abbr_authors_field: "authors"

# The ID field in the metadata to use to merge the manual annotations
annotations_id: "strain"
Expand All @@ -119,6 +119,6 @@ curate:
- country
- host
- sra_accession
- abbr_authors
- full_authors
- authors
- institution
5 changes: 3 additions & 2 deletions ingest/rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,9 @@ rule subset_metadata:
metadata_fields=",".join(config["curate"]["metadata_columns"]),
shell:
r"""
tsv-select -H -f {params.metadata_fields:q} \
{input.metadata:q} \
csv2tsv --csv-delim $'\t' {input.metadata:q} \
| tsv-select -H -f {params.metadata_fields:q} \
| csvtk fix-quotes --tabs \
> {output.subset_metadata:q} \
2> {log:q}
"""
3 changes: 2 additions & 1 deletion phylogenetic/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ configfile: "defaults/config.yaml"

rule all:
input:
expand("auspice/seasonal-cov_{virus}.json", virus=config["viruses"]),
auspice_json=expand("auspice/seasonal-cov_{virus}.json", virus=config["viruses"]),
tip_frequencies_json=expand("auspice/seasonal-cov_{virus}_tip-frequencies.json", virus=config["viruses"]),


include: "rules/prepare_sequences.smk"
Expand Down
20 changes: 15 additions & 5 deletions phylogenetic/defaults/auspice_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,24 @@
],
"build_url": "https://github.com/nextstrain/seasonal-cov",
"colorings": [
{
{
"key": "gt",
"title": "Genotype",
"type": "categorical"
},
{
"key": "region",
"title": "Region",
"type": "categorical"
},
{
"key": "country",
"title": "Country",
"type": "categorical"
},
{
"key": "region",
"title": "Region",
"key": "author",
"title": "Author",
"type": "categorical"
}
],
Expand All @@ -38,11 +43,16 @@
},
"filters": [
"country",
"region"
"region",
"author"
],
"panels": [
"tree",
"map",
"entropy"
"entropy",
"frequencies"
],
"metadata_columns": [
"author"
]
}
15 changes: 11 additions & 4 deletions phylogenetic/defaults/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@ viruses:
- "oc43"
- "hku1"

strain_id_field: "strain"

tip_frequencies:
min_date: "2017-01-01"
max_date: "6M"
narrow_bandwidth: 0.2
wide_bandwidth: 0.6
# virus-specific information
# For each virus, provide the following parameter values. All paths
# should be relative to the phylogenetic directory.
Expand Down Expand Up @@ -38,8 +45,8 @@ viruses:
subsample_max_sequences: 4000
min_length: 20000
construct_phylogeny:
clock_rate: ""
clock_std_dev: ""
clock_rate: 0.000250
clock_std_dev: 0.00010
coalescent: "opt"
date_inference: "marginal"
clock_filter_iqd: 4
Expand Down Expand Up @@ -68,8 +75,8 @@ oc43:
subsample_max_sequences: 4000
min_length: 20000
construct_phylogeny:
clock_rate: ""
clock_std_dev: ""
clock_rate: 0.000250
clock_std_dev: 0.00010
coalescent: "opt"
date_inference: "marginal"
clock_filter_iqd: 4
Expand Down
41 changes: 41 additions & 0 deletions phylogenetic/defaults/hku1/dropped_strains.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
ON136168.1 # Chow et al - excluding these strains improves rooting of the tree
ON136169.1 # Chow et al - excluding these strains improves rooting of the tree
ON136170.1 # Chow et al - excluding these strains improves rooting of the tree
ON136171.1 # Chow et al - excluding these strains improves rooting of the tree
ON136172.1 # Chow et al - excluding these strains improves rooting of the tree
ON136173.1 # Chow et al - excluding these strains improves rooting of the tree
ON461755.1 # Chow et al - excluding these strains improves rooting of the tree
ON461756.1 # Chow et al - excluding these strains improves rooting of the tree
ON461757.1 # Chow et al - excluding these strains improves rooting of the tree
ON461758.1 # Chow et al - excluding these strains improves rooting of the tree
ON461759.1 # Chow et al - excluding these strains improves rooting of the tree
ON461760.1 # Chow et al - excluding these strains improves rooting of the tree
ON461761.1 # Chow et al - excluding these strains improves rooting of the tree
ON461762.1 # Chow et al - excluding these strains improves rooting of the tree
ON461763.1 # Chow et al - excluding these strains improves rooting of the tree
ON461764.1 # Chow et al - excluding these strains improves rooting of the tree
ON461765.1 # Chow et al - excluding these strains improves rooting of the tree
ON461766.1 # Chow et al - excluding these strains improves rooting of the tree
ON461768.1 # Chow et al - excluding these strains improves rooting of the tree
ON461769.1 # Chow et al - excluding these strains improves rooting of the tree
ON461770.1 # Chow et al - excluding these strains improves rooting of the tree
ON461771.1 # Chow et al - excluding these strains improves rooting of the tree
ON461772.1 # Chow et al - excluding these strains improves rooting of the tree
ON461773.1 # Chow et al - excluding these strains improves rooting of the tree
ON461774.1 # Chow et al - excluding these strains improves rooting of the tree
ON461775.1 # Chow et al - excluding these strains improves rooting of the tree
ON461776.1 # Chow et al - excluding these strains improves rooting of the tree
ON461777.1 # Chow et al - excluding these strains improves rooting of the tree
ON461778.1 # Chow et al - excluding these strains improves rooting of the tree
ON461779.1 # Chow et al - excluding these strains improves rooting of the tree
ON461780.1 # Chow et al - excluding these strains improves rooting of the tree
ON461781.1 # Chow et al - excluding these strains improves rooting of the tree

# exclude sequences with >6000Ns and/or <80% coverage
OY757708.1 # 27.8k Ns, 8.3% coverage
OY757859.1 # 26.6k Ns, 10.7% coverage
KF430196.1 # 8446 Ns, 70.4% coverage
KF850450.2 # 8105 Ns, 71.7% coverage
ON461767.1 # 7302 Ns, 75.0% coverage
OY757702.1 # 6770 Ns, 77.3% coverage
MW587043.1 # 6072 Ns, 78.9% coverage
2 changes: 2 additions & 0 deletions phylogenetic/defaults/nl63/dropped_strains.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
PQ037243.1
PQ037240.1
40 changes: 14 additions & 26 deletions phylogenetic/rules/construct_phylogeny.smk
Original file line number Diff line number Diff line change
Expand Up @@ -45,31 +45,19 @@ rule refine:
date_inference=lambda wildcards: config[wildcards.virus]["construct_phylogeny"]["date_inference"],
clock_filter_iqd=lambda wildcards: config[wildcards.virus]["construct_phylogeny"]["clock_filter_iqd"],
shell:
# TODO move this conditional logic up into the params lambda (?)
r"""
(
if [ "{wildcards.virus}" == "229e" ] || [ "{wildcards.virus}" == "oc43" ]; then
echo "Estimating clock rate for {wildcards.virus}"
clock_rate=""
clock_std_dev=""
else
echo "Setting clock rate at {params.clock_rate} with std dev {params.clock_std_dev} for {wildcards.virus}"
clock_rate="--clock-rate {params.clock_rate}"
clock_std_dev="--clock-std-dev {params.clock_std_dev}"
fi

augur refine \
--tree {input.tree:q} \
--alignment {input.alignment:q} \
--metadata {input.metadata:q} \
--output-tree {output.tree:q} \
--output-node-data {output.node_data:q} \
--timetree \
$clock_rate \
$clock_std_dev \
--coalescent {params.coalescent:q} \
--date-confidence \
--date-inference {params.date_inference:q} \
--clock-filter-iqd {params.clock_filter_iqd:q}
) 2>{log:q}
augur refine \
--tree {input.tree:q} \
--alignment {input.alignment:q} \
--metadata {input.metadata:q} \
--output-tree {output.tree:q} \
--output-node-data {output.node_data:q} \
--timetree \
--clock-rate {params.clock_rate} \
--clock-std-dev {params.clock_std_dev} \
--coalescent {params.coalescent:q} \
--date-confidence \
--date-inference {params.date_inference:q} \
--clock-filter-iqd {params.clock_filter_iqd:q} \
&> {log:q}
"""
29 changes: 29 additions & 0 deletions phylogenetic/rules/export.smk
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,32 @@ rule export:
--output {output.auspice_json:q} \
2>{log:q}
"""

rule tip_frequencies:
"""
Estimating KDE frequencies for tips
"""
input:
tree = "results/{virus}/tree.nwk",
metadata = "data/{virus}/metadata.tsv",
output:
tip_freq = "auspice/seasonal-cov_{virus}_tip-frequencies.json"
params:
strain_id = config["strain_id_field"],
min_date = config["tip_frequencies"]["min_date"],
max_date = config["tip_frequencies"]["max_date"],
narrow_bandwidth =config["tip_frequencies"]["narrow_bandwidth"],
wide_bandwidth = config["tip_frequencies"]["wide_bandwidth"],
shell:
r"""
augur frequencies \
--method kde \
--tree {input.tree} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--min-date {params.min_date} \
--max-date {params.max_date} \
--narrow-bandwidth {params.narrow_bandwidth} \
--wide-bandwidth {params.wide_bandwidth} \
--output {output.tip_freq}
"""
Loading