From 968c4d9c5f9769cbf2a006dbd2dfde15d576e282 Mon Sep 17 00:00:00 2001 From: John SJ Anderson Date: Mon, 5 Aug 2024 12:00:08 -0700 Subject: [PATCH 1/9] Lock clock rates for 229E and OC43 [#36] --- phylogenetic/defaults/config.yaml | 8 ++--- phylogenetic/rules/construct_phylogeny.smk | 40 ++++++++-------------- 2 files changed, 18 insertions(+), 30 deletions(-) diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml index 4e0b6b7..f4ae1c0 100644 --- a/phylogenetic/defaults/config.yaml +++ b/phylogenetic/defaults/config.yaml @@ -38,8 +38,8 @@ viruses: subsample_max_sequences: 4000 min_length: 20000 construct_phylogeny: - clock_rate: "" - clock_std_dev: "" + clock_rate: 0.000250 + clock_std_dev: 0.00010 coalescent: "opt" date_inference: "marginal" clock_filter_iqd: 4 @@ -68,8 +68,8 @@ oc43: subsample_max_sequences: 4000 min_length: 20000 construct_phylogeny: - clock_rate: "" - clock_std_dev: "" + clock_rate: 0.000250 + clock_std_dev: 0.00010 coalescent: "opt" date_inference: "marginal" clock_filter_iqd: 4 diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk index fba9328..fe5400a 100644 --- a/phylogenetic/rules/construct_phylogeny.smk +++ b/phylogenetic/rules/construct_phylogeny.smk @@ -45,31 +45,19 @@ rule refine: date_inference=lambda wildcards: config[wildcards.virus]["construct_phylogeny"]["date_inference"], clock_filter_iqd=lambda wildcards: config[wildcards.virus]["construct_phylogeny"]["clock_filter_iqd"], shell: - # TODO move this conditional logic up into the params lambda (?) r""" - ( - if [ "{wildcards.virus}" == "229e" ] || [ "{wildcards.virus}" == "oc43" ]; then - echo "Estimating clock rate for {wildcards.virus}" - clock_rate="" - clock_std_dev="" - else - echo "Setting clock rate at {params.clock_rate} with std dev {params.clock_std_dev} for {wildcards.virus}" - clock_rate="--clock-rate {params.clock_rate}" - clock_std_dev="--clock-std-dev {params.clock_std_dev}" - fi - - augur refine \ - --tree {input.tree:q} \ - --alignment {input.alignment:q} \ - --metadata {input.metadata:q} \ - --output-tree {output.tree:q} \ - --output-node-data {output.node_data:q} \ - --timetree \ - $clock_rate \ - $clock_std_dev \ - --coalescent {params.coalescent:q} \ - --date-confidence \ - --date-inference {params.date_inference:q} \ - --clock-filter-iqd {params.clock_filter_iqd:q} - ) 2>{log:q} + augur refine \ + --tree {input.tree:q} \ + --alignment {input.alignment:q} \ + --metadata {input.metadata:q} \ + --output-tree {output.tree:q} \ + --output-node-data {output.node_data:q} \ + --timetree \ + --clock-rate {params.clock_rate} \ + --clock-std-dev {params.clock_std_dev} \ + --coalescent {params.coalescent:q} \ + --date-confidence \ + --date-inference {params.date_inference:q} \ + --clock-filter-iqd {params.clock_filter_iqd:q} + &> {log:q} """ From 9b5d38ff6ca940dea362c1afb53d6380162bca27 Mon Sep 17 00:00:00 2001 From: John SJ Anderson Date: Thu, 12 Dec 2024 16:03:43 -0800 Subject: [PATCH 2/9] Use abbreviated authors as "authors"; full authors as "full authors" --- ingest/defaults/config.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index 0d3b1aa..57e554b 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -45,7 +45,7 @@ curate: Geographic Location: location Isolate Collection date: date Host Name: host - Submitter Names: authors + Submitter Names: full_authors Submitter Affiliation: institution SRA Accessions: sra_accession @@ -93,13 +93,13 @@ curate: fields: ["region", "country", "division", "location"] # Metadata field that contains the list of authors associated with the sequence - authors_field: "authors" + authors_field: "full_authors" # Default value to use if the authors field is empty authors_default_value: "?" # Name to use for the generated abbreviated authors field - abbr_authors_field: "abbr_authors" + abbr_authors_field: "authors" # The ID field in the metadata to use to merge the manual annotations annotations_id: "strain" @@ -119,6 +119,6 @@ curate: - country - host - sra_accession - - abbr_authors + - full_authors - authors - institution From a86fb3c0badf15bf1eabec39ad844552e30f5856 Mon Sep 17 00:00:00 2001 From: John SJ Anderson Date: Thu, 12 Dec 2024 16:06:03 -0800 Subject: [PATCH 3/9] Add tip frequencies, enable frequencies panel [#41] --- phylogenetic/Snakefile | 3 ++- phylogenetic/defaults/auspice_config.json | 9 +++++-- phylogenetic/defaults/config.yaml | 7 ++++++ phylogenetic/rules/export.smk | 29 +++++++++++++++++++++++ 4 files changed, 45 insertions(+), 3 deletions(-) diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index c4909ba..0930239 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -4,7 +4,8 @@ configfile: "defaults/config.yaml" rule all: input: - expand("auspice/seasonal-cov_{virus}.json", virus=config["viruses"]), + auspice_json=expand("auspice/seasonal-cov_{virus}.json", virus=config["viruses"]), + tip_frequencies_json=expand("auspice/seasonal-cov_{virus}_tip-frequencies.json", virus=config["viruses"]), include: "rules/prepare_sequences.smk" diff --git a/phylogenetic/defaults/auspice_config.json b/phylogenetic/defaults/auspice_config.json index 6d933c5..e32e6c3 100644 --- a/phylogenetic/defaults/auspice_config.json +++ b/phylogenetic/defaults/auspice_config.json @@ -38,11 +38,16 @@ }, "filters": [ "country", - "region" + "region", + "author" ], "panels": [ "tree", "map", - "entropy" + "entropy", + "frequencies" + ], + "metadata_columns": [ + "author" ] } diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml index f4ae1c0..3b210f0 100644 --- a/phylogenetic/defaults/config.yaml +++ b/phylogenetic/defaults/config.yaml @@ -7,6 +7,13 @@ viruses: - "oc43" - "hku1" +strain_id_field: "strain" + +tip_frequencies: + min_date: "2017-01-01" + max_date: "6M" + narrow_bandwidth: 0.2 + wide_bandwidth: 0.6 # virus-specific information # For each virus, provide the following parameter values. All paths # should be relative to the phylogenetic directory. diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk index 8b29266..8cde332 100644 --- a/phylogenetic/rules/export.smk +++ b/phylogenetic/rules/export.smk @@ -37,3 +37,32 @@ rule export: --output {output.auspice_json:q} \ 2>{log:q} """ + +rule tip_frequencies: + """ + Estimating KDE frequencies for tips + """ + input: + tree = "results/{virus}/tree.nwk", + metadata = "data/{virus}/metadata.tsv", + output: + tip_freq = "auspice/seasonal-cov_{virus}_tip-frequencies.json" + params: + strain_id = config["strain_id_field"], + min_date = config["tip_frequencies"]["min_date"], + max_date = config["tip_frequencies"]["max_date"], + narrow_bandwidth =config["tip_frequencies"]["narrow_bandwidth"], + wide_bandwidth = config["tip_frequencies"]["wide_bandwidth"], + shell: + r""" + augur frequencies \ + --method kde \ + --tree {input.tree} \ + --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ + --min-date {params.min_date} \ + --max-date {params.max_date} \ + --narrow-bandwidth {params.narrow_bandwidth} \ + --wide-bandwidth {params.wide_bandwidth} \ + --output {output.tip_freq} + """ From c291cdd63f642d0ad52d263d9827a92325fba403 Mon Sep 17 00:00:00 2001 From: John SJ Anderson Date: Thu, 12 Dec 2024 16:07:01 -0800 Subject: [PATCH 4/9] Tweak color-by -- re-order, add author --- phylogenetic/defaults/auspice_config.json | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/phylogenetic/defaults/auspice_config.json b/phylogenetic/defaults/auspice_config.json index e32e6c3..970340c 100644 --- a/phylogenetic/defaults/auspice_config.json +++ b/phylogenetic/defaults/auspice_config.json @@ -12,19 +12,24 @@ ], "build_url": "https://github.com/nextstrain/seasonal-cov", "colorings": [ - { + { "key": "gt", "title": "Genotype", "type": "categorical" }, + { + "key": "region", + "title": "Region", + "type": "categorical" + }, { "key": "country", "title": "Country", "type": "categorical" }, { - "key": "region", - "title": "Region", + "key": "author", + "title": "Author", "type": "categorical" } ], From 122a2093ce3cdf090e6743e626368bddfcd2ec2d Mon Sep 17 00:00:00 2001 From: John SJ Anderson Date: Thu, 12 Dec 2024 16:07:22 -0800 Subject: [PATCH 5/9] Missing backquote --- phylogenetic/rules/construct_phylogeny.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk index fe5400a..da004f6 100644 --- a/phylogenetic/rules/construct_phylogeny.smk +++ b/phylogenetic/rules/construct_phylogeny.smk @@ -58,6 +58,6 @@ rule refine: --coalescent {params.coalescent:q} \ --date-confidence \ --date-inference {params.date_inference:q} \ - --clock-filter-iqd {params.clock_filter_iqd:q} + --clock-filter-iqd {params.clock_filter_iqd:q} \ &> {log:q} """ From 17faa789067283d1350714c96bfb062046c63e60 Mon Sep 17 00:00:00 2001 From: John SJ Anderson Date: Thu, 12 Dec 2024 16:08:08 -0800 Subject: [PATCH 6/9] Drop HKU1 strains from Chow et al MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With these strains included, the HKU1 tree roots badly — these fall in between the two effective clades in the tree, causing the root to not be properly positioned in between them. --- .../defaults/hku1/dropped_strains.txt | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/phylogenetic/defaults/hku1/dropped_strains.txt b/phylogenetic/defaults/hku1/dropped_strains.txt index e69de29..cec821a 100644 --- a/phylogenetic/defaults/hku1/dropped_strains.txt +++ b/phylogenetic/defaults/hku1/dropped_strains.txt @@ -0,0 +1,32 @@ +ON136168.1 # Chow et al - excluding these strains improves rooting of the tree +ON136169.1 # Chow et al - excluding these strains improves rooting of the tree +ON136170.1 # Chow et al - excluding these strains improves rooting of the tree +ON136171.1 # Chow et al - excluding these strains improves rooting of the tree +ON136172.1 # Chow et al - excluding these strains improves rooting of the tree +ON136173.1 # Chow et al - excluding these strains improves rooting of the tree +ON461755.1 # Chow et al - excluding these strains improves rooting of the tree +ON461756.1 # Chow et al - excluding these strains improves rooting of the tree +ON461757.1 # Chow et al - excluding these strains improves rooting of the tree +ON461758.1 # Chow et al - excluding these strains improves rooting of the tree +ON461759.1 # Chow et al - excluding these strains improves rooting of the tree +ON461760.1 # Chow et al - excluding these strains improves rooting of the tree +ON461761.1 # Chow et al - excluding these strains improves rooting of the tree +ON461762.1 # Chow et al - excluding these strains improves rooting of the tree +ON461763.1 # Chow et al - excluding these strains improves rooting of the tree +ON461764.1 # Chow et al - excluding these strains improves rooting of the tree +ON461765.1 # Chow et al - excluding these strains improves rooting of the tree +ON461766.1 # Chow et al - excluding these strains improves rooting of the tree +ON461768.1 # Chow et al - excluding these strains improves rooting of the tree +ON461769.1 # Chow et al - excluding these strains improves rooting of the tree +ON461770.1 # Chow et al - excluding these strains improves rooting of the tree +ON461771.1 # Chow et al - excluding these strains improves rooting of the tree +ON461772.1 # Chow et al - excluding these strains improves rooting of the tree +ON461773.1 # Chow et al - excluding these strains improves rooting of the tree +ON461774.1 # Chow et al - excluding these strains improves rooting of the tree +ON461775.1 # Chow et al - excluding these strains improves rooting of the tree +ON461776.1 # Chow et al - excluding these strains improves rooting of the tree +ON461777.1 # Chow et al - excluding these strains improves rooting of the tree +ON461778.1 # Chow et al - excluding these strains improves rooting of the tree +ON461779.1 # Chow et al - excluding these strains improves rooting of the tree +ON461780.1 # Chow et al - excluding these strains improves rooting of the tree +ON461781.1 # Chow et al - excluding these strains improves rooting of the tree From cb5eb3672967d5a266829a9b87ceacccfdfb3d6d Mon Sep 17 00:00:00 2001 From: John SJ Anderson Date: Wed, 18 Dec 2024 10:54:34 -0800 Subject: [PATCH 7/9] Exclude lower quality HKU1 sequences This is all sequences (as of ~15 Dec 2024) with >6k Ns or <80% coverage. This was determined by exporting the Nextstrain tree over to Nextclade, dropping the entire Nextstrain sequences.fasta on the Nextclade tree, then filtering/sorting samples by both N count and coverage percentage. --- phylogenetic/defaults/hku1/dropped_strains.txt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/phylogenetic/defaults/hku1/dropped_strains.txt b/phylogenetic/defaults/hku1/dropped_strains.txt index cec821a..e288f2a 100644 --- a/phylogenetic/defaults/hku1/dropped_strains.txt +++ b/phylogenetic/defaults/hku1/dropped_strains.txt @@ -30,3 +30,12 @@ ON461778.1 # Chow et al - excluding these strains improves rooting of the tree ON461779.1 # Chow et al - excluding these strains improves rooting of the tree ON461780.1 # Chow et al - excluding these strains improves rooting of the tree ON461781.1 # Chow et al - excluding these strains improves rooting of the tree + +# exclude sequences with >6000Ns and/or <80% coverage +OY757708.1 # 27.8k Ns, 8.3% coverage +OY757859.1 # 26.6k Ns, 10.7% coverage +KF430196.1 # 8446 Ns, 70.4% coverage +KF850450.2 # 8105 Ns, 71.7% coverage +ON461767.1 # 7302 Ns, 75.0% coverage +OY757702.1 # 6770 Ns, 77.3% coverage +MW587043.1 # 6072 Ns, 78.9% coverage From 22619be755eef72f47d69f10cc0165c8926cca21 Mon Sep 17 00:00:00 2001 From: John SJ Anderson Date: Wed, 18 Dec 2024 10:56:15 -0800 Subject: [PATCH 8/9] Drop a couple NL63 sequences As suggested by Trevor. --- phylogenetic/defaults/nl63/dropped_strains.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/phylogenetic/defaults/nl63/dropped_strains.txt b/phylogenetic/defaults/nl63/dropped_strains.txt index e69de29..059b89f 100644 --- a/phylogenetic/defaults/nl63/dropped_strains.txt +++ b/phylogenetic/defaults/nl63/dropped_strains.txt @@ -0,0 +1,2 @@ +PQ037243.1 +PQ037240.1 From a60dae7777352443036218cd2cf6a1541878eaba Mon Sep 17 00:00:00 2001 From: John SJ Anderson Date: Wed, 18 Dec 2024 11:09:50 -0800 Subject: [PATCH 9/9] Update TSV handling per best practices [#42] Verified that metadata file outputs are unchanged by this revision. --- ingest/rules/curate.smk | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk index cb8649c..3d7439e 100644 --- a/ingest/rules/curate.smk +++ b/ingest/rules/curate.smk @@ -134,8 +134,9 @@ rule subset_metadata: metadata_fields=",".join(config["curate"]["metadata_columns"]), shell: r""" - tsv-select -H -f {params.metadata_fields:q} \ - {input.metadata:q} \ + csv2tsv --csv-delim $'\t' {input.metadata:q} \ + | tsv-select -H -f {params.metadata_fields:q} \ + | csvtk fix-quotes --tabs \ > {output.subset_metadata:q} \ 2> {log:q} """