diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index 0d3b1aa..57e554b 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -45,7 +45,7 @@ curate: Geographic Location: location Isolate Collection date: date Host Name: host - Submitter Names: authors + Submitter Names: full_authors Submitter Affiliation: institution SRA Accessions: sra_accession @@ -93,13 +93,13 @@ curate: fields: ["region", "country", "division", "location"] # Metadata field that contains the list of authors associated with the sequence - authors_field: "authors" + authors_field: "full_authors" # Default value to use if the authors field is empty authors_default_value: "?" # Name to use for the generated abbreviated authors field - abbr_authors_field: "abbr_authors" + abbr_authors_field: "authors" # The ID field in the metadata to use to merge the manual annotations annotations_id: "strain" @@ -119,6 +119,6 @@ curate: - country - host - sra_accession - - abbr_authors + - full_authors - authors - institution diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk index cb8649c..3d7439e 100644 --- a/ingest/rules/curate.smk +++ b/ingest/rules/curate.smk @@ -134,8 +134,9 @@ rule subset_metadata: metadata_fields=",".join(config["curate"]["metadata_columns"]), shell: r""" - tsv-select -H -f {params.metadata_fields:q} \ - {input.metadata:q} \ + csv2tsv --csv-delim $'\t' {input.metadata:q} \ + | tsv-select -H -f {params.metadata_fields:q} \ + | csvtk fix-quotes --tabs \ > {output.subset_metadata:q} \ 2> {log:q} """ diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index c4909ba..0930239 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -4,7 +4,8 @@ configfile: "defaults/config.yaml" rule all: input: - expand("auspice/seasonal-cov_{virus}.json", virus=config["viruses"]), + auspice_json=expand("auspice/seasonal-cov_{virus}.json", virus=config["viruses"]), + tip_frequencies_json=expand("auspice/seasonal-cov_{virus}_tip-frequencies.json", virus=config["viruses"]), include: "rules/prepare_sequences.smk" diff --git a/phylogenetic/defaults/auspice_config.json b/phylogenetic/defaults/auspice_config.json index 6d933c5..970340c 100644 --- a/phylogenetic/defaults/auspice_config.json +++ b/phylogenetic/defaults/auspice_config.json @@ -12,19 +12,24 @@ ], "build_url": "https://github.com/nextstrain/seasonal-cov", "colorings": [ - { + { "key": "gt", "title": "Genotype", "type": "categorical" }, + { + "key": "region", + "title": "Region", + "type": "categorical" + }, { "key": "country", "title": "Country", "type": "categorical" }, { - "key": "region", - "title": "Region", + "key": "author", + "title": "Author", "type": "categorical" } ], @@ -38,11 +43,16 @@ }, "filters": [ "country", - "region" + "region", + "author" ], "panels": [ "tree", "map", - "entropy" + "entropy", + "frequencies" + ], + "metadata_columns": [ + "author" ] } diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml index 4e0b6b7..3b210f0 100644 --- a/phylogenetic/defaults/config.yaml +++ b/phylogenetic/defaults/config.yaml @@ -7,6 +7,13 @@ viruses: - "oc43" - "hku1" +strain_id_field: "strain" + +tip_frequencies: + min_date: "2017-01-01" + max_date: "6M" + narrow_bandwidth: 0.2 + wide_bandwidth: 0.6 # virus-specific information # For each virus, provide the following parameter values. All paths # should be relative to the phylogenetic directory. @@ -38,8 +45,8 @@ viruses: subsample_max_sequences: 4000 min_length: 20000 construct_phylogeny: - clock_rate: "" - clock_std_dev: "" + clock_rate: 0.000250 + clock_std_dev: 0.00010 coalescent: "opt" date_inference: "marginal" clock_filter_iqd: 4 @@ -68,8 +75,8 @@ oc43: subsample_max_sequences: 4000 min_length: 20000 construct_phylogeny: - clock_rate: "" - clock_std_dev: "" + clock_rate: 0.000250 + clock_std_dev: 0.00010 coalescent: "opt" date_inference: "marginal" clock_filter_iqd: 4 diff --git a/phylogenetic/defaults/hku1/dropped_strains.txt b/phylogenetic/defaults/hku1/dropped_strains.txt index e69de29..e288f2a 100644 --- a/phylogenetic/defaults/hku1/dropped_strains.txt +++ b/phylogenetic/defaults/hku1/dropped_strains.txt @@ -0,0 +1,41 @@ +ON136168.1 # Chow et al - excluding these strains improves rooting of the tree +ON136169.1 # Chow et al - excluding these strains improves rooting of the tree +ON136170.1 # Chow et al - excluding these strains improves rooting of the tree +ON136171.1 # Chow et al - excluding these strains improves rooting of the tree +ON136172.1 # Chow et al - excluding these strains improves rooting of the tree +ON136173.1 # Chow et al - excluding these strains improves rooting of the tree +ON461755.1 # Chow et al - excluding these strains improves rooting of the tree +ON461756.1 # Chow et al - excluding these strains improves rooting of the tree +ON461757.1 # Chow et al - excluding these strains improves rooting of the tree +ON461758.1 # Chow et al - excluding these strains improves rooting of the tree +ON461759.1 # Chow et al - excluding these strains improves rooting of the tree +ON461760.1 # Chow et al - excluding these strains improves rooting of the tree +ON461761.1 # Chow et al - excluding these strains improves rooting of the tree +ON461762.1 # Chow et al - excluding these strains improves rooting of the tree +ON461763.1 # Chow et al - excluding these strains improves rooting of the tree +ON461764.1 # Chow et al - excluding these strains improves rooting of the tree +ON461765.1 # Chow et al - excluding these strains improves rooting of the tree +ON461766.1 # Chow et al - excluding these strains improves rooting of the tree +ON461768.1 # Chow et al - excluding these strains improves rooting of the tree +ON461769.1 # Chow et al - excluding these strains improves rooting of the tree +ON461770.1 # Chow et al - excluding these strains improves rooting of the tree +ON461771.1 # Chow et al - excluding these strains improves rooting of the tree +ON461772.1 # Chow et al - excluding these strains improves rooting of the tree +ON461773.1 # Chow et al - excluding these strains improves rooting of the tree +ON461774.1 # Chow et al - excluding these strains improves rooting of the tree +ON461775.1 # Chow et al - excluding these strains improves rooting of the tree +ON461776.1 # Chow et al - excluding these strains improves rooting of the tree +ON461777.1 # Chow et al - excluding these strains improves rooting of the tree +ON461778.1 # Chow et al - excluding these strains improves rooting of the tree +ON461779.1 # Chow et al - excluding these strains improves rooting of the tree +ON461780.1 # Chow et al - excluding these strains improves rooting of the tree +ON461781.1 # Chow et al - excluding these strains improves rooting of the tree + +# exclude sequences with >6000Ns and/or <80% coverage +OY757708.1 # 27.8k Ns, 8.3% coverage +OY757859.1 # 26.6k Ns, 10.7% coverage +KF430196.1 # 8446 Ns, 70.4% coverage +KF850450.2 # 8105 Ns, 71.7% coverage +ON461767.1 # 7302 Ns, 75.0% coverage +OY757702.1 # 6770 Ns, 77.3% coverage +MW587043.1 # 6072 Ns, 78.9% coverage diff --git a/phylogenetic/defaults/nl63/dropped_strains.txt b/phylogenetic/defaults/nl63/dropped_strains.txt index e69de29..059b89f 100644 --- a/phylogenetic/defaults/nl63/dropped_strains.txt +++ b/phylogenetic/defaults/nl63/dropped_strains.txt @@ -0,0 +1,2 @@ +PQ037243.1 +PQ037240.1 diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk index fba9328..da004f6 100644 --- a/phylogenetic/rules/construct_phylogeny.smk +++ b/phylogenetic/rules/construct_phylogeny.smk @@ -45,31 +45,19 @@ rule refine: date_inference=lambda wildcards: config[wildcards.virus]["construct_phylogeny"]["date_inference"], clock_filter_iqd=lambda wildcards: config[wildcards.virus]["construct_phylogeny"]["clock_filter_iqd"], shell: - # TODO move this conditional logic up into the params lambda (?) r""" - ( - if [ "{wildcards.virus}" == "229e" ] || [ "{wildcards.virus}" == "oc43" ]; then - echo "Estimating clock rate for {wildcards.virus}" - clock_rate="" - clock_std_dev="" - else - echo "Setting clock rate at {params.clock_rate} with std dev {params.clock_std_dev} for {wildcards.virus}" - clock_rate="--clock-rate {params.clock_rate}" - clock_std_dev="--clock-std-dev {params.clock_std_dev}" - fi - - augur refine \ - --tree {input.tree:q} \ - --alignment {input.alignment:q} \ - --metadata {input.metadata:q} \ - --output-tree {output.tree:q} \ - --output-node-data {output.node_data:q} \ - --timetree \ - $clock_rate \ - $clock_std_dev \ - --coalescent {params.coalescent:q} \ - --date-confidence \ - --date-inference {params.date_inference:q} \ - --clock-filter-iqd {params.clock_filter_iqd:q} - ) 2>{log:q} + augur refine \ + --tree {input.tree:q} \ + --alignment {input.alignment:q} \ + --metadata {input.metadata:q} \ + --output-tree {output.tree:q} \ + --output-node-data {output.node_data:q} \ + --timetree \ + --clock-rate {params.clock_rate} \ + --clock-std-dev {params.clock_std_dev} \ + --coalescent {params.coalescent:q} \ + --date-confidence \ + --date-inference {params.date_inference:q} \ + --clock-filter-iqd {params.clock_filter_iqd:q} \ + &> {log:q} """ diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk index 8b29266..8cde332 100644 --- a/phylogenetic/rules/export.smk +++ b/phylogenetic/rules/export.smk @@ -37,3 +37,32 @@ rule export: --output {output.auspice_json:q} \ 2>{log:q} """ + +rule tip_frequencies: + """ + Estimating KDE frequencies for tips + """ + input: + tree = "results/{virus}/tree.nwk", + metadata = "data/{virus}/metadata.tsv", + output: + tip_freq = "auspice/seasonal-cov_{virus}_tip-frequencies.json" + params: + strain_id = config["strain_id_field"], + min_date = config["tip_frequencies"]["min_date"], + max_date = config["tip_frequencies"]["max_date"], + narrow_bandwidth =config["tip_frequencies"]["narrow_bandwidth"], + wide_bandwidth = config["tip_frequencies"]["wide_bandwidth"], + shell: + r""" + augur frequencies \ + --method kde \ + --tree {input.tree} \ + --metadata {input.metadata} \ + --metadata-id-columns {params.strain_id} \ + --min-date {params.min_date} \ + --max-date {params.max_date} \ + --narrow-bandwidth {params.narrow_bandwidth} \ + --wide-bandwidth {params.wide_bandwidth} \ + --output {output.tip_freq} + """