From 870c9386a90fc89b210d6120260cef64421747d6 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Tue, 21 Nov 2023 13:32:25 -0800 Subject: [PATCH] Add rules for merging USVI data with NCBI GenBank ingested data. The original Zika build contained USVI data that had been posted publiclly to GitHub but not yet submitted to NCBI GenBank. This commit adds rules to merge the USVI data with the NCBI GenBank data. Since USVI does not have a genbank_accession column, we create a new accession column for both USVI and NCBI GenBank accessions. This accession column is then used as the strain_id column for the phylogenetic build. Since auspice automagically generates a NCBI GenBank url for "genbank_accession" fields, we use a "url" field instead, allowing for a mix of GenBank and GitHub urls to be used in the strain popup window. --- phylogenetic/Snakefile | 14 +- phylogenetic/config/config_zika.yaml | 2 +- phylogenetic/example_data/metadata.tsv | 70 ++++----- phylogenetic/example_data/metadata_usvi.tsv | 2 + .../example_data/sequences_usvi.fasta | 137 ++++++++++++++++++ phylogenetic/scripts/set_final_strain_name.py | 1 - .../workflow/snakemake_rules/usvi.smk | 52 +++++++ 7 files changed, 235 insertions(+), 43 deletions(-) create mode 100644 phylogenetic/example_data/metadata_usvi.tsv create mode 100644 phylogenetic/example_data/sequences_usvi.fasta create mode 100644 phylogenetic/workflow/snakemake_rules/usvi.smk diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile index 15ff6a9..ddd301d 100644 --- a/phylogenetic/Snakefile +++ b/phylogenetic/Snakefile @@ -16,6 +16,8 @@ rule files: files = rules.files.params +include: "workflow/snakemake_rules/usvi.smk" + rule download: """Downloading sequences and metadata from data.nextstrain.org""" output: @@ -53,8 +55,8 @@ rule filter: - minimum genome length of {params.min_length} (50% of Zika virus genome) """ input: - sequences = "data/sequences.fasta", - metadata = "data/metadata.tsv", + sequences = "data/sequences_all.fasta", + metadata = "data/metadata_all.tsv", exclude = files.dropped_strains output: sequences = "results/filtered.fasta" @@ -122,7 +124,7 @@ rule refine: input: tree = "results/tree_raw.nwk", alignment = "results/aligned.fasta", - metadata = "data/metadata.tsv" + metadata = "data/metadata_all.tsv" output: tree = "results/tree.nwk", node_data = "results/branch_lengths.json" @@ -189,7 +191,7 @@ rule traits: """ input: tree = "results/tree.nwk", - metadata = "data/metadata.tsv" + metadata = "data/metadata_all.tsv" output: node_data = "results/traits.json", params: @@ -212,7 +214,7 @@ rule export: """Exporting data files for for auspice""" input: tree = "results/tree.nwk", - metadata = "data/metadata.tsv", + metadata = "data/metadata_all.tsv", branch_lengths = "results/branch_lengths.json", traits = "results/traits.json", nt_muts = "results/nt_muts.json", @@ -242,7 +244,7 @@ rule export: rule final_strain_name: input: auspice_json="results/raw_zika.json", - metadata="data/metadata.tsv", + metadata="data/metadata_all.tsv", root_sequence="results/raw_zika_root-sequence.json", output: auspice_json="auspice/zika.json", diff --git a/phylogenetic/config/config_zika.yaml b/phylogenetic/config/config_zika.yaml index 5345584..fa4e134 100644 --- a/phylogenetic/config/config_zika.yaml +++ b/phylogenetic/config/config_zika.yaml @@ -1,2 +1,2 @@ -strain_id_field: "genbank_accession" +strain_id_field: "accession" display_strain_field: "strain" \ No newline at end of file diff --git a/phylogenetic/example_data/metadata.tsv b/phylogenetic/example_data/metadata.tsv index 6e5345c..3d39cf9 100644 --- a/phylogenetic/example_data/metadata.tsv +++ b/phylogenetic/example_data/metadata.tsv @@ -1,35 +1,35 @@ -strain virus genbank_accession date region country division city db segment authors url -PAN/CDC_259359_V1_V3/2015 zika KX156774 2015-12-18 North America Panama Panama Panama genbank genome Shabman et al https://www.ncbi.nlm.nih.gov/nuccore/KX156774 -COL/FLR_00024/2015 zika MF574569 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574569 -PRVABC59 zika KU501215 2015-12-XX North America Puerto Rico Puerto Rico Puerto Rico genbank genome Lanciotti et al https://www.ncbi.nlm.nih.gov/nuccore/KU501215 -COL/FLR_00008/2015 zika MF574562 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574562 -Colombia/2016/ZC204Se zika KY317939 2016-01-06 South America Colombia Colombia Colombia genbank genome Quick et al https://www.ncbi.nlm.nih.gov/nuccore/KY317939 -ZKC2/2016 zika KX253996 2016-02-16 Oceania American Samoa American Samoa American Samoa genbank genome Wu et al https://www.ncbi.nlm.nih.gov/nuccore/KX253996 -VEN/UF_1/2016 zika KX702400 2016-03-25 South America Venezuela Venezuela Venezuela genbank genome Blohm et al https://www.ncbi.nlm.nih.gov/nuccore/KX702400 -DOM/2016/BB_0059 zika KY785425 2016-04-04 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785425 -BRA/2016/FC_6706 zika KY785433 2016-04-08 South America Brazil Brazil Brazil genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785433 -DOM/2016/BB_0183 zika KY785420 2016-04-18 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785420 -EcEs062_16 zika KX879603 2016-04-XX South America Ecuador Ecuador Ecuador genbank genome Marquez et al https://www.ncbi.nlm.nih.gov/nuccore/KX879603 -HND/2016/HU_ME59 zika KY785418 2016-05-13 North America Honduras Honduras Honduras genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785418 -DOM/2016/MA_WGS16_011 zika KY785484 2016-06-06 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785484 -DOM/2016/BB_0433 zika KY785441 2016-06-13 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al https://www.ncbi.nlm.nih.gov/nuccore/KY785441 -USA/2016/FL022 zika KY075935 2016-07-22 North America Usa Usa Usa genbank genome Grubaugh et al https://www.ncbi.nlm.nih.gov/nuccore/KY075935 -SG_027 zika KY241697 2016-08-27 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al https://www.ncbi.nlm.nih.gov/nuccore/KY241697 -SG_074 zika KY241744 2016-08-28 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al https://www.ncbi.nlm.nih.gov/nuccore/KY241744 -SG_056 zika KY241726 2016-08-28 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al https://www.ncbi.nlm.nih.gov/nuccore/KY241726 -USA/2016/FLUR022 zika KY325473 2016-08-31 North America Usa Usa Usa genbank genome Grubaugh et al https://www.ncbi.nlm.nih.gov/nuccore/KY325473 -Aedes_aegypti/USA/2016/FL05 zika KY075937 2016-09-09 North America Usa Usa Usa genbank genome Grubaugh et al https://www.ncbi.nlm.nih.gov/nuccore/KY075937 -SG_018 zika KY241688 2016-09-13 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al https://www.ncbi.nlm.nih.gov/nuccore/KY241688 -USA/2016/FLWB042 zika KY325478 2016-09-26 North America Usa Usa Usa genbank genome Grubaugh et al https://www.ncbi.nlm.nih.gov/nuccore/KY325478 -COL/PRV_00028/2015 zika MF574578 2016-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al https://www.ncbi.nlm.nih.gov/nuccore/MF574578 -Thailand/1610acTw zika MF692778 2016-10-XX Southeast Asia Thailand Thailand Thailand genbank genome Lin et al https://www.ncbi.nlm.nih.gov/nuccore/MF692778 -1_0087_PF zika KX447509 2013-12-XX Oceania French Polynesia French Polynesia French Polynesia genbank genome Pettersson et al https://www.ncbi.nlm.nih.gov/nuccore/KX447509 -1_0199_PF zika KX447519 2013-11-XX Oceania French Polynesia French Polynesia French Polynesia genbank genome Pettersson et al https://www.ncbi.nlm.nih.gov/nuccore/KX447519 -1_0181_PF zika KX447512 2013-12-XX Oceania French Polynesia French Polynesia French Polynesia genbank genome Pettersson et al https://www.ncbi.nlm.nih.gov/nuccore/KX447512 -Brazil/2015/ZBRC301 zika KY558995 2015-05-13 South America Brazil Brazil Brazil genbank genome Faria et al https://www.ncbi.nlm.nih.gov/nuccore/KY558995 -Brazil/2015/ZBRA105 zika KY558989 2015-02-23 South America Brazil Brazil Brazil genbank genome Faria et al https://www.ncbi.nlm.nih.gov/nuccore/KY558989 -Brazil/2016/ZBRC16 zika KY558991 2016-01-19 South America Brazil Brazil Brazil genbank genome Faria et al https://www.ncbi.nlm.nih.gov/nuccore/KY558991 -V8375 zika KU501217 2015-11-01 North America Guatemala Guatemala Guatemala genbank genome Lanciotti et al https://www.ncbi.nlm.nih.gov/nuccore/KU501217 -Nica1_16 zika KX421195 2016-01-19 North America Nicaragua Nicaragua Nicaragua genbank genome Tabata et al https://www.ncbi.nlm.nih.gov/nuccore/KX421195 -Brazil/2015/ZBRC303 zika KY558997 2015-05-14 South America Brazil Brazil Brazil genbank genome Faria et al https://www.ncbi.nlm.nih.gov/nuccore/KY558997 -SMGC_1 zika KX266255 2016-02-14 Oceania American Samoa American Samoa American Samoa genbank genome Bi et al https://www.ncbi.nlm.nih.gov/nuccore/KX266255 +strain virus genbank_accession date region country division city db segment authors +PAN/CDC_259359_V1_V3/2015 zika KX156774 2015-12-18 North America Panama Panama Panama genbank genome Shabman et al +COL/FLR_00024/2015 zika MF574569 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al +PRVABC59 zika KU501215 2015-12-XX North America Puerto Rico Puerto Rico Puerto Rico genbank genome Lanciotti et al +COL/FLR_00008/2015 zika MF574562 2015-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al +Colombia/2016/ZC204Se zika KY317939 2016-01-06 South America Colombia Colombia Colombia genbank genome Quick et al +ZKC2/2016 zika KX253996 2016-02-16 Oceania American Samoa American Samoa American Samoa genbank genome Wu et al +VEN/UF_1/2016 zika KX702400 2016-03-25 South America Venezuela Venezuela Venezuela genbank genome Blohm et al +DOM/2016/BB_0059 zika KY785425 2016-04-04 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al +BRA/2016/FC_6706 zika KY785433 2016-04-08 South America Brazil Brazil Brazil genbank genome Metsky et al +DOM/2016/BB_0183 zika KY785420 2016-04-18 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al +EcEs062_16 zika KX879603 2016-04-XX South America Ecuador Ecuador Ecuador genbank genome Marquez et al +HND/2016/HU_ME59 zika KY785418 2016-05-13 North America Honduras Honduras Honduras genbank genome Metsky et al +DOM/2016/MA_WGS16_011 zika KY785484 2016-06-06 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al +DOM/2016/BB_0433 zika KY785441 2016-06-13 North America Dominican Republic Dominican Republic Dominican Republic genbank genome Metsky et al +USA/2016/FL022 zika KY075935 2016-07-22 North America Usa Usa Usa genbank genome Grubaugh et al +SG_027 zika KY241697 2016-08-27 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al +SG_074 zika KY241744 2016-08-28 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al +SG_056 zika KY241726 2016-08-28 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al +USA/2016/FLUR022 zika KY325473 2016-08-31 North America Usa Usa Usa genbank genome Grubaugh et al +Aedes_aegypti/USA/2016/FL05 zika KY075937 2016-09-09 North America Usa Usa Usa genbank genome Grubaugh et al +SG_018 zika KY241688 2016-09-13 Southeast Asia Singapore Singapore Singapore genbank genome Ho et al +USA/2016/FLWB042 zika KY325478 2016-09-26 North America Usa Usa Usa genbank genome Grubaugh et al +COL/PRV_00028/2015 zika MF574578 2016-12-XX South America Colombia Colombia Colombia genbank genome Pickett et al +Thailand/1610acTw zika MF692778 2016-10-XX Southeast Asia Thailand Thailand Thailand genbank genome Lin et al +1_0087_PF zika KX447509 2013-12-XX Oceania French Polynesia French Polynesia French Polynesia genbank genome Pettersson et al +1_0199_PF zika KX447519 2013-11-XX Oceania French Polynesia French Polynesia French Polynesia genbank genome Pettersson et al +1_0181_PF zika KX447512 2013-12-XX Oceania French Polynesia French Polynesia French Polynesia genbank genome Pettersson et al +Brazil/2015/ZBRC301 zika KY558995 2015-05-13 South America Brazil Brazil Brazil genbank genome Faria et al +Brazil/2015/ZBRA105 zika KY558989 2015-02-23 South America Brazil Brazil Brazil genbank genome Faria et al +Brazil/2016/ZBRC16 zika KY558991 2016-01-19 South America Brazil Brazil Brazil genbank genome Faria et al +V8375 zika KU501217 2015-11-01 North America Guatemala Guatemala Guatemala genbank genome Lanciotti et al +Nica1_16 zika KX421195 2016-01-19 North America Nicaragua Nicaragua Nicaragua genbank genome Tabata et al +Brazil/2015/ZBRC303 zika KY558997 2015-05-14 South America Brazil Brazil Brazil genbank genome Faria et al +SMGC_1 zika KX266255 2016-02-14 Oceania American Samoa American Samoa American Samoa genbank genome Bi et al diff --git a/phylogenetic/example_data/metadata_usvi.tsv b/phylogenetic/example_data/metadata_usvi.tsv new file mode 100644 index 0000000..96d3d52 --- /dev/null +++ b/phylogenetic/example_data/metadata_usvi.tsv @@ -0,0 +1,2 @@ +genbank_accession genbank_accession_rev accession strain date region country division location length host release_date update_date sra_accessions authors institution url +USVI/37/2016 VI37 USVI/37/2016 2016-10-06 North America Usvi Saint Croix Saint Croix 10807 Homo sapiens Black et al FH https://github.com/blab/zika-usvi/ diff --git a/phylogenetic/example_data/sequences_usvi.fasta b/phylogenetic/example_data/sequences_usvi.fasta new file mode 100644 index 0000000..d677bfc --- /dev/null +++ b/phylogenetic/example_data/sequences_usvi.fasta @@ -0,0 +1,137 @@ +>VI37 +nnnnnnnnnnnnnnnnnnnnnnnnnnnngacagttcgagtttgaagcgaaagctagcaacagtatcaacaggttttattt +tggatttggaaacgagagtttctggtcatgaaaaacccaaaaaagaaatccggaggattccggattgtcaatatgctaaa +acgcggagtagcccgtgtgagcccctttgggggcttgaagaggctgccagccggacttctgctgggtcatgggcccatca +ggatggtcttggcgattctagcctttttgagattcacggcaatcaagccatcactgggcctcatcaatagatggggttca +gtggggaaaaaagaggctatggaaacaataaagaagttcaagaaagatctggctgccatgctgagaataatcaatgctag +gaaggagaagaagagacgaggcgcagatactagtgtcggaattgttggcctcctgctgaccacagctatggcagcggagg +tcactagacgtgggagtgcatactatatgtacttggacagaaacgatgctggggaggccatatcttttccaaccacattg +gggatgaataagtgttatatacagatcatggatcttggacacatgtgtgatgccaccatgagctatgaatgccctatgct +ggatgagggggtggaaccagatgacgtcgattgttggtgcaacacgacgtcaacttgggttgtgtacggaacctgccatc +acaaaaaaggtgaagcacggagatctagaagagctgtgacgctcccctcccattccaccaggaagctgcaaacgcggtcg +caaacctggttggaatcaagagaatacacaaagcacttgattagagtcgaaaattggatattcaggaaccctggcttcgc +gttagcagcagctgccatcgcttggcttttgggaagctcaacgagccaaaaagtcatatacttggtcatgatactgctga +ttgccccggcatacagcatcaggtgcataggagtcagcaatagggactttgtggaaggtatgtcaggtgggacttgggtt +gatgttgtcttggaacatggaggttgtgtcaccgtaatggcacaggacaaaccgactgtcgacatagagctggttacaac +aacagtcagcaacatggcggaggtaagatcctactgctatgaggcatcaatatcagacatggcttctgacagccgctgcc +caacacaaggtgaagcctaccttgacaagcaatcagacactcaatatgtctgcaaaagaacgttagtggacagaggctgg +ggaaatggatgtggactttttggcaaagggagcctggtgacatgcgctaagtttgcatgctccaagaaaatgaccgggaa +gagcatccagccagagaatctggagtaccggataatgctgtcagttcatggctcccagcacagtgggatgatcgttaatg +acacaggacatgaaactgatgagaatagagcgaaagttgagataacgcccaattcaccgagagccgaagccaccctgggg +ggttttggaagcctaggacttgattgtgaaccgaggacaggccttgacttttcagatttgtattacttgactatgaataa +caagcactggttggttcacaaggagtggttccacgacattccattaccttggcacgctggggcagacaccggaactccac +actggaacaacaaagaagcactggtagagttcaaggacgcacatgccaaaaggcaaactgtcgtggttctagggagtcaa +gaaggagcagttcacacggcccttgctggagctctggaggctgagatggatggtgcaaagggaaggctgtcctctggcca +cttgaaatgtcgcctgaaaatggataaacttagattgaagggcgtgtcatactccttgtgtactgcagcgttcacattca +ccaagatcccggctgaaacactgcacgggacagtcacagtggaggtacagtacgcagggacagatggaccttgcaaggtt +ccagctcagatggcggtggacatgcaaactctgaccccagttgggaggttgataaccgctaaccccgtaatcactgaaag +cactgagaactctaagatgatgctggaacttgatccaccatttggggactcttacattgtcataggagtcggggagaaga +agatcacccaccactggcacaggagtggcagcaccattggaaaagcatttgaagccactgtgagaggtgccaagagaatg +gcagtcttgggagacacagcctgggactttggatcagttggaggcgctctcaactcattgggcaagggcatccatcaaat +ttttggagcagctttcaaatcattgtttggaggaatgtcctggttctcacaaattctcattggaacgttgctgatgtggt +tgggtctgaacacaaagaatggatctatttcccttatgtgcttggccttagggggagtgttgatcttcttatccacagcc +gtctctgctgatgtggggtgctcggtggacttctcaaagaaggagacgagatgcggtacaggggtgttcgtctataacga +cgttgaagcctggagggacaggtacaagtaccatcctgactccccccgtagattggcagcagcagttaagcaagcctggg +aagatggtatctgcgggatctcctctgtttcaagaatggaaaacatcatgtggagatcagtagaaggggagctcaacgca +atcctggaagagaatggagttcaactgacggtcgttgtgggatctgtaaaaaaccccatgtggagaggtccacagagatt +gcccgtgcctgtgaacgagctgccccacggctggaaggcttgggggaaatcgtacttcgtcagagcagcaaagacaaata +acagctttgtcgtggatggtgacacactgaaggaatgcccactcaaacatagagcatggaacagctttcttgtggaggat +catgggttcggggtatttcacactagtgtctggctcaaggttagagaagattattcattagagtgtgatccagccgttat +tggaacagctgttaagggaaaggaggctgtacacagtgatctaggctactggattgagagtgagaagaatgacacatgga +ggctggagagggcccatctgatcgagatgaaaacatgtgaatggccaaagtcccacacattgtggacagatggaatagaa +gagagtgatctgatcatacccaagtctttagctgggccactcagccatcacaataccagagagggctacaggacccaaat +gaaagggccatggcacagtgaagagcttgaaattcggtttgaggaatgcccaggcactaaggtccacgtggaggaaacat +gtggaacaagaggaccatctctgagatcaaccactgcaagcggaagggtgatcgaggaatggtgctgcagggagtgcaca +atgcccccactgtcgttccgggctaaagatggctgttggtatggaatggagataaggcccaggaaagaaccagaaagcaa +cttagtaaggtcaatggtgactgcaggatcaactgatcacatggaccacttctcccttggagtgcttgtgatcctgctca +tggtgcaggaagggctgaagaagagaatgaccacaaagatcatcataagcacatcaatggcagtgctggtagctatgatc +ctgggaggattttcaatgagtgacctggctaagcttgcaattttgatgggtgccaccttcgcggaaatgaacactggagg +agatgtagctcatctggcgctgatagcggcattcaaagtcagaccagcgttgctggtatctttcatcttcagagctaatt +ggacaccccgtgaaagcatgctgctggccttggcctcgtgtcttttgcaaactgcgatctccgccttggaaggcgacctg +atggttctcatcaatggttttgctttggcctggttggcaatacgagcgatggttgttccacgcactgataacatcacctt +ggcaatcctggctgctctgacaccactggcccggggcacactgcttgtggcgtggagagcaggccttgctacttgcgggg +ggtttatgctcctctctctgaagggaaaaggcagtgtgaagaagaacttaccatttgtcatggccctgggactaaccgct +gtgaggctggtcgaccccatcaacgtggtgggactgctgttgctcacaaggagtgggaagcggagctggccccctagcga +agtactcacagctgttggcctgatatgcgcattggctggagggttcgccaaggcagatatagagatggctgggcccatgg +ccgcggtcggtctgctaattgtcagttacgtggtctcaggaaagagtgtggacatgtacattgaaagagcaggtgacatc +acatgggaaaaagatgcggaagtcactggaaacagtccccggctcgatgtggcgctagatgagagtggtgatttctccct +ggtggaggatgacggtccccccatgagagagatcatactcaaggtggtcctgatgaccatctgtggcatgaacccaatag +ccataccctttgcagctggagcgtggtacgtatacgtgaagactggaaaaaggagtggtgctctatgggatgtgcctgct +cccaaggaagtaaaaaagggggagaccacagatggagtgtacagagtaatgactcgtagactgctaggttcaacacaagt +tggagtgggagttatgcaagagggggtctttcacactatgtggcacgtcacaaaaggatccgcgctgagaagcggtgaag +ggagacttgatccatactggggagatgtcaagcaggatctggtgtcatactgtggtccatggaagctagatgccgcctgg +gatgggcacagcgaggtgcagctcttggccgtgccccccggagagagagcgaggaacatccagactctgcccggaatatt +taagacaaaggatggggacattggagcggttgcgctggattacccagcaggaacttcaggatctccaatcctagacaagt +gtgggagagtgataggactttatggcaatggggtcgtgatcaaaaacgggagttatgttagtgccatcacccaagggagg +agggaggaagagactcctgttgagtgcttcgagccctcgatgctgaagaagaagcagctaactgtcttagacttgcatcc +tggagctgggaaaaccaggagagttcttcctgaaatagtccgtgaagccataaaaacaagactccgtactgtgatcttag +ctccaaccagggttgtcgctgctgaaatggaggaggcccttagagggcttccagtgcgttatatgacaacagcagtcaat +gtcacccactctggaacagaaatcgtcgacttaatgtgccatgccaccttcacttcacgtctactacagccaatcagagt +ccccaactataatctgtatattatggatgaggcccacttcacagatccctcaagtatagcagcaagaggatacatttcaa +caagggttgagatgggcgaggcggctgccatcttcatgaccgccacgccaccaggaacccgtgacgcatttccggactcc +aactcaccaattatggacaccgaagtggaagtcccagagagagcctggagctcaggctttgattgggtgacggatcattc +tggaaaaacagtttggtttgttccaagcgtgaggaacggcaatgagatcgcagcttgtctgacaaaggctggaaaacggg +tcatacagctcagcagaaagacttttgagacagagttccagaaaacaaaacatcaagagtgggactttgtcgtgacaact +gacatttcagagatgggcgccaactttaaagctgaccgtgtcatagattccaggagatgcctaaagccggtcatacttga +tggcgagagagtcattctggctggacccatgcctgtcacacatgccagcgctgcccagaggagggggcgcataggcagga +atcccaacaaacctggagatgagtatctgtatggaggtgggtgcgcagagactgacgaagaccatgcacactggcttgaa +gcaagaatgctccttgacaatatttacctccaagatggcctcatagcctcgctctatcgacctgaggccgacaaagtagc +agccattgagggagagttcaagcttaggacggagcaaaggaagacctttgtggaactcatgaaaagaggagatcttcctg +tttggctggcctatcaggttgcatctgccggaataacctacacagatagaagatggtgctttgatggcacgaccaacaac +accataatggaagacagtgtgccggcagaggtgtggaccagacacggagagaaaagagtgctcaaaccgaggtggatgga +cgccagagtttgttcagatcatgcggccctgaagtcattcaaggagtttgccgctgggaaaagaggagcggcttttggag +tgatggaagccctgggaacactgccaggacacatgacnnagagattccaggaagcnattgacaacctcgctgtgctcatg +cgngcagagactggaagcaggccttacaaagccgcggcggcccaattgccggagaccctagagaccataatgcntttggg +gttgctgggaacagtctcgctgggaatcttcttcgtcttgatgaggaacaagggcatagggaagatgggctttggaatgg +tgactcttggggccagcgcatggctcatgtggctctcggaaattgagccagccagaattgcatgtgtcctcattgttgtg +ttcctattgctggtggtgctcatacctgagccagaaaagcaaagatctccccaggacaaccaaatggcaatcatcatcat +ggtagcagtaggtcttttgggcttgattaccgccaatgaactcggatggttggagagaacaaagagtgacctaagccatc +taatgggaaggagagaggagggggcaaccataggattctcaatggacattgacctgcggccagcctcagcttgggccatc +tatgctgccttgacaactttcattaccccagccgtccaacatgcagtgaccacctcatacaacaactactccttaatggc +gatggccacgcaagctggagtgttgtttggcatgggcaaagggatgccattctacgcatgggactttggagtcccgctgc +taatgataggttgctactcacaattaacacccctgaccctaatagtggccatcattttgctcgtggcgcactacatgtac +ttgatcccagggctgcaggcagcagctgcgcgtgctgcccagaagagaacggcagctggcatcatgaagaaccctgttgt +ggatggaatagtggtgactgacattgacacaatgacaattgacccccaagtggagaaaaagatgggacaggtgctactca +tagcagtggccgtctccagcgccatactgtcgcggaccgcctgggggtggggggaggctggggctctgatcacagccgca +acttccactttgtgggaaggctctccgaacaagtactggaactcctctacagccacttcactgtgtaacatttttagggg +aagttacttggctggagcttctctaatctacacagtaacaagaaacgctggcttggtcaagagacgtgggggtggaacag +gagagaccctgggagagaaatggaaggcccgcttgaaccagatgtcggccctggagttctactcctacaaaaagtcaggc +atcaccgaggtgtgcagagaagaggcccgccgcgccctcaaggacggtgtggcaacgggaggccatgctgtgtcccgagg +aagtgcaaagctgagatggttggtggagcggggatacctgcagccctatggaaaggtcattgatcttggatgtggcagag +ggggctggagttactacgccgccaccatccgcaaagttcaagaagtgaaaggatacacaaaaggaggccctggtcatgaa +gaacccgtgttggtgcaaagctatgggtggaacatagtccgtcttaagagtggggtggacgtctttcatatggcggctga +gccgtgtgacacgttgctgtgtgacataggtgagtcatcatctagtcctgaagtggaagaagcacggacgctcagagtcc +tctccatggtgggggattggcttgaaaaaagaccaggagccttttgtataaaagtgttgtgcccatacaccagcactatg +atggaaaccctggagcgactgcagcgtaggtatgggggaggactggtcagagtgccactctcccgcaactctacacatga +gatgtactgggtctctggagcgaaaagcaacaccataaaaagtgtgtccaccacgagccagctcctcttggggcgcatgg +acgggcctaggaggccagtgaaatatgaggaggatgtgaatctcggctctggcacgcgggctgtggtaagctgcgctgaa +gctcccaacatgaagatcattggtaaccgcattgaaaggatccgcagtgagcacgcggaaacgtggttctttgacgagaa +ccacccatataggacatgggcttaccatggaagctatgaggcccccacacaagggtcagcgtcctctctaataaacgggg +ttgtcaggctcctgtcaaaaccctgggatgtggtgactggagtcacaggaatagccatgaccgacaccacaccgtatggt +cagcaaagagttttcaaggaaaaagtggacactagggtgccagacccccaagaaggcactcgtcaggttatgagcatggt +ctcttcctggttgtggaaagagctaggcaaacacaaacggccacgagtctgcaccaaagaagagttcatcaacaaggttc +gtagcaatgcagcattaggggcaatatttgaggaggaaaaagagtggaagactgcagtggaagctgtgaacgatccaagg +ttctgggctctagtggacaaggaaagagagcaccacctgagaggagagtgccagagctgtgtgtacaacatgatgggaaa +aagagaaaagaaacaaggggaatttggaaaggccaagggcagccgcgccatctggtatatgtggctaggggctagatttc +tagagttcgaagcccttggattcttgaacgaggatcactggatggggagagagaactcaggaggtggtgttgaagggctg +ggattacaaagactcggatatgtcctagaagagatgagtcgtataccaggaggaaggatgtatgcagatgacactgctgg +ctgggacacccgcattagcaggtttgatctggagaatgaagctctaatcaccaaccaaatggagaaagggcacagggcct +tggcattggccataatcaagtacacataccaaaacaaagtggtaaaggtccttagaccagctgaaaaagggaaaacagtt +atggacattatttcgagacaagaccaaagggggagcggacaagttgtcacttacgctcttaacacatttaccaacctagt +ggtgcaactcattcggaatatggaggctgaggaagttctagagatgcaagacttgtggctgctgcggaggtcagagaaag +tgaccaactggttgcagagcaacggatgggataggctcaaacgaatggcagtcagtggagatgattgcgttgtgaagcca +attgatgataggtttgcacatgccctcaggttcttgaatgatatgggaaaagttaggaaggacacacaagagtggaaacc +ctcaactggatgggacaactgggaagaagttccgttttgctcccaccacttcaacaagctccatctcaaggacgggaggt +ccattgtggttccctgccgccaccaagatgaactgattggtcgggcccgcgtctctccaggggcgggatggagcatccgg +gagactgcttgcctagcaaaatcatatgcgcaaatgtggcagctcctttatttccacagaagggacctccgactgatggc +caatgccatttgttcatctgtgccagttgactgggttccaactgggagaactacctggtcaatccatggaaagggagaat +ggatgaccactgaagacatgcttgtggtgtggaacagagtgtggatnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn +nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn +nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn +nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn +nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn +nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn +nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn +nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn +nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn +nnnnnnn diff --git a/phylogenetic/scripts/set_final_strain_name.py b/phylogenetic/scripts/set_final_strain_name.py index c670f44..d104ca1 100644 --- a/phylogenetic/scripts/set_final_strain_name.py +++ b/phylogenetic/scripts/set_final_strain_name.py @@ -6,7 +6,6 @@ def replace_name_recursive(node, lookup, saveoldcolumn): if node["name"] in lookup: if saveoldcolumn == "accession": node["node_attrs"][saveoldcolumn] = node["name"] - node["node_attrs"]["url"] = "https://www.ncbi.nlm.nih.gov/nuccore/" + node["name"] elif saveoldcolumn == "genbank_accession": node["node_attrs"][saveoldcolumn] = {} node["node_attrs"][saveoldcolumn]["value"] = node["name"] diff --git a/phylogenetic/workflow/snakemake_rules/usvi.smk b/phylogenetic/workflow/snakemake_rules/usvi.smk new file mode 100644 index 0000000..5ae8b3d --- /dev/null +++ b/phylogenetic/workflow/snakemake_rules/usvi.smk @@ -0,0 +1,52 @@ +rule download_usvi: + """Downloading sequences and metadata from data.nextstrain.org""" + output: + sequences = "data/sequences_usvi.fasta.zst", + metadata = "data/metadata_usvi.tsv.zst" + params: + sequences_url = "https://data.nextstrain.org/files/zika/sequences_usvi.fasta.zst", + metadata_url = "https://data.nextstrain.org/files/zika/metadata_usvi.tsv.zst" + shell: + """ + curl -fsSL --compressed {params.sequences_url:q} --output {output.sequences} + curl -fsSL --compressed {params.metadata_url:q} --output {output.metadata} + """ + +rule decompress_usvi: + """Decompressing sequences and metadata""" + input: + sequences = "data/sequences_usvi.fasta.zst", + metadata = "data/metadata_usvi.tsv.zst" + output: + sequences = "data/sequences_usvi.fasta", + metadata = "data/metadata_usvi.tsv" + shell: + """ + zstd -d -c {input.sequences} > {output.sequences} + zstd -d -c {input.metadata} > {output.metadata} + """ + +rule append_usvi: + """Appending USVI sequences""" + input: + sequences = "data/sequences.fasta", + metadata = "data/metadata.tsv", + usvi_sequences = "data/sequences_usvi.fasta", + usvi_metadata = "data/metadata_usvi.tsv" + output: + sequences = "data/sequences_all.fasta", + metadata = "data/metadata_all.tsv" + shell: + """ + cat {input.sequences} {input.usvi_sequences} > {output.sequences} + + csvtk mutate2 -tl \ + -n url \ + -e '"https://www.ncbi.nlm.nih.gov/nuccore/" + $genbank_accession' \ + {input.metadata} \ + | csvtk mutate2 -tl \ + -n accession \ + -e '$genbank_accession' \ + | csvtk concat -tl - {input.usvi_metadata} \ + > {output.metadata} + """ \ No newline at end of file