-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Split by serotype using NCBI virus_tax_id #20
- Loading branch information
Showing
5 changed files
with
103 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
#! /usr/bin/env python3 | ||
|
||
import argparse | ||
import json | ||
from sys import stdin, stdout | ||
|
||
def parse_args(): | ||
parser = argparse.ArgumentParser( | ||
description="Dengue specific processing of metadata, infer serotype from virus_tax_id", | ||
formatter_class=argparse.ArgumentDefaultsHelpFormatter, | ||
) | ||
parser.add_argument( | ||
"--virus-tax-id", | ||
type=str, | ||
default="virus_tax_id", | ||
help="Column name containing the NCBI taxon id of the virus serotype.", | ||
) | ||
parser.add_argument( | ||
"--out-col", | ||
type=str, | ||
default="ncbi_serotype", | ||
help="Column name to store the inferred serotype.", | ||
) | ||
return parser.parse_args() | ||
|
||
|
||
def _get_dengue_serotype(record, col="virus_tax_id"): | ||
"""Set dengue serotype from virus_tax_id""" | ||
dengue_types = { | ||
"11053": "denv1", | ||
"11060": "denv2", | ||
"11069": "denv3", | ||
"11070": "denv4", | ||
"31634": "denv2", # Dengue virus 2 Thailand/16681/84 | ||
} | ||
|
||
taxon_id = record[col] | ||
|
||
return dengue_types.get(taxon_id, "") | ||
|
||
|
||
def main(): | ||
args = parse_args() | ||
|
||
for record in stdin: | ||
record = json.loads(record) | ||
record[args.out_col] = _get_dengue_serotype(record, col=args.virus_tax_id) | ||
stdout.write(json.dumps(record) + "\n") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
""" | ||
This part of the workflow handles splitting the data by serotype either based on the | ||
NCBI metadata or Nextclade dataset. Could use both if necessary to cross-validate. | ||
metadata = "results/metadata_all.tsv" | ||
sequences = "results/sequences_all.fasta" | ||
This will produce output files as | ||
metadata_{serotype} = "results/metadata_{serotype}.tsv" | ||
sequences_{serotype} = "results/sequences_{serotype}.fasta" | ||
Parameters are expected to be defined in `config.transform`. | ||
""" | ||
|
||
rule split_by_ncbi_serotype: | ||
""" | ||
Split the data by serotype based on the NCBI metadata. | ||
""" | ||
input: | ||
metadata = "results/metadata_all.tsv", | ||
sequences = "results/sequences_all.fasta" | ||
output: | ||
metadata = "results/metadata_{serotype}.tsv", | ||
sequences = "results/sequences_{serotype}.fasta" | ||
params: | ||
id_field = config["transform"]["id_field"] | ||
shell: | ||
""" | ||
augur filter \ | ||
--sequences {input.sequences} \ | ||
--metadata {input.metadata} \ | ||
--metadata-id-columns {params.id_field} \ | ||
--query "ncbi_serotype=='{wildcards.serotype}'" \ | ||
--output-sequences {output.sequences} \ | ||
--output-metadata {output.metadata} | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters