diff --git a/ingest/Snakefile b/ingest/Snakefile index bfc99b0d..f8dc8833 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -11,10 +11,12 @@ if not config: send_slack_notifications = config.get("send_slack_notifications", False) +serotypes = ['all', 'denv1', 'denv2', 'denv3', 'denv4'] def _get_all_targets(wildcards): # Default targets are the metadata TSV and sequences FASTA files - all_targets = ["results/sequences.fasta", "results/metadata.tsv"] + all_targets = expand(["results/sequences_{serotype}.fasta", "results/metadata_{serotype}.tsv"], serotype=serotypes) + # Add additional targets based on upload config upload_config = config.get("upload", {}) @@ -57,6 +59,7 @@ rule all: include: "workflow/snakemake_rules/fetch_sequences.smk" include: "workflow/snakemake_rules/transform.smk" +include: "workflow/snakemake_rules/split_serotypes.smk" if config.get("upload", False): diff --git a/ingest/bin/infer-dengue-serotype.py b/ingest/bin/infer-dengue-serotype.py new file mode 100755 index 00000000..fe32996d --- /dev/null +++ b/ingest/bin/infer-dengue-serotype.py @@ -0,0 +1,52 @@ +#! /usr/bin/env python3 + +import argparse +import json +from sys import stdin, stdout + +def parse_args(): + parser = argparse.ArgumentParser( + description="Dengue specific processing of metadata, infer serotype from virus_tax_id", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--virus-tax-id", + type=str, + default="virus_tax_id", + help="Column name containing the NCBI taxon id of the virus serotype.", + ) + parser.add_argument( + "--out-col", + type=str, + default="ncbi_serotype", + help="Column name to store the inferred serotype.", + ) + return parser.parse_args() + + +def _get_dengue_serotype(record, col="virus_tax_id"): + """Set dengue serotype from virus_tax_id""" + dengue_types = { + "11053": "denv1", + "11060": "denv2", + "11069": "denv3", + "11070": "denv4", + "31634": "denv2", # Dengue virus 2 Thailand/16681/84 + } + + taxon_id = record[col] + + return dengue_types.get(taxon_id, "") + + +def main(): + args = parse_args() + + for record in stdin: + record = json.loads(record) + record[args.out_col] = _get_dengue_serotype(record, col=args.virus_tax_id) + stdout.write(json.dumps(record) + "\n") + + +if __name__ == "__main__": + main() diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml index 231a5bb2..3f08a9ed 100644 --- a/ingest/config/config.yaml +++ b/ingest/config/config.yaml @@ -15,6 +15,8 @@ ncbi_datasets_fields: - isolate-collection-date - release-date - update-date + - virus-tax-id + - virus-name - length - host-name - isolate-lineage-source @@ -37,6 +39,8 @@ transform: 'isolate-collection-date=date', 'release-date=release_date', 'update-date=update_date', + 'virus-tax-id=virus_tax_id', + 'virus-name=virus_name', 'sra-accs=sra_accessions', 'submitter-names=authors', 'submitter-affiliation=institution', @@ -96,6 +100,7 @@ transform: 'host', 'release_date', 'update_date', + 'ncbi_serotype', # inferred from virus_tax_id 'sra_accessions', 'abbr_authors', 'authors', diff --git a/ingest/workflow/snakemake_rules/split_serotypes.smk b/ingest/workflow/snakemake_rules/split_serotypes.smk new file mode 100644 index 00000000..e3c92ef7 --- /dev/null +++ b/ingest/workflow/snakemake_rules/split_serotypes.smk @@ -0,0 +1,37 @@ +""" +This part of the workflow handles splitting the data by serotype either based on the +NCBI metadata or Nextclade dataset. Could use both if necessary to cross-validate. + + metadata = "results/metadata_all.tsv" + sequences = "results/sequences_all.fasta" + +This will produce output files as + + metadata_{serotype} = "results/metadata_{serotype}.tsv" + sequences_{serotype} = "results/sequences_{serotype}.fasta" + +Parameters are expected to be defined in `config.transform`. +""" + +rule split_by_ncbi_serotype: + """ + Split the data by serotype based on the NCBI metadata. + """ + input: + metadata = "results/metadata_all.tsv", + sequences = "results/sequences_all.fasta" + output: + metadata = "results/metadata_{serotype}.tsv", + sequences = "results/sequences_{serotype}.fasta" + params: + id_field = config["transform"]["id_field"] + shell: + """ + augur filter \ + --sequences {input.sequences} \ + --metadata {input.metadata} \ + --metadata-id-columns {params.id_field} \ + --query "ncbi_serotype=='{wildcards.serotype}'" \ + --output-sequences {output.sequences} \ + --output-metadata {output.metadata} + """ diff --git a/ingest/workflow/snakemake_rules/transform.smk b/ingest/workflow/snakemake_rules/transform.smk index ec63d00f..2af0b883 100644 --- a/ingest/workflow/snakemake_rules/transform.smk +++ b/ingest/workflow/snakemake_rules/transform.smk @@ -6,8 +6,8 @@ formats and expects input file This will produce output files as - metadata = "results/metadata.tsv" - sequences = "results/sequences.fasta" + metadata = "results/metadata_all.tsv" + sequences = "results/sequences_all.fasta" Parameters are expected to be defined in `config.transform`. """ @@ -42,8 +42,8 @@ rule transform: all_geolocation_rules="data/all-geolocation-rules.tsv", annotations=config["transform"]["annotations"], output: - metadata="results/metadata.tsv", - sequences="results/sequences.fasta", + metadata="results/metadata_all.tsv", + sequences="results/sequences_all.fasta", log: "logs/transform.txt", params: @@ -85,6 +85,7 @@ rule transform: --abbr-authors-field {params.abbr_authors_field} \ | ./vendored/apply-geolocation-rules \ --geolocation-rules {input.all_geolocation_rules} \ + | ./bin/infer-dengue-serotype.py \ | ./vendored/merge-user-metadata \ --annotations {input.annotations} \ --id-field {params.annotations_id} \