From 9f3123d9f8fe63ed8afe6566bc90c3312d3fd533 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Wed, 29 Nov 2023 16:27:42 -0800 Subject: [PATCH] fixup: NCBI Dataset field name transformations --- ingest/config/config.yaml | 23 +++++++++- .../snakemake_rules/fetch_sequences.smk | 43 +++---------------- 2 files changed, 26 insertions(+), 40 deletions(-) diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml index fdf5cb4..927bd75 100644 --- a/ingest/config/config.yaml +++ b/ingest/config/config.yaml @@ -2,8 +2,27 @@ sources: ['genbank'] # Pathogen NCBI Taxonomy ID ncbi_taxon_id: '64320' -# Renames the NCBI dataset headers -ncbi_field_map: 'source-data/ncbi-dataset-field-map.tsv' +# The list of NCBI Datasets fields to include from NCBI Datasets output +# These need to be the mneumonics of the NCBI Datasets fields, see docs for full list of fields +# https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields +# Note: the "accession" field MUST be provided to match with the sequences +ncbi_datasets_fields: + - accession + - sourcedb + - sra-accs + - isolate-lineage + - geo-region + - geo-location + - isolate-collection-date + - release-date + - update-date + - length + - host-name + - isolate-lineage-source + - biosample-acc + - submitter-names + - submitter-affiliation + - submitter-country # Params for the transform rule transform: diff --git a/ingest/workflow/snakemake_rules/fetch_sequences.smk b/ingest/workflow/snakemake_rules/fetch_sequences.smk index 8d27193..5d42d76 100644 --- a/ingest/workflow/snakemake_rules/fetch_sequences.smk +++ b/ingest/workflow/snakemake_rules/fetch_sequences.smk @@ -44,56 +44,23 @@ rule extract_ncbi_dataset_sequences: """ -def _get_ncbi_dataset_field_mnemonics(wildcards) -> str: - """ - Return list of NCBI Dataset report field mnemonics for fields that we want - to parse out of the dataset report. The column names in the output TSV - are different from the mnemonics. - - See NCBI Dataset docs for full list of available fields and their column - names in the output: - https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields - """ - fields = [ - "accession", - "sourcedb", - "isolate-lineage", - "geo-region", - "geo-location", - "isolate-collection-date", - "release-date", - "update-date", - "length", - "host-name", - "isolate-lineage-source", - "bioprojects", - "biosample-acc", - "sra-accs", - "submitter-names", - "submitter-affiliation", - ] - return ",".join(fields) - - rule format_ncbi_dataset_report: # Formats the headers to match the NCBI mnemonic names input: dataset_package="data/ncbi_dataset.zip", - ncbi_field_map=config["ncbi_field_map"], output: ncbi_dataset_tsv=temp("data/ncbi_dataset_report.tsv"), params: - fields_to_include=_get_ncbi_dataset_field_mnemonics, + ncbi_datasets_fields=",".join(config["ncbi_datasets_fields"]), benchmark: "benchmarks/format_ncbi_dataset_report.txt" shell: """ dataformat tsv virus-genome \ --package {input.dataset_package} \ - --fields {params.fields_to_include:q} \ - | csvtk -tl rename2 -F -f '*' -p '(.+)' -r '{{kv}}' -k {input.ncbi_field_map} \ - | csvtk -tl mutate -f accession-rev -n accession -p "^(.+?)\." \ - | tsv-select -H -f accession --rest last \ + --fields {params.ncbi_datasets_fields:q} \ + --elide-header \ + | csvtk add-header -t -n {params.ncbi_datasets_fields:q} \ > {output.ncbi_dataset_tsv} """ @@ -113,7 +80,7 @@ rule format_ncbi_datasets_ndjson: augur curate passthru \ --metadata {input.ncbi_dataset_tsv} \ --fasta {input.ncbi_dataset_sequences} \ - --seq-id-column accession-rev \ + --seq-id-column accession \ --seq-field sequence \ --unmatched-reporting warn \ --duplicate-reporting warn \