fixup: NCBI Dataset field name transformations

nextstrain · Nov 30, 2023 · 9f3123d · 9f3123d
1 parent 870c938
commit 9f3123d
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 40 deletions.
diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml
@@ -2,8 +2,27 @@
 sources: ['genbank']
 # Pathogen NCBI Taxonomy ID
 ncbi_taxon_id: '64320'
-# Renames the NCBI dataset headers
-ncbi_field_map: 'source-data/ncbi-dataset-field-map.tsv'
+# The list of NCBI Datasets fields to include from NCBI Datasets output
+# These need to be the mneumonics of the NCBI Datasets fields, see docs for full list of fields
+# https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
+# Note: the "accession" field MUST be provided to match with the sequences
+ncbi_datasets_fields:
+  - accession
+  - sourcedb
+  - sra-accs
+  - isolate-lineage
+  - geo-region
+  - geo-location
+  - isolate-collection-date
+  - release-date
+  - update-date
+  - length
+  - host-name
+  - isolate-lineage-source
+  - biosample-acc
+  - submitter-names
+  - submitter-affiliation
+  - submitter-country
 
 # Params for the transform rule
 transform:

diff --git a/ingest/workflow/snakemake_rules/fetch_sequences.smk b/ingest/workflow/snakemake_rules/fetch_sequences.smk
@@ -44,56 +44,23 @@ rule extract_ncbi_dataset_sequences:
         """
 
 
-def _get_ncbi_dataset_field_mnemonics(wildcards) -> str:
-    """
-    Return list of NCBI Dataset report field mnemonics for fields that we want
-    to parse out of the dataset report. The column names in the output TSV
-    are different from the mnemonics.
-
-    See NCBI Dataset docs for full list of available fields and their column
-    names in the output:
-    https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
-    """
-    fields = [
-        "accession",
-        "sourcedb",
-        "isolate-lineage",
-        "geo-region",
-        "geo-location",
-        "isolate-collection-date",
-        "release-date",
-        "update-date",
-        "length",
-        "host-name",
-        "isolate-lineage-source",
-        "bioprojects",
-        "biosample-acc",
-        "sra-accs",
-        "submitter-names",
-        "submitter-affiliation",
-    ]
-    return ",".join(fields)
-
-
 rule format_ncbi_dataset_report:
     # Formats the headers to match the NCBI mnemonic names
     input:
         dataset_package="data/ncbi_dataset.zip",
-        ncbi_field_map=config["ncbi_field_map"],
     output:
         ncbi_dataset_tsv=temp("data/ncbi_dataset_report.tsv"),
     params:
-        fields_to_include=_get_ncbi_dataset_field_mnemonics,
+        ncbi_datasets_fields=",".join(config["ncbi_datasets_fields"]),
     benchmark:
         "benchmarks/format_ncbi_dataset_report.txt"
     shell:
         """
         dataformat tsv virus-genome \
             --package {input.dataset_package} \
-            --fields {params.fields_to_include:q} \
-            | csvtk -tl rename2 -F -f '*' -p '(.+)' -r '{{kv}}' -k {input.ncbi_field_map} \
-            | csvtk -tl mutate -f accession-rev -n accession -p "^(.+?)\." \
-            | tsv-select -H -f accession --rest last \
+            --fields {params.ncbi_datasets_fields:q} \
+            --elide-header \
+            | csvtk add-header -t -n {params.ncbi_datasets_fields:q} \
             > {output.ncbi_dataset_tsv}
         """
 
@@ -113,7 +80,7 @@ rule format_ncbi_datasets_ndjson:
         augur curate passthru \
             --metadata {input.ncbi_dataset_tsv} \
             --fasta {input.ncbi_dataset_sequences} \
-            --seq-id-column accession-rev \
+            --seq-id-column accession \
             --seq-field sequence \
             --unmatched-reporting warn \
             --duplicate-reporting warn \