NCBI Dataset field name transformations

Originally the field map was created to keep mpox NDJSON backward compatible with field names used from NCBI Virus. However, this constraint is not applicable to dengue.¹ This commit organizes field renaming into two parts. 1. Rename the NCBI output columns to match the NCBI mnemonics² (see "ncbi_field_map:" in `config/config.yaml`) 2. Where necessary, rename the NCBI mnemonics to match Nextstrain expected column names³ (see "transform: fieldmap:" in `config/config.yaml`) ¹ #13 (comment) ² https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields ³ https://docs.nextstrain.org/projects/ncov/en/latest/reference/metadata-fields.html
nextstrain · Dec 5, 2023 · 2684e24 · 2684e24
1 parent 94b0113
commit 2684e24
Showing 3 changed files with 55 additions and 69 deletions.
diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml
@@ -2,22 +2,52 @@
 sources: ['genbank']
 # Pathogen NCBI Taxonomy ID
 ncbi_taxon_id: '12637'
-# Renames the NCBI dataset headers
-ncbi_field_map: 'source-data/ncbi-dataset-field-map.tsv'
+# The list of NCBI Datasets fields to include from NCBI Datasets output
+# These need to be the mneumonics of the NCBI Datasets fields, see docs for full list of fields
+# https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
+# Note: the "accession" field MUST be provided to match with the sequences
+ncbi_datasets_fields:
+  - accession
+  - sourcedb
+  - isolate-lineage
+  - geo-region
+  - geo-location
+  - isolate-collection-date
+  - release-date
+  - update-date
+  - length
+  - host-name
+  - isolate-lineage-source
+  - submitter-names
+  - submitter-affiliation
 
 # Params for the transform rule
 transform:
-  # Fields to rename.
+  # NCBI Fields to rename to Nextstrain field names.
   # This is the first step in the pipeline, so any references to field names
   # in the configs below should use the new field names
-  field_map: ['collected=date', 'released=date_released', 'genbank_accession=accession', 'submitting_organization=institution']
+  field_map: [
+    'accession=genbank_accession',
+    'accession-rev=genbank_accession_rev',
+    'isolate-lineage=strain',
+    'sourcedb=database', # necessary for applying geo location rules
+    'geo-region=region',
+    'geo-location=location',
+    'host-name=host',
+    'isolate-collection-date=date',
+    'release-date=release_date',
+    'update-date=update_date',
+    'sra-accs=sra_accessions',    
+    'submitter-names=authors',
+    'submitter-affiliation=institution',
+  ]
   # Standardized strain name regex
   # Currently accepts any characters because we do not have a clear standard for strain names
   strain_regex: '^.+$'
   # Back up strain name field if 'strain' doesn't match regex above
-  strain_backup_fields: ['accession']
+  strain_backup_fields: ['genbank_accession']
   # List of date fields to standardize
-  date_fields: ['date', 'date_released']
+  date_fields: ['date', 'release_date', 'update_date']
   # Expected date formats present in date fields
   # These date formats should use directives expected by datetime
   # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes
@@ -47,24 +77,26 @@ transform:
   # User annotations file
   annotations: 'source-data/annotations.tsv'
   # ID field used to merge annotations
-  annotations_id: 'accession'
+  annotations_id: 'genbank_accession'
   # Field to use as the sequence ID in the FASTA file
-  id_field: 'accession'
+  id_field: 'genbank_accession'
   # Field to use as the sequence in the FASTA file
   sequence_field: 'sequence'
   # Final output columns for the metadata TSV
   metadata_columns: [
-    'accession',
-    'genbank_accession_rev',
     'strain',
+    'genbank_accession',
+    'genbank_accession_rev',
     'date',
     'region',
     'country',
     'division',
     'location',
+    'length',
     'host',
-    'date_released',
-    'sra_accession',
+    'release_date',
+    'update_date',
+    'sra_accessions',
     'abbr_authors',
     'authors',
     'institution'

diff --git a/ingest/source-data/ncbi-dataset-field-map.tsv b/ingest/source-data/ncbi-dataset-field-map.tsv
diff --git a/ingest/workflow/snakemake_rules/fetch_sequences.smk b/ingest/workflow/snakemake_rules/fetch_sequences.smk
@@ -44,57 +44,26 @@ rule extract_ncbi_dataset_sequences:
         """
 
 
-def _get_ncbi_dataset_field_mnemonics(wildcards) -> str:
-    """
-    Return list of NCBI Dataset report field mnemonics for fields that we want
-    to parse out of the dataset report. The column names in the output TSV
-    are different from the mnemonics.
-
-    See NCBI Dataset docs for full list of available fields and their column
-    names in the output:
-    https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
-    """
-    fields = [
-        "accession",
-        "sourcedb",
-        "isolate-lineage",
-        "geo-region",
-        "geo-location",
-        "isolate-collection-date",
-        "release-date",
-        "update-date",
-        "length",
-        "host-name",
-        "isolate-lineage-source",
-        "bioprojects",
-        "biosample-acc",
-        "sra-accs",
-        "submitter-names",
-        "submitter-affiliation",
-    ]
-    return ",".join(fields)
-
-
 rule format_ncbi_dataset_report:
-    # Formats the headers to be the same as before we used NCBI Datasets
-    # The only fields we do not have equivalents for are "title" and "publications"
+    # Formats the headers to match the NCBI mnemonic names
     input:
         dataset_package="data/ncbi_dataset.zip",
-        ncbi_field_map=config["ncbi_field_map"],
     output:
         ncbi_dataset_tsv=temp("data/ncbi_dataset_report.tsv"),
     params:
-        fields_to_include=_get_ncbi_dataset_field_mnemonics,
+        ncbi_datasets_fields=",".join(config["ncbi_datasets_fields"]),
     benchmark:
         "benchmarks/format_ncbi_dataset_report.txt"
     shell:
         """
         dataformat tsv virus-genome \
             --package {input.dataset_package} \
-            --fields {params.fields_to_include:q} \
-            | csvtk -tl rename2 -F -f '*' -p '(.+)' -r '{{kv}}' -k {input.ncbi_field_map} \
-            | csvtk -tl mutate -f genbank_accession_rev -n genbank_accession -p "^(.+?)\." \
-            | tsv-select -H -f genbank_accession --rest last \
+            --fields {params.ncbi_datasets_fields:q} \
+            --elide-header \
+            | csvtk add-header -t -l -n {params.ncbi_datasets_fields:q} \
+            | csvtk rename -t -f accession -n accession-rev \
+            | csvtk -tl mutate -f accession-rev -n accession -p "^(.+?)\." \
+            | tsv-select -H -f accession --rest last \
             > {output.ncbi_dataset_tsv}
         """
 
@@ -105,6 +74,8 @@ rule format_ncbi_datasets_ndjson:
         ncbi_dataset_tsv="data/ncbi_dataset_report.tsv",
     output:
         ndjson="data/genbank.ndjson",
+    params:
+        ncbi_datasets_fields=",".join(config["ncbi_datasets_fields"]),
     log:
         "logs/format_ncbi_datasets_ndjson.txt",
     benchmark:
@@ -114,7 +85,7 @@ rule format_ncbi_datasets_ndjson:
         augur curate passthru \
             --metadata {input.ncbi_dataset_tsv} \
             --fasta {input.ncbi_dataset_sequences} \
-            --seq-id-column genbank_accession_rev \
+            --seq-id-column accession-rev \
             --seq-field sequence \
             --unmatched-reporting warn \
             --duplicate-reporting warn \