NCBI Dataset field name transformations

Originally the field map was created to keep mpox NDJSON backward compatible with field names used from NCBI Virus. However, this constraint is not applicable to dengue. This commit organizes field renaming into two parts. 1. Rename the NCBI output columns to match the NCBI mnemonics (see `source-data/ncbi-dataset-field-map.tsv`) 2. Where necessary, rename the NCBI mnemonics to match Nextstrain expected column names (see "transform: fieldmap:" in `config/config.yaml`) For context and discussion, see #13 (comment)
nextstrain · Nov 9, 2023 · d9e5654 · d9e5654
1 parent 94b0113
commit d9e5654
Show file tree

Hide file tree

Showing 3 changed files with 41 additions and 26 deletions.
diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml
@@ -10,14 +10,28 @@ transform:
   # Fields to rename.
   # This is the first step in the pipeline, so any references to field names
   # in the configs below should use the new field names
-  field_map: ['collected=date', 'released=date_released', 'genbank_accession=accession', 'submitting_organization=institution']
+  field_map: [
+    'accession=genbank_accession',
+    'accession-rev=genbank_accession_rev',
+    'isolate-lineage=strain',
+    'sourcedb=database', # necessary for applying geo location rules
+    'geo-region=region',
+    'geo-location=location',
+    'host-name=host',
+    'isolate-collection-date=date',
+    'release-date=release_date',
+    'update-date=update_date',
+    'sra-accs=sra_accessions',    
+    'submitter-names=authors',
+    'submitter-affiliation=institution',
+  ]
   # Standardized strain name regex
   # Currently accepts any characters because we do not have a clear standard for strain names
   strain_regex: '^.+$'
   # Back up strain name field if 'strain' doesn't match regex above
   strain_backup_fields: ['accession']
   # List of date fields to standardize
-  date_fields: ['date', 'date_released']
+  date_fields: ['date', 'release-date', 'update-date']
   # Expected date formats present in date fields
   # These date formats should use directives expected by datetime
   # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes
@@ -54,17 +68,19 @@ transform:
   sequence_field: 'sequence'
   # Final output columns for the metadata TSV
   metadata_columns: [
-    'accession',
-    'genbank_accession_rev',
     'strain',
+    'genbank_accession',
+    'genbank_accession_rev',
     'date',
     'region',
     'country',
     'division',
     'location',
+    'length',
     'host',
-    'date_released',
-    'sra_accession',
+    'release_date',
+    'update_date',
+    'sra_accessions',
     'abbr_authors',
     'authors',
     'institution'

diff --git a/ingest/source-data/ncbi-dataset-field-map.tsv b/ingest/source-data/ncbi-dataset-field-map.tsv
@@ -1,17 +1,17 @@
+# Maps the NCBI output TSV column names back to the NCBI mnemonics.
+# This list should match the list in
+# ingest/workflow/snakemake_rules/fetch_sequences.smk _get_ncbi_dataset_field_mnemonics
 key	value
-Accession	genbank_accession_rev
-Source database	database
-Isolate Lineage	strain
-Geographic Region	region
-Geographic Location	location
-Isolate Collection date	collected
-Release date	released
-Update date	updated
+Accession	accession-rev
+Source database	sourcedb
+Isolate Lineage	isolate-lineage
+Geographic Region	geo-region
+Geographic Location	geo-location
+Isolate Collection date	isolate-collection-date
+Release date	release-date
+Update date	update-date
 Length	length
-Host Name	host
-Isolate Lineage source	isolation_source
-BioProjects	bioproject_accession
-BioSample accession	biosample_accession
-SRA Accessions	sra_accession
-Submitter Names	authors
-Submitter Affiliation	submitting_organization
+Host Name	host-name
+SRA Accessions	sra-accs
+Submitter Names	submitter-names
+Submitter Affiliation	submitter-affiliation
diff --git a/ingest/workflow/snakemake_rules/fetch_sequences.smk b/ingest/workflow/snakemake_rules/fetch_sequences.smk
@@ -76,8 +76,7 @@ def _get_ncbi_dataset_field_mnemonics(wildcards) -> str:
 
 
 rule format_ncbi_dataset_report:
-    # Formats the headers to be the same as before we used NCBI Datasets
-    # The only fields we do not have equivalents for are "title" and "publications"
+    # Formats the headers to match the NCBI mnemonic names
     input:
         dataset_package="data/ncbi_dataset.zip",
         ncbi_field_map=config["ncbi_field_map"],
@@ -93,8 +92,8 @@ rule format_ncbi_dataset_report:
             --package {input.dataset_package} \
             --fields {params.fields_to_include:q} \
             | csvtk -tl rename2 -F -f '*' -p '(.+)' -r '{{kv}}' -k {input.ncbi_field_map} \
-            | csvtk -tl mutate -f genbank_accession_rev -n genbank_accession -p "^(.+?)\." \
-            | tsv-select -H -f genbank_accession --rest last \
+            | csvtk -tl mutate -f accession-rev -n accession -p "^(.+?)\." \
+            | tsv-select -H -f accession --rest last \
             > {output.ncbi_dataset_tsv}
         """
 
@@ -114,7 +113,7 @@ rule format_ncbi_datasets_ndjson:
         augur curate passthru \
             --metadata {input.ncbi_dataset_tsv} \
             --fasta {input.ncbi_dataset_sequences} \
-            --seq-id-column genbank_accession_rev \
+            --seq-id-column accession-rev \
             --seq-field sequence \
             --unmatched-reporting warn \
             --duplicate-reporting warn \