From 2684e248edd33a735163a25bb3f695ba0791f968 Mon Sep 17 00:00:00 2001
From: Jennifer Chang <jennifer.chang.bioinform@gmail.com>
Date: Wed, 8 Nov 2023 15:53:54 -0800
Subject: [PATCH] NCBI Dataset field name transformations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Originally the field map was created to keep mpox NDJSON backward compatible
with field names used from NCBI Virus. However, this constraint is not
applicable to dengue.¹

This commit organizes field renaming into two parts.

1. Rename the NCBI output columns to match the NCBI mnemonics²
   (see "ncbi_field_map:" in `config/config.yaml`)
2. Where necessary, rename the NCBI mnemonics to match Nextstrain expected column names³
   (see "transform: fieldmap:" in `config/config.yaml`)

¹ https://github.com/nextstrain/dengue/pull/13#discussion_r1374892802
² https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
³ https://docs.nextstrain.org/projects/ncov/en/latest/reference/metadata-fields.html
---
 ingest/config/config.yaml                     | 56 +++++++++++++++----
 ingest/source-data/ncbi-dataset-field-map.tsv | 17 ------
 .../snakemake_rules/fetch_sequences.smk       | 51 ++++-------------
 3 files changed, 55 insertions(+), 69 deletions(-)
 delete mode 100644 ingest/source-data/ncbi-dataset-field-map.tsv

diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml
index f2dc1abf..231a5bb2 100644
--- a/ingest/config/config.yaml
+++ b/ingest/config/config.yaml
@@ -2,22 +2,52 @@
 sources: ['genbank']
 # Pathogen NCBI Taxonomy ID
 ncbi_taxon_id: '12637'
-# Renames the NCBI dataset headers
-ncbi_field_map: 'source-data/ncbi-dataset-field-map.tsv'
+# The list of NCBI Datasets fields to include from NCBI Datasets output
+# These need to be the mneumonics of the NCBI Datasets fields, see docs for full list of fields
+# https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
+# Note: the "accession" field MUST be provided to match with the sequences
+ncbi_datasets_fields:
+  - accession
+  - sourcedb
+  - isolate-lineage
+  - geo-region
+  - geo-location
+  - isolate-collection-date
+  - release-date
+  - update-date
+  - length
+  - host-name
+  - isolate-lineage-source
+  - submitter-names
+  - submitter-affiliation
 
 # Params for the transform rule
 transform:
-  # Fields to rename.
+  # NCBI Fields to rename to Nextstrain field names.
   # This is the first step in the pipeline, so any references to field names
   # in the configs below should use the new field names
-  field_map: ['collected=date', 'released=date_released', 'genbank_accession=accession', 'submitting_organization=institution']
+  field_map: [
+    'accession=genbank_accession',
+    'accession-rev=genbank_accession_rev',
+    'isolate-lineage=strain',
+    'sourcedb=database', # necessary for applying geo location rules
+    'geo-region=region',
+    'geo-location=location',
+    'host-name=host',
+    'isolate-collection-date=date',
+    'release-date=release_date',
+    'update-date=update_date',
+    'sra-accs=sra_accessions',    
+    'submitter-names=authors',
+    'submitter-affiliation=institution',
+  ]
   # Standardized strain name regex
   # Currently accepts any characters because we do not have a clear standard for strain names
   strain_regex: '^.+$'
   # Back up strain name field if 'strain' doesn't match regex above
-  strain_backup_fields: ['accession']
+  strain_backup_fields: ['genbank_accession']
   # List of date fields to standardize
-  date_fields: ['date', 'date_released']
+  date_fields: ['date', 'release_date', 'update_date']
   # Expected date formats present in date fields
   # These date formats should use directives expected by datetime
   # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes
@@ -47,24 +77,26 @@ transform:
   # User annotations file
   annotations: 'source-data/annotations.tsv'
   # ID field used to merge annotations
-  annotations_id: 'accession'
+  annotations_id: 'genbank_accession'
   # Field to use as the sequence ID in the FASTA file
-  id_field: 'accession'
+  id_field: 'genbank_accession'
   # Field to use as the sequence in the FASTA file
   sequence_field: 'sequence'
   # Final output columns for the metadata TSV
   metadata_columns: [
-    'accession',
-    'genbank_accession_rev',
     'strain',
+    'genbank_accession',
+    'genbank_accession_rev',
     'date',
     'region',
     'country',
     'division',
     'location',
+    'length',
     'host',
-    'date_released',
-    'sra_accession',
+    'release_date',
+    'update_date',
+    'sra_accessions',
     'abbr_authors',
     'authors',
     'institution'
diff --git a/ingest/source-data/ncbi-dataset-field-map.tsv b/ingest/source-data/ncbi-dataset-field-map.tsv
deleted file mode 100644
index 00c9db88..00000000
--- a/ingest/source-data/ncbi-dataset-field-map.tsv
+++ /dev/null
@@ -1,17 +0,0 @@
-key	value
-Accession	genbank_accession_rev
-Source database	database
-Isolate Lineage	strain
-Geographic Region	region
-Geographic Location	location
-Isolate Collection date	collected
-Release date	released
-Update date	updated
-Length	length
-Host Name	host
-Isolate Lineage source	isolation_source
-BioProjects	bioproject_accession
-BioSample accession	biosample_accession
-SRA Accessions	sra_accession
-Submitter Names	authors
-Submitter Affiliation	submitting_organization
diff --git a/ingest/workflow/snakemake_rules/fetch_sequences.smk b/ingest/workflow/snakemake_rules/fetch_sequences.smk
index 3f32f9b4..efd9fbb7 100644
--- a/ingest/workflow/snakemake_rules/fetch_sequences.smk
+++ b/ingest/workflow/snakemake_rules/fetch_sequences.smk
@@ -44,57 +44,26 @@ rule extract_ncbi_dataset_sequences:
         """
 
 
-def _get_ncbi_dataset_field_mnemonics(wildcards) -> str:
-    """
-    Return list of NCBI Dataset report field mnemonics for fields that we want
-    to parse out of the dataset report. The column names in the output TSV
-    are different from the mnemonics.
-
-    See NCBI Dataset docs for full list of available fields and their column
-    names in the output:
-    https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
-    """
-    fields = [
-        "accession",
-        "sourcedb",
-        "isolate-lineage",
-        "geo-region",
-        "geo-location",
-        "isolate-collection-date",
-        "release-date",
-        "update-date",
-        "length",
-        "host-name",
-        "isolate-lineage-source",
-        "bioprojects",
-        "biosample-acc",
-        "sra-accs",
-        "submitter-names",
-        "submitter-affiliation",
-    ]
-    return ",".join(fields)
-
-
 rule format_ncbi_dataset_report:
-    # Formats the headers to be the same as before we used NCBI Datasets
-    # The only fields we do not have equivalents for are "title" and "publications"
+    # Formats the headers to match the NCBI mnemonic names
     input:
         dataset_package="data/ncbi_dataset.zip",
-        ncbi_field_map=config["ncbi_field_map"],
     output:
         ncbi_dataset_tsv=temp("data/ncbi_dataset_report.tsv"),
     params:
-        fields_to_include=_get_ncbi_dataset_field_mnemonics,
+        ncbi_datasets_fields=",".join(config["ncbi_datasets_fields"]),
     benchmark:
         "benchmarks/format_ncbi_dataset_report.txt"
     shell:
         """
         dataformat tsv virus-genome \
             --package {input.dataset_package} \
-            --fields {params.fields_to_include:q} \
-            | csvtk -tl rename2 -F -f '*' -p '(.+)' -r '{{kv}}' -k {input.ncbi_field_map} \
-            | csvtk -tl mutate -f genbank_accession_rev -n genbank_accession -p "^(.+?)\." \
-            | tsv-select -H -f genbank_accession --rest last \
+            --fields {params.ncbi_datasets_fields:q} \
+            --elide-header \
+            | csvtk add-header -t -l -n {params.ncbi_datasets_fields:q} \
+            | csvtk rename -t -f accession -n accession-rev \
+            | csvtk -tl mutate -f accession-rev -n accession -p "^(.+?)\." \
+            | tsv-select -H -f accession --rest last \
             > {output.ncbi_dataset_tsv}
         """
 
@@ -105,6 +74,8 @@ rule format_ncbi_datasets_ndjson:
         ncbi_dataset_tsv="data/ncbi_dataset_report.tsv",
     output:
         ndjson="data/genbank.ndjson",
+    params:
+        ncbi_datasets_fields=",".join(config["ncbi_datasets_fields"]),
     log:
         "logs/format_ncbi_datasets_ndjson.txt",
     benchmark:
@@ -114,7 +85,7 @@ rule format_ncbi_datasets_ndjson:
         augur curate passthru \
             --metadata {input.ncbi_dataset_tsv} \
             --fasta {input.ncbi_dataset_sequences} \
-            --seq-id-column genbank_accession_rev \
+            --seq-id-column accession-rev \
             --seq-field sequence \
             --unmatched-reporting warn \
             --duplicate-reporting warn \