From f56f4ba09f7b4e8dcecda6a0cc4199d6fff92fb7 Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Tue, 8 Oct 2024 15:12:10 -0700
Subject: [PATCH] vidrl_upload/read_flat_vidrl: add column map to script

The column map will be more complicated with the need to ingest two
slightly different flat files (_flat_file.csv and _reference_panel.csv)
as discussed in https://github.com/nextstrain/fauna/issues/161#issuecomment-2398089391.

I also found myself constantly toggling back and forth between the
separate column_map.tsv and the upload script to figure out how the
columns are being used, so it makes more sense to just hard-code the
column map in the script.
---
 source-data/vidrl_flat_file_column_map.tsv |  6 ------
 tdb/vidrl_upload.py                        | 20 ++++++++++----------
 2 files changed, 10 insertions(+), 16 deletions(-)
 delete mode 100644 source-data/vidrl_flat_file_column_map.tsv
diff --git a/source-data/vidrl_flat_file_column_map.tsv b/source-data/vidrl_flat_file_column_map.tsv
deleted file mode 100644
index 384ad4c..0000000
--- a/source-data/vidrl_flat_file_column_map.tsv
+++ /dev/null
@@ -1,6 +0,0 @@
-virus	virus_strain
-virus.passage	virus_passage
-antisera.passage	serum_passage
-ferret	serum_id
-value	titer
-antisera.name	serum_strain
diff --git a/tdb/vidrl_upload.py b/tdb/vidrl_upload.py
index d8a854d..f1078a5 100644
--- a/tdb/vidrl_upload.py
+++ b/tdb/vidrl_upload.py
@@ -54,15 +54,6 @@
     }
 }
 
-def parse_tsv_mapping_to_dict(tsv_file):
-    map_dict = {}
-    with open(tsv_file, 'r') as f:
-        for line in f:
-            (key, value) = line.split('\t')
-            key = key.lower()
-            map_dict[key] = value.rstrip('\n')
-    return map_dict
-
 
 def parse_human_serum_references(human_serum_data, subtype):
     """
@@ -320,7 +311,16 @@ def read_flat_vidrl(path, fstem, assay_type):
     Read the flat CSV file with *fstem* in the provided *path* and convert
     to the expected TSV file at `data/tmp/<fstem>.tsv` for tdb/elife_upload.
     """
-    column_map = parse_tsv_mapping_to_dict("source-data/vidrl_flat_file_column_map.tsv")
+    # The new column names need to be one of the ELIFE_COLUMNS in order to be
+    # included in the temporary output file that's then passed to elife_upload.py
+    column_map = {
+        "virus": "virus_strain",
+        "virus.passage": "virus_passage",
+        "antisera.passage": "serum_passage",
+        "ferret": "serum_id",
+        "value": "titer",
+        "antisera.name": "serum_strain"
+    }
     filepath = path + fstem + ".csv"
 
     titer_measurements = pd.read_csv(filepath, usecols=column_map.keys()) \