From f56f4ba09f7b4e8dcecda6a0cc4199d6fff92fb7 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Tue, 8 Oct 2024 15:12:10 -0700 Subject: [PATCH] vidrl_upload/read_flat_vidrl: add column map to script The column map will be more complicated with the need to ingest two slightly different flat files (_flat_file.csv and _reference_panel.csv) as discussed in https://github.com/nextstrain/fauna/issues/161#issuecomment-2398089391. I also found myself constantly toggling back and forth between the separate column_map.tsv and the upload script to figure out how the columns are being used, so it makes more sense to just hard-code the column map in the script. --- source-data/vidrl_flat_file_column_map.tsv | 6 ------ tdb/vidrl_upload.py | 20 ++++++++++---------- 2 files changed, 10 insertions(+), 16 deletions(-) delete mode 100644 source-data/vidrl_flat_file_column_map.tsv diff --git a/source-data/vidrl_flat_file_column_map.tsv b/source-data/vidrl_flat_file_column_map.tsv deleted file mode 100644 index 384ad4c..0000000 --- a/source-data/vidrl_flat_file_column_map.tsv +++ /dev/null @@ -1,6 +0,0 @@ -virus virus_strain -virus.passage virus_passage -antisera.passage serum_passage -ferret serum_id -value titer -antisera.name serum_strain diff --git a/tdb/vidrl_upload.py b/tdb/vidrl_upload.py index d8a854d..f1078a5 100644 --- a/tdb/vidrl_upload.py +++ b/tdb/vidrl_upload.py @@ -54,15 +54,6 @@ } } -def parse_tsv_mapping_to_dict(tsv_file): - map_dict = {} - with open(tsv_file, 'r') as f: - for line in f: - (key, value) = line.split('\t') - key = key.lower() - map_dict[key] = value.rstrip('\n') - return map_dict - def parse_human_serum_references(human_serum_data, subtype): """ @@ -320,7 +311,16 @@ def read_flat_vidrl(path, fstem, assay_type): Read the flat CSV file with *fstem* in the provided *path* and convert to the expected TSV file at `data/tmp/.tsv` for tdb/elife_upload. """ - column_map = parse_tsv_mapping_to_dict("source-data/vidrl_flat_file_column_map.tsv") + # The new column names need to be one of the ELIFE_COLUMNS in order to be + # included in the temporary output file that's then passed to elife_upload.py + column_map = { + "virus": "virus_strain", + "virus.passage": "virus_passage", + "antisera.passage": "serum_passage", + "ferret": "serum_id", + "value": "titer", + "antisera.name": "serum_strain" + } filepath = path + fstem + ".csv" titer_measurements = pd.read_csv(filepath, usecols=column_map.keys()) \