From 5b053e8b8e78336e01347e234935bf8432c1bdec Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Mon, 26 Aug 2024 14:52:01 -0700
Subject: [PATCH] vidrl_upload: Support ingest of human pooled sera references
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Using a hard-coded VACCINE_MAPPING to keep track of Southern Hemisphere
vaccine strains, which is used to map human pooled sera references to
specific serum strains. This commit only includes 2024 vaccines from
<https://www.who.int/publications/m/item/recommended-composition-of-influenza-virus-vaccines-for-use-in-the-2024-southern-hemisphere-influenza-season>
Will update the vaccine mapping as I backfill more data.

I had originally tried to use a4c4336607fa0f247a66cedc923bb9ad32e4473d +
5e72c59b1ee4f80cc98e193dd74b1677e77b8551 to parse the "clade" row for
the extra egg/cell info, but that pattern matching fails because of
excess clade rows in the Excel sheet.¹ Instead, I've opted to just
force include an extra row of info for the human serum data in
`find_serum_rows`.

¹ <https://github.com/nextstrain/fauna/pull/160#discussion_r1731978551>
---
 tdb/titer_block.py  |   5 +-
 tdb/vidrl_upload.py | 121 ++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 116 insertions(+), 10 deletions(-)

diff --git a/tdb/titer_block.py b/tdb/titer_block.py
index e607c6e..5a1ca58 100644
--- a/tdb/titer_block.py
+++ b/tdb/titer_block.py
@@ -284,11 +284,14 @@ def find_serum_rows(worksheet, titer_coords, virus_names=None, serum_id_pattern=
             # Ignore human serum (e.g. "SH2002", "sera", "SHVAX2002")
             if re.search(ignore_serum_pattern, cell_value):
                 if log_human_sera:
+                    max_row = max(serum_abbrev_row_idx, serum_id_row_idx, serum_passage_row_idx)
                     human_serum_data.append({
                         "col_idx": col_idx,
                         "serum_abbrev": cell_value,
                         "serum_id": str(worksheet.cell_value(serum_id_row_idx, col_idx)),
-                        "serum_passage": str(worksheet.cell_value(serum_passage_row_idx, col_idx))
+                        "serum_passage": str(worksheet.cell_value(serum_passage_row_idx, col_idx)),
+                        # Sometimes egg/cell distinction is stored in a separate row
+                        "extra_info": str(worksheet.cell_value(max_row + 1, col_idx))
                     })
                 continue
             # Deal with duplicate serum abbreviations which can get out of sync with virus full names
diff --git a/tdb/vidrl_upload.py b/tdb/vidrl_upload.py
index d79e114..07da378 100644
--- a/tdb/vidrl_upload.py
+++ b/tdb/vidrl_upload.py
@@ -18,6 +18,26 @@
 ELIFE_COLUMNS = ["virus_strain", "serum_strain","serum_id", "titer", "source", "virus_passage", "virus_passage_category", "serum_passage", "serum_passage_category", "assay_type"]
 EXPECTED_SUBTYPES = {"h1n1pdm", "h3n2", "vic", "yam"}
 
+# Vaccine mapping used for mapping human pooled sera to a specific reference virus
+# This is based on the vaccine composition for the Southern Hemisphere
+# because all human pooled sera should be from Australia
+VACCINE_MAPPING = {
+    "2024": {
+        "egg": {
+            "h1n1pdm": "A/Victoria/4897/2022",
+            "h3n2": "A/Thailand/8/2022",
+            "vic": "B/Austria/1359417/2021",
+            "yam": "B/Phuket/3073/2013"
+        },
+        "cell": {
+            "h1n1pdm": "A/Wisconsin/67/2022",
+            "h3n2": "A/Massachusetts/18/2022",
+            "vic": "B/Austria/1359417/2021",
+            "yam": "B/Phuket/3073/2013"
+        }
+    }
+}
+
 def parse_tsv_mapping_to_dict(tsv_file):
     map_dict = {}
     with open(tsv_file, 'r') as f:
@@ -27,7 +47,75 @@ def parse_tsv_mapping_to_dict(tsv_file):
             map_dict[key] = value.rstrip('\n')
     return map_dict
 
-def read_vidrl(path, fstem, assay_type):
+
+def parse_human_serum_references(human_serum_data, subtype):
+    """
+    Expects the *human_serum_data* from titer_block.find_serum_rows
+    Returns parsed human serum references, where keys are the column number of
+    the human serum reference in the Excel sheet and the values are the serum
+    data with serum id, serum passage, and serum strain.
+    """
+    human_serum_references = {}
+    year_regex = r"SH(vax|VAX|\s)?(\d{4})"
+    egg_or_cell_regex = r"^(egg|cell)$" # Used with re.IGNORECASE
+
+    potential_year_fields = ['serum_id', 'serum_passage', 'serum_abbrev']
+    potential_egg_or_cell_fields = ['serum_passage', 'extra_info']
+
+    for human_serum in human_serum_data:
+        column = human_serum['col_idx']
+        # First try to parse the year from the human serum data
+        year = new_serum_id = None
+        for field in potential_year_fields:
+            matches = re.match(year_regex, human_serum[field])
+            # Use the first match of the potential fields
+            if matches is not None:
+                year = matches.group(2)
+                # Follow a standard pattern where serum_id is `SH {year}`
+                new_serum_id = f"SH {year}"
+                break
+
+        # year is required to know which vaccine reference strain to use,
+        # so skip the human serum if it can't be parsed
+        if year is None:
+            print(f"WARNING: Skipping human sera column {column} ",
+                  f"because none of {potential_year_fields} fields ",
+                  f"matched the year regex {year_regex!r}")
+            continue
+
+        # Then try to parse egg or cell from the human serum data
+        egg_or_cell = None
+        for field in potential_egg_or_cell_fields:
+            matches = re.match(egg_or_cell_regex, human_serum[field], re.IGNORECASE)
+            # Use the first match of the potential fields
+            if matches is not None:
+                egg_or_cell = matches.group(1).lower()
+                break
+
+        # egg_or_cell is required to know which vaccine reference strain to use,
+        # so skip the human serum if it can't be parsed
+        if egg_or_cell is None:
+            print(f"WARNING: Skipping human sera column {column} ",
+                  f"because none of {potential_egg_or_cell_fields} fields ",
+                  f"matched the regex {egg_or_cell_regex!r}")
+            continue
+
+        # Raise a loud error so we know to update the VACCINE_MAPPING as needed
+        try:
+            serum_strain = VACCINE_MAPPING[year][egg_or_cell][subtype]
+        except KeyError as err:
+            raise Exception(f"VACCINE_MAPPING needs to be updated!") from err
+
+        human_serum_references[column] = {
+            "serum_id": new_serum_id,
+            "serum_passage": egg_or_cell,
+            "serum_strain": serum_strain
+        }
+
+    return human_serum_references
+
+
+def read_vidrl(path, fstem, assay_type, subtype):
     '''
     Read all csv tables in path, create data frame with reference viruses as columns
     '''
@@ -35,12 +123,12 @@ def read_vidrl(path, fstem, assay_type):
 
     if True in exten:
         ind = exten.index(True)
-        convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type)
+        convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type, subtype)
     else:
         print("Unable to recognize file {}/{}".format(path,fstem))
         sys.exit()
 
-def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type):
+def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type, subtype):
     exts = ['.xls', '.xlsm', '.xlsx']
     workbook = xlrd.open_workbook(path+fstem + exts[ind])
 
@@ -89,6 +177,7 @@ def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type):
             serum_passage_pattern=serum_passage_pattern,
             serum_abbrev_pattern=serum_abbrev_pattern,
             ignore_serum_pattern=human_serum_pattern,
+            log_human_sera=True,
             crick=crick,
         )
 
@@ -131,6 +220,12 @@ def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type):
         # }
         # print(f"corrected: serum_mapping={json.dumps(serum_mapping, indent=4)}")
 
+        human_serum_references = parse_human_serum_references(serum_block['human_serum_data'], args.subtype)
+
+        print("Human pooled serum references parsed from serum block")
+        for col, values in human_serum_references.items():
+            print(f"Column {col!r}: {values}")
+
         # Check if all the necessary indices were found
         if virus_block["virus_col_idx"] is None:
             print(f"Virus column index not found. Check the virus pattern: '{virus_pattern}'")
@@ -178,11 +273,19 @@ def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type):
                 virus_strain = str(mat.cell_value(i,virus_strain_col_index)).strip()
                 virus_passage = str(mat.cell_value(i,virus_passage_col_index)).strip()
                 for j in range(col_start, (col_end+1)):
-                    serum_id = str(mat.cell_value(serum_id_row_index,j)).strip().replace(' ','')
-                    serum_passage = str(mat.cell_value(serum_passage_row_index,j)).strip()
-                    serum_abbr = str(mat.cell_value(serum_strain_row_index,j)).strip()
-                    serum_abbr = serum_abbr.replace(' ','')
-                    serum_strain = serum_mapping.get(serum_abbr, serum_abbr)
+                    # Special handling of human pooled sera that were matched to
+                    # vaccine reference strain instead of the normal serum mapping
+                    if j in human_serum_references:
+                        serum_id = human_serum_references[j]['serum_id']
+                        serum_passage = human_serum_references[j]['serum_passage']
+                        serum_strain = human_serum_references[j]['serum_strain']
+                    else:
+                        serum_id = str(mat.cell_value(serum_id_row_index,j)).strip().replace(' ','')
+                        serum_passage = str(mat.cell_value(serum_passage_row_index,j)).strip()
+                        serum_abbr = str(mat.cell_value(serum_strain_row_index,j)).strip()
+                        serum_abbr = serum_abbr.replace(' ','')
+                        serum_strain = serum_mapping.get(serum_abbr, serum_abbr)
+
                     titer = str(mat.cell_value(i,j)).strip()
                     line = "%s\n" % ("\t".join([ virus_strain, serum_strain, serum_id, titer, source, virus_passage, virus_passage_category, serum_passage, serum_passage_category, assay_type]))
                     outfile.write(line)
@@ -227,7 +330,7 @@ def read_flat_vidrl(path, fstem, assay_type):
     if args.ftype == "flat":
         read_flat_vidrl(args.path, args.fstem, args.assay_type)
     else:
-        read_vidrl(args.path, args.fstem, args.assay_type)
+        read_vidrl(args.path, args.fstem, args.assay_type, args.subtype)
 
     if args.preview:
         command = "python tdb/elife_upload.py -db " + args.database +  " --subtype " + args.subtype + " --path data/tmp/ --fstem " + args.fstem + " --preview"