From 0c56e79ecabc9189f531917f1415dda9640a54fc Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Fri, 23 Aug 2024 15:34:14 -0700
Subject: [PATCH 1/7] vidrl_upload: Error early if `--subtype` is not provided

In the parsing of human pooled sera, the subtype will be required, so
just error early if the subtype is not provided.

Also adds an additional check that the subtype is one of the expected
values.
---
 tdb/vidrl_upload.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/tdb/vidrl_upload.py b/tdb/vidrl_upload.py
index a7e8162..e6a8f78 100644
--- a/tdb/vidrl_upload.py
+++ b/tdb/vidrl_upload.py
@@ -16,6 +16,7 @@
 parser.add_argument('--assay_type', default='hi')
 
 ELIFE_COLUMNS = ["virus_strain", "serum_strain","serum_id", "titer", "source", "virus_passage", "virus_passage_category", "serum_passage", "serum_passage_category", "assay_type"]
+EXPECTED_SUBTYPES = {"h1n1pdm", "h3n2", "vic", "yam"}
 
 def parse_tsv_mapping_to_dict(tsv_file):
     map_dict = {}
@@ -206,6 +207,11 @@ def read_flat_vidrl(path, fstem, assay_type):
 
 if __name__=="__main__":
     args = parser.parse_args()
+    # Asserting here because this is using a shared parser
+    # other tdb scripts do not require subtype
+    assert args.subtype is not None, "Subtype needs to be specified with --subtype"
+    assert args.subtype in EXPECTED_SUBTYPES, f"Subtype must be one of {EXPECTED_SUBTYPES!r}"
+
     if args.path is None:
         args.path = "data/"
     else:
@@ -221,14 +227,11 @@ def read_flat_vidrl(path, fstem, assay_type):
     else:
         read_vidrl(args.path, args.fstem, args.assay_type)
 
-    if args.subtype:
-        if args.preview:
-            command = "python tdb/elife_upload.py -db " + args.database +  " --subtype " + args.subtype + " --path data/tmp/ --fstem " + args.fstem + " --preview"
-            print(command)
-            subprocess.call(command, shell=True)
-        else:
-            command = "python tdb/elife_upload.py -db " + args.database +  " --subtype " + args.subtype + " --path data/tmp/ --fstem " + args.fstem
-            print(command)
-            subprocess.call(command, shell=True)
+    if args.preview:
+        command = "python tdb/elife_upload.py -db " + args.database +  " --subtype " + args.subtype + " --path data/tmp/ --fstem " + args.fstem + " --preview"
+        print(command)
+        subprocess.call(command, shell=True)
     else:
-        print("Subtype needs to be specified with --subtype")
+        command = "python tdb/elife_upload.py -db " + args.database +  " --subtype " + args.subtype + " --path data/tmp/ --fstem " + args.fstem
+        print(command)
+        subprocess.call(command, shell=True)

From 211e325ecd6859b85ec2fce76587360591d7ff48 Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Mon, 26 Aug 2024 14:48:37 -0700
Subject: [PATCH 2/7] vidrl_upload: Define VIDRL specific human_serum_pattern

Defining VIDRL specific human serum patterns to make it easier to
track which patterns are being used to match the human serum references.

Doing this in preparation for parsing human serum references for VIDRL.
---
 tdb/vidrl_upload.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tdb/vidrl_upload.py b/tdb/vidrl_upload.py
index e6a8f78..d79e114 100644
--- a/tdb/vidrl_upload.py
+++ b/tdb/vidrl_upload.py
@@ -50,6 +50,7 @@ def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type):
     serum_id_pattern = r"^[A-Z]\d{4,8}"
     serum_passage_pattern = r"(MDCK\d+|SIAT\d+|E\d+)"
     serum_abbrev_pattern = r"\w+\s{0,1}\w+/\d+.*"
+    human_serum_pattern = r"(^SH\d+|SHVAX|SHvax|sera|vaxpool).*"
     crick = False
 
     for worksheet_index, worksheet in enumerate(workbook.sheets(), start=1):
@@ -87,6 +88,7 @@ def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type):
             serum_id_pattern=serum_id_pattern,
             serum_passage_pattern=serum_passage_pattern,
             serum_abbrev_pattern=serum_abbrev_pattern,
+            ignore_serum_pattern=human_serum_pattern,
             crick=crick,
         )
 

From 5b053e8b8e78336e01347e234935bf8432c1bdec Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Mon, 26 Aug 2024 14:52:01 -0700
Subject: [PATCH 3/7] vidrl_upload: Support ingest of human pooled sera
 references
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Using a hard-coded VACCINE_MAPPING to keep track of Southern Hemisphere
vaccine strains, which is used to map human pooled sera references to
specific serum strains. This commit only includes 2024 vaccines from
<https://www.who.int/publications/m/item/recommended-composition-of-influenza-virus-vaccines-for-use-in-the-2024-southern-hemisphere-influenza-season>
Will update the vaccine mapping as I backfill more data.

I had originally tried to use a4c4336607fa0f247a66cedc923bb9ad32e4473d +
5e72c59b1ee4f80cc98e193dd74b1677e77b8551 to parse the "clade" row for
the extra egg/cell info, but that pattern matching fails because of
excess clade rows in the Excel sheet.¹ Instead, I've opted to just
force include an extra row of info for the human serum data in
`find_serum_rows`.

¹ <https://github.com/nextstrain/fauna/pull/160#discussion_r1731978551>
---
 tdb/titer_block.py  |   5 +-
 tdb/vidrl_upload.py | 121 ++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 116 insertions(+), 10 deletions(-)

diff --git a/tdb/titer_block.py b/tdb/titer_block.py
index e607c6e..5a1ca58 100644
--- a/tdb/titer_block.py
+++ b/tdb/titer_block.py
@@ -284,11 +284,14 @@ def find_serum_rows(worksheet, titer_coords, virus_names=None, serum_id_pattern=
             # Ignore human serum (e.g. "SH2002", "sera", "SHVAX2002")
             if re.search(ignore_serum_pattern, cell_value):
                 if log_human_sera:
+                    max_row = max(serum_abbrev_row_idx, serum_id_row_idx, serum_passage_row_idx)
                     human_serum_data.append({
                         "col_idx": col_idx,
                         "serum_abbrev": cell_value,
                         "serum_id": str(worksheet.cell_value(serum_id_row_idx, col_idx)),
-                        "serum_passage": str(worksheet.cell_value(serum_passage_row_idx, col_idx))
+                        "serum_passage": str(worksheet.cell_value(serum_passage_row_idx, col_idx)),
+                        # Sometimes egg/cell distinction is stored in a separate row
+                        "extra_info": str(worksheet.cell_value(max_row + 1, col_idx))
                     })
                 continue
             # Deal with duplicate serum abbreviations which can get out of sync with virus full names
diff --git a/tdb/vidrl_upload.py b/tdb/vidrl_upload.py
index d79e114..07da378 100644
--- a/tdb/vidrl_upload.py
+++ b/tdb/vidrl_upload.py
@@ -18,6 +18,26 @@
 ELIFE_COLUMNS = ["virus_strain", "serum_strain","serum_id", "titer", "source", "virus_passage", "virus_passage_category", "serum_passage", "serum_passage_category", "assay_type"]
 EXPECTED_SUBTYPES = {"h1n1pdm", "h3n2", "vic", "yam"}
 
+# Vaccine mapping used for mapping human pooled sera to a specific reference virus
+# This is based on the vaccine composition for the Southern Hemisphere
+# because all human pooled sera should be from Australia
+VACCINE_MAPPING = {
+    "2024": {
+        "egg": {
+            "h1n1pdm": "A/Victoria/4897/2022",
+            "h3n2": "A/Thailand/8/2022",
+            "vic": "B/Austria/1359417/2021",
+            "yam": "B/Phuket/3073/2013"
+        },
+        "cell": {
+            "h1n1pdm": "A/Wisconsin/67/2022",
+            "h3n2": "A/Massachusetts/18/2022",
+            "vic": "B/Austria/1359417/2021",
+            "yam": "B/Phuket/3073/2013"
+        }
+    }
+}
+
 def parse_tsv_mapping_to_dict(tsv_file):
     map_dict = {}
     with open(tsv_file, 'r') as f:
@@ -27,7 +47,75 @@ def parse_tsv_mapping_to_dict(tsv_file):
             map_dict[key] = value.rstrip('\n')
     return map_dict
 
-def read_vidrl(path, fstem, assay_type):
+
+def parse_human_serum_references(human_serum_data, subtype):
+    """
+    Expects the *human_serum_data* from titer_block.find_serum_rows
+    Returns parsed human serum references, where keys are the column number of
+    the human serum reference in the Excel sheet and the values are the serum
+    data with serum id, serum passage, and serum strain.
+    """
+    human_serum_references = {}
+    year_regex = r"SH(vax|VAX|\s)?(\d{4})"
+    egg_or_cell_regex = r"^(egg|cell)$" # Used with re.IGNORECASE
+
+    potential_year_fields = ['serum_id', 'serum_passage', 'serum_abbrev']
+    potential_egg_or_cell_fields = ['serum_passage', 'extra_info']
+
+    for human_serum in human_serum_data:
+        column = human_serum['col_idx']
+        # First try to parse the year from the human serum data
+        year = new_serum_id = None
+        for field in potential_year_fields:
+            matches = re.match(year_regex, human_serum[field])
+            # Use the first match of the potential fields
+            if matches is not None:
+                year = matches.group(2)
+                # Follow a standard pattern where serum_id is `SH {year}`
+                new_serum_id = f"SH {year}"
+                break
+
+        # year is required to know which vaccine reference strain to use,
+        # so skip the human serum if it can't be parsed
+        if year is None:
+            print(f"WARNING: Skipping human sera column {column} ",
+                  f"because none of {potential_year_fields} fields ",
+                  f"matched the year regex {year_regex!r}")
+            continue
+
+        # Then try to parse egg or cell from the human serum data
+        egg_or_cell = None
+        for field in potential_egg_or_cell_fields:
+            matches = re.match(egg_or_cell_regex, human_serum[field], re.IGNORECASE)
+            # Use the first match of the potential fields
+            if matches is not None:
+                egg_or_cell = matches.group(1).lower()
+                break
+
+        # egg_or_cell is required to know which vaccine reference strain to use,
+        # so skip the human serum if it can't be parsed
+        if egg_or_cell is None:
+            print(f"WARNING: Skipping human sera column {column} ",
+                  f"because none of {potential_egg_or_cell_fields} fields ",
+                  f"matched the regex {egg_or_cell_regex!r}")
+            continue
+
+        # Raise a loud error so we know to update the VACCINE_MAPPING as needed
+        try:
+            serum_strain = VACCINE_MAPPING[year][egg_or_cell][subtype]
+        except KeyError as err:
+            raise Exception(f"VACCINE_MAPPING needs to be updated!") from err
+
+        human_serum_references[column] = {
+            "serum_id": new_serum_id,
+            "serum_passage": egg_or_cell,
+            "serum_strain": serum_strain
+        }
+
+    return human_serum_references
+
+
+def read_vidrl(path, fstem, assay_type, subtype):
     '''
     Read all csv tables in path, create data frame with reference viruses as columns
     '''
@@ -35,12 +123,12 @@ def read_vidrl(path, fstem, assay_type):
 
     if True in exten:
         ind = exten.index(True)
-        convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type)
+        convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type, subtype)
     else:
         print("Unable to recognize file {}/{}".format(path,fstem))
         sys.exit()
 
-def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type):
+def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type, subtype):
     exts = ['.xls', '.xlsm', '.xlsx']
     workbook = xlrd.open_workbook(path+fstem + exts[ind])
 
@@ -89,6 +177,7 @@ def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type):
             serum_passage_pattern=serum_passage_pattern,
             serum_abbrev_pattern=serum_abbrev_pattern,
             ignore_serum_pattern=human_serum_pattern,
+            log_human_sera=True,
             crick=crick,
         )
 
@@ -131,6 +220,12 @@ def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type):
         # }
         # print(f"corrected: serum_mapping={json.dumps(serum_mapping, indent=4)}")
 
+        human_serum_references = parse_human_serum_references(serum_block['human_serum_data'], args.subtype)
+
+        print("Human pooled serum references parsed from serum block")
+        for col, values in human_serum_references.items():
+            print(f"Column {col!r}: {values}")
+
         # Check if all the necessary indices were found
         if virus_block["virus_col_idx"] is None:
             print(f"Virus column index not found. Check the virus pattern: '{virus_pattern}'")
@@ -178,11 +273,19 @@ def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type):
                 virus_strain = str(mat.cell_value(i,virus_strain_col_index)).strip()
                 virus_passage = str(mat.cell_value(i,virus_passage_col_index)).strip()
                 for j in range(col_start, (col_end+1)):
-                    serum_id = str(mat.cell_value(serum_id_row_index,j)).strip().replace(' ','')
-                    serum_passage = str(mat.cell_value(serum_passage_row_index,j)).strip()
-                    serum_abbr = str(mat.cell_value(serum_strain_row_index,j)).strip()
-                    serum_abbr = serum_abbr.replace(' ','')
-                    serum_strain = serum_mapping.get(serum_abbr, serum_abbr)
+                    # Special handling of human pooled sera that were matched to
+                    # vaccine reference strain instead of the normal serum mapping
+                    if j in human_serum_references:
+                        serum_id = human_serum_references[j]['serum_id']
+                        serum_passage = human_serum_references[j]['serum_passage']
+                        serum_strain = human_serum_references[j]['serum_strain']
+                    else:
+                        serum_id = str(mat.cell_value(serum_id_row_index,j)).strip().replace(' ','')
+                        serum_passage = str(mat.cell_value(serum_passage_row_index,j)).strip()
+                        serum_abbr = str(mat.cell_value(serum_strain_row_index,j)).strip()
+                        serum_abbr = serum_abbr.replace(' ','')
+                        serum_strain = serum_mapping.get(serum_abbr, serum_abbr)
+
                     titer = str(mat.cell_value(i,j)).strip()
                     line = "%s\n" % ("\t".join([ virus_strain, serum_strain, serum_id, titer, source, virus_passage, virus_passage_category, serum_passage, serum_passage_category, assay_type]))
                     outfile.write(line)
@@ -227,7 +330,7 @@ def read_flat_vidrl(path, fstem, assay_type):
     if args.ftype == "flat":
         read_flat_vidrl(args.path, args.fstem, args.assay_type)
     else:
-        read_vidrl(args.path, args.fstem, args.assay_type)
+        read_vidrl(args.path, args.fstem, args.assay_type, args.subtype)
 
     if args.preview:
         command = "python tdb/elife_upload.py -db " + args.database +  " --subtype " + args.subtype + " --path data/tmp/ --fstem " + args.fstem + " --preview"

From b36f7ccc4855383d2bb07668ea378e34fd86b622 Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Mon, 26 Aug 2024 15:16:08 -0700
Subject: [PATCH 4/7] vidrl_upload: Use `Human pool <year>` as serum_id for
 human serum refs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Because the `serum_host` field is unreliable in fauna, seasonal-flu
uses substring matches on the `serum_id` field to separate ferret,
human, and mouse sera.¹

Updating the `serum_id` to be `Human pool <year>` so that it can be
matched in seasonal-flu. This also has the side-effect of setting the
`serum_host` field to "human" within fauna because of the `serum_id`
matching in tdb/upload.py.²

¹ <https://github.com/nextstrain/seasonal-flu/blob/89f6cfd11481b2c51c50d68822c18d46ed56db51/workflow/snakemake_rules/download_from_fauna.smk#L93>
² <https://github.com/nextstrain/fauna/blob/88a607db53d36fc91482cae2009eefddf9477f97/tdb/upload.py#L382-L383>
---
 tdb/vidrl_upload.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tdb/vidrl_upload.py b/tdb/vidrl_upload.py
index 07da378..e25f672 100644
--- a/tdb/vidrl_upload.py
+++ b/tdb/vidrl_upload.py
@@ -71,8 +71,10 @@ def parse_human_serum_references(human_serum_data, subtype):
             # Use the first match of the potential fields
             if matches is not None:
                 year = matches.group(2)
-                # Follow a standard pattern where serum_id is `SH {year}`
-                new_serum_id = f"SH {year}"
+                # Follow a standard pattern where serum_id is `Human pool <year>`
+                # Need "human" in serum_id because this is how we match for human sera in seasonal flu
+                # <https://github.com/nextstrain/seasonal-flu/blob/89f6cfd11481b2c51c50d68822c18d46ed56db51/workflow/snakemake_rules/download_from_fauna.smk#L93>
+                new_serum_id = f"Human pool {year}"
                 break
 
         # year is required to know which vaccine reference strain to use,

From b2403ef7215ffeb0af3394c45d9af378c813fdd3 Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Mon, 26 Aug 2024 15:26:23 -0700
Subject: [PATCH 5/7] vidrl_upload: Add `--human-ref-only` flag

Allows us to only ingest the human sera references as we are backfilling
the data to avoid accidentally duplicating the ferret titer data.

This flag can be removed once we've ingested all of the human sera
references that have been previously skipped.
---
 tdb/vidrl_upload.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/tdb/vidrl_upload.py b/tdb/vidrl_upload.py
index e25f672..e0cf2ab 100644
--- a/tdb/vidrl_upload.py
+++ b/tdb/vidrl_upload.py
@@ -14,6 +14,8 @@
 from titer_block import find_titer_block, find_serum_rows, find_virus_columns
 
 parser.add_argument('--assay_type', default='hi')
+parser.add_argument('--human-ref-only', action="store_true",
+    help="Only ingest human sera references, used for backfilling data that was skipped in previous ingests.")
 
 ELIFE_COLUMNS = ["virus_strain", "serum_strain","serum_id", "titer", "source", "virus_passage", "virus_passage_category", "serum_passage", "serum_passage_category", "assay_type"]
 EXPECTED_SUBTYPES = {"h1n1pdm", "h3n2", "vic", "yam"}
@@ -117,7 +119,7 @@ def parse_human_serum_references(human_serum_data, subtype):
     return human_serum_references
 
 
-def read_vidrl(path, fstem, assay_type, subtype):
+def read_vidrl(path, fstem, assay_type, subtype, human_ref_only):
     '''
     Read all csv tables in path, create data frame with reference viruses as columns
     '''
@@ -125,12 +127,12 @@ def read_vidrl(path, fstem, assay_type, subtype):
 
     if True in exten:
         ind = exten.index(True)
-        convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type, subtype)
+        convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type, subtype, human_ref_only)
     else:
         print("Unable to recognize file {}/{}".format(path,fstem))
         sys.exit()
 
-def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type, subtype):
+def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type, subtype, human_ref_only):
     exts = ['.xls', '.xlsm', '.xlsx']
     workbook = xlrd.open_workbook(path+fstem + exts[ind])
 
@@ -281,6 +283,9 @@ def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type, subtype):
                         serum_id = human_serum_references[j]['serum_id']
                         serum_passage = human_serum_references[j]['serum_passage']
                         serum_strain = human_serum_references[j]['serum_strain']
+                    # Skip other titer measurements if we only want to ingest human serum references
+                    elif human_ref_only:
+                        continue
                     else:
                         serum_id = str(mat.cell_value(serum_id_row_index,j)).strip().replace(' ','')
                         serum_passage = str(mat.cell_value(serum_passage_row_index,j)).strip()
@@ -332,7 +337,7 @@ def read_flat_vidrl(path, fstem, assay_type):
     if args.ftype == "flat":
         read_flat_vidrl(args.path, args.fstem, args.assay_type)
     else:
-        read_vidrl(args.path, args.fstem, args.assay_type, args.subtype)
+        read_vidrl(args.path, args.fstem, args.assay_type, args.subtype, args.human_ref_only)
 
     if args.preview:
         command = "python tdb/elife_upload.py -db " + args.database +  " --subtype " + args.subtype + " --path data/tmp/ --fstem " + args.fstem + " --preview"

From b7bade44a3bc60f61084131920e52be12c484807 Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Mon, 26 Aug 2024 16:35:25 -0700
Subject: [PATCH 6/7] vidrl_upload: Update VACCINE_MAPPING

Updating with 2023 vaccines from
<https://www.who.int/publications/m/item/recommended-composition-of-influenza-virus-vaccines-for-use-in-the-2023-southern-hemisphere-influenza-season>

I ended up needing to add 2023 vaccine mapping because some of the 2024
files included human sera references from 2023.
---
 tdb/vidrl_upload.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tdb/vidrl_upload.py b/tdb/vidrl_upload.py
index e0cf2ab..95a742c 100644
--- a/tdb/vidrl_upload.py
+++ b/tdb/vidrl_upload.py
@@ -24,6 +24,20 @@
 # This is based on the vaccine composition for the Southern Hemisphere
 # because all human pooled sera should be from Australia
 VACCINE_MAPPING = {
+    "2023": {
+        "egg": {
+            "h1n1pdm": "A/Sydney/5/2021",
+            "h3n2": "A/Darwin/9/2021",
+            "vic": "B/Austria/1359417/2021",
+            "yam": "B/Phuket/3073/2013"
+        },
+        "cell": {
+            "h1n1pdm": "A/Sydney/5/2021",
+            "h3n2": "A/Darwin/6/2021",
+            "vic": "B/Austria/1359417/2021",
+            "yam": "B/Phuket/3073/2013"
+        }
+    },
     "2024": {
         "egg": {
             "h1n1pdm": "A/Victoria/4897/2022",

From f1b243a8582220b83ce97612d8e9f640a54fe38c Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Wed, 28 Aug 2024 12:48:36 -0700
Subject: [PATCH 7/7] vidrl_upload: raise exception when year cannot be parsed
 for human sera
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Based on feedback from @huddlej¹

We should _always_ have year info, so raising a loud error when it
cannot be parsed from the human sera references. I'm choosing _not_
to update the similar check for egg/cell distinction since I've already
seen examples of it missing in Excel sheets from 2023.

¹ <https://github.com/nextstrain/fauna/pull/160#discussion_r1735101714>
---
 tdb/vidrl_upload.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/tdb/vidrl_upload.py b/tdb/vidrl_upload.py
index 95a742c..d8a854d 100644
--- a/tdb/vidrl_upload.py
+++ b/tdb/vidrl_upload.py
@@ -93,13 +93,12 @@ def parse_human_serum_references(human_serum_data, subtype):
                 new_serum_id = f"Human pool {year}"
                 break
 
-        # year is required to know which vaccine reference strain to use,
-        # so skip the human serum if it can't be parsed
+        # year is required to know which vaccine reference strain to use
+        # Raise an error because this info should _always_ be available
         if year is None:
-            print(f"WARNING: Skipping human sera column {column} ",
-                  f"because none of {potential_year_fields} fields ",
-                  f"matched the year regex {year_regex!r}")
-            continue
+            raise Exception(f"Unable to process human sera column {column} ",
+                            f"because none of {potential_year_fields} fields ",
+                            f"matched the year regex {year_regex!r}")
 
         # Then try to parse egg or cell from the human serum data
         egg_or_cell = None
@@ -112,6 +111,10 @@ def parse_human_serum_references(human_serum_data, subtype):
 
         # egg_or_cell is required to know which vaccine reference strain to use,
         # so skip the human serum if it can't be parsed
+        # Only outputting a warning because I've seen Excel worksheets _without_
+        # any egg/cell distinctions from 2023. This will require extra correspondence
+        # with VIDRL, so don't let it block ingest of other data.
+        #   -Jover, 28 August 2024
         if egg_or_cell is None:
             print(f"WARNING: Skipping human sera column {column} ",
                   f"because none of {potential_egg_or_cell_fields} fields ",