From 0c56e79ecabc9189f531917f1415dda9640a54fc Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Fri, 23 Aug 2024 15:34:14 -0700 Subject: [PATCH 1/7] vidrl_upload: Error early if `--subtype` is not provided In the parsing of human pooled sera, the subtype will be required, so just error early if the subtype is not provided. Also adds an additional check that the subtype is one of the expected values. --- tdb/vidrl_upload.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/tdb/vidrl_upload.py b/tdb/vidrl_upload.py index a7e8162..e6a8f78 100644 --- a/tdb/vidrl_upload.py +++ b/tdb/vidrl_upload.py @@ -16,6 +16,7 @@ parser.add_argument('--assay_type', default='hi') ELIFE_COLUMNS = ["virus_strain", "serum_strain","serum_id", "titer", "source", "virus_passage", "virus_passage_category", "serum_passage", "serum_passage_category", "assay_type"] +EXPECTED_SUBTYPES = {"h1n1pdm", "h3n2", "vic", "yam"} def parse_tsv_mapping_to_dict(tsv_file): map_dict = {} @@ -206,6 +207,11 @@ def read_flat_vidrl(path, fstem, assay_type): if __name__=="__main__": args = parser.parse_args() + # Asserting here because this is using a shared parser + # other tdb scripts do not require subtype + assert args.subtype is not None, "Subtype needs to be specified with --subtype" + assert args.subtype in EXPECTED_SUBTYPES, f"Subtype must be one of {EXPECTED_SUBTYPES!r}" + if args.path is None: args.path = "data/" else: @@ -221,14 +227,11 @@ def read_flat_vidrl(path, fstem, assay_type): else: read_vidrl(args.path, args.fstem, args.assay_type) - if args.subtype: - if args.preview: - command = "python tdb/elife_upload.py -db " + args.database + " --subtype " + args.subtype + " --path data/tmp/ --fstem " + args.fstem + " --preview" - print(command) - subprocess.call(command, shell=True) - else: - command = "python tdb/elife_upload.py -db " + args.database + " --subtype " + args.subtype + " --path data/tmp/ --fstem " + args.fstem - print(command) - subprocess.call(command, shell=True) + if args.preview: + command = "python tdb/elife_upload.py -db " + args.database + " --subtype " + args.subtype + " --path data/tmp/ --fstem " + args.fstem + " --preview" + print(command) + subprocess.call(command, shell=True) else: - print("Subtype needs to be specified with --subtype") + command = "python tdb/elife_upload.py -db " + args.database + " --subtype " + args.subtype + " --path data/tmp/ --fstem " + args.fstem + print(command) + subprocess.call(command, shell=True) From 211e325ecd6859b85ec2fce76587360591d7ff48 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 26 Aug 2024 14:48:37 -0700 Subject: [PATCH 2/7] vidrl_upload: Define VIDRL specific human_serum_pattern Defining VIDRL specific human serum patterns to make it easier to track which patterns are being used to match the human serum references. Doing this in preparation for parsing human serum references for VIDRL. --- tdb/vidrl_upload.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tdb/vidrl_upload.py b/tdb/vidrl_upload.py index e6a8f78..d79e114 100644 --- a/tdb/vidrl_upload.py +++ b/tdb/vidrl_upload.py @@ -50,6 +50,7 @@ def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type): serum_id_pattern = r"^[A-Z]\d{4,8}" serum_passage_pattern = r"(MDCK\d+|SIAT\d+|E\d+)" serum_abbrev_pattern = r"\w+\s{0,1}\w+/\d+.*" + human_serum_pattern = r"(^SH\d+|SHVAX|SHvax|sera|vaxpool).*" crick = False for worksheet_index, worksheet in enumerate(workbook.sheets(), start=1): @@ -87,6 +88,7 @@ def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type): serum_id_pattern=serum_id_pattern, serum_passage_pattern=serum_passage_pattern, serum_abbrev_pattern=serum_abbrev_pattern, + ignore_serum_pattern=human_serum_pattern, crick=crick, ) From 5b053e8b8e78336e01347e234935bf8432c1bdec Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 26 Aug 2024 14:52:01 -0700 Subject: [PATCH 3/7] vidrl_upload: Support ingest of human pooled sera references MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using a hard-coded VACCINE_MAPPING to keep track of Southern Hemisphere vaccine strains, which is used to map human pooled sera references to specific serum strains. This commit only includes 2024 vaccines from Will update the vaccine mapping as I backfill more data. I had originally tried to use a4c4336607fa0f247a66cedc923bb9ad32e4473d + 5e72c59b1ee4f80cc98e193dd74b1677e77b8551 to parse the "clade" row for the extra egg/cell info, but that pattern matching fails because of excess clade rows in the Excel sheet.¹ Instead, I've opted to just force include an extra row of info for the human serum data in `find_serum_rows`. ¹ --- tdb/titer_block.py | 5 +- tdb/vidrl_upload.py | 121 ++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 116 insertions(+), 10 deletions(-) diff --git a/tdb/titer_block.py b/tdb/titer_block.py index e607c6e..5a1ca58 100644 --- a/tdb/titer_block.py +++ b/tdb/titer_block.py @@ -284,11 +284,14 @@ def find_serum_rows(worksheet, titer_coords, virus_names=None, serum_id_pattern= # Ignore human serum (e.g. "SH2002", "sera", "SHVAX2002") if re.search(ignore_serum_pattern, cell_value): if log_human_sera: + max_row = max(serum_abbrev_row_idx, serum_id_row_idx, serum_passage_row_idx) human_serum_data.append({ "col_idx": col_idx, "serum_abbrev": cell_value, "serum_id": str(worksheet.cell_value(serum_id_row_idx, col_idx)), - "serum_passage": str(worksheet.cell_value(serum_passage_row_idx, col_idx)) + "serum_passage": str(worksheet.cell_value(serum_passage_row_idx, col_idx)), + # Sometimes egg/cell distinction is stored in a separate row + "extra_info": str(worksheet.cell_value(max_row + 1, col_idx)) }) continue # Deal with duplicate serum abbreviations which can get out of sync with virus full names diff --git a/tdb/vidrl_upload.py b/tdb/vidrl_upload.py index d79e114..07da378 100644 --- a/tdb/vidrl_upload.py +++ b/tdb/vidrl_upload.py @@ -18,6 +18,26 @@ ELIFE_COLUMNS = ["virus_strain", "serum_strain","serum_id", "titer", "source", "virus_passage", "virus_passage_category", "serum_passage", "serum_passage_category", "assay_type"] EXPECTED_SUBTYPES = {"h1n1pdm", "h3n2", "vic", "yam"} +# Vaccine mapping used for mapping human pooled sera to a specific reference virus +# This is based on the vaccine composition for the Southern Hemisphere +# because all human pooled sera should be from Australia +VACCINE_MAPPING = { + "2024": { + "egg": { + "h1n1pdm": "A/Victoria/4897/2022", + "h3n2": "A/Thailand/8/2022", + "vic": "B/Austria/1359417/2021", + "yam": "B/Phuket/3073/2013" + }, + "cell": { + "h1n1pdm": "A/Wisconsin/67/2022", + "h3n2": "A/Massachusetts/18/2022", + "vic": "B/Austria/1359417/2021", + "yam": "B/Phuket/3073/2013" + } + } +} + def parse_tsv_mapping_to_dict(tsv_file): map_dict = {} with open(tsv_file, 'r') as f: @@ -27,7 +47,75 @@ def parse_tsv_mapping_to_dict(tsv_file): map_dict[key] = value.rstrip('\n') return map_dict -def read_vidrl(path, fstem, assay_type): + +def parse_human_serum_references(human_serum_data, subtype): + """ + Expects the *human_serum_data* from titer_block.find_serum_rows + Returns parsed human serum references, where keys are the column number of + the human serum reference in the Excel sheet and the values are the serum + data with serum id, serum passage, and serum strain. + """ + human_serum_references = {} + year_regex = r"SH(vax|VAX|\s)?(\d{4})" + egg_or_cell_regex = r"^(egg|cell)$" # Used with re.IGNORECASE + + potential_year_fields = ['serum_id', 'serum_passage', 'serum_abbrev'] + potential_egg_or_cell_fields = ['serum_passage', 'extra_info'] + + for human_serum in human_serum_data: + column = human_serum['col_idx'] + # First try to parse the year from the human serum data + year = new_serum_id = None + for field in potential_year_fields: + matches = re.match(year_regex, human_serum[field]) + # Use the first match of the potential fields + if matches is not None: + year = matches.group(2) + # Follow a standard pattern where serum_id is `SH {year}` + new_serum_id = f"SH {year}" + break + + # year is required to know which vaccine reference strain to use, + # so skip the human serum if it can't be parsed + if year is None: + print(f"WARNING: Skipping human sera column {column} ", + f"because none of {potential_year_fields} fields ", + f"matched the year regex {year_regex!r}") + continue + + # Then try to parse egg or cell from the human serum data + egg_or_cell = None + for field in potential_egg_or_cell_fields: + matches = re.match(egg_or_cell_regex, human_serum[field], re.IGNORECASE) + # Use the first match of the potential fields + if matches is not None: + egg_or_cell = matches.group(1).lower() + break + + # egg_or_cell is required to know which vaccine reference strain to use, + # so skip the human serum if it can't be parsed + if egg_or_cell is None: + print(f"WARNING: Skipping human sera column {column} ", + f"because none of {potential_egg_or_cell_fields} fields ", + f"matched the regex {egg_or_cell_regex!r}") + continue + + # Raise a loud error so we know to update the VACCINE_MAPPING as needed + try: + serum_strain = VACCINE_MAPPING[year][egg_or_cell][subtype] + except KeyError as err: + raise Exception(f"VACCINE_MAPPING needs to be updated!") from err + + human_serum_references[column] = { + "serum_id": new_serum_id, + "serum_passage": egg_or_cell, + "serum_strain": serum_strain + } + + return human_serum_references + + +def read_vidrl(path, fstem, assay_type, subtype): ''' Read all csv tables in path, create data frame with reference viruses as columns ''' @@ -35,12 +123,12 @@ def read_vidrl(path, fstem, assay_type): if True in exten: ind = exten.index(True) - convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type) + convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type, subtype) else: print("Unable to recognize file {}/{}".format(path,fstem)) sys.exit() -def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type): +def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type, subtype): exts = ['.xls', '.xlsm', '.xlsx'] workbook = xlrd.open_workbook(path+fstem + exts[ind]) @@ -89,6 +177,7 @@ def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type): serum_passage_pattern=serum_passage_pattern, serum_abbrev_pattern=serum_abbrev_pattern, ignore_serum_pattern=human_serum_pattern, + log_human_sera=True, crick=crick, ) @@ -131,6 +220,12 @@ def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type): # } # print(f"corrected: serum_mapping={json.dumps(serum_mapping, indent=4)}") + human_serum_references = parse_human_serum_references(serum_block['human_serum_data'], args.subtype) + + print("Human pooled serum references parsed from serum block") + for col, values in human_serum_references.items(): + print(f"Column {col!r}: {values}") + # Check if all the necessary indices were found if virus_block["virus_col_idx"] is None: print(f"Virus column index not found. Check the virus pattern: '{virus_pattern}'") @@ -178,11 +273,19 @@ def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type): virus_strain = str(mat.cell_value(i,virus_strain_col_index)).strip() virus_passage = str(mat.cell_value(i,virus_passage_col_index)).strip() for j in range(col_start, (col_end+1)): - serum_id = str(mat.cell_value(serum_id_row_index,j)).strip().replace(' ','') - serum_passage = str(mat.cell_value(serum_passage_row_index,j)).strip() - serum_abbr = str(mat.cell_value(serum_strain_row_index,j)).strip() - serum_abbr = serum_abbr.replace(' ','') - serum_strain = serum_mapping.get(serum_abbr, serum_abbr) + # Special handling of human pooled sera that were matched to + # vaccine reference strain instead of the normal serum mapping + if j in human_serum_references: + serum_id = human_serum_references[j]['serum_id'] + serum_passage = human_serum_references[j]['serum_passage'] + serum_strain = human_serum_references[j]['serum_strain'] + else: + serum_id = str(mat.cell_value(serum_id_row_index,j)).strip().replace(' ','') + serum_passage = str(mat.cell_value(serum_passage_row_index,j)).strip() + serum_abbr = str(mat.cell_value(serum_strain_row_index,j)).strip() + serum_abbr = serum_abbr.replace(' ','') + serum_strain = serum_mapping.get(serum_abbr, serum_abbr) + titer = str(mat.cell_value(i,j)).strip() line = "%s\n" % ("\t".join([ virus_strain, serum_strain, serum_id, titer, source, virus_passage, virus_passage_category, serum_passage, serum_passage_category, assay_type])) outfile.write(line) @@ -227,7 +330,7 @@ def read_flat_vidrl(path, fstem, assay_type): if args.ftype == "flat": read_flat_vidrl(args.path, args.fstem, args.assay_type) else: - read_vidrl(args.path, args.fstem, args.assay_type) + read_vidrl(args.path, args.fstem, args.assay_type, args.subtype) if args.preview: command = "python tdb/elife_upload.py -db " + args.database + " --subtype " + args.subtype + " --path data/tmp/ --fstem " + args.fstem + " --preview" From b36f7ccc4855383d2bb07668ea378e34fd86b622 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 26 Aug 2024 15:16:08 -0700 Subject: [PATCH 4/7] vidrl_upload: Use `Human pool ` as serum_id for human serum refs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Because the `serum_host` field is unreliable in fauna, seasonal-flu uses substring matches on the `serum_id` field to separate ferret, human, and mouse sera.¹ Updating the `serum_id` to be `Human pool ` so that it can be matched in seasonal-flu. This also has the side-effect of setting the `serum_host` field to "human" within fauna because of the `serum_id` matching in tdb/upload.py.² ¹ ² --- tdb/vidrl_upload.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tdb/vidrl_upload.py b/tdb/vidrl_upload.py index 07da378..e25f672 100644 --- a/tdb/vidrl_upload.py +++ b/tdb/vidrl_upload.py @@ -71,8 +71,10 @@ def parse_human_serum_references(human_serum_data, subtype): # Use the first match of the potential fields if matches is not None: year = matches.group(2) - # Follow a standard pattern where serum_id is `SH {year}` - new_serum_id = f"SH {year}" + # Follow a standard pattern where serum_id is `Human pool ` + # Need "human" in serum_id because this is how we match for human sera in seasonal flu + # + new_serum_id = f"Human pool {year}" break # year is required to know which vaccine reference strain to use, From b2403ef7215ffeb0af3394c45d9af378c813fdd3 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 26 Aug 2024 15:26:23 -0700 Subject: [PATCH 5/7] vidrl_upload: Add `--human-ref-only` flag Allows us to only ingest the human sera references as we are backfilling the data to avoid accidentally duplicating the ferret titer data. This flag can be removed once we've ingested all of the human sera references that have been previously skipped. --- tdb/vidrl_upload.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tdb/vidrl_upload.py b/tdb/vidrl_upload.py index e25f672..e0cf2ab 100644 --- a/tdb/vidrl_upload.py +++ b/tdb/vidrl_upload.py @@ -14,6 +14,8 @@ from titer_block import find_titer_block, find_serum_rows, find_virus_columns parser.add_argument('--assay_type', default='hi') +parser.add_argument('--human-ref-only', action="store_true", + help="Only ingest human sera references, used for backfilling data that was skipped in previous ingests.") ELIFE_COLUMNS = ["virus_strain", "serum_strain","serum_id", "titer", "source", "virus_passage", "virus_passage_category", "serum_passage", "serum_passage_category", "assay_type"] EXPECTED_SUBTYPES = {"h1n1pdm", "h3n2", "vic", "yam"} @@ -117,7 +119,7 @@ def parse_human_serum_references(human_serum_data, subtype): return human_serum_references -def read_vidrl(path, fstem, assay_type, subtype): +def read_vidrl(path, fstem, assay_type, subtype, human_ref_only): ''' Read all csv tables in path, create data frame with reference viruses as columns ''' @@ -125,12 +127,12 @@ def read_vidrl(path, fstem, assay_type, subtype): if True in exten: ind = exten.index(True) - convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type, subtype) + convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type, subtype, human_ref_only) else: print("Unable to recognize file {}/{}".format(path,fstem)) sys.exit() -def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type, subtype): +def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type, subtype, human_ref_only): exts = ['.xls', '.xlsm', '.xlsx'] workbook = xlrd.open_workbook(path+fstem + exts[ind]) @@ -281,6 +283,9 @@ def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type, subtype): serum_id = human_serum_references[j]['serum_id'] serum_passage = human_serum_references[j]['serum_passage'] serum_strain = human_serum_references[j]['serum_strain'] + # Skip other titer measurements if we only want to ingest human serum references + elif human_ref_only: + continue else: serum_id = str(mat.cell_value(serum_id_row_index,j)).strip().replace(' ','') serum_passage = str(mat.cell_value(serum_passage_row_index,j)).strip() @@ -332,7 +337,7 @@ def read_flat_vidrl(path, fstem, assay_type): if args.ftype == "flat": read_flat_vidrl(args.path, args.fstem, args.assay_type) else: - read_vidrl(args.path, args.fstem, args.assay_type, args.subtype) + read_vidrl(args.path, args.fstem, args.assay_type, args.subtype, args.human_ref_only) if args.preview: command = "python tdb/elife_upload.py -db " + args.database + " --subtype " + args.subtype + " --path data/tmp/ --fstem " + args.fstem + " --preview" From b7bade44a3bc60f61084131920e52be12c484807 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 26 Aug 2024 16:35:25 -0700 Subject: [PATCH 6/7] vidrl_upload: Update VACCINE_MAPPING Updating with 2023 vaccines from I ended up needing to add 2023 vaccine mapping because some of the 2024 files included human sera references from 2023. --- tdb/vidrl_upload.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tdb/vidrl_upload.py b/tdb/vidrl_upload.py index e0cf2ab..95a742c 100644 --- a/tdb/vidrl_upload.py +++ b/tdb/vidrl_upload.py @@ -24,6 +24,20 @@ # This is based on the vaccine composition for the Southern Hemisphere # because all human pooled sera should be from Australia VACCINE_MAPPING = { + "2023": { + "egg": { + "h1n1pdm": "A/Sydney/5/2021", + "h3n2": "A/Darwin/9/2021", + "vic": "B/Austria/1359417/2021", + "yam": "B/Phuket/3073/2013" + }, + "cell": { + "h1n1pdm": "A/Sydney/5/2021", + "h3n2": "A/Darwin/6/2021", + "vic": "B/Austria/1359417/2021", + "yam": "B/Phuket/3073/2013" + } + }, "2024": { "egg": { "h1n1pdm": "A/Victoria/4897/2022", From f1b243a8582220b83ce97612d8e9f640a54fe38c Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Wed, 28 Aug 2024 12:48:36 -0700 Subject: [PATCH 7/7] vidrl_upload: raise exception when year cannot be parsed for human sera MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Based on feedback from @huddlej¹ We should _always_ have year info, so raising a loud error when it cannot be parsed from the human sera references. I'm choosing _not_ to update the similar check for egg/cell distinction since I've already seen examples of it missing in Excel sheets from 2023. ¹ --- tdb/vidrl_upload.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tdb/vidrl_upload.py b/tdb/vidrl_upload.py index 95a742c..d8a854d 100644 --- a/tdb/vidrl_upload.py +++ b/tdb/vidrl_upload.py @@ -93,13 +93,12 @@ def parse_human_serum_references(human_serum_data, subtype): new_serum_id = f"Human pool {year}" break - # year is required to know which vaccine reference strain to use, - # so skip the human serum if it can't be parsed + # year is required to know which vaccine reference strain to use + # Raise an error because this info should _always_ be available if year is None: - print(f"WARNING: Skipping human sera column {column} ", - f"because none of {potential_year_fields} fields ", - f"matched the year regex {year_regex!r}") - continue + raise Exception(f"Unable to process human sera column {column} ", + f"because none of {potential_year_fields} fields ", + f"matched the year regex {year_regex!r}") # Then try to parse egg or cell from the human serum data egg_or_cell = None @@ -112,6 +111,10 @@ def parse_human_serum_references(human_serum_data, subtype): # egg_or_cell is required to know which vaccine reference strain to use, # so skip the human serum if it can't be parsed + # Only outputting a warning because I've seen Excel worksheets _without_ + # any egg/cell distinctions from 2023. This will require extra correspondence + # with VIDRL, so don't let it block ingest of other data. + # -Jover, 28 August 2024 if egg_or_cell is None: print(f"WARNING: Skipping human sera column {column} ", f"because none of {potential_egg_or_cell_fields} fields ",