diff --git a/tdb/titer_block.py b/tdb/titer_block.py index e607c6e..5a1ca58 100644 --- a/tdb/titer_block.py +++ b/tdb/titer_block.py @@ -284,11 +284,14 @@ def find_serum_rows(worksheet, titer_coords, virus_names=None, serum_id_pattern= # Ignore human serum (e.g. "SH2002", "sera", "SHVAX2002") if re.search(ignore_serum_pattern, cell_value): if log_human_sera: + max_row = max(serum_abbrev_row_idx, serum_id_row_idx, serum_passage_row_idx) human_serum_data.append({ "col_idx": col_idx, "serum_abbrev": cell_value, "serum_id": str(worksheet.cell_value(serum_id_row_idx, col_idx)), - "serum_passage": str(worksheet.cell_value(serum_passage_row_idx, col_idx)) + "serum_passage": str(worksheet.cell_value(serum_passage_row_idx, col_idx)), + # Sometimes egg/cell distinction is stored in a separate row + "extra_info": str(worksheet.cell_value(max_row + 1, col_idx)) }) continue # Deal with duplicate serum abbreviations which can get out of sync with virus full names diff --git a/tdb/vidrl_upload.py b/tdb/vidrl_upload.py index a7e8162..d8a854d 100644 --- a/tdb/vidrl_upload.py +++ b/tdb/vidrl_upload.py @@ -14,8 +14,45 @@ from titer_block import find_titer_block, find_serum_rows, find_virus_columns parser.add_argument('--assay_type', default='hi') +parser.add_argument('--human-ref-only', action="store_true", + help="Only ingest human sera references, used for backfilling data that was skipped in previous ingests.") ELIFE_COLUMNS = ["virus_strain", "serum_strain","serum_id", "titer", "source", "virus_passage", "virus_passage_category", "serum_passage", "serum_passage_category", "assay_type"] +EXPECTED_SUBTYPES = {"h1n1pdm", "h3n2", "vic", "yam"} + +# Vaccine mapping used for mapping human pooled sera to a specific reference virus +# This is based on the vaccine composition for the Southern Hemisphere +# because all human pooled sera should be from Australia +VACCINE_MAPPING = { + "2023": { + "egg": { + "h1n1pdm": "A/Sydney/5/2021", + "h3n2": "A/Darwin/9/2021", + "vic": "B/Austria/1359417/2021", + "yam": "B/Phuket/3073/2013" + }, + "cell": { + "h1n1pdm": "A/Sydney/5/2021", + "h3n2": "A/Darwin/6/2021", + "vic": "B/Austria/1359417/2021", + "yam": "B/Phuket/3073/2013" + } + }, + "2024": { + "egg": { + "h1n1pdm": "A/Victoria/4897/2022", + "h3n2": "A/Thailand/8/2022", + "vic": "B/Austria/1359417/2021", + "yam": "B/Phuket/3073/2013" + }, + "cell": { + "h1n1pdm": "A/Wisconsin/67/2022", + "h3n2": "A/Massachusetts/18/2022", + "vic": "B/Austria/1359417/2021", + "yam": "B/Phuket/3073/2013" + } + } +} def parse_tsv_mapping_to_dict(tsv_file): map_dict = {} @@ -26,7 +63,80 @@ def parse_tsv_mapping_to_dict(tsv_file): map_dict[key] = value.rstrip('\n') return map_dict -def read_vidrl(path, fstem, assay_type): + +def parse_human_serum_references(human_serum_data, subtype): + """ + Expects the *human_serum_data* from titer_block.find_serum_rows + Returns parsed human serum references, where keys are the column number of + the human serum reference in the Excel sheet and the values are the serum + data with serum id, serum passage, and serum strain. + """ + human_serum_references = {} + year_regex = r"SH(vax|VAX|\s)?(\d{4})" + egg_or_cell_regex = r"^(egg|cell)$" # Used with re.IGNORECASE + + potential_year_fields = ['serum_id', 'serum_passage', 'serum_abbrev'] + potential_egg_or_cell_fields = ['serum_passage', 'extra_info'] + + for human_serum in human_serum_data: + column = human_serum['col_idx'] + # First try to parse the year from the human serum data + year = new_serum_id = None + for field in potential_year_fields: + matches = re.match(year_regex, human_serum[field]) + # Use the first match of the potential fields + if matches is not None: + year = matches.group(2) + # Follow a standard pattern where serum_id is `Human pool ` + # Need "human" in serum_id because this is how we match for human sera in seasonal flu + # + new_serum_id = f"Human pool {year}" + break + + # year is required to know which vaccine reference strain to use + # Raise an error because this info should _always_ be available + if year is None: + raise Exception(f"Unable to process human sera column {column} ", + f"because none of {potential_year_fields} fields ", + f"matched the year regex {year_regex!r}") + + # Then try to parse egg or cell from the human serum data + egg_or_cell = None + for field in potential_egg_or_cell_fields: + matches = re.match(egg_or_cell_regex, human_serum[field], re.IGNORECASE) + # Use the first match of the potential fields + if matches is not None: + egg_or_cell = matches.group(1).lower() + break + + # egg_or_cell is required to know which vaccine reference strain to use, + # so skip the human serum if it can't be parsed + # Only outputting a warning because I've seen Excel worksheets _without_ + # any egg/cell distinctions from 2023. This will require extra correspondence + # with VIDRL, so don't let it block ingest of other data. + # -Jover, 28 August 2024 + if egg_or_cell is None: + print(f"WARNING: Skipping human sera column {column} ", + f"because none of {potential_egg_or_cell_fields} fields ", + f"matched the regex {egg_or_cell_regex!r}") + continue + + # Raise a loud error so we know to update the VACCINE_MAPPING as needed + try: + serum_strain = VACCINE_MAPPING[year][egg_or_cell][subtype] + except KeyError as err: + raise Exception(f"VACCINE_MAPPING needs to be updated!") from err + + human_serum_references[column] = { + "serum_id": new_serum_id, + "serum_passage": egg_or_cell, + "serum_strain": serum_strain + } + + return human_serum_references + + +def read_vidrl(path, fstem, assay_type, subtype, human_ref_only): ''' Read all csv tables in path, create data frame with reference viruses as columns ''' @@ -34,12 +144,12 @@ def read_vidrl(path, fstem, assay_type): if True in exten: ind = exten.index(True) - convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type) + convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type, subtype, human_ref_only) else: print("Unable to recognize file {}/{}".format(path,fstem)) sys.exit() -def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type): +def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type, subtype, human_ref_only): exts = ['.xls', '.xlsm', '.xlsx'] workbook = xlrd.open_workbook(path+fstem + exts[ind]) @@ -49,6 +159,7 @@ def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type): serum_id_pattern = r"^[A-Z]\d{4,8}" serum_passage_pattern = r"(MDCK\d+|SIAT\d+|E\d+)" serum_abbrev_pattern = r"\w+\s{0,1}\w+/\d+.*" + human_serum_pattern = r"(^SH\d+|SHVAX|SHvax|sera|vaxpool).*" crick = False for worksheet_index, worksheet in enumerate(workbook.sheets(), start=1): @@ -86,6 +197,8 @@ def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type): serum_id_pattern=serum_id_pattern, serum_passage_pattern=serum_passage_pattern, serum_abbrev_pattern=serum_abbrev_pattern, + ignore_serum_pattern=human_serum_pattern, + log_human_sera=True, crick=crick, ) @@ -128,6 +241,12 @@ def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type): # } # print(f"corrected: serum_mapping={json.dumps(serum_mapping, indent=4)}") + human_serum_references = parse_human_serum_references(serum_block['human_serum_data'], args.subtype) + + print("Human pooled serum references parsed from serum block") + for col, values in human_serum_references.items(): + print(f"Column {col!r}: {values}") + # Check if all the necessary indices were found if virus_block["virus_col_idx"] is None: print(f"Virus column index not found. Check the virus pattern: '{virus_pattern}'") @@ -175,11 +294,22 @@ def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type): virus_strain = str(mat.cell_value(i,virus_strain_col_index)).strip() virus_passage = str(mat.cell_value(i,virus_passage_col_index)).strip() for j in range(col_start, (col_end+1)): - serum_id = str(mat.cell_value(serum_id_row_index,j)).strip().replace(' ','') - serum_passage = str(mat.cell_value(serum_passage_row_index,j)).strip() - serum_abbr = str(mat.cell_value(serum_strain_row_index,j)).strip() - serum_abbr = serum_abbr.replace(' ','') - serum_strain = serum_mapping.get(serum_abbr, serum_abbr) + # Special handling of human pooled sera that were matched to + # vaccine reference strain instead of the normal serum mapping + if j in human_serum_references: + serum_id = human_serum_references[j]['serum_id'] + serum_passage = human_serum_references[j]['serum_passage'] + serum_strain = human_serum_references[j]['serum_strain'] + # Skip other titer measurements if we only want to ingest human serum references + elif human_ref_only: + continue + else: + serum_id = str(mat.cell_value(serum_id_row_index,j)).strip().replace(' ','') + serum_passage = str(mat.cell_value(serum_passage_row_index,j)).strip() + serum_abbr = str(mat.cell_value(serum_strain_row_index,j)).strip() + serum_abbr = serum_abbr.replace(' ','') + serum_strain = serum_mapping.get(serum_abbr, serum_abbr) + titer = str(mat.cell_value(i,j)).strip() line = "%s\n" % ("\t".join([ virus_strain, serum_strain, serum_id, titer, source, virus_passage, virus_passage_category, serum_passage, serum_passage_category, assay_type])) outfile.write(line) @@ -206,6 +336,11 @@ def read_flat_vidrl(path, fstem, assay_type): if __name__=="__main__": args = parser.parse_args() + # Asserting here because this is using a shared parser + # other tdb scripts do not require subtype + assert args.subtype is not None, "Subtype needs to be specified with --subtype" + assert args.subtype in EXPECTED_SUBTYPES, f"Subtype must be one of {EXPECTED_SUBTYPES!r}" + if args.path is None: args.path = "data/" else: @@ -219,16 +354,13 @@ def read_flat_vidrl(path, fstem, assay_type): if args.ftype == "flat": read_flat_vidrl(args.path, args.fstem, args.assay_type) else: - read_vidrl(args.path, args.fstem, args.assay_type) - - if args.subtype: - if args.preview: - command = "python tdb/elife_upload.py -db " + args.database + " --subtype " + args.subtype + " --path data/tmp/ --fstem " + args.fstem + " --preview" - print(command) - subprocess.call(command, shell=True) - else: - command = "python tdb/elife_upload.py -db " + args.database + " --subtype " + args.subtype + " --path data/tmp/ --fstem " + args.fstem - print(command) - subprocess.call(command, shell=True) + read_vidrl(args.path, args.fstem, args.assay_type, args.subtype, args.human_ref_only) + + if args.preview: + command = "python tdb/elife_upload.py -db " + args.database + " --subtype " + args.subtype + " --path data/tmp/ --fstem " + args.fstem + " --preview" + print(command) + subprocess.call(command, shell=True) else: - print("Subtype needs to be specified with --subtype") + command = "python tdb/elife_upload.py -db " + args.database + " --subtype " + args.subtype + " --path data/tmp/ --fstem " + args.fstem + print(command) + subprocess.call(command, shell=True)