vidrl_upload: Add --human-ref-only flag

Allows us to only ingest the human sera references as we are backfilling the data to avoid accidentally duplicating the ferret titer data. This flag can be removed once we've ingested all of the human sera references that have been previously skipped.
nextstrain · Aug 27, 2024 · b2403ef · b2403ef
1 parent b36f7cc
commit b2403ef
Showing 1 changed file with 9 additions and 4 deletions.
diff --git a/tdb/vidrl_upload.py b/tdb/vidrl_upload.py
@@ -14,6 +14,8 @@
 from titer_block import find_titer_block, find_serum_rows, find_virus_columns
 
 parser.add_argument('--assay_type', default='hi')
+parser.add_argument('--human-ref-only', action="store_true",
+    help="Only ingest human sera references, used for backfilling data that was skipped in previous ingests.")
 
 ELIFE_COLUMNS = ["virus_strain", "serum_strain","serum_id", "titer", "source", "virus_passage", "virus_passage_category", "serum_passage", "serum_passage_category", "assay_type"]
 EXPECTED_SUBTYPES = {"h1n1pdm", "h3n2", "vic", "yam"}
@@ -117,20 +119,20 @@ def parse_human_serum_references(human_serum_data, subtype):
     return human_serum_references
 
 
-def read_vidrl(path, fstem, assay_type, subtype):
+def read_vidrl(path, fstem, assay_type, subtype, human_ref_only):
     '''
     Read all csv tables in path, create data frame with reference viruses as columns
     '''
     exten = [ os.path.isfile(path + fstem + ext) for ext in ['.xls', '.xlsm', '.xlsx'] ]
 
     if True in exten:
         ind = exten.index(True)
-        convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type, subtype)
+        convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type, subtype, human_ref_only)
     else:
         print("Unable to recognize file {}/{}".format(path,fstem))
         sys.exit()
 
-def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type, subtype):
+def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type, subtype, human_ref_only):
     exts = ['.xls', '.xlsm', '.xlsx']
     workbook = xlrd.open_workbook(path+fstem + exts[ind])
 
@@ -281,6 +283,9 @@ def convert_vidrl_xls_to_tsv(path, fstem, ind, assay_type, subtype):
                         serum_id = human_serum_references[j]['serum_id']
                         serum_passage = human_serum_references[j]['serum_passage']
                         serum_strain = human_serum_references[j]['serum_strain']
+                    # Skip other titer measurements if we only want to ingest human serum references
+                    elif human_ref_only:
+                        continue
                     else:
                         serum_id = str(mat.cell_value(serum_id_row_index,j)).strip().replace(' ','')
                         serum_passage = str(mat.cell_value(serum_passage_row_index,j)).strip()
@@ -332,7 +337,7 @@ def read_flat_vidrl(path, fstem, assay_type):
     if args.ftype == "flat":
         read_flat_vidrl(args.path, args.fstem, args.assay_type)
     else:
-        read_vidrl(args.path, args.fstem, args.assay_type, args.subtype)
+        read_vidrl(args.path, args.fstem, args.assay_type, args.subtype, args.human_ref_only)
 
     if args.preview:
         command = "python tdb/elife_upload.py -db " + args.database +  " --subtype " + args.subtype + " --path data/tmp/ --fstem " + args.fstem + " --preview"