From 21cbcd86910a57862cbff1db9468aea0db851bb0 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 4 Nov 2024 15:24:49 -0800 Subject: [PATCH] read_flat_vidrl: Clean up `virus_strain` that includes "pool" suffix Based on meeting with VIDRL, we should only keep homologous titers for `virus_strain` that includes "pool" suffix. This will act as a proxy homologous titer for the human serum references. All other virus strains that include the "pool" suffix are ignored because they are duplicate data. --- tdb/vidrl_upload.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tdb/vidrl_upload.py b/tdb/vidrl_upload.py index 992bb91..069ff90 100644 --- a/tdb/vidrl_upload.py +++ b/tdb/vidrl_upload.py @@ -389,6 +389,8 @@ def curate_reference_panel_records( "antisera passage": "serum_passage", "ferret": "serum_id", "titre": "titer", + # Used for cleaning up `virus_strain` that includes "pool" suffix + "homologous": "homologous" } for record in records: @@ -408,8 +410,16 @@ def curate_reference_panel_records( new_record = standardize_human_serum(new_record, "virus_passage") - # TODO: Clean up `virus_strain` that includes "pool" suffix - # Should these be dropped completely because they are not "real" measurements? + # Clean up `virus_strain` that includes "pool" suffix + # Strip "pool" suffix and keep as proxy of homologous titer + # for the human serum pool reference. Skip measurements that are not + # marked as homologous since they are just duplicates of the proxy measurements + # -Jover, 04 November 2024 + if re.match(r".*pool$", new_record["virus_strain"]): + if new_record["homologous"] == "TRUE": + new_record["virus_strain"] = re.sub(r"pool$", "", new_record["virus_strain"]) + else: + continue yield new_record