From 21cbcd86910a57862cbff1db9468aea0db851bb0 Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Mon, 4 Nov 2024 15:24:49 -0800
Subject: [PATCH] read_flat_vidrl: Clean up `virus_strain` that includes "pool"
 suffix

Based on meeting with VIDRL, we should only keep homologous titers
for `virus_strain` that includes "pool" suffix. This will act as a proxy
homologous titer for the human serum references. All other virus strains
that include the "pool" suffix are ignored because they are duplicate
data.
---
 tdb/vidrl_upload.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/tdb/vidrl_upload.py b/tdb/vidrl_upload.py
index 992bb91..069ff90 100644
--- a/tdb/vidrl_upload.py
+++ b/tdb/vidrl_upload.py
@@ -389,6 +389,8 @@ def curate_reference_panel_records(
         "antisera passage": "serum_passage",
         "ferret": "serum_id",
         "titre": "titer",
+        # Used for cleaning up `virus_strain` that includes "pool" suffix
+        "homologous": "homologous"
     }
 
     for record in records:
@@ -408,8 +410,16 @@ def curate_reference_panel_records(
 
         new_record = standardize_human_serum(new_record, "virus_passage")
 
-        # TODO: Clean up `virus_strain` that includes "pool" suffix
-        # Should these be dropped completely because they are not "real" measurements?
+        # Clean up `virus_strain` that includes "pool" suffix
+        # Strip "pool" suffix and keep as proxy of homologous titer
+        # for the human serum pool reference. Skip measurements that are not
+        # marked as homologous since they are just duplicates of the proxy measurements
+        #   -Jover, 04 November 2024
+        if re.match(r".*pool$", new_record["virus_strain"]):
+            if new_record["homologous"] == "TRUE":
+                new_record["virus_strain"] = re.sub(r"pool$", "", new_record["virus_strain"])
+            else:
+                continue
 
         yield new_record