From 614afadf9bd8a65c6f64dd9b5f51ee433abd296c Mon Sep 17 00:00:00 2001
From: Jover Lee <joverlee521@gmail.com>
Date: Mon, 4 Nov 2024 17:24:55 -0800
Subject: [PATCH] read_flat_vidrl: Check for potential duplicate
 _reference_panel files

Based on meeting with VIDRL, a/b and _1/_2 reference panel files are
created from the same Excel file so they are duplicates while capital
A/B files are separate assays.

So, this changes allows us to check for the a/b and _1/_2 patterns and
ignore the reference panel file if it's a duplicate. This means we
always ingest the a or _1 file but ignore the b and _2 files.
---
 tdb/vidrl_upload.py | 54 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 49 insertions(+), 5 deletions(-)

diff --git a/tdb/vidrl_upload.py b/tdb/vidrl_upload.py
index 365fe0e..82c2bfe 100644
--- a/tdb/vidrl_upload.py
+++ b/tdb/vidrl_upload.py
@@ -1,4 +1,4 @@
-import os, re, time, datetime, csv, sys, json, errno
+import os, re, time, datetime, csv, sys, json, errno, filecmp
 import pandas as pd
 from upload import upload
 from rethinkdb import r
@@ -515,6 +515,52 @@ def validate_records(records: Iterator[dict]) -> Tuple[Iterator[dict], str]:
     return (serum_abbr_map, test_date)
 
 
+def get_ref_panel_filepath(fstem, path) -> Optional[str]:
+    """
+    Returns valid _reference_panel filepath if it should be ingested.
+    1. Checks the expected _reference_panel.csv file exists
+    2. Checks if the _reference_panel file is a duplicate of another file
+
+    Note: This does depend on the user having all of the flat files locally
+    and expects the user to always ingest the first _reference_panel file.
+    """
+    reference_filepath = path + fstem + ".csv"
+    if not os.path.isfile(reference_filepath):
+        print(f"WARNING: Coupled reference panel file {reference_filepath!r} does not exist.", file=sys.stderr)
+        return None
+
+    # Check if the file is a potential duplicate where one Excel file got split into multiple flat files.
+    # Look for `b` or `_2` files that are potentially duplicates of `a` or `_1` files
+    # We are ignoring the capital A/B patterns because these indicate separate assays.
+    char_pattern = r"b"
+    num_pattern  = r"_2"
+    dup_ref_pattern = rf"(\d*(?:_?[A-Z])?)({char_pattern}|{num_pattern})(\.xlsx.*)"
+    dup_match = re.match(dup_ref_pattern, fstem)
+    if dup_match:
+        # Construct the filepath for the first potential file that was ingested, e.g.
+        # 0612b.xlsx_H3_reference_panel.csv    ->  0612a.xlsx_H3_reference_panel.csv
+        # 0710_B_2.xlsx_H3_reference_panel.csv ->  0710_B_1.xlsx_H3_reference_panel.csv
+        # 0717B_2.xlsx_H3_reference_panel.csv  ->  0717B_1.xlsx_H3_reference_panel.csv
+        if re.match(char_pattern, dup_match.group(2)):
+            first_pattern = "a"
+        elif re.match(num_pattern, dup_match.group(2)):
+            first_pattern = "_1"
+        else:
+            # This should only occur if the `dup_ref_pattern` is out of sync with the `char_pattern` and `num_pattern`
+            raise Error(f"Unable to match reference {dup_match.group(2)!r} to {ab_pattern} or {num_pattern}")
+
+        first_fstem = f"{dup_match.group(1)}{first_pattern}{dup_match.group(3)}"
+        first_filepath = path + first_fstem + ".csv"
+
+        # If the first potential file exists and has the same content as the
+        # current file, then ignore the current file.
+        if os.path.isfile(first_filepath) and filecmp.cmp(first_filepath, reference_filepath, shallow=False):
+            print(f"WARNING: Ignoring reference panel file {fstem!r} because it is duplicate of {first_fstem!r}", file=sys.stderr)
+            return None
+
+    return reference_filepath
+
+
 def read_flat_vidrl(path, fstem, assay_type):
     """
     Read the flat CSV file with *fstem* in the provided *path* and convert
@@ -530,9 +576,9 @@ def read_flat_vidrl(path, fstem, assay_type):
     write_records_to_tsv(validated_records, output_filepath)
 
     reference_fstem = fstem.replace("_flat_file", "_reference_panel")
-    reference_filepath = path + reference_fstem + ".csv"
+    reference_filepath = get_ref_panel_filepath(reference_fstem, path)
     serum_abbr_map, test_date = validated_records.return_value
-    if os.path.isfile(reference_filepath):
+    if reference_filepath:
         reference_records = read_csv_to_dict(reference_filepath)
         curated_reference_records = curate_reference_panel_records(
             reference_records,
@@ -542,8 +588,6 @@ def read_flat_vidrl(path, fstem, assay_type):
             assay_type)
         # Append to the same temp file as the flat_file.csv records
         write_records_to_tsv(curated_reference_records, output_filepath, "a")
-    else:
-        print(f"WARNING: Coupled reference panel file {reference_filepath!r} does not exist.", file=sys.stderr)
 
 
 if __name__=="__main__":