From 599130744697f1fbd70b8df1a43ea35b28d2139a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sara=20Monz=C3=B3n?= <sara.monzon.fdz@gmail.com>
Date: Mon, 5 Aug 2024 17:21:47 +0200
Subject: [PATCH] some renaming and fixed remove ref, and consensus haps
 filtering

---
 bin/ivar_variants_to_vcf.py | 117 +++++++++++++++---------------------
 1 file changed, 50 insertions(+), 67 deletions(-)

diff --git a/bin/ivar_variants_to_vcf.py b/bin/ivar_variants_to_vcf.py
index 59cdb636..6925d526 100755
--- a/bin/ivar_variants_to_vcf.py
+++ b/bin/ivar_variants_to_vcf.py
@@ -366,7 +366,7 @@ def merge_rows(self, consec_rows):
         merged_row = consec_rows.loc[merged_index].values.tolist()
         return merged_row
 
-    def create_merge_rowlist(self, clean_rows_list):
+    def merge_hap_vcf(self, clean_rows_list):
         """Merge all the given rows in a single one for each dataframe of consecutive
         rows in a given list
 
@@ -412,9 +412,9 @@ def handle_dup_rows(self, row_set):
         merged_rowlist = []
         for indexlist in index_product_list:
             consec_rows = row_set.loc[indexlist, :]
-            clean_rows_list = self.merge_ref_alt(consec_rows.copy())
+            clean_rows_list = self.get_haplotypes(consec_rows.copy())
             cleaned_ref_rows_list = self.remove_edge_ref(clean_rows_list)
-            batch_rowlist = self.create_merge_rowlist(cleaned_ref_rows_list.copy())
+            batch_rowlist = self.merge_hap_vcf(cleaned_ref_rows_list.copy())
             merged_rowlist.extend(batch_rowlist)
         return merged_rowlist
 
@@ -511,7 +511,7 @@ def merge_rule_check(self, alt_dictlist):
                 consec_series.append(altdict)
         return consec_series
 
-    def merge_ref_alt(self, consec_rows):
+    def get_haplotypes(self, consec_rows):
         """Create a list of all possible combinations of REF and ALT consecutive codons
         following certain conditions of similarity using Allele Frequency values
 
@@ -521,7 +521,7 @@ def merge_ref_alt(self, consec_rows):
         Returns:
             clean_rows_list (list(pd.DataFrame)): Filtered list with viable combinations
         """
-        # Compare variants AF with REF and group those with more similarity
+        # Create all posible combination between ref and alt (different haplotypes) for consecutive variants
         merged_ref_rows = self.get_ref_rowset(consec_rows.copy())
         merged_ref_rows["AF"] = merged_ref_rows["FILENAME"].str.split(":").str[8]
         alt_rows = merged_ref_rows[
@@ -538,46 +538,47 @@ def merge_ref_alt(self, consec_rows):
             x: {"AF": float(y), "set": "alt"}
             for x, y in alt_rows["AF"].to_dict().items()
         }
-        alt_combinations = list(
+        hap_combinations = list(
             product(*[[(k, alt_dict[k]), (k, ref_dict[k])] for k in alt_dict.keys()])
         )
-        combined_dictlist = [dict(comb) for comb in alt_combinations]
-        # Keep together only those codons that fulfill certain similarity rules
+        hap_comb_dict = [dict(comb) for comb in hap_combinations]
 
-        consec_series = self.merge_rule_check(combined_dictlist)
+        # Keep together only those haplotypes that fulfill certain similarity rules
+        valid_hap = self.merge_rule_check(hap_comb_dict)
+
+        if self.consensus_merge:
+            valid_hap_fil = []
+
+            # Process each dictionary in the list
+            for hap in valid_hap:
+                # Create a dictionary of items that meet the threshold
+                valid = all(value['AF'] > self.consensus_af for value in hap.values())
+                if valid:
+                    valid_hap_fil.append(hap)
+                else:
+                    # Create a list of dictionaries for items that do not meet the threshold
+                    separate_dicts = [{key: value} for key, value in hap.items() if value['AF'] <= self.consensus_af]
+
+                    # Extend filtered_list with separate_dicts
+                    for sep in separate_dicts:
+                        if sep not in valid_hap_fil:
+                            valid_hap_fil.append(sep)
+        else:
+            valid_hap_fil = valid_hap
 
         clean_rows_list = []
-        for rowdict in consec_series:
+        for hap in valid_hap_fil:
             consec_rowsdict = {}
-            for key, vals in rowdict.items():
+            for key, vals in hap.items():
                 if vals["set"] == "alt":
                     consec_rowsdict[key] = alt_rows.loc[int(key)]
                 else:
                     consec_rowsdict[key] = ref_rows.loc[int(key)]
             consec_df = pd.DataFrame.from_dict(consec_rowsdict, orient="index")
-            if self.consensus_merge is True:
-                if any(x >= self.consensus_af for x in consec_df["AF"].astype(float)):
-                    consensus_dfs = consec_df.groupby(
-                        consec_df["AF"].astype(float) >= self.consensus_af
-                    )
-                    for af_in_consensus, df in consensus_dfs:
-                        df = df.drop("AF", axis=1)
-                        if af_in_consensus is True:
-                            if not self.find_consecutive(df).empty:
-                                clean_rows_list.append(df)
-                            else:
-                                for _, row in df.groupby("POS"):
-                                    clean_rows_list.append(row)
-                        else:
-                            for _, row in df.groupby("POS"):
-                                clean_rows_list.append(row)
-                else:
-                    for _, row in consec_df.drop("AF", axis=1).groupby("POS"):
-                        clean_rows_list.append(row)
-            else:
-                clean_loc_df = consec_df.drop("AF", axis=1)
-                if not clean_loc_df.empty:
-                    clean_rows_list.append(clean_loc_df)
+
+            clean_loc_df = consec_df.drop("AF", axis=1)
+            if not clean_loc_df.empty:
+                clean_rows_list.append(clean_loc_df)
         return clean_rows_list
 
     def remove_edge_ref(self, clean_rows_list):
@@ -593,24 +594,6 @@ def indexes_are_consecutive(idx_list):
             """Returns True if ints in list are consecutive, or just 1 element"""
             return sorted(idx_list) == list(range(min(idx_list), max(idx_list) + 1))
 
-        def remove_subsets(cleaned_ref_rows_list):
-            """Remove those dataframes which are subsets of another one in the list"""
-
-            def is_subset(df1, df2):
-                """Returns True if df1 is a subset of df2"""
-                return len(df1.merge(df2)) == len(df1)
-
-            max_length = max(len(df) for df in cleaned_ref_rows_list)
-            largest_dfs = [df for df in cleaned_ref_rows_list if len(df) == max_length]
-            other_dfs = [df for df in cleaned_ref_rows_list if len(df) < max_length]
-            if not other_dfs:
-                return largest_dfs
-            final_ref_rows_list = largest_dfs
-            for smalldf in other_dfs:
-                if not any(is_subset(smalldf, bigdf) for bigdf in largest_dfs):
-                    final_ref_rows_list.append(smalldf)
-            return final_ref_rows_list
-
         cleaned_ref_rows_list = []
         for df in clean_rows_list:
             ref_col = df["REF"]
@@ -626,9 +609,8 @@ def is_subset(df1, df2):
                 df = df.drop(idx_matches, axis=0)
                 if not df.empty:
                     cleaned_ref_rows_list.append(df)
-        final_ref_rows_list = remove_subsets(cleaned_ref_rows_list)
 
-        return final_ref_rows_list
+        return cleaned_ref_rows_list
 
     def process_vcf_df(self, vcf_df):
         """Merge rows with consecutive SNPs that passed all filters and without NAs
@@ -652,26 +634,26 @@ def include_rows(vcf_df, last_index, rows_to_merge):
 
         # Clean ivar tsv duplicates due to duplicated annotation
         vcf_df = vcf_df.drop_duplicates(subset=["REGION", "POS", "REF", "ALT"])
-        clean_vcf_df = vcf_df[vcf_df["INFO"] == "TYPE=SNP"]
-        clean_vcf_df = clean_vcf_df[clean_vcf_df["FILTER"] == "PASS"].dropna()
-        consecutive_df = self.find_consecutive(clean_vcf_df)
+        vcf_df_fil = vcf_df[vcf_df["INFO"] == "TYPE=SNP"]
+        vcf_df_fil = vcf_df_fil[vcf_df_fil["FILTER"] == "PASS"].dropna()
+        consecutive_df = self.find_consecutive(vcf_df_fil)
         if consecutive_df.empty:
             return vcf_df
-        splitted_groups = self.split_non_consecutive(consecutive_df)
-        same_codon_consecutive = self.get_same_codon(splitted_groups)
-        split_rows_dict = self.split_by_codon(same_codon_consecutive)
-        for _, row_set in sorted(split_rows_dict.items()):
+        consecutive_split = self.split_non_consecutive(consecutive_df)
+        codon_split = self.get_same_codon(consecutive_split)
+        codon_split_dict = self.split_by_codon(codon_split)
+        for _, row_set in sorted(codon_split_dict.items()):
             last_index = vcf_df.tail(1).index[0] + 1
             vcf_df = vcf_df.drop(row_set.Index)
             # Redundant "Index" column is generated by Itertuples in split_by_codon()
             row_set = row_set.drop(["Index"], axis=1)
             if self.find_duplicates(row_set):
-                rows_to_merge = self.handle_dup_rows(row_set.copy())
+                haps_merged = self.handle_dup_rows(row_set.copy())
             else:
-                clean_rows_list = self.merge_ref_alt(row_set.copy())
-                cleaned_ref_rows_list = self.remove_edge_ref(clean_rows_list)
-                rows_to_merge = self.create_merge_rowlist(cleaned_ref_rows_list.copy())
-            vcf_df = include_rows(vcf_df, last_index, rows_to_merge)
+                hap_rows = self.get_haplotypes(row_set.copy())
+                hap_rows_noref = self.remove_edge_ref(hap_rows)
+                haps_merged = self.merge_hap_vcf(hap_rows_noref.copy())
+            vcf_df = include_rows(vcf_df, last_index, haps_merged)
         vcf_df = vcf_df[vcf_df["REF"] != vcf_df["ALT"]]
         vcf_df = vcf_df.sort_index().reset_index(drop=True)
         processed_vcf_df = vcf_df.drop_duplicates().sort_values("POS")
@@ -752,7 +734,7 @@ def export_vcf(vcf_table, consensus=True):
                 filepath = self.file_out
             else:
                 basename = os.path.splitext(os.path.basename(self.file_out))[0]
-                filename = str(basename) + "_merge_annot.vcf"
+                filename = str(basename) + "_all_hap.vcf"
                 filepath = os.path.join(os.path.dirname(self.file_out), filename)
             with open(filepath, "w") as file_out:
                 file_out.write(vcf_header + "\n")
@@ -781,6 +763,7 @@ def make_dir(path):
 
 def main(args=None):
     args = parse_args(args)
+
     ivar_to_vcf = IvarVariants(
         file_in=args.file_in,
         file_out=args.file_out,