Add a column to the peptide table indicating whether it passes the ov…

…erlap filter
matsengrp · Jul 15, 2024 · 52e4cdf · 52e4cdf
1 parent 7faf8c9
commit 52e4cdf
Showing 1 changed file with 63 additions and 25 deletions.
diff --git a/templates/aggregate_organisms.py b/templates/aggregate_organisms.py
@@ -92,6 +92,10 @@ def __init__(self):
         self.logger.info("Grouping replicates by sample")
         self.sample_table = self.group_replicates()
 
+        # Apply the max_overlap filter
+        # (setting the column 'passes_filter' to True if the peptide passes)
+        self.sample_table = self.apply_max_overlap_filter()
+
         # Save to CSV
         self.sample_table.to_csv("!{sample_id}.peptide.ebs.csv.gz", index=None)
 
@@ -298,34 +302,28 @@ def classify_edgeR_hit(self, r):
         else:
             return "DISCORDANT"
 
-    def group_organisms(self) -> pd.DataFrame:
-        """Group together the results by organism."""
+    def apply_max_overlap_filter(self) -> pd.DataFrame:
+        """Apply the max_overlap filter to each sample/organism."""
 
-        # Analyze each organism independently
+        # Analyze each sample/organism independently
         df = pd.concat([
-            self.group_sample_organisms(d, sample, organism)
-            for (sample, organism), d in self.sample_table.assign(
+            self.apply_max_overlap_filter_sub(d)
+            for _, d in self.sample_table.assign(
                 organism=lambda d: d["peptide"].apply(
                     self.peptide_mapping["organism"].get
                 )
             ).groupby(
                 ["sample", "organism"]
             )
-        ]).fillna(
-            0
-        )
+        ])
 
         return df
 
-    def group_sample_organisms(
+    def apply_max_overlap_filter_sub(
         self,
-        df: pd.DataFrame,
-        sample: str,
-        organism: str
+        df: pd.DataFrame
     ) -> pd.DataFrame:
 
-        """Analyze the data for a single sample, single organism."""
-
         # Add the sequence information for each peptide
         df = df.assign(
             seq=df["peptide"].apply(
@@ -341,31 +339,71 @@ def group_sample_organisms(
         # Keep track of the peptide kmers which have been observed so far
         kmers_seen = set()
 
-        # Make a list of the indices which will be dropped
-        to_drop = list()
+        # Make a list of the indices pass the filter
+        passes_filter = list()
 
         # Go down the list, starting with the tightest binders
-        for i, r in df.iterrows():
+        for _, r in df.iterrows():
 
             # Get the kmers by this peptide
             row_kmers = set([
                 r["seq"][n:(n + self.max_overlap)]
                 for n in range(len(r["seq"]) - self.max_overlap)
             ])
 
-            # If any of those kmers have been seen before
-            if len(row_kmers & kmers_seen) > 0:
-
-                # Drop the row
-                to_drop.append(i)
+            # If none of those kmers have been seen before,
+            # it passes the filter
+            passes_filter.append(len(row_kmers & kmers_seen) == 0)
 
-            # If not
-            else:
+            # If it passes
+            if passes_filter[-1]:
 
                 # Add the covered positions
                 kmers_seen |= row_kmers
 
-        df = df.drop(index=to_drop)
+        # Add a column to the table indicating
+        # whether the peptide passes the filter
+        df = df.assign(
+            passes_filter=passes_filter
+        )
+
+        # Drop the sequence column
+        return (
+            df
+            .drop(columns=["seq"])
+            .sort_index()
+        )
+
+    def group_organisms(self) -> pd.DataFrame:
+        """Group together the results by organism."""
+
+        # Analyze each organism independently
+        df = pd.concat([
+            self.group_sample_organisms(d, sample, organism)
+            for (sample, organism), d in self.sample_table.assign(
+                organism=lambda d: d["peptide"].apply(
+                    self.peptide_mapping["organism"].get
+                )
+            ).groupby(
+                ["sample", "organism"]
+            )
+        ]).fillna(
+            0
+        )
+
+        return df
+
+    def group_sample_organisms(
+        self,
+        df: pd.DataFrame,
+        sample: str,
+        organism: str
+    ) -> pd.DataFrame:
+
+        """Analyze the data for a single sample, single organism."""
+
+        # For this summary, drop peptides which don't pass the filter
+        df = df.query("passes_filter")
 
         # Return the number of hits, etc. for all and just public epitopes
         dat = pd.DataFrame([{