Merge pull request #75 from matsengrp/74_add_outputs_aggregate_organisms

Add outputs to the aggregate organisms module
matsengrp · Jul 15, 2024 · 14a43fc · 14a43fc
2 parents 0a36357 + 52e4cdf
commit 14a43fc
Show file tree

Hide file tree

Showing 2 changed files with 91 additions and 29 deletions.
diff --git a/templates/aggregate_organisms.py b/templates/aggregate_organisms.py
@@ -4,6 +4,7 @@
 from typing import List
 import pandas as pd
 import logging
+from scipy.stats import gmean
 
 # APPROACH
 
@@ -18,7 +19,7 @@
 #       marked as FALSE if both replicates are below the threshold Z-score
 #       marked as DISCORDANT if some but not all replicates are above the threshold Z-score
 #   Public: marked as TRUE if the epitope was included in the input list of public epitopes
- 
+
 # 3. To combine the virus-level data for each sample, only keep the highest-scoring
 # set of epitopes which do not overlap with any other epitope by more than 7aa.
 # To identify overlaps, use an exact alignment approach using k-mers. Note that this
@@ -34,7 +35,7 @@
 #   Max EBS across public epitopes
 #   Mean EBS across all epitopes
 #   Mean EBS across public epitopes
- 
+
 
 # INPUTS
 
@@ -91,6 +92,10 @@ def __init__(self):
         self.logger.info("Grouping replicates by sample")
         self.sample_table = self.group_replicates()
 
+        # Apply the max_overlap filter
+        # (setting the column 'passes_filter' to True if the peptide passes)
+        self.sample_table = self.apply_max_overlap_filter()
+
         # Save to CSV
         self.sample_table.to_csv("!{sample_id}.peptide.ebs.csv.gz", index=None)
 
@@ -246,7 +251,11 @@ def group_replicates(self) -> pd.DataFrame:
             n_replicates=len(replicates),
             EBS=df.mean(axis=1),
             hit=df.apply(self.classify_hit, axis=1),
-            edgeR_hit=df.apply(self.classify_edgeR_hit, axis=1) if self.edgeR_hits is not None else None,
+            edgeR_hit=(
+                df.apply(self.classify_edgeR_hit, axis=1)
+                if self.edgeR_hits is not None
+                else None
+            ),
             sample='!{sample_id}'
         ).reset_index(
         ).rename(
@@ -267,7 +276,8 @@ def group_replicates(self) -> pd.DataFrame:
     def classify_hit(self, r):
         """Determine whether a peptide is a hit, or discordant."""
 
-        # Get the vector of whether each replicate is above the z-score threshold
+        # Get the vector of whether each replicate
+        # is above the z-score threshold
         hit_vec = r > self.zscore_threshold
 
         # Determine the hit type
@@ -279,7 +289,10 @@ def classify_hit(self, r):
             return "DISCORDANT"
 
     def classify_edgeR_hit(self, r):
-        """Determine whether a peptide is a hit, or discordant - based on edgeR hits."""
+        """
+        Determine whether a peptide is a hit, or discordant - 
+        based on edgeR hits.
+        """
 
         # Determine the hit type
         if r.all():
@@ -289,27 +302,27 @@ def classify_edgeR_hit(self, r):
         else:
             return "DISCORDANT"
 
-    def group_organisms(self) -> pd.DataFrame:
-        """Group together the results by organism."""
+    def apply_max_overlap_filter(self) -> pd.DataFrame:
+        """Apply the max_overlap filter to each sample/organism."""
 
-        # Analyze each organism independently
+        # Analyze each sample/organism independently
         df = pd.concat([
-            self.group_sample_organisms(d, sample, organism)
-            for (sample, organism), d in self.sample_table.assign(
+            self.apply_max_overlap_filter_sub(d)
+            for _, d in self.sample_table.assign(
                 organism=lambda d: d["peptide"].apply(
                     self.peptide_mapping["organism"].get
                 )
             ).groupby(
                 ["sample", "organism"]
             )
-        ]).fillna(
-            0
-        )
+        ])
 
         return df
 
-    def group_sample_organisms(self, df:pd.DataFrame, sample:str, organism:str) -> pd.DataFrame:
-        """Analyze the data for a single sample, single organism."""
+    def apply_max_overlap_filter_sub(
+        self,
+        df: pd.DataFrame
+    ) -> pd.DataFrame:
 
         # Add the sequence information for each peptide
         df = df.assign(
@@ -326,31 +339,71 @@ def group_sample_organisms(self, df:pd.DataFrame, sample:str, organism:str) -> p
         # Keep track of the peptide kmers which have been observed so far
         kmers_seen = set()
 
-        # Make a list of the indices which will be dropped
-        to_drop = list()
+        # Make a list of the indices pass the filter
+        passes_filter = list()
 
         # Go down the list, starting with the tightest binders
-        for i, r in df.iterrows():
+        for _, r in df.iterrows():
 
             # Get the kmers by this peptide
             row_kmers = set([
                 r["seq"][n:(n + self.max_overlap)]
                 for n in range(len(r["seq"]) - self.max_overlap)
             ])
 
-            # If any of those kmers have been seen before
-            if len(row_kmers & kmers_seen) > 0:
-
-                # Drop the row
-                to_drop.append(i)
+            # If none of those kmers have been seen before,
+            # it passes the filter
+            passes_filter.append(len(row_kmers & kmers_seen) == 0)
 
-            # If not
-            else:
+            # If it passes
+            if passes_filter[-1]:
 
                 # Add the covered positions
                 kmers_seen |= row_kmers
 
-        df = df.drop(index=to_drop)
+        # Add a column to the table indicating
+        # whether the peptide passes the filter
+        df = df.assign(
+            passes_filter=passes_filter
+        )
+
+        # Drop the sequence column
+        return (
+            df
+            .drop(columns=["seq"])
+            .sort_index()
+        )
+
+    def group_organisms(self) -> pd.DataFrame:
+        """Group together the results by organism."""
+
+        # Analyze each organism independently
+        df = pd.concat([
+            self.group_sample_organisms(d, sample, organism)
+            for (sample, organism), d in self.sample_table.assign(
+                organism=lambda d: d["peptide"].apply(
+                    self.peptide_mapping["organism"].get
+                )
+            ).groupby(
+                ["sample", "organism"]
+            )
+        ]).fillna(
+            0
+        )
+
+        return df
+
+    def group_sample_organisms(
+        self,
+        df: pd.DataFrame,
+        sample: str,
+        organism: str
+    ) -> pd.DataFrame:
+
+        """Analyze the data for a single sample, single organism."""
+
+        # For this summary, drop peptides which don't pass the filter
+        df = df.query("passes_filter")
 
         # Return the number of hits, etc. for all and just public epitopes
         dat = pd.DataFrame([{
@@ -370,9 +423,15 @@ def group_sample_organisms(self, df:pd.DataFrame, sample:str, organism:str) -> p
                     (f"n_edgeR_hits_{label}", (d["edgeR_hit"] == "TRUE").sum()),
                     (f"n_edgeR_discordant_{label}", (d["edgeR_hit"] == "DISCORDANT").sum()),
                     (f"max_ebs_{label}", d["EBS"].max()),
-                    (f"mean_ebs_{label}", d["EBS"].mean())
+                    (f"mean_ebs_{label}", d["EBS"].mean()),
+                    (f"gmean_ebs_{label}", gmean(d["EBS"]))
+                ]
+                if k not in [
+                    "n_hits_hits",
+                    "n_discordant_hits",
+                    "gmean_ebs_all",
+                    "gmean_ebs_public"
                 ]
-                if k not in ["n_hits_hits", "n_discordant_hits"]
             }
         }])
 

diff --git a/workflows/alignment.nf b/workflows/alignment.nf
@@ -139,7 +139,10 @@ workflow ALIGN {
                 tuple(
                     "peptide_ref",
                     row.sample_id,
-                    file("$params.reads_prefix/$row.fastq_filepath")
+                    file(
+                        "$params.reads_prefix/$row.fastq_filepath",
+                        checkIfExists:true
+                    )
                 ) 
             }
             .set { samples_ch }