Skip to content

Commit

Permalink
Merge pull request #75 from matsengrp/74_add_outputs_aggregate_organisms
Browse files Browse the repository at this point in the history
Add outputs to the aggregate organisms module
  • Loading branch information
sminot authored Jul 15, 2024
2 parents 0a36357 + 52e4cdf commit 14a43fc
Show file tree
Hide file tree
Showing 2 changed files with 91 additions and 29 deletions.
115 changes: 87 additions & 28 deletions templates/aggregate_organisms.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import List
import pandas as pd
import logging
from scipy.stats import gmean

# APPROACH

Expand All @@ -18,7 +19,7 @@
# marked as FALSE if both replicates are below the threshold Z-score
# marked as DISCORDANT if some but not all replicates are above the threshold Z-score
# Public: marked as TRUE if the epitope was included in the input list of public epitopes

# 3. To combine the virus-level data for each sample, only keep the highest-scoring
# set of epitopes which do not overlap with any other epitope by more than 7aa.
# To identify overlaps, use an exact alignment approach using k-mers. Note that this
Expand All @@ -34,7 +35,7 @@
# Max EBS across public epitopes
# Mean EBS across all epitopes
# Mean EBS across public epitopes


# INPUTS

Expand Down Expand Up @@ -91,6 +92,10 @@ def __init__(self):
self.logger.info("Grouping replicates by sample")
self.sample_table = self.group_replicates()

# Apply the max_overlap filter
# (setting the column 'passes_filter' to True if the peptide passes)
self.sample_table = self.apply_max_overlap_filter()

# Save to CSV
self.sample_table.to_csv("!{sample_id}.peptide.ebs.csv.gz", index=None)

Expand Down Expand Up @@ -246,7 +251,11 @@ def group_replicates(self) -> pd.DataFrame:
n_replicates=len(replicates),
EBS=df.mean(axis=1),
hit=df.apply(self.classify_hit, axis=1),
edgeR_hit=df.apply(self.classify_edgeR_hit, axis=1) if self.edgeR_hits is not None else None,
edgeR_hit=(
df.apply(self.classify_edgeR_hit, axis=1)
if self.edgeR_hits is not None
else None
),
sample='!{sample_id}'
).reset_index(
).rename(
Expand All @@ -267,7 +276,8 @@ def group_replicates(self) -> pd.DataFrame:
def classify_hit(self, r):
"""Determine whether a peptide is a hit, or discordant."""

# Get the vector of whether each replicate is above the z-score threshold
# Get the vector of whether each replicate
# is above the z-score threshold
hit_vec = r > self.zscore_threshold

# Determine the hit type
Expand All @@ -279,7 +289,10 @@ def classify_hit(self, r):
return "DISCORDANT"

def classify_edgeR_hit(self, r):
"""Determine whether a peptide is a hit, or discordant - based on edgeR hits."""
"""
Determine whether a peptide is a hit, or discordant -
based on edgeR hits.
"""

# Determine the hit type
if r.all():
Expand All @@ -289,27 +302,27 @@ def classify_edgeR_hit(self, r):
else:
return "DISCORDANT"

def group_organisms(self) -> pd.DataFrame:
"""Group together the results by organism."""
def apply_max_overlap_filter(self) -> pd.DataFrame:
"""Apply the max_overlap filter to each sample/organism."""

# Analyze each organism independently
# Analyze each sample/organism independently
df = pd.concat([
self.group_sample_organisms(d, sample, organism)
for (sample, organism), d in self.sample_table.assign(
self.apply_max_overlap_filter_sub(d)
for _, d in self.sample_table.assign(
organism=lambda d: d["peptide"].apply(
self.peptide_mapping["organism"].get
)
).groupby(
["sample", "organism"]
)
]).fillna(
0
)
])

return df

def group_sample_organisms(self, df:pd.DataFrame, sample:str, organism:str) -> pd.DataFrame:
"""Analyze the data for a single sample, single organism."""
def apply_max_overlap_filter_sub(
self,
df: pd.DataFrame
) -> pd.DataFrame:

# Add the sequence information for each peptide
df = df.assign(
Expand All @@ -326,31 +339,71 @@ def group_sample_organisms(self, df:pd.DataFrame, sample:str, organism:str) -> p
# Keep track of the peptide kmers which have been observed so far
kmers_seen = set()

# Make a list of the indices which will be dropped
to_drop = list()
# Make a list of the indices pass the filter
passes_filter = list()

# Go down the list, starting with the tightest binders
for i, r in df.iterrows():
for _, r in df.iterrows():

# Get the kmers by this peptide
row_kmers = set([
r["seq"][n:(n + self.max_overlap)]
for n in range(len(r["seq"]) - self.max_overlap)
])

# If any of those kmers have been seen before
if len(row_kmers & kmers_seen) > 0:

# Drop the row
to_drop.append(i)
# If none of those kmers have been seen before,
# it passes the filter
passes_filter.append(len(row_kmers & kmers_seen) == 0)

# If not
else:
# If it passes
if passes_filter[-1]:

# Add the covered positions
kmers_seen |= row_kmers

df = df.drop(index=to_drop)
# Add a column to the table indicating
# whether the peptide passes the filter
df = df.assign(
passes_filter=passes_filter
)

# Drop the sequence column
return (
df
.drop(columns=["seq"])
.sort_index()
)

def group_organisms(self) -> pd.DataFrame:
"""Group together the results by organism."""

# Analyze each organism independently
df = pd.concat([
self.group_sample_organisms(d, sample, organism)
for (sample, organism), d in self.sample_table.assign(
organism=lambda d: d["peptide"].apply(
self.peptide_mapping["organism"].get
)
).groupby(
["sample", "organism"]
)
]).fillna(
0
)

return df

def group_sample_organisms(
self,
df: pd.DataFrame,
sample: str,
organism: str
) -> pd.DataFrame:

"""Analyze the data for a single sample, single organism."""

# For this summary, drop peptides which don't pass the filter
df = df.query("passes_filter")

# Return the number of hits, etc. for all and just public epitopes
dat = pd.DataFrame([{
Expand All @@ -370,9 +423,15 @@ def group_sample_organisms(self, df:pd.DataFrame, sample:str, organism:str) -> p
(f"n_edgeR_hits_{label}", (d["edgeR_hit"] == "TRUE").sum()),
(f"n_edgeR_discordant_{label}", (d["edgeR_hit"] == "DISCORDANT").sum()),
(f"max_ebs_{label}", d["EBS"].max()),
(f"mean_ebs_{label}", d["EBS"].mean())
(f"mean_ebs_{label}", d["EBS"].mean()),
(f"gmean_ebs_{label}", gmean(d["EBS"]))
]
if k not in [
"n_hits_hits",
"n_discordant_hits",
"gmean_ebs_all",
"gmean_ebs_public"
]
if k not in ["n_hits_hits", "n_discordant_hits"]
}
}])

Expand Down
5 changes: 4 additions & 1 deletion workflows/alignment.nf
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,10 @@ workflow ALIGN {
tuple(
"peptide_ref",
row.sample_id,
file("$params.reads_prefix/$row.fastq_filepath")
file(
"$params.reads_prefix/$row.fastq_filepath",
checkIfExists:true
)
)
}
.set { samples_ch }
Expand Down

0 comments on commit 14a43fc

Please sign in to comment.