Skip to content

Commit

Permalink
Add a column to the peptide table indicating whether it passes the ov…
Browse files Browse the repository at this point in the history
…erlap filter
  • Loading branch information
sminot committed Jul 15, 2024
1 parent 7faf8c9 commit 52e4cdf
Showing 1 changed file with 63 additions and 25 deletions.
88 changes: 63 additions & 25 deletions templates/aggregate_organisms.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,10 @@ def __init__(self):
self.logger.info("Grouping replicates by sample")
self.sample_table = self.group_replicates()

# Apply the max_overlap filter
# (setting the column 'passes_filter' to True if the peptide passes)
self.sample_table = self.apply_max_overlap_filter()

# Save to CSV
self.sample_table.to_csv("!{sample_id}.peptide.ebs.csv.gz", index=None)

Expand Down Expand Up @@ -298,34 +302,28 @@ def classify_edgeR_hit(self, r):
else:
return "DISCORDANT"

def group_organisms(self) -> pd.DataFrame:
"""Group together the results by organism."""
def apply_max_overlap_filter(self) -> pd.DataFrame:
"""Apply the max_overlap filter to each sample/organism."""

# Analyze each organism independently
# Analyze each sample/organism independently
df = pd.concat([
self.group_sample_organisms(d, sample, organism)
for (sample, organism), d in self.sample_table.assign(
self.apply_max_overlap_filter_sub(d)
for _, d in self.sample_table.assign(
organism=lambda d: d["peptide"].apply(
self.peptide_mapping["organism"].get
)
).groupby(
["sample", "organism"]
)
]).fillna(
0
)
])

return df

def group_sample_organisms(
def apply_max_overlap_filter_sub(
self,
df: pd.DataFrame,
sample: str,
organism: str
df: pd.DataFrame
) -> pd.DataFrame:

"""Analyze the data for a single sample, single organism."""

# Add the sequence information for each peptide
df = df.assign(
seq=df["peptide"].apply(
Expand All @@ -341,31 +339,71 @@ def group_sample_organisms(
# Keep track of the peptide kmers which have been observed so far
kmers_seen = set()

# Make a list of the indices which will be dropped
to_drop = list()
# Make a list of the indices pass the filter
passes_filter = list()

# Go down the list, starting with the tightest binders
for i, r in df.iterrows():
for _, r in df.iterrows():

# Get the kmers by this peptide
row_kmers = set([
r["seq"][n:(n + self.max_overlap)]
for n in range(len(r["seq"]) - self.max_overlap)
])

# If any of those kmers have been seen before
if len(row_kmers & kmers_seen) > 0:

# Drop the row
to_drop.append(i)
# If none of those kmers have been seen before,
# it passes the filter
passes_filter.append(len(row_kmers & kmers_seen) == 0)

# If not
else:
# If it passes
if passes_filter[-1]:

# Add the covered positions
kmers_seen |= row_kmers

df = df.drop(index=to_drop)
# Add a column to the table indicating
# whether the peptide passes the filter
df = df.assign(
passes_filter=passes_filter
)

# Drop the sequence column
return (
df
.drop(columns=["seq"])
.sort_index()
)

def group_organisms(self) -> pd.DataFrame:
"""Group together the results by organism."""

# Analyze each organism independently
df = pd.concat([
self.group_sample_organisms(d, sample, organism)
for (sample, organism), d in self.sample_table.assign(
organism=lambda d: d["peptide"].apply(
self.peptide_mapping["organism"].get
)
).groupby(
["sample", "organism"]
)
]).fillna(
0
)

return df

def group_sample_organisms(
self,
df: pd.DataFrame,
sample: str,
organism: str
) -> pd.DataFrame:

"""Analyze the data for a single sample, single organism."""

# For this summary, drop peptides which don't pass the filter
df = df.query("passes_filter")

# Return the number of hits, etc. for all and just public epitopes
dat = pd.DataFrame([{
Expand Down

0 comments on commit 52e4cdf

Please sign in to comment.