Skip to content

Commit

Permalink
Merge pull request #510 from Proteobench/peptidoform-module-ProteomeD…
Browse files Browse the repository at this point in the history
…iscoverer

Peptidoform output from Proteome Discoverer
  • Loading branch information
RobbinBouwmeester authored Dec 19, 2024
2 parents 8136cc5 + 010fc0f commit 392cb3c
Show file tree
Hide file tree
Showing 8 changed files with 7,995 additions and 17 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
[mapper]
"Protein Accessions" = "Proteins"
"Sequence" = "sequence"
"Modifications" = "modifications"

[condition_mapper]
"Abundances (Normalized): F1: Sample, ConditionA" = "A"
"Abundances (Normalized): F2: Sample, ConditionA" = "A"
"Abundances (Normalized): F3: Sample, ConditionA" = "A"
"Abundances (Normalized): F4: Sample, ConditionB" = "B"
"Abundances (Normalized): F5: Sample, ConditionB" = "B"
"Abundances (Normalized): F6: Sample, ConditionB" = "B"

[run_mapper]
"Abundances (Normalized): F1: Sample, ConditionA" = "Condition_A_Sample_Alpha_01"
"Abundances (Normalized): F2: Sample, ConditionA" = "Condition_A_Sample_Alpha_02"
"Abundances (Normalized): F3: Sample, ConditionA" = "Condition_A_Sample_Alpha_03"
"Abundances (Normalized): F4: Sample, ConditionB" = "Condition_B_Sample_Alpha_01"
"Abundances (Normalized): F5: Sample, ConditionB" = "Condition_B_Sample_Alpha_02"
"Abundances (Normalized): F6: Sample, ConditionB" = "Condition_B_Sample_Alpha_03"

[species_mapper]
"_YEAST" = "YEAST"
"_ECOLI" = "ECOLI"
"_HUMAN" = "HUMAN"

[general]
"contaminant_flag" = "Cont_"
"decoy_flag" = true
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

[quant_lfq_peptidoform_DDA]
"WOMBAT" = "parse_settings_wombat.toml"
"Proteome Discoverer" = "parse_settings_proteomediscoverer.toml"
"Custom" = "parse_settings_custom.toml"

[quant_lfq_ion_DIA_AIF]
Expand Down
95 changes: 82 additions & 13 deletions proteobench/io/parsing/parse_peptidoform.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,13 @@ def load_input_file(input_csv: str, input_format: str) -> pd.DataFrame:
pd.DataFrame: The loaded dataframe with the required columns added (like "proforma").
"""
input_data_frame: pd.DataFrame
if input_format == "Proteome Discoverer":
input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t")
input_data_frame["Modifications"].fillna("", inplace=True)
input_data_frame["proforma"] = input_data_frame.apply(
lambda x: aggregate_modification_column(x["Sequence"], x["Modifications"]),
axis=1,
)
if input_format == "WOMBAT":
input_data_frame = pd.read_csv(input_csv, low_memory=False, sep=",")
input_data_frame["proforma"] = input_data_frame["modified_peptide"]
Expand All @@ -34,11 +41,17 @@ def aggregate_modification_column(
"Any C-term": -1,
"Protein N-term": 0,
"Protein C-term": -1,
"N-Term": 0, # Added to handle "N-Term"
"C-Term": -1, # If you also expect "C-Term"
},
) -> str:
"""
Aggregate modifications into a string representing the modified sequence.
This version handles both:
- Original format (e.g. "Methylation (C11)" or "Carbamidomethyl (Any N-term)")
- New format (e.g. "1xCarbamidomethyl [C11]", "1xOxidation [M4]", "1xAcetyl [N-Term]")
Args:
input_string_seq (str): The input sequence string.
input_string_modifications (str): The modifications applied to the sequence.
Expand All @@ -47,25 +60,81 @@ def aggregate_modification_column(
Returns:
str: The modified sequence string with aggregated modifications.
"""

# If no modifications, return the original sequence unchanged
if not input_string_modifications.strip():
return input_string_seq

# Split modifications by ';' to handle multiple modifications
raw_mods = [x.strip() for x in input_string_modifications.split(";") if x.strip()]

all_mods = []
for m in input_string_modifications.split("; "):
if len(m) == 0:
continue
m_stripped = m.split(" (")[1].rstrip(")")
m_name = m.split(" (")[0]

if m_stripped in special_locations.keys():
if special_locations[m_stripped] == -1:
all_mods.append((m_name, len(input_string_seq)))
else:
all_mods.append((m_name, special_locations[m_stripped]))
continue

all_mods.append((m_name, int(m_stripped[1:])))
for m in raw_mods:
# Detect format by checking for '(' or '['
if "(" in m and "[" not in m:
# Original format (e.g. "Carbamidomethyl (C11)" or "Methylation (Any N-term)")
parts = m.split(" (")
if len(parts) < 2:
continue
m_name = parts[0].strip()
m_stripped = parts[1].rstrip(")")

# Check if this is a special location
if m_stripped in special_locations:
loc = special_locations[m_stripped]
if loc == -1:
loc = len(input_string_seq) # C-term
all_mods.append((m_name, loc))
else:
# Assume format like C11 means position 11
loc = int(m_stripped[1:])
all_mods.append((m_name, loc))

else:
# New format, e.g. "1xCarbamidomethyl [C11]", "1xAcetyl [N-Term]"
# Remove any count prefix like "1x"
entry = re.sub(r"\d+x", "", m).strip()

# Extract modification name and bracketed portion
mod_name_match = re.match(r"([A-Za-z]+)\s*\[(.+)\]", entry)
if not mod_name_match:
continue

mod_name = mod_name_match.group(1)
positions_str = mod_name_match.group(2).strip()

# Positions could be multiple (e.g. "C10; C13")
pos_parts = [p.strip() for p in positions_str.split(";") if p.strip()]
if not pos_parts:
# If there's nothing after the brackets, skip
continue

for pos_part in pos_parts:
# Check if pos_part is a known special location (e.g. "N-Term")
if pos_part in special_locations:
loc = special_locations[pos_part]
if loc == -1:
loc = len(input_string_seq)
all_mods.append((mod_name, loc))
else:
# Otherwise, assume format like C11 or M4
if len(pos_part) > 1:
loc = int(pos_part[1:])
all_mods.append((mod_name, loc))

# Sort modifications by descending position so we insert from the end
all_mods.sort(key=lambda x: x[1], reverse=True)

for name, loc in all_mods:
# Insert the modification into the sequence.
# 'loc' is a 1-based index if it's a residue position.
# For terminal modifications, special_locations will have adjusted it.
# If loc is -1 or at sequence end, we've already resolved it to len(sequence).

# Insert the modification brackets at position 'loc'.
# Note: If loc == 0 (N-term), insert at start of sequence.
# If loc == len(sequence), insert at end (C-term).
input_string_seq = input_string_seq[:loc] + f"[{name}]" + input_string_seq[loc:]

return input_string_seq
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def __init__(
"..",
"..",
"..",
"..",
"io",
"parsing",
"io_parse_settings",
Expand Down
4 changes: 4 additions & 0 deletions proteobench/modules/quant/quant_base/quant_base_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ class QuantModule:
"FragPipe (DIA-NN quant)": extract_params_fragger,
"MSAID": extract_params_msaid,
"Spectronaut": extract_params_spectronaut,
# TODO needs to be replace with parameter extraction function
"WOMBAT": extract_params_spectronaut,
# TODO needs to be replace with parameter extraction function
"Proteome Discoverer": extract_params_spectronaut,
}

def __init__(
Expand Down
1 change: 1 addition & 0 deletions proteobench/plotting/plot_quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def plot_metric(
"Spectronaut": "#bcbd22",
"FragPipe (DIA-NN quant)": "#ff7f00",
"MSAID": "#afff57",
"Proteome Discoverer": "#8c564b",
},
mapping: Dict[str, int] = {"old": 10, "new": 20},
highlight_color: str = "#d30067",
Expand Down
Loading

0 comments on commit 392cb3c

Please sign in to comment.