Merge pull request #510 from Proteobench/peptidoform-module-ProteomeD…

…iscoverer Peptidoform output from Proteome Discoverer
Proteobench · Dec 19, 2024 · 392cb3c · 392cb3c
2 parents 8136cc5 + 010fc0f
commit 392cb3c
Show file tree

Hide file tree

Showing 8 changed files with 7,995 additions and 17 deletions.
diff --git a/...arsing/io_parse_settings/Quant/lfq/peptidoform/DDA/parse_settings_proteomediscoverer.toml b/...arsing/io_parse_settings/Quant/lfq/peptidoform/DDA/parse_settings_proteomediscoverer.toml
@@ -0,0 +1,29 @@
+[mapper]
+"Protein Accessions" = "Proteins"
+"Sequence" = "sequence"
+"Modifications" = "modifications"
+
+[condition_mapper]
+"Abundances (Normalized): F1: Sample, ConditionA" = "A"
+"Abundances (Normalized): F2: Sample, ConditionA" = "A"
+"Abundances (Normalized): F3: Sample, ConditionA" = "A"
+"Abundances (Normalized): F4: Sample, ConditionB" = "B"
+"Abundances (Normalized): F5: Sample, ConditionB" = "B"
+"Abundances (Normalized): F6: Sample, ConditionB" = "B"
+
+[run_mapper]
+"Abundances (Normalized): F1: Sample, ConditionA" = "Condition_A_Sample_Alpha_01"
+"Abundances (Normalized): F2: Sample, ConditionA" = "Condition_A_Sample_Alpha_02"
+"Abundances (Normalized): F3: Sample, ConditionA" = "Condition_A_Sample_Alpha_03"
+"Abundances (Normalized): F4: Sample, ConditionB" = "Condition_B_Sample_Alpha_01"
+"Abundances (Normalized): F5: Sample, ConditionB" = "Condition_B_Sample_Alpha_02"
+"Abundances (Normalized): F6: Sample, ConditionB" = "Condition_B_Sample_Alpha_03"
+
+[species_mapper]
+"_YEAST" = "YEAST"
+"_ECOLI" = "ECOLI"
+"_HUMAN" = "HUMAN"
+
+[general]
+"contaminant_flag" = "Cont_"
+"decoy_flag" = true
diff --git a/proteobench/io/parsing/io_parse_settings/parse_settings_files.toml b/proteobench/io/parsing/io_parse_settings/parse_settings_files.toml
@@ -10,6 +10,7 @@
 
 [quant_lfq_peptidoform_DDA]
 "WOMBAT" = "parse_settings_wombat.toml"
+"Proteome Discoverer" = "parse_settings_proteomediscoverer.toml"
 "Custom" = "parse_settings_custom.toml"
 
 [quant_lfq_ion_DIA_AIF]

diff --git a/proteobench/io/parsing/parse_peptidoform.py b/proteobench/io/parsing/parse_peptidoform.py
@@ -16,6 +16,13 @@ def load_input_file(input_csv: str, input_format: str) -> pd.DataFrame:
         pd.DataFrame: The loaded dataframe with the required columns added (like "proforma").
     """
     input_data_frame: pd.DataFrame
+    if input_format == "Proteome Discoverer":
+        input_data_frame = pd.read_csv(input_csv, low_memory=False, sep="\t")
+        input_data_frame["Modifications"].fillna("", inplace=True)
+        input_data_frame["proforma"] = input_data_frame.apply(
+            lambda x: aggregate_modification_column(x["Sequence"], x["Modifications"]),
+            axis=1,
+        )
     if input_format == "WOMBAT":
         input_data_frame = pd.read_csv(input_csv, low_memory=False, sep=",")
         input_data_frame["proforma"] = input_data_frame["modified_peptide"]
@@ -34,11 +41,17 @@ def aggregate_modification_column(
         "Any C-term": -1,
         "Protein N-term": 0,
         "Protein C-term": -1,
+        "N-Term": 0,  # Added to handle "N-Term"
+        "C-Term": -1,  # If you also expect "C-Term"
     },
 ) -> str:
     """
     Aggregate modifications into a string representing the modified sequence.
 
+    This version handles both:
+    - Original format (e.g. "Methylation (C11)" or "Carbamidomethyl (Any N-term)")
+    - New format (e.g. "1xCarbamidomethyl [C11]", "1xOxidation [M4]", "1xAcetyl [N-Term]")
+
     Args:
         input_string_seq (str): The input sequence string.
         input_string_modifications (str): The modifications applied to the sequence.
@@ -47,25 +60,81 @@ def aggregate_modification_column(
     Returns:
         str: The modified sequence string with aggregated modifications.
     """
+
+    # If no modifications, return the original sequence unchanged
+    if not input_string_modifications.strip():
+        return input_string_seq
+
+    # Split modifications by ';' to handle multiple modifications
+    raw_mods = [x.strip() for x in input_string_modifications.split(";") if x.strip()]
+
     all_mods = []
-    for m in input_string_modifications.split("; "):
-        if len(m) == 0:
-            continue
-        m_stripped = m.split(" (")[1].rstrip(")")
-        m_name = m.split(" (")[0]
-
-        if m_stripped in special_locations.keys():
-            if special_locations[m_stripped] == -1:
-                all_mods.append((m_name, len(input_string_seq)))
-            else:
-                all_mods.append((m_name, special_locations[m_stripped]))
-            continue
 
-        all_mods.append((m_name, int(m_stripped[1:])))
+    for m in raw_mods:
+        # Detect format by checking for '(' or '['
+        if "(" in m and "[" not in m:
+            # Original format (e.g. "Carbamidomethyl (C11)" or "Methylation (Any N-term)")
+            parts = m.split(" (")
+            if len(parts) < 2:
+                continue
+            m_name = parts[0].strip()
+            m_stripped = parts[1].rstrip(")")
+
+            # Check if this is a special location
+            if m_stripped in special_locations:
+                loc = special_locations[m_stripped]
+                if loc == -1:
+                    loc = len(input_string_seq)  # C-term
+                all_mods.append((m_name, loc))
+            else:
+                # Assume format like C11 means position 11
+                loc = int(m_stripped[1:])
+                all_mods.append((m_name, loc))
 
+        else:
+            # New format, e.g. "1xCarbamidomethyl [C11]", "1xAcetyl [N-Term]"
+            # Remove any count prefix like "1x"
+            entry = re.sub(r"\d+x", "", m).strip()
+
+            # Extract modification name and bracketed portion
+            mod_name_match = re.match(r"([A-Za-z]+)\s*\[(.+)\]", entry)
+            if not mod_name_match:
+                continue
+
+            mod_name = mod_name_match.group(1)
+            positions_str = mod_name_match.group(2).strip()
+
+            # Positions could be multiple (e.g. "C10; C13")
+            pos_parts = [p.strip() for p in positions_str.split(";") if p.strip()]
+            if not pos_parts:
+                # If there's nothing after the brackets, skip
+                continue
+
+            for pos_part in pos_parts:
+                # Check if pos_part is a known special location (e.g. "N-Term")
+                if pos_part in special_locations:
+                    loc = special_locations[pos_part]
+                    if loc == -1:
+                        loc = len(input_string_seq)
+                    all_mods.append((mod_name, loc))
+                else:
+                    # Otherwise, assume format like C11 or M4
+                    if len(pos_part) > 1:
+                        loc = int(pos_part[1:])
+                        all_mods.append((mod_name, loc))
+
+    # Sort modifications by descending position so we insert from the end
     all_mods.sort(key=lambda x: x[1], reverse=True)
 
     for name, loc in all_mods:
+        # Insert the modification into the sequence.
+        # 'loc' is a 1-based index if it's a residue position.
+        # For terminal modifications, special_locations will have adjusted it.
+        # If loc is -1 or at sequence end, we've already resolved it to len(sequence).
+
+        # Insert the modification brackets at position 'loc'.
+        # Note: If loc == 0 (N-term), insert at start of sequence.
+        #       If loc == len(sequence), insert at end (C-term).
         input_string_seq = input_string_seq[:loc] + f"[{name}]" + input_string_seq[loc:]
 
     return input_string_seq

diff --git a/proteobench/modules/quant/lfq/peptidoform/DDA/quant_lfq_peptidoform_DDA.py b/proteobench/modules/quant/lfq/peptidoform/DDA/quant_lfq_peptidoform_DDA.py
@@ -37,6 +37,7 @@ def __init__(
                 "..",
                 "..",
                 "..",
+                "..",
                 "io",
                 "parsing",
                 "io_parse_settings",

diff --git a/proteobench/modules/quant/quant_base/quant_base_module.py b/proteobench/modules/quant/quant_base/quant_base_module.py
@@ -71,6 +71,10 @@ class QuantModule:
         "FragPipe (DIA-NN quant)": extract_params_fragger,
         "MSAID": extract_params_msaid,
         "Spectronaut": extract_params_spectronaut,
+        # TODO needs to be replace with parameter extraction function
+        "WOMBAT": extract_params_spectronaut,
+        # TODO needs to be replace with parameter extraction function
+        "Proteome Discoverer": extract_params_spectronaut,
     }
 
     def __init__(

diff --git a/proteobench/plotting/plot_quant.py b/proteobench/plotting/plot_quant.py
@@ -87,6 +87,7 @@ def plot_metric(
             "Spectronaut": "#bcbd22",
             "FragPipe (DIA-NN quant)": "#ff7f00",
             "MSAID": "#afff57",
+            "Proteome Discoverer": "#8c564b",
         },
         mapping: Dict[str, int] = {"old": 10, "new": 20},
         highlight_color: str = "#d30067",
-Original file line number
+Diff line change
@@ Expand Up / @@ -37,6 +37,7 @@ def __init__( @@
                     "..",
                     "..",
                     "..",
+                    "..",
                     "io",
                     "parsing",
                     "io_parse_settings",
@@ Expand Down @@