From fdceebac3d306a0630ba992343b3fb2c9f9a0286 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <Arthur.Declercq@UGent.be>
Date: Sat, 24 Feb 2024 15:48:00 +0100
Subject: [PATCH 01/23] initial commit

---
 ms2rescore/feature_generators/ms2.py | 161 +++++++++++++++++++++++++++
 1 file changed, 161 insertions(+)
 create mode 100644 ms2rescore/feature_generators/ms2.py

diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py
new file mode 100644
index 00000000..8e144574
--- /dev/null
+++ b/ms2rescore/feature_generators/ms2.py
@@ -0,0 +1,161 @@
+"""
+MS2-based feature generator.
+
+"""
+
+import logging
+import re
+from typing import List, Optional, Union
+from itertools import chain
+from collections import defaultdict
+
+
+import numpy as np
+from psm_utils import PSMList
+from pyteomics.mass import calculate_mass
+
+from ms2rescore.feature_generators.base import FeatureGeneratorBase
+from ms2rescore.utils import infer_spectrum_path
+
+logger = logging.getLogger(__name__)
+
+
+class MS2FeatureGenerator(FeatureGeneratorBase):
+    """DeepLC retention time-based feature generator."""
+
+    def __init__(
+        self,
+        *args,
+        spectrum_path: Optional[str] = None,
+        spectrum_id_pattern: str = "(.*)",
+        **kwargs,
+    ) -> None:
+        """
+        Generate MS2-based features for rescoring.
+
+        DeepLC retraining is on by default. Add ``deeplc_retrain: False`` as a keyword argument to
+        disable retraining.
+
+        Parameters
+        ----------
+
+        Attributes
+        ----------
+        feature_names: list[str]
+            Names of the features that will be added to the PSMs.
+
+        """
+        super().__init__(*args, **kwargs)
+        self.spectrum_path = spectrum_path
+        self.spectrum_id_pattern = spectrum_id_pattern
+
+    @property
+    def feature_names(self) -> List[str]:
+        return [
+            "ln_explained_intensity",
+            "ln_total_intensity",
+            "ln_explained_intensity_ratio",
+            "ln_explained_b_ion_ratio",
+            "ln_explained_y_ion_ratio",
+            "longest_b_ion_sequence",
+            "longest_y_ion_sequence",
+            "matched_b_ions",
+            "matched_b_ions_pct",
+            "matched_y_ions",
+            "matched_y_ions_pct",
+            "matched_ions_pct",
+        ]
+
+    def add_features(self, psm_list: PSMList) -> None:
+        """Add DeepLC-derived features to PSMs."""
+
+        logger.info("Adding MS2-derived features to PSMs.")
+        psm_dict = psm_list.get_psm_dict()
+        current_run = 1
+        total_runs = sum(len(runs) for runs in psm_dict.values())
+
+        for runs in psm_dict.values():
+            for run, psms in runs.items():
+                logger.info(
+                    f"Running MS2 for PSMs from run ({current_run}/{total_runs}) `{run}`..."
+                )
+                psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
+                spectrum_filename = infer_spectrum_path(self.spectrum_path, run)
+                logger.debug(f"Using spectrum file `{spectrum_filename}`")
+                self._add_features_run(psm_list_run, spectrum_filename)
+                current_run += 1
+
+    def _add_features_run(self, psm_list: PSMList, spectrum_filename: str) -> None:
+        pass
+
+
+def longest_sequence_of_trues(lst):
+    max_sequence = 0
+    current_sequence = 0
+
+    for value in lst:
+        current_sequence = current_sequence + 1 if value else 0
+        max_sequence = max(max_sequence, current_sequence)
+
+    return max_sequence
+
+
+def extract_spectrum_features(annotated_spectrum, peptidoform):
+    features = defaultdict(list)
+    b_ions_matched = [False] * (len(peptidoform.sequence))
+    y_ions_matched = [False] * (len(peptidoform.sequence))
+
+    pseudo_count = 0.00001
+
+    for annotated_peak in annotated_spectrum.spectrum:
+        features["total_intensity"].append(annotated_peak.intensity)
+
+        if annotated_peak.annotation:
+            features["matched_intensity"].append(annotated_peak.intensity)
+            for matched_ion in annotated_peak.annotation:
+                if "y" in matched_ion.ion:
+                    features["y_ion_matched"].append(annotated_peak.intensity)
+                    y_ions_matched[int(re.search(r"\d+", matched_ion.ion).group())] = True
+                elif "b" in matched_ion.ion:
+                    features["b_ion_matched"].append(annotated_peak.intensity)
+                    b_ions_matched[int(re.search(r"\d+", matched_ion.ion).group())] = True
+
+    return {
+        "ln_explained_intensity": np.log(np.sum(features["matched_intensity"]) + pseudo_count),
+        "ln_total_intensity": np.log(np.sum(features["total_intensity"]) + pseudo_count),
+        "ln_explained_intensity_ratio": np.log(
+            np.sum(features["matched_intensity"]) / np.sum(features["total_intensity"])
+            + pseudo_count
+        ),
+        "ln_explained_b_ion_ratio": np.log(
+            np.sum(features["b_ion_matched"]) / np.sum(features["matched_intensity"])
+            + pseudo_count
+        ),
+        "ln_explained_y_ion_ratio": np.log(
+            np.sum(features["y_ion_matched"]) / np.sum(features["matched_intensity"])
+            + pseudo_count
+        ),
+        "longest_b_ion_sequence": longest_sequence_of_trues(b_ions_matched),
+        "longest_y_ion_sequence": longest_sequence_of_trues(y_ions_matched),
+        "matched_b_ions": sum(b_ions_matched),
+        "matched_b_ions_pct": sum(b_ions_matched) / len(b_ions_matched),
+        "matched_y_ions": sum(y_ions_matched),
+        "matched_y_ions_pct": sum(y_ions_matched) / len(y_ions_matched),
+        "matched_ions_pct": (sum(b_ions_matched) + sum(y_ions_matched))
+        / (len(b_ions_matched) + len(y_ions_matched)),
+    }
+
+
+# TODO: keep this here?
+def modification_evidence():
+    return
+
+
+# TODO: move to basic feature generator
+def extract_psm_features(psm):
+    return {
+        "peptide_len": len(psm.peptidoform.sequence),
+        "expmass": (psm.precursor_mz * psm.get_precursor_charge())
+        - (psm.get_precursor_charge() * 1.007276),  # TODO: replace by constant
+        "calcmass": calculate_mass(psm.peptidoform.composition),
+    }

From 5374ed83aa56c075bcd2cf2da1004e049d13ee9d Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <Arthur.Declercq@UGent.be>
Date: Sun, 25 Feb 2024 16:44:54 +0100
Subject: [PATCH 02/23] finalize ms2 feature generation

---
 ms2rescore/core.py                        |  25 +--
 ms2rescore/exceptions.py                  |   6 +
 ms2rescore/feature_generators/__init__.py |   2 +
 ms2rescore/feature_generators/ms2.py      | 218 +++++++++++++++-------
 4 files changed, 171 insertions(+), 80 deletions(-)

diff --git a/ms2rescore/core.py b/ms2rescore/core.py
index e5ae26f1..f8ff38d7 100644
--- a/ms2rescore/core.py
+++ b/ms2rescore/core.py
@@ -54,18 +54,18 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
         f"PSMs already contain the following rescoring features: {psm_list_feature_names}"
     )
 
-    # TODO: avoid hard coding feature generators in some way
-    rt_required = ("deeplc" in config["feature_generators"]) and (
-        None in psm_list["retention_time"]
-    )
-    im_required = ("ionmob" or "im2deep" in config["feature_generators"]) and (
-        None in psm_list["ion_mobility"]
-    )
-    logger.debug(f"RT required: {rt_required}, IM required: {im_required}")
-
-    if rt_required or im_required:
-        logger.info("Parsing missing retention time and/or ion mobility values from spectra...")
-        fill_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required)
+    # # TODO: avoid hard coding feature generators in some way
+    # rt_required = ("deeplc" in config["feature_generators"]) and (
+    #     None in psm_list["retention_time"]
+    # )
+    # im_required = ("ionmob" or "im2deep" in config["feature_generators"]) and (
+    #     None in psm_list["ion_mobility"]
+    # )
+    # logger.debug(f"RT required: {rt_required}, IM required: {im_required}")
+
+    # if rt_required or im_required:
+    #     logger.info("Parsing missing retention time and/or ion mobility values from spectra...")
+    #     fill_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required)
 
     # Add rescoring features
     for fgen_name, fgen_config in config["feature_generators"].items():
@@ -82,6 +82,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
         fgen.add_features(psm_list)
         logger.debug(f"Adding features from {fgen_name}: {set(fgen.feature_names)}")
         feature_names[fgen_name] = set(fgen.feature_names)
+    exit()
 
     # Filter out psms that do not have all added features
     all_feature_names = {f for fgen in feature_names.values() for f in fgen}
diff --git a/ms2rescore/exceptions.py b/ms2rescore/exceptions.py
index f8d1fa01..9b429a30 100644
--- a/ms2rescore/exceptions.py
+++ b/ms2rescore/exceptions.py
@@ -23,3 +23,9 @@ class ModificationParsingError(IDFileParsingError):
     """Identification file parsing error."""
 
     pass
+
+
+class ParseSpectrumError(MS2RescoreError):
+    """Error parsing spectrum file."""
+
+    pass
diff --git a/ms2rescore/feature_generators/__init__.py b/ms2rescore/feature_generators/__init__.py
index 52440a5f..7a10fd4c 100644
--- a/ms2rescore/feature_generators/__init__.py
+++ b/ms2rescore/feature_generators/__init__.py
@@ -8,6 +8,7 @@
 from ms2rescore.feature_generators.maxquant import MaxQuantFeatureGenerator
 from ms2rescore.feature_generators.ms2pip import MS2PIPFeatureGenerator
 from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator
+from ms2rescore.feature_generators.ms2 import MS2FeatureGenerator
 
 FEATURE_GENERATORS = {
     "basic": BasicFeatureGenerator,
@@ -16,4 +17,5 @@
     "maxquant": MaxQuantFeatureGenerator,
     "ionmob": IonMobFeatureGenerator,
     "im2deep": IM2DeepFeatureGenerator,
+    "ms2": MS2FeatureGenerator,
 }
diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py
index 8e144574..013bcb44 100644
--- a/ms2rescore/feature_generators/ms2.py
+++ b/ms2rescore/feature_generators/ms2.py
@@ -12,13 +12,27 @@
 
 import numpy as np
 from psm_utils import PSMList
-from pyteomics.mass import calculate_mass
+from pyteomics import mass, mzml, mgf
+from rustyms import RawSpectrum, LinearPeptide, FragmentationModel, MassMode
 
 from ms2rescore.feature_generators.base import FeatureGeneratorBase
 from ms2rescore.utils import infer_spectrum_path
+from ms2rescore.exceptions import ParseSpectrumError
 
 logger = logging.getLogger(__name__)
 
+FRAGMENTATION_MODELS = {
+    "cidhcd": FragmentationModel.CidHcd,
+    "etcid": FragmentationModel.Etcid,
+    "etd": FragmentationModel.Etd,
+    "ethcd": FragmentationModel.Ethcd,
+    "all": FragmentationModel.All,
+}
+MASS_MODES = {
+    "average": MassMode.Average,
+    "monoisotopic": MassMode.Monoisotopic,
+}
+
 
 class MS2FeatureGenerator(FeatureGeneratorBase):
     """DeepLC retention time-based feature generator."""
@@ -28,14 +42,13 @@ def __init__(
         *args,
         spectrum_path: Optional[str] = None,
         spectrum_id_pattern: str = "(.*)",
+        fragmentationmodel: str = "All",
+        massmode: str = "Monoisotopic",
         **kwargs,
     ) -> None:
         """
         Generate MS2-based features for rescoring.
 
-        DeepLC retraining is on by default. Add ``deeplc_retrain: False`` as a keyword argument to
-        disable retraining.
-
         Parameters
         ----------
 
@@ -48,6 +61,8 @@ def __init__(
         super().__init__(*args, **kwargs)
         self.spectrum_path = spectrum_path
         self.spectrum_id_pattern = spectrum_id_pattern
+        self.fragmentation_model = FRAGMENTATION_MODELS[fragmentationmodel.lower()]
+        self.mass_mode = MASS_MODES[massmode.lower()]
 
     @property
     def feature_names(self) -> List[str]:
@@ -67,7 +82,7 @@ def feature_names(self) -> List[str]:
         ]
 
     def add_features(self, psm_list: PSMList) -> None:
-        """Add DeepLC-derived features to PSMs."""
+        """Add MS2-derived features to PSMs."""
 
         logger.info("Adding MS2-derived features to PSMs.")
         psm_dict = psm_list.get_psm_dict()
@@ -82,68 +97,128 @@ def add_features(self, psm_list: PSMList) -> None:
                 psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
                 spectrum_filename = infer_spectrum_path(self.spectrum_path, run)
                 logger.debug(f"Using spectrum file `{spectrum_filename}`")
-                self._add_features_run(psm_list_run, spectrum_filename)
-                current_run += 1
-
-    def _add_features_run(self, psm_list: PSMList, spectrum_filename: str) -> None:
-        pass
-
-
-def longest_sequence_of_trues(lst):
-    max_sequence = 0
-    current_sequence = 0
-
-    for value in lst:
-        current_sequence = current_sequence + 1 if value else 0
-        max_sequence = max(max_sequence, current_sequence)
-
-    return max_sequence
-
 
-def extract_spectrum_features(annotated_spectrum, peptidoform):
-    features = defaultdict(list)
-    b_ions_matched = [False] * (len(peptidoform.sequence))
-    y_ions_matched = [False] * (len(peptidoform.sequence))
-
-    pseudo_count = 0.00001
-
-    for annotated_peak in annotated_spectrum.spectrum:
-        features["total_intensity"].append(annotated_peak.intensity)
-
-        if annotated_peak.annotation:
-            features["matched_intensity"].append(annotated_peak.intensity)
-            for matched_ion in annotated_peak.annotation:
-                if "y" in matched_ion.ion:
-                    features["y_ion_matched"].append(annotated_peak.intensity)
-                    y_ions_matched[int(re.search(r"\d+", matched_ion.ion).group())] = True
-                elif "b" in matched_ion.ion:
-                    features["b_ion_matched"].append(annotated_peak.intensity)
-                    b_ions_matched[int(re.search(r"\d+", matched_ion.ion).group())] = True
+                annotated_spectra = self._annotate_spectra(psm_list_run, spectrum_filename)
+                self._calculate_features(psm_list_run, annotated_spectra)
+                current_run += 1
 
-    return {
-        "ln_explained_intensity": np.log(np.sum(features["matched_intensity"]) + pseudo_count),
-        "ln_total_intensity": np.log(np.sum(features["total_intensity"]) + pseudo_count),
-        "ln_explained_intensity_ratio": np.log(
-            np.sum(features["matched_intensity"]) / np.sum(features["total_intensity"])
-            + pseudo_count
-        ),
-        "ln_explained_b_ion_ratio": np.log(
-            np.sum(features["b_ion_matched"]) / np.sum(features["matched_intensity"])
-            + pseudo_count
-        ),
-        "ln_explained_y_ion_ratio": np.log(
-            np.sum(features["y_ion_matched"]) / np.sum(features["matched_intensity"])
-            + pseudo_count
-        ),
-        "longest_b_ion_sequence": longest_sequence_of_trues(b_ions_matched),
-        "longest_y_ion_sequence": longest_sequence_of_trues(y_ions_matched),
-        "matched_b_ions": sum(b_ions_matched),
-        "matched_b_ions_pct": sum(b_ions_matched) / len(b_ions_matched),
-        "matched_y_ions": sum(y_ions_matched),
-        "matched_y_ions_pct": sum(y_ions_matched) / len(y_ions_matched),
-        "matched_ions_pct": (sum(b_ions_matched) + sum(y_ions_matched))
-        / (len(b_ions_matched) + len(y_ions_matched)),
-    }
+    def _calculate_features(self, psm_list: PSMList, annotated_spectra: List) -> None:
+        """Calculate features from annotated spectra and add them to the PSMs."""
+        for psm, spectrum in zip(psm_list, annotated_spectra):
+            psm.features.update(self.extract_spectrum_features(spectrum, psm.peptidoform))
+
+    def _annotate_spectra(self, psm_list: PSMList, spectrum_file: str) -> List:
+        """Retrieve annotated spectra for all psms."""
+
+        annotated_spectra = []
+        spectrum_reader = self._read_spectrum_file(spectrum_file)
+
+        spectrum_id_pattern = re.compile(
+            self.spectrum_id_pattern if self.spectrum_id_pattern else r"(.*)"
+        )
+        logger.info(spectrum_id_pattern)
+        logger.info(spectrum_reader._offset_index.mapping["spectrum"].keys())
+
+        try:
+            mapper = {
+                spectrum_id_pattern.search(spectrum_id).group(1): spectrum_id
+                for spectrum_id in spectrum_reader._offset_index.mapping["spectrum"].keys()
+            }
+        except AttributeError:
+            raise ParseSpectrumError(
+                "Could not parse spectrum IDs using ´spectrum_id_pattern´. Please make sure that there is a capturing in the pattern."
+            )
+
+        for psm in psm_list:
+            pyteomics_spectrum = spectrum_reader.get_by_id(mapper[psm.spectrum_id])
+            annotated_spectrum = self._annotate_spectrum(psm, pyteomics_spectrum)
+            annotated_spectra.append(annotated_spectrum)
+
+        return annotated_spectra
+
+    @staticmethod
+    def _read_spectrum_file(spectrum_filepath: str) -> Union[mzml.PreIndexedMzML, mgf.IndexedMGF]:
+
+        if spectrum_filepath.suffix == ".mzML":
+            return mzml.PreIndexedMzML(str(spectrum_filepath))
+        elif spectrum_filepath.suffix == ".mgf":
+            return mgf.IndexedMGF(str(spectrum_filepath))
+
+    @staticmethod
+    def _longest_ion_sequence(lst):
+        max_sequence = 0
+        current_sequence = 0
+
+        for value in lst:
+            current_sequence = current_sequence + 1 if value else 0
+            max_sequence = max(max_sequence, current_sequence)
+
+        return max_sequence
+
+    def extract_spectrum_features(self, annotated_spectrum, peptidoform):
+        features = defaultdict(list)
+        b_ions_matched = [False] * (len(peptidoform.sequence))
+        y_ions_matched = [False] * (len(peptidoform.sequence))
+
+        pseudo_count = 0.00001
+
+        for annotated_peak in annotated_spectrum.spectrum:
+            features["total_intensity"].append(annotated_peak.intensity)
+
+            if annotated_peak.annotation:
+                features["matched_intensity"].append(annotated_peak.intensity)
+                for matched_ion in annotated_peak.annotation:
+                    if "y" in matched_ion.ion:
+                        features["y_ion_matched"].append(annotated_peak.intensity)
+                        y_ions_matched[int(re.search(r"\d+", matched_ion.ion).group())] = True
+                    elif "b" in matched_ion.ion:
+                        features["b_ion_matched"].append(annotated_peak.intensity)
+                        b_ions_matched[int(re.search(r"\d+", matched_ion.ion).group())] = True
+
+        return {
+            "ln_explained_intensity": np.log(np.sum(features["matched_intensity"]) + pseudo_count),
+            "ln_total_intensity": np.log(np.sum(features["total_intensity"]) + pseudo_count),
+            "ln_explained_intensity_ratio": np.log(
+                np.sum(features["matched_intensity"]) / np.sum(features["total_intensity"])
+                + pseudo_count
+            ),
+            "ln_explained_b_ion_ratio": np.log(
+                np.sum(features["b_ion_matched"]) / np.sum(features["matched_intensity"])
+                + pseudo_count
+            ),
+            "ln_explained_y_ion_ratio": np.log(
+                np.sum(features["y_ion_matched"]) / np.sum(features["matched_intensity"])
+                + pseudo_count
+            ),
+            "longest_b_ion_sequence": self._longest_ion_sequence(b_ions_matched),
+            "longest_y_ion_sequence": self._longest_ion_sequence(y_ions_matched),
+            "matched_b_ions": sum(b_ions_matched),
+            "matched_b_ions_pct": sum(b_ions_matched) / len(b_ions_matched),
+            "matched_y_ions": sum(y_ions_matched),
+            "matched_y_ions_pct": sum(y_ions_matched) / len(y_ions_matched),
+            "matched_ions_pct": (sum(b_ions_matched) + sum(y_ions_matched))
+            / (len(b_ions_matched) + len(y_ions_matched)),
+        }
+
+    def _annotate_spectrum(self, psm, spectrum):
+
+        spectrum = RawSpectrum(
+            title=psm.spectrum_id,
+            num_scans=1,
+            rt=psm.retention_time,
+            precursor_charge=psm.get_precursor_charge(),
+            precursor_mass=mass.calculate_mass(psm.peptidoform.compsoition),
+            mz_array=spectrum["m/z array"],
+            intensity_array=spectrum["intensity array"],
+        )
+
+        annotated_spectrum = spectrum.annotate(
+            peptide=LinearPeptide(psm.peptidoform.proforma),
+            model=self.fragmentationmodel,
+            mode=self.massmode,
+        )
+
+        return annotated_spectrum
 
 
 # TODO: keep this here?
@@ -153,9 +228,16 @@ def modification_evidence():
 
 # TODO: move to basic feature generator
 def extract_psm_features(psm):
+
+    expmass = (psm.precursor_mz * psm.get_precursor_charge()) - (
+        psm.get_precursor_charge() * 1.007276
+    )  # TODO: replace by constant
+    calcmass = mass.calculate_mass(psm.peptidoform.composition)
+    delta_mass = expmass - calcmass
+
     return {
         "peptide_len": len(psm.peptidoform.sequence),
-        "expmass": (psm.precursor_mz * psm.get_precursor_charge())
-        - (psm.get_precursor_charge() * 1.007276),  # TODO: replace by constant
-        "calcmass": calculate_mass(psm.peptidoform.composition),
+        "expmass": expmass,
+        "calcmass": calcmass,
+        "delta_mass": delta_mass,
     }

From 60207a390f08cba34730889543a356fd1cb9b06b Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <Arthur.Declercq@UGent.be>
Date: Sun, 25 Feb 2024 16:45:09 +0100
Subject: [PATCH 03/23] add rustyms

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index be5295ba..beb6b440 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -55,6 +55,7 @@ dependencies = [
 
 [project.optional-dependencies]
 ionmob = ["ionmob>=0.2", "tensorflow"]
+ms2 = ["rustyms"]
 dev = ["ruff", "black", "pytest", "pytest-cov", "pre-commit"]
 docs = [
     "sphinx",

From ae39844a7b46b9e4622c5f66458aae5146e3776b Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <Arthur.Declercq@UGent.be>
Date: Mon, 26 Feb 2024 11:04:37 +0100
Subject: [PATCH 04/23] remove exit statement fixed IM required value

---
 ms2rescore/core.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/ms2rescore/core.py b/ms2rescore/core.py
index f8ff38d7..6e0e547c 100644
--- a/ms2rescore/core.py
+++ b/ms2rescore/core.py
@@ -54,18 +54,19 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
         f"PSMs already contain the following rescoring features: {psm_list_feature_names}"
     )
 
-    # # TODO: avoid hard coding feature generators in some way
-    # rt_required = ("deeplc" in config["feature_generators"]) and (
-    #     None in psm_list["retention_time"]
-    # )
-    # im_required = ("ionmob" or "im2deep" in config["feature_generators"]) and (
-    #     None in psm_list["ion_mobility"]
-    # )
-    # logger.debug(f"RT required: {rt_required}, IM required: {im_required}")
-
-    # if rt_required or im_required:
-    #     logger.info("Parsing missing retention time and/or ion mobility values from spectra...")
-    #     fill_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required)
+    # TODO: avoid hard coding feature generators in some way
+    rt_required = ("deeplc" in config["feature_generators"]) and (
+        None in psm_list["retention_time"]
+    )
+    im_required = (
+        "ionmob" in config["feature_generators"] or "im2deep" in config["feature_generators"]
+    ) and (None in psm_list["ion_mobility"])
+
+    logger.info(f"RT required: {rt_required}, IM required: {im_required}")
+
+    if rt_required or im_required:
+        logger.info("Parsing missing retention time and/or ion mobility values from spectra...")
+        fill_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required)
 
     # Add rescoring features
     for fgen_name, fgen_config in config["feature_generators"].items():
@@ -82,7 +83,6 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
         fgen.add_features(psm_list)
         logger.debug(f"Adding features from {fgen_name}: {set(fgen.feature_names)}")
         feature_names[fgen_name] = set(fgen.feature_names)
-    exit()
 
     # Filter out psms that do not have all added features
     all_feature_names = {f for fgen in feature_names.values() for f in fgen}

From 9b98c4df3b13d0ab935172a5cfeca8bdbb6159f4 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <Arthur.Declercq@UGent.be>
Date: Mon, 26 Feb 2024 15:05:23 +0100
Subject: [PATCH 05/23] change logger.info to debug

---
 ms2rescore/core.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ms2rescore/core.py b/ms2rescore/core.py
index 6e0e547c..032b1463 100644
--- a/ms2rescore/core.py
+++ b/ms2rescore/core.py
@@ -11,10 +11,12 @@
 from ms2rescore.parse_spectra import fill_missing_values
 from ms2rescore.report import generate
 from ms2rescore.rescoring_engines import mokapot, percolator
+from ms2rescore.utils import profile
 
 logger = logging.getLogger(__name__)
 
 
+@profile
 def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
     """
     Run full MS²Rescore workflow with passed configuration.
@@ -62,7 +64,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
         "ionmob" in config["feature_generators"] or "im2deep" in config["feature_generators"]
     ) and (None in psm_list["ion_mobility"])
 
-    logger.info(f"RT required: {rt_required}, IM required: {im_required}")
+    logger.debug(f"RT required: {rt_required}, IM required: {im_required}")
 
     if rt_required or im_required:
         logger.info("Parsing missing retention time and/or ion mobility values from spectra...")

From 5e45756726582fa54e63f2d572c400332bcaa683 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <Arthur.Declercq@UGent.be>
Date: Mon, 26 Feb 2024 15:06:34 +0100
Subject: [PATCH 06/23] added profile decorator to get timings for functions

---
 ms2rescore/utils.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/ms2rescore/utils.py b/ms2rescore/utils.py
index 70417b56..982664b5 100644
--- a/ms2rescore/utils.py
+++ b/ms2rescore/utils.py
@@ -4,6 +4,9 @@
 from glob import glob
 from pathlib import Path
 from typing import Optional, Union
+import cProfile
+import pstats
+import io
 
 from ms2rescore.exceptions import MS2RescoreConfigurationError
 
@@ -76,3 +79,22 @@ def infer_spectrum_path(
             )
 
     return Path(resolved_path)
+
+
+def profile(fnc):
+    """A decorator that uses cProfile to profile a function"""
+
+    def inner(*args, **kwargs):
+
+        pr = cProfile.Profile()
+        pr.enable()
+        retval = fnc(*args, **kwargs)
+        pr.disable()
+        s = io.StringIO()
+        sortby = "time"
+        ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
+        ps.print_stats()
+        logger.info(s.getvalue())
+        return retval
+
+    return inner

From 304777cfa171cf5b7abf5cd14640e971f6510167 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <Arthur.Declercq@UGent.be>
Date: Mon, 26 Feb 2024 15:07:13 +0100
Subject: [PATCH 07/23] removed profile as standard rescore debug statement

---
 ms2rescore/core.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ms2rescore/core.py b/ms2rescore/core.py
index 032b1463..d0a52a4e 100644
--- a/ms2rescore/core.py
+++ b/ms2rescore/core.py
@@ -11,12 +11,10 @@
 from ms2rescore.parse_spectra import fill_missing_values
 from ms2rescore.report import generate
 from ms2rescore.rescoring_engines import mokapot, percolator
-from ms2rescore.utils import profile
 
 logger = logging.getLogger(__name__)
 
 
-@profile
 def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
     """
     Run full MS²Rescore workflow with passed configuration.

From 95ee4755325cd85990473cf15313c92e8c2f567a Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <Arthur.Declercq@UGent.be>
Date: Mon, 26 Feb 2024 15:07:21 +0100
Subject: [PATCH 08/23] added new basic features

---
 ms2rescore/feature_generators/basic.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/ms2rescore/feature_generators/basic.py b/ms2rescore/feature_generators/basic.py
index bc2222ee..df005167 100644
--- a/ms2rescore/feature_generators/basic.py
+++ b/ms2rescore/feature_generators/basic.py
@@ -56,6 +56,7 @@ def add_features(self, psm_list: PSMList) -> None:
         charge_states = np.array([psm.peptidoform.precursor_charge for psm in psm_list])
         precursor_mzs = psm_list["precursor_mz"]
         scores = psm_list["score"]
+        peptide_lengths = np.array([len(psm.peptidoform.sequence) for psm in psm_list])
 
         has_charge = None not in charge_states
         has_mz = None not in precursor_mzs and has_charge
@@ -74,6 +75,14 @@ def add_features(self, psm_list: PSMList) -> None:
         if has_score:
             self._feature_names.append("search_engine_score")
 
+        if has_mz and has_charge:
+            experimental_mass = (precursor_mzs * charge_n) - (charge_n * 1.007276466812)
+            theoretical_mass = (theo_mz * charge_n) - (charge_n * 1.007276466812)
+            mass_error = experimental_mass - theoretical_mass
+            self._feature_names.extend(["theoretical_mass", "experimental_mass", "mass_error"])
+
+        self._feature_names.append("pep_len")
+
         for i, psm in enumerate(psm_list):
             psm.rescoring_features.update(
                 dict(
@@ -81,6 +90,10 @@ def add_features(self, psm_list: PSMList) -> None:
                     **charge_one_hot[i] if has_charge else {},
                     **{"abs_ms1_error_ppm": abs_ms1_error_ppm[i]} if has_mz else {},
                     **{"search_engine_score": scores[i]} if has_score else {},
+                    **{"theoretical_mass": theoretical_mass[i]} if has_mz and has_charge else {},
+                    **{"experimental_mass": experimental_mass[i]} if has_mz and has_charge else {},
+                    **{"mass_error": mass_error[i]} if has_mz and has_charge else {},
+                    **{"pep_len": peptide_lengths[i]},
                 )
             )
 

From 73f4573331a82ea3984457333d3c701736f4d86d Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <Arthur.Declercq@UGent.be>
Date: Mon, 26 Feb 2024 15:07:47 +0100
Subject: [PATCH 09/23] fixes for ms2 feature generator, removed
 multiprocessing

---
 ms2rescore/feature_generators/ms2.py | 149 +++++++++++++++------------
 1 file changed, 81 insertions(+), 68 deletions(-)

diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py
index 013bcb44..997accb2 100644
--- a/ms2rescore/feature_generators/ms2.py
+++ b/ms2rescore/feature_generators/ms2.py
@@ -4,6 +4,7 @@
 """
 
 import logging
+import multiprocessing
 import re
 from typing import List, Optional, Union
 from itertools import chain
@@ -14,6 +15,7 @@
 from psm_utils import PSMList
 from pyteomics import mass, mzml, mgf
 from rustyms import RawSpectrum, LinearPeptide, FragmentationModel, MassMode
+from rich.progress import track
 
 from ms2rescore.feature_generators.base import FeatureGeneratorBase
 from ms2rescore.utils import infer_spectrum_path
@@ -42,8 +44,9 @@ def __init__(
         *args,
         spectrum_path: Optional[str] = None,
         spectrum_id_pattern: str = "(.*)",
-        fragmentationmodel: str = "All",
-        massmode: str = "Monoisotopic",
+        fragmentation_model: str = "All",
+        mass_mode: str = "Monoisotopic",
+        processes: int = 1,
         **kwargs,
     ) -> None:
         """
@@ -61,8 +64,9 @@ def __init__(
         super().__init__(*args, **kwargs)
         self.spectrum_path = spectrum_path
         self.spectrum_id_pattern = spectrum_id_pattern
-        self.fragmentation_model = FRAGMENTATION_MODELS[fragmentationmodel.lower()]
-        self.mass_mode = MASS_MODES[massmode.lower()]
+        self.fragmentation_model = FRAGMENTATION_MODELS[fragmentation_model.lower()]
+        self.mass_mode = MASS_MODES[mass_mode.lower()]
+        self.processes = processes
 
     @property
     def feature_names(self) -> List[str]:
@@ -98,26 +102,17 @@ def add_features(self, psm_list: PSMList) -> None:
                 spectrum_filename = infer_spectrum_path(self.spectrum_path, run)
                 logger.debug(f"Using spectrum file `{spectrum_filename}`")
 
-                annotated_spectra = self._annotate_spectra(psm_list_run, spectrum_filename)
-                self._calculate_features(psm_list_run, annotated_spectra)
+                self._calculate_features(psm_list_run, spectrum_filename)
                 current_run += 1
 
-    def _calculate_features(self, psm_list: PSMList, annotated_spectra: List) -> None:
-        """Calculate features from annotated spectra and add them to the PSMs."""
-        for psm, spectrum in zip(psm_list, annotated_spectra):
-            psm.features.update(self.extract_spectrum_features(spectrum, psm.peptidoform))
-
-    def _annotate_spectra(self, psm_list: PSMList, spectrum_file: str) -> List:
+    def _calculate_features(self, psm_list: PSMList, spectrum_file: str) -> List:
         """Retrieve annotated spectra for all psms."""
 
-        annotated_spectra = []
         spectrum_reader = self._read_spectrum_file(spectrum_file)
 
         spectrum_id_pattern = re.compile(
             self.spectrum_id_pattern if self.spectrum_id_pattern else r"(.*)"
         )
-        logger.info(spectrum_id_pattern)
-        logger.info(spectrum_reader._offset_index.mapping["spectrum"].keys())
 
         try:
             mapper = {
@@ -128,20 +123,43 @@ def _annotate_spectra(self, psm_list: PSMList, spectrum_file: str) -> List:
             raise ParseSpectrumError(
                 "Could not parse spectrum IDs using ´spectrum_id_pattern´. Please make sure that there is a capturing in the pattern."
             )
-
-        for psm in psm_list:
-            pyteomics_spectrum = spectrum_reader.get_by_id(mapper[psm.spectrum_id])
-            annotated_spectrum = self._annotate_spectrum(psm, pyteomics_spectrum)
-            annotated_spectra.append(annotated_spectrum)
-
-        return annotated_spectra
+        annotated_spectra = [
+            self._annotate_spectrum(psm, spectrum_reader.get_by_id(mapper[psm.spectrum_id]))
+            for psm in psm_list
+        ]
+        for psm, annotated_spectrum in zip(psm_list, annotated_spectra):
+            psm.rescoring_features.update(
+                self._calculate_spectrum_features(psm, annotated_spectrum)
+            )
+        # with multiprocessing.Pool(self.processes) as pool:
+        #     counts_failed = 0
+        #     for psm, features in zip(
+        #         psm_list,
+        #         track(
+        #             pool.imap(
+        #                 self._calculate_spectrum_features_wrapper,
+        #                 zip(psm_list, annotated_spectra),
+        #                 chunksize=1000,
+        #             ),
+        #             total=len(psm_list),
+        #             description="Calculating MS2 features...",
+        #             transient=True,
+        #         ),
+        #     ):
+        #         if features:
+        #             psm.rescoring_features.update(features)
+
+        #         else:
+        #             counts_failed += 1
+        # if counts_failed > 0:
+        #     logger.warning(f"Failed to calculate features for {counts_failed} PSMs")
 
     @staticmethod
     def _read_spectrum_file(spectrum_filepath: str) -> Union[mzml.PreIndexedMzML, mgf.IndexedMGF]:
 
-        if spectrum_filepath.suffix == ".mzML":
+        if spectrum_filepath.suffix.lower() == ".mzml":
             return mzml.PreIndexedMzML(str(spectrum_filepath))
-        elif spectrum_filepath.suffix == ".mgf":
+        elif spectrum_filepath.suffix.lower() == ".mgf":
             return mgf.IndexedMGF(str(spectrum_filepath))
 
     @staticmethod
@@ -155,40 +173,48 @@ def _longest_ion_sequence(lst):
 
         return max_sequence
 
-    def extract_spectrum_features(self, annotated_spectrum, peptidoform):
+    def _calculate_spectrum_features(self, psm, annotated_spectrum):
+
         features = defaultdict(list)
-        b_ions_matched = [False] * (len(peptidoform.sequence))
-        y_ions_matched = [False] * (len(peptidoform.sequence))
+        b_ions_matched = [False] * (len(psm.peptidoform.sequence))
+        y_ions_matched = [False] * (len(psm.peptidoform.sequence))
 
         pseudo_count = 0.00001
+        ion_fragment_regex = re.compile(r"\d+")
 
-        for annotated_peak in annotated_spectrum.spectrum:
-            features["total_intensity"].append(annotated_peak.intensity)
+        for peak in annotated_spectrum:
+            features["total_intensity"].append(peak.intensity)
 
-            if annotated_peak.annotation:
-                features["matched_intensity"].append(annotated_peak.intensity)
-                for matched_ion in annotated_peak.annotation:
+            if peak.annotation:
+                features["matched_intensity"].append(peak.intensity)
+                for matched_ion in peak.annotation:
                     if "y" in matched_ion.ion:
-                        features["y_ion_matched"].append(annotated_peak.intensity)
-                        y_ions_matched[int(re.search(r"\d+", matched_ion.ion).group())] = True
+                        features["y_ion_matched"].append(peak.intensity)
+                        y_ions_matched[int(ion_fragment_regex.search(matched_ion.ion).group())] = (
+                            True
+                        )
                     elif "b" in matched_ion.ion:
-                        features["b_ion_matched"].append(annotated_peak.intensity)
-                        b_ions_matched[int(re.search(r"\d+", matched_ion.ion).group())] = True
+                        features["b_ion_matched"].append(peak.intensity)
+                        b_ions_matched[int(ion_fragment_regex.search(matched_ion.ion).group())] = (
+                            True
+                        )
+
+        total_intensity_sum = np.sum(features["total_intensity"])
+        matched_intensity_sum = np.sum(features["matched_intensity"])
+        b_ion_matched_sum = np.sum(features["b_ion_matched"])
+        y_ion_matched_sum = np.sum(features["y_ion_matched"])
 
         return {
-            "ln_explained_intensity": np.log(np.sum(features["matched_intensity"]) + pseudo_count),
-            "ln_total_intensity": np.log(np.sum(features["total_intensity"]) + pseudo_count),
+            "ln_explained_intensity": np.log(matched_intensity_sum + pseudo_count),
+            "ln_total_intensity": np.log(total_intensity_sum + pseudo_count),
             "ln_explained_intensity_ratio": np.log(
-                np.sum(features["matched_intensity"]) / np.sum(features["total_intensity"])
-                + pseudo_count
+                matched_intensity_sum / total_intensity_sum + pseudo_count
             ),
             "ln_explained_b_ion_ratio": np.log(
-                np.sum(features["b_ion_matched"]) / np.sum(features["matched_intensity"])
-                + pseudo_count
+                b_ion_matched_sum / matched_intensity_sum + pseudo_count
             ),
             "ln_explained_y_ion_ratio": np.log(
-                np.sum(features["y_ion_matched"]) / np.sum(features["matched_intensity"])
-                + pseudo_count
+                y_ion_matched_sum / matched_intensity_sum + pseudo_count
             ),
             "longest_b_ion_sequence": self._longest_ion_sequence(b_ions_matched),
             "longest_y_ion_sequence": self._longest_ion_sequence(y_ions_matched),
@@ -200,44 +226,31 @@ def extract_spectrum_features(self, annotated_spectrum, peptidoform):
             / (len(b_ions_matched) + len(y_ions_matched)),
         }
 
-    def _annotate_spectrum(self, psm, spectrum):
+    def _calculate_spectrum_features_wrapper(self, psm_spectrum_tuple):
+        psm, spectrum = psm_spectrum_tuple
+        return self._calculate_spectrum_features(psm, spectrum)
+
+    def _annotate_spectrum(self, psm, pyteomics_spectrum):
 
         spectrum = RawSpectrum(
             title=psm.spectrum_id,
             num_scans=1,
             rt=psm.retention_time,
             precursor_charge=psm.get_precursor_charge(),
-            precursor_mass=mass.calculate_mass(psm.peptidoform.compsoition),
-            mz_array=spectrum["m/z array"],
-            intensity_array=spectrum["intensity array"],
+            precursor_mass=mass.calculate_mass(psm.peptidoform.composition),
+            mz_array=pyteomics_spectrum["m/z array"],
+            intensity_array=pyteomics_spectrum["intensity array"],
         )
 
         annotated_spectrum = spectrum.annotate(
             peptide=LinearPeptide(psm.peptidoform.proforma),
-            model=self.fragmentationmodel,
-            mode=self.massmode,
+            model=self.fragmentation_model,
+            mode=self.mass_mode,
         )
 
-        return annotated_spectrum
+        return annotated_spectrum.spectrum
 
 
 # TODO: keep this here?
 def modification_evidence():
     return
-
-
-# TODO: move to basic feature generator
-def extract_psm_features(psm):
-
-    expmass = (psm.precursor_mz * psm.get_precursor_charge()) - (
-        psm.get_precursor_charge() * 1.007276
-    )  # TODO: replace by constant
-    calcmass = mass.calculate_mass(psm.peptidoform.composition)
-    delta_mass = expmass - calcmass
-
-    return {
-        "peptide_len": len(psm.peptidoform.sequence),
-        "expmass": expmass,
-        "calcmass": calcmass,
-        "delta_mass": delta_mass,
-    }

From 947233ee0039757676ea5b62daaacf2e1059089f Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <Arthur.Declercq@UGent.be>
Date: Wed, 28 Feb 2024 11:12:32 +0100
Subject: [PATCH 10/23] return empty list on parsing error with rustyms,
 removed multiprocessing

---
 ms2rescore/feature_generators/ms2.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py
index 997accb2..b292d51a 100644
--- a/ms2rescore/feature_generators/ms2.py
+++ b/ms2rescore/feature_generators/ms2.py
@@ -4,7 +4,6 @@
 """
 
 import logging
-import multiprocessing
 import re
 from typing import List, Optional, Union
 from itertools import chain
@@ -15,7 +14,6 @@
 from psm_utils import PSMList
 from pyteomics import mass, mzml, mgf
 from rustyms import RawSpectrum, LinearPeptide, FragmentationModel, MassMode
-from rich.progress import track
 
 from ms2rescore.feature_generators.base import FeatureGeneratorBase
 from ms2rescore.utils import infer_spectrum_path
@@ -62,6 +60,7 @@ def __init__(
 
         """
         super().__init__(*args, **kwargs)
+
         self.spectrum_path = spectrum_path
         self.spectrum_id_pattern = spectrum_id_pattern
         self.fragmentation_model = FRAGMENTATION_MODELS[fragmentation_model.lower()]
@@ -100,7 +99,6 @@ def add_features(self, psm_list: PSMList) -> None:
                 )
                 psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
                 spectrum_filename = infer_spectrum_path(self.spectrum_path, run)
-                logger.debug(f"Using spectrum file `{spectrum_filename}`")
 
                 self._calculate_features(psm_list_run, spectrum_filename)
                 current_run += 1
@@ -175,6 +173,9 @@ def _longest_ion_sequence(lst):
 
     def _calculate_spectrum_features(self, psm, annotated_spectrum):
 
+        if not annotated_spectrum:
+            return {}
+
         features = defaultdict(list)
         b_ions_matched = [False] * (len(psm.peptidoform.sequence))
         y_ions_matched = [False] * (len(psm.peptidoform.sequence))
@@ -241,12 +242,14 @@ def _annotate_spectrum(self, psm, pyteomics_spectrum):
             mz_array=pyteomics_spectrum["m/z array"],
             intensity_array=pyteomics_spectrum["intensity array"],
         )
-
-        annotated_spectrum = spectrum.annotate(
-            peptide=LinearPeptide(psm.peptidoform.proforma),
-            model=self.fragmentation_model,
-            mode=self.mass_mode,
-        )
+        try:
+            annotated_spectrum = spectrum.annotate(
+                peptide=LinearPeptide(psm.peptidoform.proforma.split("/")[0]),
+                model=self.fragmentation_model,
+                mode=self.mass_mode,
+            )
+        except:  # noqa E722
+            return []
 
         return annotated_spectrum.spectrum
 

From 24ce565542fb42cea534b81b53093435dba2a36c Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <Arthur.Declercq@UGent.be>
Date: Fri, 15 Mar 2024 09:47:02 +0100
Subject: [PATCH 11/23] add deeplc_calibration psm set

---
 ms2rescore/feature_generators/deeplc.py | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py
index 30bea716..207f8ba1 100644
--- a/ms2rescore/feature_generators/deeplc.py
+++ b/ms2rescore/feature_generators/deeplc.py
@@ -25,6 +25,7 @@
 
 import numpy as np
 from psm_utils import PSMList
+from psm_utils.io import read_file
 
 from ms2rescore.feature_generators.base import FeatureGeneratorBase
 
@@ -40,6 +41,7 @@ def __init__(
         *args,
         lower_score_is_better: bool = False,
         calibration_set_size: Union[int, float, None] = None,
+        calibration_set: Union[str, None] = None,
         processes: int = 1,
         **kwargs,
     ) -> None:
@@ -71,6 +73,7 @@ def __init__(
 
         self.lower_psm_score_better = lower_score_is_better
         self.calibration_set_size = calibration_set_size
+        self.calibration_set = calibration_set
         self.processes = processes
         self.deeplc_kwargs = kwargs or {}
 
@@ -120,7 +123,9 @@ def add_features(self, psm_list: PSMList) -> None:
         # Run DeepLC for each spectrum file
         current_run = 1
         total_runs = sum(len(runs) for runs in psm_dict.values())
-
+        if self.calibration_set:
+            logger.info("Loading calibration set...")
+            self.calibration_set = read_file(self.calibration_set, filetype="tsv")
         for runs in psm_dict.values():
             # Reset DeepLC predictor for each collection of runs
             self.deeplc_predictor = None
@@ -138,13 +143,19 @@ def add_features(self, psm_list: PSMList) -> None:
                 )
 
                 # Disable wild logging to stdout by Tensorflow, unless in debug mode
-                with contextlib.redirect_stdout(
-                    open(os.devnull, "w")
-                ) if not self._verbose else contextlib.nullcontext():
+                with (
+                    contextlib.redirect_stdout(open(os.devnull, "w"))
+                    if not self._verbose
+                    else contextlib.nullcontext()
+                ):
                     # Make new PSM list for this run (chain PSMs per spectrum to flat list)
                     psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
-
-                    psm_list_calibration = self._get_calibration_psms(psm_list_run)
+                    if self.calibration_set:
+                        psm_list_calibration = self.calibration_set[
+                            self.calibration_set["run"] == run
+                        ]
+                    else:
+                        psm_list_calibration = self._get_calibration_psms(psm_list_run)
                     logger.debug(f"Calibrating DeepLC with {len(psm_list_calibration)} PSMs...")
                     self.deeplc_predictor = self.DeepLC(
                         n_jobs=self.processes,

From 33c38b04e304e6b865714adb26b813c647911628 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Wed, 17 Apr 2024 17:10:38 +0200
Subject: [PATCH 12/23] remove unused import

---
 ms2rescore/utils.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/ms2rescore/utils.py b/ms2rescore/utils.py
index 28249c51..3515893a 100644
--- a/ms2rescore/utils.py
+++ b/ms2rescore/utils.py
@@ -4,9 +4,6 @@
 from glob import glob
 from pathlib import Path
 from typing import Optional, Union
-import cProfile
-import pstats
-import io
 
 from ms2rescore.exceptions import MS2RescoreConfigurationError
 

From 11fdc51ad70c225393553f5964f7485a60b7ae19 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Wed, 21 Aug 2024 13:25:10 +0200
Subject: [PATCH 13/23] integrate mumble into ms2branch

---
 ms2rescore/exceptions.py                    |  6 +++
 ms2rescore/package_data/config_default.json |  1 +
 ms2rescore/package_data/config_schema.json  | 42 +++++++++++++++++++++
 ms2rescore/parse_psms.py                    | 14 +++++++
 pyproject.toml                              |  1 +
 5 files changed, 64 insertions(+)

diff --git a/ms2rescore/exceptions.py b/ms2rescore/exceptions.py
index ba5492a2..68b1b43d 100644
--- a/ms2rescore/exceptions.py
+++ b/ms2rescore/exceptions.py
@@ -41,3 +41,9 @@ class RescoringError(MS2RescoreError):
     """Error while rescoring PSMs."""
 
     pass
+
+
+class ParseSpectrumError(MS2RescoreError):
+    """Error while rescoring PSMs."""
+
+    pass
diff --git a/ms2rescore/package_data/config_default.json b/ms2rescore/package_data/config_default.json
index 733ea707..126fde8b 100644
--- a/ms2rescore/package_data/config_default.json
+++ b/ms2rescore/package_data/config_default.json
@@ -12,6 +12,7 @@
             },
             "maxquant": {}
         },
+        "psm_generator": {},
         "rescoring_engine": {
             "mokapot": {
                 "train_fdr": 0.01,
diff --git a/ms2rescore/package_data/config_schema.json b/ms2rescore/package_data/config_schema.json
index ab77f3b7..40471714 100644
--- a/ms2rescore/package_data/config_schema.json
+++ b/ms2rescore/package_data/config_schema.json
@@ -62,6 +62,18 @@
                         "mokapot": {}
                     }
                 },
+                "psm_generator": {
+                    "description": "PSM generator and their configuration.",
+                    "type": "object",
+                    "minProperties": 0,
+                    "maxProperties": 1,
+                    "patternProperties": {
+                        ".*": { "$ref": "#/definitions/psm_generator" },
+                        "mumble": {
+                            "$ref": "#/definitions/mumble"
+                        }
+                    }
+                },
                 "config_file": {
                     "description": "Path to configuration file",
                     "oneOf": [{ "type": "string" }, { "type": "null" }]
@@ -179,6 +191,7 @@
                     "type": "boolean",
                     "default": false
                 }
+                
             }
         }
     },
@@ -193,6 +206,11 @@
             "type": "object",
             "additionalProperties": true
         },
+        "psm_generator": {
+            "description": "Additional PSM feature generator configuration",
+            "type": "object",
+            "additionalProperties": true
+        },
         "basic": {
             "$ref": "#/definitions/feature_generator",
             "description": "Basic feature generator configuration",
@@ -273,6 +291,30 @@
                 }
             }
         },
+        "mumble": {
+            "$ref": "#/definitions/psm_generator",
+            "description": "Mumble PSM generator configuration using Mubmle",
+            "type": "object",
+            "additionalProperties": true,
+            "properties": {
+                "aa_combinations": {
+                    "description": "Additional amino acid combinations to consider as mass shift",
+                    "type": "integer",
+                    "default": 0
+                },
+                "fasta_file": {
+                    "description": "Maximum number of modifications per peptide",
+                    "oneOf": [{ "type": "string" }, { "type": "null" }],
+                    "default": false
+                },
+                "mass_error": {
+                    "description": "mass error in Da",
+                    "type": "number",
+                    "default": 0.2
+                }
+            }
+        },
+
         "mokapot": {
             "$ref": "#/definitions/rescoring_engine",
             "description": "Mokapot rescoring engine configuration. Additional properties are passed to the Mokapot brew function.",
diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py
index 6a3c6065..ae28b519 100644
--- a/ms2rescore/parse_psms.py
+++ b/ms2rescore/parse_psms.py
@@ -5,6 +5,7 @@
 import numpy as np
 import psm_utils.io
 from psm_utils import PSMList
+from mumble import PSMHandler
 
 from ms2rescore.exceptions import MS2RescoreConfigurationError
 
@@ -90,6 +91,19 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
         new_ids = [_match_psm_ids(old_id, pattern) for old_id in psm_list["spectrum_id"]]
         psm_list["spectrum_id"] = new_ids
 
+    # Addition of Modifications for mass shifts in the PSMs with Mumble
+    if "mumble" in config["psm_generator"]:
+        logger.debug("Applying modifications for mass shifts using Mumble...")
+        mumble_config = config["psm_generator"]["mumble"]
+        psm_handler = PSMHandler(
+            **mumble_config,  # TODO how do we store config for mumble?
+        )
+        psm_list = psm_handler.add_modified_psms(
+            psm_list, generate_modified_decoys=True, keep_original=True
+        )
+        if mumble_config["output_file"]:
+            psm_handler.write_modified_psm_list(psm_list, mumble_config["output_file"])
+
     return psm_list
 
 
diff --git a/pyproject.toml b/pyproject.toml
index f57ad800..7c1b0a9c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,6 +50,7 @@ dependencies = [
     "pyteomics>=4.7.2",
     "rich>=12",
     "tomli>=2; python_version < '3.11'",
+    "mumble"
 ]
 
 [project.optional-dependencies]

From 883169a90cf7c57970dac1085ba54defbf1ffdb9 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Fri, 27 Sep 2024 09:58:27 +0200
Subject: [PATCH 14/23] temp removal of sage features before rescoring

---
 ms2rescore/parse_psms.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py
index ae28b519..91c94168 100644
--- a/ms2rescore/parse_psms.py
+++ b/ms2rescore/parse_psms.py
@@ -95,6 +95,15 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
     if "mumble" in config["psm_generator"]:
         logger.debug("Applying modifications for mass shifts using Mumble...")
         mumble_config = config["psm_generator"]["mumble"]
+        for psm in psm_list:
+            psm["rescoring_features"].pop("expmass", None)
+            psm["rescoring_features"].pop("calcmass", None)
+            psm["rescoring_features"].pop("peptide_len", None)
+            psm["rescoring_features"].pop("matched_peaks", None)
+            psm["rescoring_features"].pop("longest_b", None)
+            psm["rescoring_features"].pop("longest_y", None)
+            psm["rescoring_features"].pop("longest_y_pct", None)
+            psm["rescoring_features"].pop("matched_intensity_pct", None)
         psm_handler = PSMHandler(
             **mumble_config,  # TODO how do we store config for mumble?
         )

From da39ae81d052b4391ada17fd24bb5a856078e8a8 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Fri, 8 Nov 2024 15:41:26 +0100
Subject: [PATCH 15/23] remove psm_file features when rescoring with mumble

---
 ms2rescore/parse_psms.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py
index 91c94168..309c1c96 100644
--- a/ms2rescore/parse_psms.py
+++ b/ms2rescore/parse_psms.py
@@ -95,15 +95,14 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
     if "mumble" in config["psm_generator"]:
         logger.debug("Applying modifications for mass shifts using Mumble...")
         mumble_config = config["psm_generator"]["mumble"]
-        for psm in psm_list:
-            psm["rescoring_features"].pop("expmass", None)
-            psm["rescoring_features"].pop("calcmass", None)
-            psm["rescoring_features"].pop("peptide_len", None)
-            psm["rescoring_features"].pop("matched_peaks", None)
-            psm["rescoring_features"].pop("longest_b", None)
-            psm["rescoring_features"].pop("longest_y", None)
-            psm["rescoring_features"].pop("longest_y_pct", None)
-            psm["rescoring_features"].pop("matched_intensity_pct", None)
+
+        # Check if psm_list[0].rescoring_features is empty or not
+        if psm_list[0].rescoring_features:
+            logger.debug("Removing psm_file rescoring features from PSMs...")
+            # psm_list.remove_rescoring_features() # TODO add this to psm_utils
+            for psm in psm_list:
+                psm.rescoring_features = {}
+
         psm_handler = PSMHandler(
             **mumble_config,  # TODO how do we store config for mumble?
         )

From 37fff28ebc03fcf66f98be5a5c51ca7e3b68e97b Mon Sep 17 00:00:00 2001
From: SamvPy <sam.vanpuyenbroeck@gmail.com>
Date: Tue, 19 Nov 2024 16:10:43 +0100
Subject: [PATCH 16/23] linting

---
 ms2rescore/core.py                         |  5 ++++-
 ms2rescore/feature_generators/__init__.py  |  4 ++--
 ms2rescore/feature_generators/deeplc.py    |  8 +++++---
 ms2rescore/feature_generators/ionmob.py    | 11 +++++++++--
 ms2rescore/feature_generators/ms2pip.py    |  5 ++++-
 ms2rescore/gui/__main__.py                 |  2 +-
 ms2rescore/parse_psms.py                   |  2 +-
 ms2rescore/rescoring_engines/percolator.py |  2 +-
 8 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/ms2rescore/core.py b/ms2rescore/core.py
index a02ab4f6..48e125bc 100644
--- a/ms2rescore/core.py
+++ b/ms2rescore/core.py
@@ -14,7 +14,10 @@
 from ms2rescore.parse_spectra import get_missing_values
 from ms2rescore.report import generate
 from ms2rescore.rescoring_engines import mokapot, percolator
-from ms2rescore.rescoring_engines.mokapot import add_peptide_confidence, add_psm_confidence
+from ms2rescore.rescoring_engines.mokapot import (
+    add_peptide_confidence,
+    add_psm_confidence,
+)
 
 logger = logging.getLogger(__name__)
 
diff --git a/ms2rescore/feature_generators/__init__.py b/ms2rescore/feature_generators/__init__.py
index 7a10fd4c..9eff1b5c 100644
--- a/ms2rescore/feature_generators/__init__.py
+++ b/ms2rescore/feature_generators/__init__.py
@@ -4,11 +4,11 @@
 
 from ms2rescore.feature_generators.basic import BasicFeatureGenerator
 from ms2rescore.feature_generators.deeplc import DeepLCFeatureGenerator
+from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator
 from ms2rescore.feature_generators.ionmob import IonMobFeatureGenerator
 from ms2rescore.feature_generators.maxquant import MaxQuantFeatureGenerator
-from ms2rescore.feature_generators.ms2pip import MS2PIPFeatureGenerator
-from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator
 from ms2rescore.feature_generators.ms2 import MS2FeatureGenerator
+from ms2rescore.feature_generators.ms2pip import MS2PIPFeatureGenerator
 
 FEATURE_GENERATORS = {
     "basic": BasicFeatureGenerator,
diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py
index 206d6a21..e3113544 100644
--- a/ms2rescore/feature_generators/deeplc.py
+++ b/ms2rescore/feature_generators/deeplc.py
@@ -143,9 +143,11 @@ def add_features(self, psm_list: PSMList) -> None:
                 )
 
                 # Disable wild logging to stdout by Tensorflow, unless in debug mode
-                with contextlib.redirect_stdout(
-                    open(os.devnull, "w", encoding="utf-8")
-                ) if not self._verbose else contextlib.nullcontext():
+                with (
+                    contextlib.redirect_stdout(open(os.devnull, "w", encoding="utf-8"))
+                    if not self._verbose
+                    else contextlib.nullcontext()
+                ):
                     # Make new PSM list for this run (chain PSMs per spectrum to flat list)
                     psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
                     if self.calibration_set:
diff --git a/ms2rescore/feature_generators/ionmob.py b/ms2rescore/feature_generators/ionmob.py
index 7fa0c0a1..e9b51026 100644
--- a/ms2rescore/feature_generators/ionmob.py
+++ b/ms2rescore/feature_generators/ionmob.py
@@ -23,12 +23,19 @@
 import tensorflow as tf
 from psm_utils import Peptidoform, PSMList
 
-from ms2rescore.feature_generators.base import FeatureGeneratorBase, FeatureGeneratorException
+from ms2rescore.feature_generators.base import (
+    FeatureGeneratorBase,
+    FeatureGeneratorException,
+)
 
 try:
     from ionmob import __file__ as ionmob_file
     from ionmob.preprocess.data import to_tf_dataset_inference
-    from ionmob.utilities.chemistry import VARIANT_DICT, calculate_mz, reduced_mobility_to_ccs
+    from ionmob.utilities.chemistry import (
+        VARIANT_DICT,
+        calculate_mz,
+        reduced_mobility_to_ccs,
+    )
     from ionmob.utilities.tokenization import tokenizer_from_json
     from ionmob.utilities.utility import get_ccs_shift
 except ImportError:
diff --git a/ms2rescore/feature_generators/ms2pip.py b/ms2rescore/feature_generators/ms2pip.py
index 3882b3fc..34d427e2 100644
--- a/ms2rescore/feature_generators/ms2pip.py
+++ b/ms2rescore/feature_generators/ms2pip.py
@@ -37,7 +37,10 @@
 from psm_utils import PSMList
 from rich.progress import track
 
-from ms2rescore.feature_generators.base import FeatureGeneratorBase, FeatureGeneratorException
+from ms2rescore.feature_generators.base import (
+    FeatureGeneratorBase,
+    FeatureGeneratorException,
+)
 from ms2rescore.utils import infer_spectrum_path
 
 logger = logging.getLogger(__name__)
diff --git a/ms2rescore/gui/__main__.py b/ms2rescore/gui/__main__.py
index 429e117f..57cee3c6 100644
--- a/ms2rescore/gui/__main__.py
+++ b/ms2rescore/gui/__main__.py
@@ -1,8 +1,8 @@
 """Entrypoint for MS²Rescore GUI."""
 
+import contextlib
 import multiprocessing
 import os
-import contextlib
 
 from ms2rescore.gui.app import app
 
diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py
index 309c1c96..cc61926f 100644
--- a/ms2rescore/parse_psms.py
+++ b/ms2rescore/parse_psms.py
@@ -4,8 +4,8 @@
 
 import numpy as np
 import psm_utils.io
-from psm_utils import PSMList
 from mumble import PSMHandler
+from psm_utils import PSMList
 
 from ms2rescore.exceptions import MS2RescoreConfigurationError
 
diff --git a/ms2rescore/rescoring_engines/percolator.py b/ms2rescore/rescoring_engines/percolator.py
index 192950bd..32d856c4 100644
--- a/ms2rescore/rescoring_engines/percolator.py
+++ b/ms2rescore/rescoring_engines/percolator.py
@@ -19,8 +19,8 @@
 
 import logging
 import subprocess
-from typing import Any, Dict, Optional
 from copy import deepcopy
+from typing import Any, Dict, Optional
 
 import psm_utils
 

From e8b59f3ca2c6f1fbaae4ec5cc6b531f25284fad7 Mon Sep 17 00:00:00 2001
From: SamvPy <sam.vanpuyenbroeck@gmail.com>
Date: Tue, 19 Nov 2024 16:10:56 +0100
Subject: [PATCH 17/23] add hyperscore calculation

---
 ms2rescore/feature_generators/ms2.py | 183 ++++++++++++++++++++++++++-
 1 file changed, 176 insertions(+), 7 deletions(-)

diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py
index b292d51a..490f4f47 100644
--- a/ms2rescore/feature_generators/ms2.py
+++ b/ms2rescore/feature_generators/ms2.py
@@ -5,19 +5,19 @@
 
 import logging
 import re
-from typing import List, Optional, Union
-from itertools import chain
 from collections import defaultdict
-
+from itertools import chain
+from typing import List, Optional, Union
 
 import numpy as np
-from psm_utils import PSMList
-from pyteomics import mass, mzml, mgf
-from rustyms import RawSpectrum, LinearPeptide, FragmentationModel, MassMode
+import pyopenms as oms
+from psm_utils import PSM, Peptidoform, PSMList
+from pyteomics import mass, mgf, mzml
+from rustyms import FragmentationModel, LinearPeptide, MassMode, RawSpectrum
 
+from ms2rescore.exceptions import ParseSpectrumError
 from ms2rescore.feature_generators.base import FeatureGeneratorBase
 from ms2rescore.utils import infer_spectrum_path
-from ms2rescore.exceptions import ParseSpectrumError
 
 logger = logging.getLogger(__name__)
 
@@ -45,6 +45,7 @@ def __init__(
         fragmentation_model: str = "All",
         mass_mode: str = "Monoisotopic",
         processes: int = 1,
+        calculate_hyperscore: bool = True,  # Allow optional ?
         **kwargs,
     ) -> None:
         """
@@ -66,6 +67,7 @@ def __init__(
         self.fragmentation_model = FRAGMENTATION_MODELS[fragmentation_model.lower()]
         self.mass_mode = MASS_MODES[mass_mode.lower()]
         self.processes = processes
+        self.calculate_hyperscore = calculate_hyperscore
 
     @property
     def feature_names(self) -> List[str]:
@@ -82,6 +84,7 @@ def feature_names(self) -> List[str]:
             "matched_y_ions",
             "matched_y_ions_pct",
             "matched_ions_pct",
+            "hyperscore",
         ]
 
     def add_features(self, psm_list: PSMList) -> None:
@@ -129,6 +132,22 @@ def _calculate_features(self, psm_list: PSMList, spectrum_file: str) -> List:
             psm.rescoring_features.update(
                 self._calculate_spectrum_features(psm, annotated_spectrum)
             )
+
+            if self.calculate_hyperscore:
+                # Filters out peaks which are unnannotated (can be specified, but keep at b-y ions of any charge ?)
+                mz_list, intensity_list = _annotated_spectrum_to_mzint(
+                    annotated_spectrum=annotated_spectrum
+                )
+                psm.rescoring_features.update(
+                    {
+                        "hyperscore": calculate_hyperscore(
+                            psm=psm, observed_mz=mz_list, observed_intensity=intensity_list
+                        )
+                    }
+                )
+
+            else:
+                psm.rescoring_features.update({"hyperscore": np.nan})
         # with multiprocessing.Pool(self.processes) as pool:
         #     counts_failed = 0
         #     for psm, features in zip(
@@ -253,7 +272,157 @@ def _annotate_spectrum(self, psm, pyteomics_spectrum):
 
         return annotated_spectrum.spectrum
 
+    def _calculate_hyperscore(self, psm, spectrum):
+        pass
+
+    def _calculate_fragmentation_features(self, psm, annotated_spectrum):
+        pass
+
 
 # TODO: keep this here?
 def modification_evidence():
     return
+
+
+def _annotated_spectrum_to_mzint(annotated_spectrum, ion_types=["b", "y"]):
+
+    mz_list = []
+    intensity_list = []
+
+    for peak in annotated_spectrum:
+
+        annotations = peak.annotation
+        for fragment in annotations:
+            ion = fragment.ion
+            if ion[0] not in ion_types:
+                continue
+
+            mz_list.append(peak.experimental_mz)
+            intensity_list.append(peak.intensity)
+            break
+
+    return mz_list, intensity_list
+
+
+def _peptidoform_to_oms(peptidoform: Peptidoform) -> tuple[oms.AASequence, Optional[int]]:
+    """
+    Parse a peptidoform object to pyOpenMS compatible format.
+
+    Parameter
+    ---------
+    Peptidoform: Peptidoform
+        Peptide string in Peptidoform format
+
+    Returns
+    -------
+    AASequence (pyOpenMS):
+        A peptide sequence in pyOpenMS format
+    int:
+        charge of the peptide
+    """
+
+    n_term = peptidoform.properties["n_term"]
+    peptide_oms_str = f"[{sum([mod.mass for mod in n_term])}]" if n_term else ""
+
+    for aa, mods in peptidoform.parsed_sequence:
+        peptide_oms_str += aa
+        if isinstance(mods, list):
+            peptide_oms_str += f"[{sum([mod.mass for mod in mods])}]"
+
+    c_term = peptidoform.properties["c_term"]
+    peptide_oms_str += f"[{sum([mod.mass for mod in c_term])}]" if c_term else ""
+
+    peptide_oms = oms.AASequence.fromString(peptide_oms_str)
+
+    return peptide_oms
+
+
+def _peptidoform_to_theoretical_spectrum(peptidoform: str) -> oms.MSSpectrum:
+    """
+    Create a theoretical spectrum from a peptide sequence.
+
+    Parameter
+    ---------
+    peptide: str
+        Peptide sequence in proforma format
+    engine: str
+        The engine to use to create theoretical spectrum.
+        Can only be 'pyopenms' or 'spectrum-utils' (default)
+
+    Return
+    ------
+    MSSpectrum
+        Spectrum object in pyOpenMS format
+    """
+    # Reformat peptide sequence in pyOpenMS format
+    peptide_oms = _peptidoform_to_oms(peptidoform=peptidoform)
+
+    # Initialize the required objects to create the spectrum
+    spectrum = oms.MSSpectrum()
+    tsg = oms.TheoreticalSpectrumGenerator()
+    p = oms.Param()
+
+    p.setValue("add_b_ions", "true")
+    p.setValue("add_metainfo", "true")
+    tsg.setParameters(param=p)
+
+    # Create the theoretical spectrum
+    tsg.getSpectrum(spec=spectrum, peptide=peptide_oms, min_charge=1, max_charge=2)
+    return spectrum
+
+
+def calculate_hyperscore(
+    psm: PSM,
+    observed_mz: List[float],
+    observed_intensity: List[float],
+    fragment_tol_mass=20,
+    fragment_tol_mode="ppm",
+):
+    """
+    Calculate the hyperscore as defined in the X!Tandem search engine.
+
+    It is a metric of how good two spectra match with each other (matching peaks).
+
+    Parameters
+    ----------
+    psm: psm_utils.PSM
+        The PSM used to extract 'spectrum_id' (for MGF spectrum extraction)
+        and 'Peptidoform' (the peptide sequence)
+    observed_mz: List[float]
+        List of observed mz values with matching order as observed intensity
+    observed_intensity: List[float]
+        List of observed intensity values
+    fragment_tol_mass: int
+        The allowed tolerance to match peaks
+    fragment_tol_mode: str
+        'ppm' for parts-per-million mode. 'Da' for fragment_tol_mass in Dalton.
+    Return
+    ------
+    int
+        The hyperscore
+    """
+    if fragment_tol_mode == "ppm":
+        fragment_mass_tolerance_unit_ppm = True
+    elif fragment_tol_mode == "Da":
+        fragment_mass_tolerance_unit_ppm = False
+    else:
+        raise Exception(
+            "fragment_tol_mode can only take 'Da' or 'ppm'. {} was provided.".format(
+                fragment_tol_mode
+            )
+        )
+    if len(observed_intensity) == 0:
+        logging.warning(f"PSM ({psm.spectrum_id}) has no annotated peaks.")
+        return 0.0
+
+    theoretical_spectrum = _peptidoform_to_theoretical_spectrum(peptidoform=psm.peptidoform)
+    observed_spectrum_oms = oms.MSSpectrum()
+    observed_spectrum_oms.set_peaks([observed_mz, observed_intensity])
+    hyperscore = oms.HyperScore()
+    result = hyperscore.compute(
+        fragment_mass_tolerance=fragment_tol_mass,
+        fragment_mass_tolerance_unit_ppm=fragment_mass_tolerance_unit_ppm,
+        exp_spectrum=observed_spectrum_oms,
+        theo_spectrum=theoretical_spectrum,
+    )
+    return result

From c51cd346f438b2afc97c2b171ea73bcdc60a64eb Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Thu, 21 Nov 2024 17:21:04 +0100
Subject: [PATCH 18/23] calibration fixes

---
 ms2rescore/feature_generators/deeplc.py  | 15 +++++----------
 ms2rescore/feature_generators/im2deep.py |  3 ++-
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py
index e3113544..29a249eb 100644
--- a/ms2rescore/feature_generators/deeplc.py
+++ b/ms2rescore/feature_generators/deeplc.py
@@ -123,9 +123,6 @@ def add_features(self, psm_list: PSMList) -> None:
         # Run DeepLC for each spectrum file
         current_run = 1
         total_runs = sum(len(runs) for runs in psm_dict.values())
-        if self.calibration_set:
-            logger.info("Loading calibration set...")
-            self.calibration_set = read_file(self.calibration_set, filetype="tsv")
         for runs in psm_dict.values():
             # Reset DeepLC predictor for each collection of runs
             self.deeplc_predictor = None
@@ -150,12 +147,7 @@ def add_features(self, psm_list: PSMList) -> None:
                 ):
                     # Make new PSM list for this run (chain PSMs per spectrum to flat list)
                     psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
-                    if self.calibration_set:
-                        psm_list_calibration = self.calibration_set[
-                            self.calibration_set["run"] == run
-                        ]
-                    else:
-                        psm_list_calibration = self._get_calibration_psms(psm_list_run)
+                    psm_list_calibration = self._get_calibration_psms(psm_list_run)
                     logger.debug(f"Calibrating DeepLC with {len(psm_list_calibration)} PSMs...")
                     self.deeplc_predictor = self.DeepLC(
                         n_jobs=self.processes,
@@ -204,7 +196,10 @@ def add_features(self, psm_list: PSMList) -> None:
 
     def _get_calibration_psms(self, psm_list: PSMList):
         """Get N best scoring target PSMs for calibration."""
-        psm_list_targets = psm_list[~psm_list["is_decoy"]]
+        psm_list_targets = psm_list[
+            ~psm_list["is_decoy"]
+            & [metadata.get("original_psm", True) for metadata in psm_list["metadata"]]
+        ]
         if self.calibration_set_size:
             n_psms = self._get_number_of_calibration_psms(psm_list_targets)
             indices = np.argsort(psm_list_targets["score"])
diff --git a/ms2rescore/feature_generators/im2deep.py b/ms2rescore/feature_generators/im2deep.py
index 772f6855..2afa4fcd 100644
--- a/ms2rescore/feature_generators/im2deep.py
+++ b/ms2rescore/feature_generators/im2deep.py
@@ -158,7 +158,8 @@ def make_calibration_df(psm_list_df: pd.DataFrame, threshold: float = 0.25) -> p
         identified_psms = psm_list_df[
             (psm_list_df["qvalue"] < 0.01)
             & (~psm_list_df["is_decoy"])
-            & (psm_list_df["charge"] < 5)  # predictions do not go higher for IM2Deep
+            & (psm_list_df["charge"] < 7)  # predictions do not go higher for IM2Deep
+            & ([metadata.get("original_psm", True) for metadata in psm_list_df["metadata"]])
         ]
         calibration_psms = identified_psms[
             identified_psms["qvalue"] < identified_psms["qvalue"].quantile(1 - threshold)

From 295e37f20d2e344f46bf5fc9d911f6955210f184 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Thu, 21 Nov 2024 17:21:16 +0100
Subject: [PATCH 19/23] changes for mumble implementation

---
 ms2rescore/feature_generators/ms2.py | 38 ----------------------------
 ms2rescore/parse_psms.py             |  8 ++----
 pyproject.toml                       |  1 -
 3 files changed, 2 insertions(+), 45 deletions(-)

diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py
index 490f4f47..a3bd6d77 100644
--- a/ms2rescore/feature_generators/ms2.py
+++ b/ms2rescore/feature_generators/ms2.py
@@ -23,7 +23,6 @@
 
 FRAGMENTATION_MODELS = {
     "cidhcd": FragmentationModel.CidHcd,
-    "etcid": FragmentationModel.Etcid,
     "etd": FragmentationModel.Etd,
     "ethcd": FragmentationModel.Ethcd,
     "all": FragmentationModel.All,
@@ -148,28 +147,6 @@ def _calculate_features(self, psm_list: PSMList, spectrum_file: str) -> List:
 
             else:
                 psm.rescoring_features.update({"hyperscore": np.nan})
-        # with multiprocessing.Pool(self.processes) as pool:
-        #     counts_failed = 0
-        #     for psm, features in zip(
-        #         psm_list,
-        #         track(
-        #             pool.imap(
-        #                 self._calculate_spectrum_features_wrapper,
-        #                 zip(psm_list, annotated_spectra),
-        #                 chunksize=1000,
-        #             ),
-        #             total=len(psm_list),
-        #             description="Calculating MS2 features...",
-        #             transient=True,
-        #         ),
-        #     ):
-        #         if features:
-        #             psm.rescoring_features.update(features)
-
-        #         else:
-        #             counts_failed += 1
-        # if counts_failed > 0:
-        #     logger.warning(f"Failed to calculate features for {counts_failed} PSMs")
 
     @staticmethod
     def _read_spectrum_file(spectrum_filepath: str) -> Union[mzml.PreIndexedMzML, mgf.IndexedMGF]:
@@ -246,10 +223,6 @@ def _calculate_spectrum_features(self, psm, annotated_spectrum):
             / (len(b_ions_matched) + len(y_ions_matched)),
         }
 
-    def _calculate_spectrum_features_wrapper(self, psm_spectrum_tuple):
-        psm, spectrum = psm_spectrum_tuple
-        return self._calculate_spectrum_features(psm, spectrum)
-
     def _annotate_spectrum(self, psm, pyteomics_spectrum):
 
         spectrum = RawSpectrum(
@@ -272,17 +245,6 @@ def _annotate_spectrum(self, psm, pyteomics_spectrum):
 
         return annotated_spectrum.spectrum
 
-    def _calculate_hyperscore(self, psm, spectrum):
-        pass
-
-    def _calculate_fragmentation_features(self, psm, annotated_spectrum):
-        pass
-
-
-# TODO: keep this here?
-def modification_evidence():
-    return
-
 
 def _annotated_spectrum_to_mzint(annotated_spectrum, ion_types=["b", "y"]):
 
diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py
index cc61926f..2f0ec3b0 100644
--- a/ms2rescore/parse_psms.py
+++ b/ms2rescore/parse_psms.py
@@ -104,13 +104,9 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
                 psm.rescoring_features = {}
 
         psm_handler = PSMHandler(
-            **mumble_config,  # TODO how do we store config for mumble?
+            **mumble_config,
         )
-        psm_list = psm_handler.add_modified_psms(
-            psm_list, generate_modified_decoys=True, keep_original=True
-        )
-        if mumble_config["output_file"]:
-            psm_handler.write_modified_psm_list(psm_list, mumble_config["output_file"])
+        psm_list = psm_handler.add_modified_psms(psm_list)
 
     return psm_list
 
diff --git a/pyproject.toml b/pyproject.toml
index 4285f1fa..985cc27e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,7 +50,6 @@ dependencies = [
     "pyteomics>=4.7.2",
     "rich>=12",
     "tomli>=2; python_version < '3.11'",
-    "mumble"
 ]
 
 [project.optional-dependencies]

From 909860d9c6f81490f9af3f6de364b606ddd7fe9c Mon Sep 17 00:00:00 2001
From: SamvPy <sam.vanpuyenbroeck@gmail.com>
Date: Fri, 22 Nov 2024 11:51:23 +0100
Subject: [PATCH 20/23] change openms peptide formatting

---
 ms2rescore/feature_generators/ms2.py | 121 ++++++++++++++++++++++-----
 1 file changed, 98 insertions(+), 23 deletions(-)

diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py
index a3bd6d77..eb049ed8 100644
--- a/ms2rescore/feature_generators/ms2.py
+++ b/ms2rescore/feature_generators/ms2.py
@@ -12,8 +12,11 @@
 import numpy as np
 import pyopenms as oms
 from psm_utils import PSM, Peptidoform, PSMList
+from psm_utils.peptidoform import PeptidoformException
+from pyteomics.proforma import ProFormaError
 from pyteomics import mass, mgf, mzml
 from rustyms import FragmentationModel, LinearPeptide, MassMode, RawSpectrum
+from pathlib import Path
 
 from ms2rescore.exceptions import ParseSpectrumError
 from ms2rescore.feature_generators.base import FeatureGeneratorBase
@@ -68,6 +71,7 @@ def __init__(
         self.processes = processes
         self.calculate_hyperscore = calculate_hyperscore
 
+
     @property
     def feature_names(self) -> List[str]:
         return [
@@ -137,19 +141,20 @@ def _calculate_features(self, psm_list: PSMList, spectrum_file: str) -> List:
                 mz_list, intensity_list = _annotated_spectrum_to_mzint(
                     annotated_spectrum=annotated_spectrum
                 )
+                hyperscore = calculate_hyperscore(
+                    psm=psm, observed_mz=mz_list, observed_intensity=intensity_list
+                )
+                if hyperscore is None:
+                    continue
                 psm.rescoring_features.update(
                     {
-                        "hyperscore": calculate_hyperscore(
-                            psm=psm, observed_mz=mz_list, observed_intensity=intensity_list
-                        )
+                        "hyperscore": hyperscore
                     }
                 )
 
-            else:
-                psm.rescoring_features.update({"hyperscore": np.nan})
 
     @staticmethod
-    def _read_spectrum_file(spectrum_filepath: str) -> Union[mzml.PreIndexedMzML, mgf.IndexedMGF]:
+    def _read_spectrum_file(spectrum_filepath: Path) -> Union[mzml.PreIndexedMzML, mgf.IndexedMGF]:
 
         if spectrum_filepath.suffix.lower() == ".mzml":
             return mzml.PreIndexedMzML(str(spectrum_filepath))
@@ -235,8 +240,9 @@ def _annotate_spectrum(self, psm, pyteomics_spectrum):
             intensity_array=pyteomics_spectrum["intensity array"],
         )
         try:
+            linear_peptide = LinearPeptide(psm.peptidoform.proforma.split("/")[0])
             annotated_spectrum = spectrum.annotate(
-                peptide=LinearPeptide(psm.peptidoform.proforma.split("/")[0]),
+                peptide=linear_peptide,
                 model=self.fragmentation_model,
                 mode=self.mass_mode,
             )
@@ -246,6 +252,9 @@ def _annotate_spectrum(self, psm, pyteomics_spectrum):
         return annotated_spectrum.spectrum
 
 
+############################
+### HYPERSCORE FUNCTIONS ###
+############################
 def _annotated_spectrum_to_mzint(annotated_spectrum, ion_types=["b", "y"]):
 
     mz_list = []
@@ -268,35 +277,87 @@ def _annotated_spectrum_to_mzint(annotated_spectrum, ion_types=["b", "y"]):
 
 def _peptidoform_to_oms(peptidoform: Peptidoform) -> tuple[oms.AASequence, Optional[int]]:
     """
-    Parse a peptidoform object to pyOpenMS compatible format.
+    Parse a peptide sequence in proforma format to pyOpenMS compatible format.
+
+    Only supports UNIMOD format.
 
     Parameter
     ---------
-    Peptidoform: Peptidoform
-        Peptide string in Peptidoform format
+    peptide: str
+        Peptide string in proforma format
 
     Returns
     -------
     AASequence (pyOpenMS):
         A peptide sequence in pyOpenMS format
-    int:
-        charge of the peptide
     """
+    peptide = peptidoform.proforma
+
+    # Reformat unimod modifications
+    pattern_unimod = r"\[UNIMOD:(\d+)\]"
+
+    def replace_unimod(match):
+        return f"(UniMod:{match.group(1)})"
+
+    peptide_oms_str = re.sub(
+        pattern=pattern_unimod, repl=replace_unimod, string=peptide
+    )
+
+    # Parse N-terminal modifications
+    if ")-" in peptide_oms_str:
+        peptide_oms_list = peptide_oms_str.split(")-")
+        nterm_modification, peptide_oms_str = peptide_oms_list[-2], peptide_oms_list[-1]
+        nterm_modification += ")"
+        peptide_oms_str = "." + nterm_modification + peptide_oms_str + "."
+    elif "]-" in peptide_oms_str:
+        peptide_oms_list = peptide_oms_str.split("]-")
+        nterm_modification, peptide_oms_str = peptide_oms_list[-2], peptide_oms_list[-1]
+        nterm_modification += "]"
+        peptide_oms_str = "." + nterm_modification + peptide_oms_str + "."
 
-    n_term = peptidoform.properties["n_term"]
-    peptide_oms_str = f"[{sum([mod.mass for mod in n_term])}]" if n_term else ""
+    # Split the charge from the peptide string
+    if "/" in peptide_oms_str:
+        peptide_oms_str, _ = peptide_oms_str.split("/")
 
-    for aa, mods in peptidoform.parsed_sequence:
-        peptide_oms_str += aa
-        if isinstance(mods, list):
-            peptide_oms_str += f"[{sum([mod.mass for mod in mods])}]"
+    try:
+        peptide_oms = oms.AASequence.fromString(peptide_oms_str)
+        return peptide_oms
 
-    c_term = peptidoform.properties["c_term"]
-    peptide_oms_str += f"[{sum([mod.mass for mod in c_term])}]" if c_term else ""
+    except:
+        return
 
-    peptide_oms = oms.AASequence.fromString(peptide_oms_str)
 
-    return peptide_oms
+# def _peptidoform_to_oms(peptidoform: Peptidoform) -> tuple[oms.AASequence, Optional[int]]:
+#     """
+#     Parse a peptidoform object to pyOpenMS compatible format.
+
+#     Parameter
+#     ---------
+#     Peptidoform: Peptidoform
+#         Peptide string in Peptidoform format
+
+#     Returns
+#     -------
+#     AASequence (pyOpenMS):
+#         A peptide sequence in pyOpenMS format
+#     int:
+#         charge of the peptide
+#     """
+
+#     n_term = peptidoform.properties["n_term"]
+#     peptide_oms_str = f"[{sum([mod.mass for mod in n_term])}]" if n_term else ""
+
+#     for aa, mods in peptidoform.parsed_sequence:
+#         peptide_oms_str += aa
+#         if isinstance(mods, list):
+#             peptide_oms_str += f"[{sum([mod.mass for mod in mods])}]"
+
+#     c_term = peptidoform.properties["c_term"]
+#     peptide_oms_str += f"[{sum([mod.mass for mod in c_term])}]" if c_term else ""
+
+#     peptide_oms = oms.AASequence.fromString(peptide_oms_str)
+
+#     return peptide_oms
 
 
 def _peptidoform_to_theoretical_spectrum(peptidoform: str) -> oms.MSSpectrum:
@@ -318,6 +379,8 @@ def _peptidoform_to_theoretical_spectrum(peptidoform: str) -> oms.MSSpectrum:
     """
     # Reformat peptide sequence in pyOpenMS format
     peptide_oms = _peptidoform_to_oms(peptidoform=peptidoform)
+    if peptide_oms is None:
+        return
 
     # Initialize the required objects to create the spectrum
     spectrum = oms.MSSpectrum()
@@ -375,9 +438,16 @@ def calculate_hyperscore(
         )
     if len(observed_intensity) == 0:
         logging.warning(f"PSM ({psm.spectrum_id}) has no annotated peaks.")
-        return 0.0
+        return
 
     theoretical_spectrum = _peptidoform_to_theoretical_spectrum(peptidoform=psm.peptidoform)
+    # This is mainly the cause of the modification not being allowed according to pyopenms
+    # pyOpenMS sets stringent criteria on modification being placed on annotated amino acids
+    # according to the unimod database
+    if theoretical_spectrum is None:
+        logging.warning(f'Peptidoform has either unsupported modifications or is being placed on non-allowed residue: {psm.peptidoform.proforma}')
+        return
+
     observed_spectrum_oms = oms.MSSpectrum()
     observed_spectrum_oms.set_peaks([observed_mz, observed_intensity])
     hyperscore = oms.HyperScore()
@@ -388,3 +458,8 @@ def calculate_hyperscore(
         theo_spectrum=theoretical_spectrum,
     )
     return result
+
+
+###########################
+### FRAG SITE FUNCTIONS ###
+###########################

From c5902c21c0c5c72e71bb64cafc17c15c28b50191 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Fri, 22 Nov 2024 13:36:41 +0100
Subject: [PATCH 21/23] add mumble psm filtering functionality

---
 ms2rescore/core.py  | 10 +++++++++-
 ms2rescore/utils.py | 31 +++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/ms2rescore/core.py b/ms2rescore/core.py
index 48e125bc..f314bef6 100644
--- a/ms2rescore/core.py
+++ b/ms2rescore/core.py
@@ -18,6 +18,7 @@
     add_peptide_confidence,
     add_psm_confidence,
 )
+from ms2rescore.utils import filter_mumble_psms
 
 logger = logging.getLogger(__name__)
 
@@ -102,6 +103,10 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
         )
     psm_list = psm_list[psms_with_features]
 
+    if "mumble" in config["psm_generator"]:
+        # Remove PSMS that have a less matched ions than the original hit
+        psm_list = filter_mumble_psms(psm_list)
+
     # Write feature names to file
     _write_feature_names(feature_names, output_file_root)
 
@@ -251,7 +256,10 @@ def _write_feature_names(feature_names, output_file_root):
 def _log_id_psms_before(psm_list: PSMList, fdr: float = 0.01, max_rank: int = 1) -> int:
     """Log #PSMs identified before rescoring."""
     id_psms_before = (
-        (psm_list["qvalue"] <= 0.01) & (psm_list["rank"] <= max_rank) & (~psm_list["is_decoy"])
+        (psm_list["qvalue"] <= 0.01)
+        & (psm_list["rank"] <= max_rank)
+        & (~psm_list["is_decoy"])
+        & ([metadata.get("original_psm", True) for metadata in psm_list["metadata"]])
     ).sum()
     logger.info(
         f"Found {id_psms_before} identified PSMs with rank <= {max_rank} at {fdr} FDR before "
diff --git a/ms2rescore/utils.py b/ms2rescore/utils.py
index 3515893a..33b1f0a1 100644
--- a/ms2rescore/utils.py
+++ b/ms2rescore/utils.py
@@ -4,8 +4,10 @@
 from glob import glob
 from pathlib import Path
 from typing import Optional, Union
+import numpy as np
 
 from ms2rescore.exceptions import MS2RescoreConfigurationError
+from psm_utils import PSMList
 
 logger = logging.getLogger(__name__)
 
@@ -93,3 +95,32 @@ def _is_minitdf(spectrum_file: str) -> bool:
     files = set(Path(spectrum_file).glob("*ms2spectrum.bin"))
     files.update(Path(spectrum_file).glob("*ms2spectrum.parquet"))
     return len(files) >= 2
+
+
+def filter_mumble_psms(psm_list: PSMList) -> PSMList:
+    """
+    Filter out PSMs with mumble in the peptide sequence.
+    """
+    keep = [None] * len(psm_list)
+    original_hit = np.array([metadata.get("original_hit") for metadata in psm_list["metadata"]])
+    spectrum_indices = np.array([psm.spectrum for psm in psm_list])
+    runs = np.array([psm.run for psm in psm_list])
+    if "matched_ions_pct" in psm_list[0].rescoring_features:
+        matched_ions = [psm.rescoring_features["matched_ions_pct"] for psm in psm_list]
+    else:
+        return psm_list
+    for i, psm in enumerate(psm_list):
+        if isinstance(keep[i], bool):
+            continue
+        elif original_hit[i]:
+            keep[i] = True
+        else:
+            original_matched_ions_pct = np.logical_and.reduce(
+                [original_hit, spectrum_indices == psm.spectrum_index, runs == psm.run]
+            )
+            if original_matched_ions_pct > matched_ions[i]:
+                keep[i] = False
+            else:
+                keep[i] = True
+    logger.debug(f"Filtered out {len(psm_list) - sum(keep)} mumble PSMs.")
+    return psm_list[keep]

From 5ce55f501d4a52fe6714a904d6402b90d8450613 Mon Sep 17 00:00:00 2001
From: SamvPy <sam.vanpuyenbroeck@gmail.com>
Date: Fri, 22 Nov 2024 15:05:03 +0100
Subject: [PATCH 22/23] remove pyopenms dependency for hyperscore calculation

---
 ms2rescore/feature_generators/ms2.py | 296 +++++++--------------------
 1 file changed, 75 insertions(+), 221 deletions(-)

diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py
index eb049ed8..df12ca95 100644
--- a/ms2rescore/feature_generators/ms2.py
+++ b/ms2rescore/feature_generators/ms2.py
@@ -138,20 +138,14 @@ def _calculate_features(self, psm_list: PSMList, spectrum_file: str) -> List:
 
             if self.calculate_hyperscore:
                 # Filters out peaks which are unnannotated (can be specified, but keep at b-y ions of any charge ?)
-                mz_list, intensity_list = _annotated_spectrum_to_mzint(
-                    annotated_spectrum=annotated_spectrum
+                b, y = get_by_fragments(annotated_spectrum)
+                hs = calculate_hyperscore(
+                    n_y=len(y),
+                    n_b=len(b),
+                    y_ion_intensities=y,
+                    b_ion_intensities=b
                 )
-                hyperscore = calculate_hyperscore(
-                    psm=psm, observed_mz=mz_list, observed_intensity=intensity_list
-                )
-                if hyperscore is None:
-                    continue
-                psm.rescoring_features.update(
-                    {
-                        "hyperscore": hyperscore
-                    }
-                )
-
+                psm.rescoring_features.update({'hyperscore': hs})
 
     @staticmethod
     def _read_spectrum_file(spectrum_filepath: Path) -> Union[mzml.PreIndexedMzML, mgf.IndexedMGF]:
@@ -251,215 +245,75 @@ def _annotate_spectrum(self, psm, pyteomics_spectrum):
 
         return annotated_spectrum.spectrum
 
-
-############################
-### HYPERSCORE FUNCTIONS ###
-############################
-def _annotated_spectrum_to_mzint(annotated_spectrum, ion_types=["b", "y"]):
-
-    mz_list = []
-    intensity_list = []
-
-    for peak in annotated_spectrum:
-
-        annotations = peak.annotation
-        for fragment in annotations:
-            ion = fragment.ion
-            if ion[0] not in ion_types:
-                continue
-
-            mz_list.append(peak.experimental_mz)
-            intensity_list.append(peak.intensity)
-            break
-
-    return mz_list, intensity_list
-
-
-def _peptidoform_to_oms(peptidoform: Peptidoform) -> tuple[oms.AASequence, Optional[int]]:
-    """
-    Parse a peptide sequence in proforma format to pyOpenMS compatible format.
-
-    Only supports UNIMOD format.
-
-    Parameter
-    ---------
-    peptide: str
-        Peptide string in proforma format
-
-    Returns
-    -------
-    AASequence (pyOpenMS):
-        A peptide sequence in pyOpenMS format
-    """
-    peptide = peptidoform.proforma
-
-    # Reformat unimod modifications
-    pattern_unimod = r"\[UNIMOD:(\d+)\]"
-
-    def replace_unimod(match):
-        return f"(UniMod:{match.group(1)})"
-
-    peptide_oms_str = re.sub(
-        pattern=pattern_unimod, repl=replace_unimod, string=peptide
-    )
-
-    # Parse N-terminal modifications
-    if ")-" in peptide_oms_str:
-        peptide_oms_list = peptide_oms_str.split(")-")
-        nterm_modification, peptide_oms_str = peptide_oms_list[-2], peptide_oms_list[-1]
-        nterm_modification += ")"
-        peptide_oms_str = "." + nterm_modification + peptide_oms_str + "."
-    elif "]-" in peptide_oms_str:
-        peptide_oms_list = peptide_oms_str.split("]-")
-        nterm_modification, peptide_oms_str = peptide_oms_list[-2], peptide_oms_list[-1]
-        nterm_modification += "]"
-        peptide_oms_str = "." + nterm_modification + peptide_oms_str + "."
-
-    # Split the charge from the peptide string
-    if "/" in peptide_oms_str:
-        peptide_oms_str, _ = peptide_oms_str.split("/")
-
-    try:
-        peptide_oms = oms.AASequence.fromString(peptide_oms_str)
-        return peptide_oms
-
-    except:
-        return
-
-
-# def _peptidoform_to_oms(peptidoform: Peptidoform) -> tuple[oms.AASequence, Optional[int]]:
-#     """
-#     Parse a peptidoform object to pyOpenMS compatible format.
-
-#     Parameter
-#     ---------
-#     Peptidoform: Peptidoform
-#         Peptide string in Peptidoform format
-
-#     Returns
-#     -------
-#     AASequence (pyOpenMS):
-#         A peptide sequence in pyOpenMS format
-#     int:
-#         charge of the peptide
-#     """
-
-#     n_term = peptidoform.properties["n_term"]
-#     peptide_oms_str = f"[{sum([mod.mass for mod in n_term])}]" if n_term else ""
-
-#     for aa, mods in peptidoform.parsed_sequence:
-#         peptide_oms_str += aa
-#         if isinstance(mods, list):
-#             peptide_oms_str += f"[{sum([mod.mass for mod in mods])}]"
-
-#     c_term = peptidoform.properties["c_term"]
-#     peptide_oms_str += f"[{sum([mod.mass for mod in c_term])}]" if c_term else ""
-
-#     peptide_oms = oms.AASequence.fromString(peptide_oms_str)
-
-#     return peptide_oms
-
-
-def _peptidoform_to_theoretical_spectrum(peptidoform: str) -> oms.MSSpectrum:
+ 
+def factorial(n):
     """
-    Create a theoretical spectrum from a peptide sequence.
-
-    Parameter
-    ---------
-    peptide: str
-        Peptide sequence in proforma format
-    engine: str
-        The engine to use to create theoretical spectrum.
-        Can only be 'pyopenms' or 'spectrum-utils' (default)
-
-    Return
-    ------
-    MSSpectrum
-        Spectrum object in pyOpenMS format
+    Compute factorial of n using a loop.
+    Parameters:
+        n (int): Non-negative integer.
+    Returns:
+        int: Factorial of n.
     """
-    # Reformat peptide sequence in pyOpenMS format
-    peptide_oms = _peptidoform_to_oms(peptidoform=peptidoform)
-    if peptide_oms is None:
-        return
-
-    # Initialize the required objects to create the spectrum
-    spectrum = oms.MSSpectrum()
-    tsg = oms.TheoreticalSpectrumGenerator()
-    p = oms.Param()
-
-    p.setValue("add_b_ions", "true")
-    p.setValue("add_metainfo", "true")
-    tsg.setParameters(param=p)
-
-    # Create the theoretical spectrum
-    tsg.getSpectrum(spec=spectrum, peptide=peptide_oms, min_charge=1, max_charge=2)
-    return spectrum
-
-
-def calculate_hyperscore(
-    psm: PSM,
-    observed_mz: List[float],
-    observed_intensity: List[float],
-    fragment_tol_mass=20,
-    fragment_tol_mode="ppm",
-):
+    result = 1
+    for i in range(1, n + 1):
+        result *= i
+    return result
+ 
+def calculate_hyperscore(n_y, n_b, y_ion_intensities, b_ion_intensities):
     """
-    Calculate the hyperscore as defined in the X!Tandem search engine.
-
-    It is a metric of how good two spectra match with each other (matching peaks).
-
-    Parameters
-    ----------
-    psm: psm_utils.PSM
-        The PSM used to extract 'spectrum_id' (for MGF spectrum extraction)
-        and 'Peptidoform' (the peptide sequence)
-    observed_mz: List[float]
-        List of observed mz values with matching order as observed intensity
-    observed_intensity: List[float]
-        List of observed intensity values
-    fragment_tol_mass: int
-        The allowed tolerance to match peaks
-    fragment_tol_mode: str
-        'ppm' for parts-per-million mode. 'Da' for fragment_tol_mass in Dalton.
-    Return
-    ------
-    int
-        The hyperscore
+    Calculate the hyperscore for a peptide-spectrum match.
+    Parameters:
+        n_y (int): Number of matched y-ions.
+        n_b (int): Number of matched b-ions.
+        y_ion_intensities (list): Intensities of matched y-ions.
+        b_ion_intensities (list): Intensities of matched b-ions.
+    Returns:
+        float: Calculated hyperscore.
     """
-    if fragment_tol_mode == "ppm":
-        fragment_mass_tolerance_unit_ppm = True
-    elif fragment_tol_mode == "Da":
-        fragment_mass_tolerance_unit_ppm = False
-    else:
-        raise Exception(
-            "fragment_tol_mode can only take 'Da' or 'ppm'. {} was provided.".format(
-                fragment_tol_mode
-            )
-        )
-    if len(observed_intensity) == 0:
-        logging.warning(f"PSM ({psm.spectrum_id}) has no annotated peaks.")
-        return
-
-    theoretical_spectrum = _peptidoform_to_theoretical_spectrum(peptidoform=psm.peptidoform)
-    # This is mainly the cause of the modification not being allowed according to pyopenms
-    # pyOpenMS sets stringent criteria on modification being placed on annotated amino acids
-    # according to the unimod database
-    if theoretical_spectrum is None:
-        logging.warning(f'Peptidoform has either unsupported modifications or is being placed on non-allowed residue: {psm.peptidoform.proforma}')
-        return
-
-    observed_spectrum_oms = oms.MSSpectrum()
-    observed_spectrum_oms.set_peaks([observed_mz, observed_intensity])
-    hyperscore = oms.HyperScore()
-    result = hyperscore.compute(
-        fragment_mass_tolerance=fragment_tol_mass,
-        fragment_mass_tolerance_unit_ppm=fragment_mass_tolerance_unit_ppm,
-        exp_spectrum=observed_spectrum_oms,
-        theo_spectrum=theoretical_spectrum,
-    )
-    return result
-
-
-###########################
-### FRAG SITE FUNCTIONS ###
-###########################
+    # Calculate the product of y-ion and b-ion intensities
+    product_y = np.sum(y_ion_intensities) if y_ion_intensities else 1
+    product_b = np.sum(b_ion_intensities) if b_ion_intensities else 1
+ 
+    # Calculate factorial using custom function
+    factorial_y = factorial(n_y)
+    factorial_b = factorial(n_b)
+ 
+    # Compute hyperscore
+    hyperscore = np.log(factorial_y * factorial_b * (product_y+product_b))
+    return hyperscore
+
+def infer_fragment_identity(frag, allow_ion_types=['b', 'y']):
+    ion = frag.ion
+
+    is_allowed = False
+    for allowed_ion_type in allow_ion_types:
+        if allowed_ion_type in ion:
+            is_allowed=True
+            break
+    
+    if not is_allowed:
+        return False
+    # Radicals
+    if "·" in ion:
+        return False
+    if frag.neutral_loss is not None:
+        return False
+    if frag.charge > 2:
+        return False
+    
+    return ion[0]
+
+def get_by_fragments(annotated_spectrum):
+    b_intensities = []
+    y_intensities = []
+    for peak in annotated_spectrum:
+        
+        for fragment in peak.annotation:
+
+            ion_type = infer_fragment_identity(fragment)
+            
+            if ion_type == 'b':
+                b_intensities.append(peak.intensity)
+            if ion_type == 'y':
+                y_intensities.append(peak.intensity)
+    return b_intensities, y_intensities
\ No newline at end of file

From 986c5f612bbd46d4f5f8f25e5d95425cde470e31 Mon Sep 17 00:00:00 2001
From: ArthurDeclercq <arre_declercq@hotmail.com>
Date: Fri, 22 Nov 2024 16:00:42 +0100
Subject: [PATCH 23/23] fix spectrum_id accession

---
 ms2rescore/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ms2rescore/utils.py b/ms2rescore/utils.py
index 33b1f0a1..7897b924 100644
--- a/ms2rescore/utils.py
+++ b/ms2rescore/utils.py
@@ -103,7 +103,7 @@ def filter_mumble_psms(psm_list: PSMList) -> PSMList:
     """
     keep = [None] * len(psm_list)
     original_hit = np.array([metadata.get("original_hit") for metadata in psm_list["metadata"]])
-    spectrum_indices = np.array([psm.spectrum for psm in psm_list])
+    spectrum_indices = np.array([psm.spectrum_id for psm in psm_list])
     runs = np.array([psm.run for psm in psm_list])
     if "matched_ions_pct" in psm_list[0].rescoring_features:
         matched_ions = [psm.rescoring_features["matched_ions_pct"] for psm in psm_list]
@@ -116,7 +116,7 @@ def filter_mumble_psms(psm_list: PSMList) -> PSMList:
             keep[i] = True
         else:
             original_matched_ions_pct = np.logical_and.reduce(
-                [original_hit, spectrum_indices == psm.spectrum_index, runs == psm.run]
+                [original_hit, spectrum_indices == psm.spectrum_id, runs == psm.run]
             )
             if original_matched_ions_pct > matched_ions[i]:
                 keep[i] = False