From 66fecc988bf51a2e84693b9ea25f626762f6da52 Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Fri, 5 Apr 2024 16:36:19 +0200
Subject: [PATCH 1/4] Use rust for parsing spectrum files

---
 ms2rescore/core.py          |   2 +-
 ms2rescore/parse_spectra.py | 139 ++++++++----------------------------
 pyproject.toml              |   5 +-
 tests/test_data/test.mgf    |  13 ++++
 tests/test_parse_spectra.py |  23 ++++++
 5 files changed, 68 insertions(+), 114 deletions(-)
 create mode 100644 tests/test_data/test.mgf
 create mode 100644 tests/test_parse_spectra.py

diff --git a/ms2rescore/core.py b/ms2rescore/core.py
index f777d620..725a1d58 100644
--- a/ms2rescore/core.py
+++ b/ms2rescore/core.py
@@ -63,7 +63,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
     im_required = "ionmob" in config["feature_generators"] and None in psm_list["ion_mobility"]
     if rt_required or im_required:
         logger.info("Parsing missing retention time and/or ion mobility values from spectra...")
-        get_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required)
+        get_missing_values(psm_list, config, rt_required=rt_required, im_required=im_required)
 
     # Add rescoring features
     for fgen_name, fgen_config in config["feature_generators"].items():
diff --git a/ms2rescore/parse_spectra.py b/ms2rescore/parse_spectra.py
index 9ed199b9..1e197289 100644
--- a/ms2rescore/parse_spectra.py
+++ b/ms2rescore/parse_spectra.py
@@ -3,11 +3,9 @@
 import logging
 import re
 from itertools import chain
-from typing import Dict, Tuple
 
+from ms2rescore_rs import get_precursor_info
 from psm_utils import PSMList
-from pyteomics.mgf import MGF
-from pyteomics.mzml import MzML
 from rich.progress import track
 
 from ms2rescore.exceptions import MS2RescoreError
@@ -16,7 +14,9 @@
 logger = logging.getLogger(__name__)
 
 
-def get_missing_values(config, psm_list, missing_rt=False, missing_im=False):
+def get_missing_values(
+    psm_list: PSMList, config: dict, rt_required: bool = False, im_required: bool = False
+):
     """Get missing RT/IM features from spectrum file."""
     logger.debug("Extracting missing RT/IM values from spectrum file(s).")
 
@@ -25,113 +25,30 @@ def get_missing_values(config, psm_list, missing_rt=False, missing_im=False):
         for run, psms in track(runs.items(), description="Extracting RT/IM values..."):
             psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
             spectrum_file = infer_spectrum_path(config["spectrum_path"], run)
-
-            if spectrum_file.suffix.lower() == ".mzml":
-                rt_dict, im_dict = _parse_values_from_mzml(
-                    spectrum_file, config, run, missing_rt, missing_im
-                )
-            elif spectrum_file.suffix.lower() == ".mgf":
-                rt_dict, im_dict = _parse_values_from_mgf(
-                    spectrum_file, config, run, missing_rt, missing_im
-                )
-
-            for value_dict, value in zip([rt_dict, im_dict], ["retention_time", "ion_mobility"]):
-                if value_dict:
-                    try:
-                        psm_list_run[value] = [value_dict[psm.spectrum_id] for psm in psm_list_run]
-                    except KeyError:
-                        raise ParsingError(
-                            f"Could not parse {value} values from spectrum file for run {run}."
-                        )
-
-
-def _parse_values_from_mgf(
-    spectrum_file, config, run, missing_rt, missing_im
-) -> Tuple[Dict, Dict]:
-    """
-    Parse retention time and/or ion mobility from an MGF file.
-
-    Notes
-    -----
-    - Extracting values (e.g., ion mobility) according to the Matrix documentation:
-      http://www.matrixscience.com/help/data_file_help.html
-
-    """
-    rt_dict = {}
-    im_dict = {}
-
-    spectrum_id_pattern = re.compile(
-        config["spectrum_id_pattern"] if config["spectrum_id_pattern"] else r"(.*)"
-    )
-
-    for spectrum in MGF(str(spectrum_file)):
-        matched_id = spectrum_id_pattern.match(spectrum["params"]["title"]).group()
-        if missing_rt:
-            try:
-                rt_dict[matched_id] = float(spectrum["params"]["rtinseconds"])
-            except KeyError:
-                raise ParsingError(
-                    "Could not parse retention time (`rtinseconds`) from spectrum file for "
-                    f"run {run}. Please make sure that the retention time key is present in the "
-                    "spectrum file or disable the relevant feature generator."
-                )
-        if missing_im:
-            try:
-                im_dict[matched_id] = float(spectrum["params"]["ion_mobility"])
-            except KeyError:
-                raise ParsingError(
-                    "Could not parse ion mobility (`ion_mobility`) from spectrum file "
-                    f"for run {run}. Please make sure that the ion mobility key is present in the "
-                    "spectrum file or disable the relevant feature generator."
-                )
-
-    return rt_dict, im_dict
-
-
-def _parse_values_from_mzml(
-    spectrum_file, config, run, missing_rt, missing_im
-) -> Tuple[Dict, Dict]:
-    """Parse retention time and/or ion mobility from an mzML file."""
-    rt_dict = {}
-    im_dict = {}
-
-    spectrum_id_pattern = re.compile(
-        config["spectrum_id_pattern"] if config["spectrum_id_pattern"] else r"(.*)"
-    )
-
-    for spectrum in MzML(str(spectrum_file)):
-        matched_id = spectrum_id_pattern.match(spectrum["id"]).group()
-        if missing_rt:
-            try:
-                rt_dict[matched_id] = float(spectrum["scanList"]["scan"][0]["scan start time"])
-            except KeyError:
-                raise ParsingError(
-                    "Could not parse retention time (`scan start time`) from spectrum file for "
-                    f"run {run}. Please make sure that the retention time key is present in the "
-                    "spectrum file or disable the relevant feature generator."
-                )
-        if missing_im:
-            try:
-                im_dict[matched_id] = float(
-                    spectrum["scanList"]["scan"][0]["reverse ion mobility"]
-                )
-            except KeyError:
-                raise ParsingError(
-                    "Could not parse ion mobility (`reverse ion mobility`) from spectrum file "
-                    f"for run {run}. Please make sure that the ion mobility key is present in the "
-                    "spectrum file or disable the relevant feature generator."
-                )
-
-    return rt_dict, im_dict
-
-
-class ParseMGFError(MS2RescoreError):
-    """Error parsing MGF file."""
-
-    pass
-
-
-class ParsingError(MS2RescoreError):
+            precursors = get_precursor_info(str(spectrum_file))
+
+            if config["spectrum_id_pattern"]:
+                spectrum_id_pattern = re.compile(config["spectrum_id_pattern"])
+                precursors = {
+                    spectrum_id_pattern.search(spectrum_id).group(1): precursor
+                    for spectrum_id, precursor in precursors.items()
+                }
+
+            for psm in psm_list_run:
+                try:
+                    if rt_required:
+                        psm.retention_time = precursors[psm.spectrum_id].rt
+                    if im_required:
+                        psm.ion_mobility = precursors[psm.spectrum_id].im
+                    if not psm.precursor_mz:
+                        psm.precursor_mz = precursors[psm.spectrum_id].mz
+                except KeyError as e:
+                    raise SpectrumParsingError(
+                        f"Could not extract missing RT/IM values from spectrum file for run {run}."
+                    ) from e
+
+
+class SpectrumParsingError(MS2RescoreError):
     """Error parsing retention time from spectrum file."""
 
     pass
diff --git a/pyproject.toml b/pyproject.toml
index 8f135149..18bbb8be 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,8 +32,9 @@ classifiers = [
 dynamic = ["version"]
 requires-python = ">=3.8"
 dependencies = [
+    "ms2rescore_rs",
     "numpy>=1.16.0; python_version != '3.11'",
-    "numpy==1.24.3; python_version == '3.11'",  # Incompatibility with sklearn, pygam, and TF...
+    "numpy==1.24.3; python_version == '3.11'", # Incompatibility with sklearn, pygam, and TF...
     "pandas>=1.0",
     "rich>=12",
     "pyteomics>=4.1.0",
@@ -47,7 +48,7 @@ dependencies = [
     "psm_utils>=0.4",
     "customtkinter>=5,<6",
     "mokapot>=0.9",
-    "pydantic>=1.8.2,<2",                # Fix compatibility with v2 in psm_utils
+    "pydantic>=1.8.2,<2",                      # Fix compatibility with v2 in psm_utils
     "jinja2>=3",
     "plotly>=5",
 ]
diff --git a/tests/test_data/test.mgf b/tests/test_data/test.mgf
new file mode 100644
index 00000000..e4899c08
--- /dev/null
+++ b/tests/test_data/test.mgf
@@ -0,0 +1,13 @@
+BEGIN IONS
+TITLE=peptide: peptide1
+CHARGE=2+
+PEPMASS=475.137295
+ION_MOBILITY=42.42
+RTINSECONDS=51.2
+72.04439 100
+148.06043 600
+232.07504 300
+263.08737 400
+347.10198 500
+423.11802 200
+END IONS
diff --git a/tests/test_parse_spectra.py b/tests/test_parse_spectra.py
new file mode 100644
index 00000000..4dffc9dc
--- /dev/null
+++ b/tests/test_parse_spectra.py
@@ -0,0 +1,23 @@
+import pytest
+from psm_utils import PSM, PSMList
+
+from ms2rescore.parse_spectra import get_missing_values
+
+
+def test_get_missing_values():
+    psm_list = PSMList(
+        psm_list=[
+            PSM(peptidoform="PEPTIDEK/2", spectrum_id="peptide1"),
+        ]
+    )
+    get_missing_values(
+        psm_list,
+        config={
+            "spectrum_path": "tests/test_data/test.mgf",
+            "spectrum_id_pattern": "peptide: (.*)",
+        },
+        rt_required=True,
+        im_required=True,
+    )
+    assert psm_list[0].retention_time == pytest.approx(0.853, 0.001)
+    assert psm_list[0].ion_mobility == pytest.approx(42.42, 0.01)

From c5677a3f013b332b22679d46aac258dc8ec898e6 Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Sun, 7 Apr 2024 18:56:11 +0200
Subject: [PATCH 2/4] Use single quotes instead of backticks when logging
 strings

---
 ms2rescore/parse_psms.py    | 15 +++++++--------
 ms2rescore/parse_spectra.py |  5 ++---
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py
index b30539f3..a9855fda 100644
--- a/ms2rescore/parse_psms.py
+++ b/ms2rescore/parse_psms.py
@@ -1,6 +1,5 @@
 import logging
 import re
-from itertools import chain
 from typing import Dict, Union
 
 import psm_utils.io
@@ -60,9 +59,9 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
 
     if config["psm_id_pattern"]:
         pattern = re.compile(config["psm_id_pattern"])
-        logger.debug("Applying `psm_id_pattern`...")
+        logger.debug("Applying 'psm_id_pattern'...")
         logger.debug(
-            f"Parsing `{psm_list['spectrum_id'][0]}` to `{_match_psm_ids(psm_list['spectrum_id'][0], pattern)}`"
+            f"Parsing '{psm_list[0].spectrum_id}' to '{_match_psm_ids(psm_list[0].spectrum_id, pattern)}'"
         )
         new_ids = [_match_psm_ids(old_id, pattern) for old_id in psm_list["spectrum_id"]]
         psm_list["spectrum_id"] = new_ids
@@ -86,7 +85,7 @@ def _read_psms(config, psm_list):
         valid_psms = 0
         for psm_file in config["psm_file"]:
             logger.info(
-                f"Reading PSMs from PSM file ({current_file}/{total_files}): `{psm_file}`..."
+                f"Reading PSMs from PSM file ({current_file}/{total_files}): '{psm_file}'..."
             )
             try:
                 id_file_psm_list = psm_utils.io.read_file(
@@ -97,8 +96,8 @@ def _read_psms(config, psm_list):
                 )
             except psm_utils.io.PSMUtilsIOException:
                 raise MS2RescoreConfigurationError(
-                    "Error occurred while reading PSMs. Please check the `psm_file` and "
-                    "`psm_file_type` settings. See "
+                    "Error occurred while reading PSMs. Please check the 'psm_file' and "
+                    "'psm_file_type' settings. See "
                     "https://ms2rescore.readthedocs.io/en/latest/userguide/input-files/"
                     " for more information."
                 )
@@ -129,7 +128,7 @@ def _find_decoys(config, psm_list):
     if not any(psm_list["is_decoy"]):
         raise MS2RescoreConfigurationError(
             "No decoy PSMs found. Please check if decoys are present in the PSM file and that "
-            "the `id_decoy_pattern` option is correct. See "
+            "the 'id_decoy_pattern' option is correct. See "
             "https://ms2rescore.readthedocs.io/en/latest/userguide/configuration/#selecting-decoy-psms"
             " for more information."
         )
@@ -150,7 +149,7 @@ def _match_psm_ids(old_id, regex_pattern):
         return match[1]
     except (TypeError, IndexError):
         raise MS2RescoreConfigurationError(
-            f"`psm_id_pattern` could not be extracted from PSM spectrum IDs (i.e. {old_id})."
+            f"'psm_id_pattern' could not be extracted from PSM spectrum IDs (i.e. {old_id})."
             " Ensure that the regex contains a capturing group?"
         )
 
diff --git a/ms2rescore/parse_spectra.py b/ms2rescore/parse_spectra.py
index 1e197289..17b35d60 100644
--- a/ms2rescore/parse_spectra.py
+++ b/ms2rescore/parse_spectra.py
@@ -18,13 +18,12 @@ def get_missing_values(
     psm_list: PSMList, config: dict, rt_required: bool = False, im_required: bool = False
 ):
     """Get missing RT/IM features from spectrum file."""
-    logger.debug("Extracting missing RT/IM values from spectrum file(s).")
-
     psm_dict = psm_list.get_psm_dict()
     for runs in psm_dict.values():
-        for run, psms in track(runs.items(), description="Extracting RT/IM values..."):
+        for run, psms in runs.items():
             psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
             spectrum_file = infer_spectrum_path(config["spectrum_path"], run)
+            logger.debug("Reading spectrum file: '%s'", spectrum_file)
             precursors = get_precursor_info(str(spectrum_file))
 
             if config["spectrum_id_pattern"]:

From ed6423b29ba81a3cbb6d43f60a2a8859acb64719 Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Sun, 7 Apr 2024 18:56:47 +0200
Subject: [PATCH 3/4] mokapot: Fix writing of FlashLFQ output when PSMList run
 field is empty

---
 ms2rescore/rescoring_engines/mokapot.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ms2rescore/rescoring_engines/mokapot.py b/ms2rescore/rescoring_engines/mokapot.py
index f3927f47..cc7a336f 100644
--- a/ms2rescore/rescoring_engines/mokapot.py
+++ b/ms2rescore/rescoring_engines/mokapot.py
@@ -171,6 +171,10 @@ def convert_psm_list(
     feature_df.columns = [f"feature:{f}" for f in feature_df.columns]
     combined_df = pd.concat([psm_df[required_columns], feature_df], axis=1)
 
+    # Ensure filename for FlashLFQ txt output
+    if not combined_df["run"].notnull().all():
+        combined_df["run"] = "ms_run"
+
     feature_names = [f"feature:{f}" for f in feature_names] if feature_names else None
 
     lin_psm_data = LinearPsmDataset(

From dc866495e18a33ee0dfabf58e4785ed7d7d1ec25 Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Sun, 7 Apr 2024 18:57:24 +0200
Subject: [PATCH 4/4] Reactivate tests in CI

---
 .github/workflows/test.yml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 339aade3..dbe1019b 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -37,9 +37,10 @@ jobs:
         run: |
           pip install .[dev]
 
-      # - name: Test with pytest
-      #   run: |
-      #     pytest
+      - name: Test with pytest
+        run: |
+          pytest
+
       - name: Test installation
         run: |
           ms2rescore --help