Merge branch 'main' of https://github.com/compomics/ms2rescore into t…

…imsRescore
compomics · Apr 8, 2024 · 86ba7b6 · 86ba7b6
2 parents 17a4e12 + 1fe9d9e
commit 86ba7b6
Show file tree

Hide file tree

Showing 12 changed files with 124 additions and 144 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -33,9 +33,10 @@ jobs:
         run: |
           pip install .[dev]
 
-      # - name: Test with pytest
-      #   run: |
-      #     pytest
+      - name: Test with pytest
+        run: |
+          pytest
+
       - name: Test installation
         run: |
           ms2rescore --help

diff --git a/ms2rescore/core.py b/ms2rescore/core.py
@@ -8,7 +8,7 @@
 
 from ms2rescore.feature_generators import FEATURE_GENERATORS
 from ms2rescore.parse_psms import parse_psms
-from ms2rescore.parse_spectra import fill_missing_values
+from ms2rescore.parse_spectra import get_missing_values
 from ms2rescore.report import generate
 from ms2rescore.rescoring_engines import mokapot, percolator
 from ms2rescore import exceptions
@@ -69,7 +69,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
 
     if rt_required or im_required:
         logger.info("Parsing missing retention time and/or ion mobility values from spectra...")
-        fill_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required)
+        get_missing_values(psm_list, config, rt_required=rt_required, im_required=im_required)
 
     # Add rescoring features
     for fgen_name, fgen_config in config["feature_generators"].items():
@@ -163,7 +163,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
                 output_file_root, psm_list=psm_list, feature_names=feature_names, use_txt_log=True
             )
         except exceptions.ReportGenerationError as e:
-            logger.error(e)
+            logger.exception(e)
 
 
 def _write_feature_names(feature_names, output_file_root):

diff --git a/ms2rescore/feature_generators/ms2pip.py b/ms2rescore/feature_generators/ms2pip.py
@@ -53,6 +53,7 @@ def __init__(
         ms2_tolerance: float = 0.02,
         spectrum_path: Optional[str] = None,
         spectrum_id_pattern: str = "(.*)",
+        model_dir: Optional[str] = None,
         processes: 1,
         **kwargs,
     ) -> None:
@@ -71,6 +72,8 @@ def __init__(
         spectrum_id_pattern : str, optional
             Regular expression pattern to extract spectrum ID from spectrum file. Defaults to
             :py:const:`.*`.
+        model_dir
+            Directory containing MS²PIP models. Defaults to :py:const:`None` (use MS²PIP default).
         processes : int, optional
             Number of processes to use. Defaults to 1.
 
@@ -85,6 +88,7 @@ def __init__(
         self.ms2_tolerance = ms2_tolerance
         self.spectrum_path = spectrum_path
         self.spectrum_id_pattern = spectrum_id_pattern
+        self.model_dir = model_dir
         self.processes = processes
 
     @property
@@ -194,6 +198,7 @@ def add_features(self, psm_list: PSMList) -> None:
                         model=self.model,
                         ms2_tolerance=self.ms2_tolerance,
                         compute_correlations=False,
+                        model_dir=self.model_dir,
                         processes=self.processes,
                     )
                 except NoMatchingSpectraFound as e:

diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py
@@ -1,6 +1,5 @@
 import logging
 import re
-from itertools import chain
 from typing import Dict, Union
 
 import psm_utils.io
@@ -64,10 +63,9 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
 
     if config["psm_id_pattern"]:
         pattern = re.compile(config["psm_id_pattern"])
-        logger.debug("Applying `psm_id_pattern`...")
+        logger.debug("Applying 'psm_id_pattern'...")
         logger.debug(
-            f"Parsing `{psm_list['spectrum_id'][0]}` to "
-            f"`{_match_psm_ids(psm_list['spectrum_id'][0], pattern)}`"
+            f"Parsing '{psm_list[0].spectrum_id}' to '{_match_psm_ids(psm_list[0].spectrum_id, pattern)}'"
         )
         new_ids = [_match_psm_ids(old_id, pattern) for old_id in psm_list["spectrum_id"]]
         psm_list["spectrum_id"] = new_ids
@@ -91,7 +89,7 @@ def _read_psms(config, psm_list):
         valid_psms = 0
         for psm_file in config["psm_file"]:
             logger.info(
-                f"Reading PSMs from PSM file ({current_file}/{total_files}): `{psm_file}`..."
+                f"Reading PSMs from PSM file ({current_file}/{total_files}): '{psm_file}'..."
             )
             try:
                 id_file_psm_list = psm_utils.io.read_file(
@@ -102,8 +100,8 @@ def _read_psms(config, psm_list):
                 )
             except psm_utils.io.PSMUtilsIOException:
                 raise MS2RescoreConfigurationError(
-                    "Error occurred while reading PSMs. Please check the `psm_file` and "
-                    "`psm_file_type` settings. See "
+                    "Error occurred while reading PSMs. Please check the 'psm_file' and "
+                    "'psm_file_type' settings. See "
                     "https://ms2rescore.readthedocs.io/en/latest/userguide/input-files/"
                     " for more information."
                 )
@@ -134,7 +132,7 @@ def _find_decoys(config, psm_list):
     if not any(psm_list["is_decoy"]):
         raise MS2RescoreConfigurationError(
             "No decoy PSMs found. Please check if decoys are present in the PSM file and that "
-            "the `id_decoy_pattern` option is correct. See "
+            "the 'id_decoy_pattern' option is correct. See "
             "https://ms2rescore.readthedocs.io/en/latest/userguide/configuration/#selecting-decoy-psms"
             " for more information."
         )
@@ -155,7 +153,7 @@ def _match_psm_ids(old_id, regex_pattern):
         return match[1]
     except (TypeError, IndexError):
         raise MS2RescoreConfigurationError(
-            f"`psm_id_pattern` could not be extracted from PSM spectrum IDs (i.e. {old_id})."
+            f"'psm_id_pattern' could not be extracted from PSM spectrum IDs (i.e. {old_id})."
             " Ensure that the regex contains a capturing group?"
         )
 

diff --git a/ms2rescore/parse_spectra.py b/ms2rescore/parse_spectra.py
@@ -3,142 +3,50 @@
 import logging
 import re
 from itertools import chain
-from typing import Dict, Tuple
 
+from ms2rescore_rs import get_precursor_info
 from psm_utils import PSMList
-from pyteomics.mgf import IndexedMGF
-from pyteomics.mzml import PreIndexedMzML
-from rich.progress import track
 
 from ms2rescore.exceptions import MS2RescoreError
 from ms2rescore.utils import infer_spectrum_path
 
 logger = logging.getLogger(__name__)
 
 
-def fill_missing_values(config, psm_list, missing_rt=False, missing_im=False):
+def get_missing_values(
+    psm_list: PSMList, config: dict, rt_required: bool = False, im_required: bool = False
+):
     """Get missing RT/IM features from spectrum file."""
-    logger.debug("Extracting missing RT/IM values from spectrum file(s).")
-
     psm_dict = psm_list.get_psm_dict()
     for runs in psm_dict.values():
-        for run, psms in track(runs.items(), description="Extracting RT/IM values..."):
+        for run, psms in runs.items():
             psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
             spectrum_file = infer_spectrum_path(config["spectrum_path"], run)
-
-            if missing_im or missing_rt:
-                if spectrum_file.suffix.lower() == ".mzml":
-                    _parse_values_from_mzml(
-                        psm_list_run, spectrum_file, config, missing_rt, missing_im
-                    )
-                elif spectrum_file.suffix.lower() == ".mgf":
-                    _parse_values_from_mgf(
-                        psm_list_run, spectrum_file, config, missing_rt, missing_im
-                    )
-                else:
-                    raise MS2RescoreError(
-                        f"Could not parse retention time and/or ion mobility from spectrum file for run {run}. "
-                        "Please make sure that the spectrum file is either in mzML or MGF format."
-                    )
-
-
-def _parse_values_from_mgf(
-    psm_list_run, spectrum_file, config, missing_rt, missing_im
-) -> Tuple[Dict, Dict]:
-    """Parse retention time and/or ion mobility from an mzML file."""
-
-    mgf = IndexedMGF(str(spectrum_file))
-    spectrum_id_pattern = re.compile(
-        config["spectrum_id_pattern"] if config["spectrum_id_pattern"] else r"(.*)"
-    )
-
-    try:
-        mapper = {
-            spectrum_id_pattern.search(spectrum_id).group(1): spectrum_id
-            for spectrum_id in mgf._offset_index.mapping["spectrum"].keys()
-        }
-    except AttributeError:
-        raise ParseMGFError(
-            "Could not parse spectrum IDs using ´spectrum_id_pattern´. Please make sure that there is a capturing in the pattern."
-        )
-
-    spectra = {spectrum_id: mgf.get_by_id(spectrum_id) for spectrum_id in mapper.values()}
-
-    for psm in psm_list_run:
-        spectrum = spectra.get(mapper[psm.spectrum_id])
-        if spectrum is None:
-            raise ParsingError(f"Could not find spectrum with ID {psm.spectrum_id} in MGF file.")
-
-        if missing_rt and "params" in spectrum and "rtinseconds" in spectrum["params"]:
-            psm.retention_time = float(spectrum["params"]["rtinseconds"])
-
-        if missing_im and "params" in spectrum and "ion_mobility" in spectrum["params"]:
-            psm.ion_mobility = float(spectrum["params"]["ion_mobility"])
-
-
-def _parse_values_from_mzml(
-    psm_list_run, spectrum_file, config, missing_rt, missing_im
-) -> Tuple[Dict, Dict]:
-    """Parse retention time and/or ion mobility from an mzML file."""
-
-    mzml = PreIndexedMzML(str(spectrum_file))
-    spectrum_id_pattern = re.compile(
-        config["spectrum_id_pattern"] if config["spectrum_id_pattern"] else r"(.*)"
-    )
-
-    try:
-        mapper = {
-            spectrum_id_pattern.search(spectrum_id).group(1): spectrum_id
-            for spectrum_id in mzml._offset_index.mapping["spectrum"].keys()
-        }
-    except AttributeError as e:
-        raise ParseMGFError(
-            "Could not parse spectrum IDs using ´spectrum_id_pattern´. Please make sure that there is a capturing in the pattern."
-        ) from e
-
-    spectra = {spectrum_id: mzml.get_by_id(spectrum_id) for spectrum_id in mapper.values()}
-
-    for psm in psm_list_run:
-        spectrum = spectra.get(mapper[psm.spectrum_id])
-        if spectrum is None:
-            raise ParsingError(f"Could not find spectrum with ID {psm.spectrum_id} in mzML file.")
-
-        if (
-            missing_rt
-            and "scanList" in spectrum
-            and "scan" in spectrum["scanList"]
-            and spectrum["scanList"]["scan"]
-        ):
-            psm.retention_time = float(spectrum["scanList"]["scan"][0].get("scan start time", 0))
-
-        if missing_im:
-            if (
-                "precursorList" in spectrum
-                and "precursor" in spectrum["precursorList"]
-                and spectrum["precursorList"]["precursor"]
-            ):
-                psm.ion_mobility = float(
-                    spectrum["precursorList"]["precursor"][0]["selectedIonList"]["selectedIon"][
-                        0
-                    ].get("inverse reduced ion mobility", 0)
-                )
-            elif (
-                "scanList" in spectrum
-                and "scan" in spectrum["scanList"]
-                and spectrum["scanList"]["scan"]
-            ):
-                psm.ion_mobility = float(
-                    spectrum["scanList"]["scan"][0].get("reverse ion mobility", 0)
-                )
-
-
-class ParseMGFError(MS2RescoreError):
-    """Error parsing MGF file."""
-
-    pass
-
-
-class ParsingError(MS2RescoreError):
+            logger.debug("Reading spectrum file: '%s'", spectrum_file)
+            precursors = get_precursor_info(str(spectrum_file))
+
+            if config["spectrum_id_pattern"]:
+                spectrum_id_pattern = re.compile(config["spectrum_id_pattern"])
+                precursors = {
+                    spectrum_id_pattern.search(spectrum_id).group(1): precursor
+                    for spectrum_id, precursor in precursors.items()
+                }
+
+            for psm in psm_list_run:
+                try:
+                    if rt_required:
+                        psm.retention_time = precursors[psm.spectrum_id].rt
+                    if im_required:
+                        psm.ion_mobility = precursors[psm.spectrum_id].im
+                    if not psm.precursor_mz:
+                        psm.precursor_mz = precursors[psm.spectrum_id].mz
+                except KeyError as e:
+                    raise SpectrumParsingError(
+                        f"Could not extract missing RT/IM values from spectrum file for run {run}."
+                    ) from e
+
+
+class SpectrumParsingError(MS2RescoreError):
     """Error parsing retention time from spectrum file."""
 
     pass
diff --git a/ms2rescore/report/charts.py b/ms2rescore/report/charts.py
@@ -214,6 +214,14 @@ def score_scatter_plot(
         Column with index for each PSM, peptide, or protein to use for merging data frames.
 
     """
+    if not before or not after:
+        figure = go.Figure()
+        figure.add_annotation(
+            text="No data available for comparison.",
+            showarrow=False,
+        )
+        return figure
+
     # Restructure data
     merge_columns = [indexer, "mokapot score", "mokapot q-value", "mokapot PEP"]
     ce_psms_targets = pd.merge(
@@ -288,6 +296,14 @@ def fdr_plot_comparison(
         Column with index for each PSM, peptide, or protein to use for merging dataframes.
 
     """
+    if not before or not after:
+        figure = go.Figure()
+        figure.add_annotation(
+            text="No data available for comparison.",
+            showarrow=False,
+        )
+        return figure
+
     # Prepare data
     ce_psms_targets_melted = (
         pd.merge(
@@ -336,7 +352,7 @@ def fdr_plot_comparison(
 def identification_overlap(
     before: mokapot.LinearConfidence,
     after: mokapot.LinearConfidence,
-) -> go.Figure():
+) -> go.Figure:
     """
     Plot stacked bar charts of removed, retained, and gained PSMs, peptides, and proteins.
 
@@ -348,6 +364,14 @@ def identification_overlap(
         Mokapot linear confidence results after rescoring.
 
     """
+    if not before or not after:
+        figure = go.Figure()
+        figure.add_annotation(
+            text="No data available for comparison.",
+            showarrow=False,
+        )
+        return figure
+
     levels = before.levels  # ["psms", "peptides", "proteins"] if all available
     indexers = ["index", "index", "mokapot protein group"]
 

diff --git a/ms2rescore/report/generate.py b/ms2rescore/report/generate.py
@@ -166,6 +166,11 @@ def _get_stats_context(confidence_before, confidence_after):
     levels = ["psms", "peptides", "proteins"]
     level_names = ["PSMs", "Peptides", "Protein groups"]
     card_colors = ["card-bg-blue", "card-bg-green", "card-bg-red"]
+
+    # Cannot report stats if confidence estimates are not present
+    if not confidence_before or not confidence_after:
+        return stats
+
     for level, level_name, card_color in zip(levels, level_names, card_colors):
         try:
             before = confidence_before.accepted[level.lower()]

diff --git a/ms2rescore/report/utils.py b/ms2rescore/report/utils.py
@@ -80,10 +80,7 @@ def get_confidence_estimates(
 
         try:
             confidence[when] = lin_psm_dataset.assign_confidence()
-        except RuntimeError as e:
-            raise ReportGenerationError(
-                f"Error while assigning confidence estimates to PSMs ({when} rescoring). "
-                "Could not generate report."
-            ) from e
+        except RuntimeError:
+            confidence[when] = None
 
     return confidence["before"], confidence["after"]
diff --git a/ms2rescore/rescoring_engines/mokapot.py b/ms2rescore/rescoring_engines/mokapot.py
@@ -171,6 +171,10 @@ def convert_psm_list(
     feature_df.columns = [f"feature:{f}" for f in feature_df.columns]
     combined_df = pd.concat([psm_df[required_columns], feature_df], axis=1)
 
+    # Ensure filename for FlashLFQ txt output
+    if not combined_df["run"].notnull().all():
+        combined_df["run"] = "ms_run"
+
     feature_names = [f"feature:{f}" for f in feature_names] if feature_names else None
 
     lin_psm_data = LinearPsmDataset(