diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index c03b1667..1137ef58 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -14,7 +14,7 @@ jobs:
- uses: actions/checkout@v4
- name: Set up Python
- uses: actions/setup-python@v4
+ uses: actions/setup-python@v5
with:
python-version: "3.11"
@@ -29,7 +29,7 @@ jobs:
- name: Test built package
run: |
- pip install dist/ms2rescore-*.whl
+ pip install --only-binary :all: dist/ms2rescore-*.whl
# pytest
ms2rescore --help
@@ -47,14 +47,14 @@ jobs:
steps:
- uses: actions/checkout@v4
- - uses: actions/setup-python@v4
+ - uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Install package and dependencies
run: |
python -m pip install --upgrade pip
- pip install . pyinstaller
+ pip install --only-binary :all: . pyinstaller
- name: Install Inno Setup
uses: crazy-max/ghaction-chocolatey@v3
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index dbe1019b..8830c6e0 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -24,18 +24,14 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
- pip install flake8
+ pip install ruff
- - name: Lint with flake8
- run: |
- # stop the build if there are Python syntax errors or undefined names
- flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
- # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
- flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+ - name: Run Ruff
+ run: ruff check --output-format=github .
- name: Build and install ms2rescore package
run: |
- pip install .[dev]
+ pip install --only-binary :all: .[dev]
- name: Test with pytest
run: |
diff --git a/Dockerfile b/Dockerfile
index 474f6f5d..7ac34e15 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,8 +1,10 @@
-FROM ubuntu:focal
+FROM python:3.11
+
+# ARG DEBIAN_FRONTEND=noninteractive
LABEL name="ms2rescore"
-ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/ms2rescore
+# ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/ms2rescore
ADD pyproject.toml /ms2rescore/pyproject.toml
ADD LICENSE /ms2rescore/LICENSE
@@ -11,8 +13,7 @@ ADD MANIFEST.in /ms2rescore/MANIFEST.in
ADD ms2rescore /ms2rescore/ms2rescore
RUN apt-get update \
- && apt-get install --no-install-recommends -y python3-pip procps libglib2.0-0 libsm6 libxrender1 libxext6 \
- && rm -rf /var/lib/apt/lists/* \
- && pip3 install ms2rescore/
+ && apt install -y procps \
+ && pip install /ms2rescore --only-binary :all:
ENTRYPOINT [""]
diff --git a/docs/source/config_schema.md b/docs/source/config_schema.md
index 2b523aa5..209bc009 100644
--- a/docs/source/config_schema.md
+++ b/docs/source/config_schema.md
@@ -10,6 +10,7 @@
- **`deeplc`**: Refer to *[#/definitions/deeplc](#definitions/deeplc)*.
- **`maxquant`**: Refer to *[#/definitions/maxquant](#definitions/maxquant)*.
- **`ionmob`**: Refer to *[#/definitions/ionmob](#definitions/ionmob)*.
+ - **`im2deep`**: Refer to *[#/definitions/im2deep](#definitions/im2deep)*.
- **`rescoring_engine`** *(object)*: Rescoring engine to use and its configuration. Leave empty to skip rescoring and write features to file. Default: `{"mokapot": {}}`.
- **`.*`**: Refer to *[#/definitions/rescoring_engine](#definitions/rescoring_engine)*.
- **`percolator`**: Refer to *[#/definitions/percolator](#definitions/percolator)*.
@@ -47,7 +48,17 @@
- **One of**
- *string*
- *null*
+ - **`psm_id_rt_pattern`**: Regex pattern to extract retention time from PSM identifier. Requires at least one capturing group. Default: `null`.
+ - **One of**
+ - *string*
+ - *null*
+ - **`psm_id_im_pattern`**: Regex pattern to extract ion mobility from PSM identifier. Requires at least one capturing group. Default: `null`.
+ - **One of**
+ - *string*
+ - *null*
- **`lower_score_is_better`** *(boolean)*: Bool indicating if lower score is better. Default: `false`.
+ - **`max_psm_rank_input`** *(number)*: Maximum rank of PSMs to use as input for rescoring. Minimum: `1`. Default: `10`.
+ - **`max_psm_rank_output`** *(number)*: Maximum rank of PSMs to return after rescoring, before final FDR calculation. Minimum: `1`. Default: `1`.
- **`modification_mapping`** *(object)*: Mapping of modification labels to each replacement label. Default: `{}`.
- **`fixed_modifications`** *(object)*: Mapping of amino acids with fixed modifications to the modification name. Can contain additional properties. Default: `{}`.
- **`processes`** *(number)*: Number of parallel processes to use; -1 for all available. Minimum: `-1`. Default: `-1`.
@@ -57,6 +68,7 @@
- *string*
- *null*
- **`write_report`** *(boolean)*: Write an HTML report with various QC metrics and charts. Default: `false`.
+ - **`profile`** *(boolean)*: Write a txt report using cProfile for profiling. Default: `false`.
## Definitions
- **`feature_generator`** *(object)*: Feature generator configuration. Can contain additional properties.
@@ -75,7 +87,10 @@
- **`ionmob_model`** *(string)*: Path to Ionmob model directory. Default: `"GRUPredictor"`.
- **`reference_dataset`** *(string)*: Path to Ionmob reference dataset file. Default: `"Meier_unimod.parquet"`.
- **`tokenizer`** *(string)*: Path to tokenizer json file. Default: `"tokenizer.json"`.
+- **`im2deep`** *(object)*: Ion mobility feature generator configuration using IM2Deep. Can contain additional properties. Refer to *[#/definitions/feature_generator](#definitions/feature_generator)*.
+ - **`reference_dataset`** *(string)*: Path to IM2Deep reference dataset file. Default: `"Meier_unimod.parquet"`.
- **`mokapot`** *(object)*: Mokapot rescoring engine configuration. Additional properties are passed to the Mokapot brew function. Can contain additional properties. Refer to *[#/definitions/rescoring_engine](#definitions/rescoring_engine)*.
+ - **`train_fdr`** *(number)*: FDR threshold for training Mokapot. Minimum: `0`. Maximum: `1`. Default: `0.01`.
- **`write_weights`** *(boolean)*: Write Mokapot weights to a text file. Default: `false`.
- **`write_txt`** *(boolean)*: Write Mokapot results to a text file. Default: `false`.
- **`write_flashlfq`** *(boolean)*: Write Mokapot results to a FlashLFQ-compatible file. Default: `false`.
diff --git a/docs/source/userguide/configuration.rst b/docs/source/userguide/configuration.rst
index 8b68ea45..e22b3ec1 100644
--- a/docs/source/userguide/configuration.rst
+++ b/docs/source/userguide/configuration.rst
@@ -240,6 +240,65 @@ expression pattern that extracts the decoy status from the protein name:
decoy_pattern = "DECOY_"
+Multi-rank rescoring
+====================
+
+Some search engines, such as MaxQuant, report multiple candidate PSMs for the same spectrum.
+MS²Rescore can rescore multiple candidate PSMs per spectrum. This allows for lower-ranking
+candidate PSMs to become the top-ranked PSM after rescoring. This behavior can be controlled with
+the ``max_psm_rank_input`` option.
+
+To ensure a correct FDR control after rescoring, MS²Rescore filters out lower-ranking PSMs before
+final FDR calculation and writing the output files. To allow for lower-ranking PSMs to be included
+in the final output - for instance, to consider chimeric spectra - the ``max_psm_rank_output``
+option can be used.
+
+For example, to rescore the top 5 PSMs per spectrum and output the best PSM after rescoring,
+the following configuration can be used:
+
+.. tab:: JSON
+
+ .. code-block:: json
+
+ "max_psm_rank_input": 5
+ "max_psm_rank_output": 1
+
+.. tab:: TOML
+
+ .. code-block:: toml
+
+ max_psm_rank_input = 5
+ max_psm_rank_output = 1
+
+
+Configuring rescoring engines
+=============================
+
+MS²Rescore supports multiple rescoring engines, such as Mokapot and Percolator. The rescoring
+engine can be selected and configured with the ``rescoring_engine`` option. For example, to use
+Mokapot with a custom train_fdr of 0.1%, the following configuration can be used:
+
+.. tab:: JSON
+
+ .. code-block:: json
+
+ "rescoring_engine": {
+ "mokapot": {
+ "train_fdr": 0.001
+ }
+
+.. tab:: TOML
+
+ .. code-block:: toml
+
+ [ms2rescore.rescoring_engine.mokapot]
+ train_fdr = 0.001
+
+
+All options for the rescoring engines can be found in the :ref:`ms2rescore.rescoring_engines`
+section.
+
+
All configuration options
=========================
diff --git a/docs/source/userguide/input-files.rst b/docs/source/userguide/input-files.rst
index 08129b7f..b556b520 100644
--- a/docs/source/userguide/input-files.rst
+++ b/docs/source/userguide/input-files.rst
@@ -5,16 +5,13 @@ Input files
PSM file(s)
===========
-The peptide-spectrum match (PSM) file is generally the output from a proteomics search engine.
-This file serves as the main input to MS²Rescore. One or multiple PSM files can be provided at
-once. Note that merging PSMs from different MS runs could have an impact on the correctness of
-the FDR control.
+The **peptide-spectrum match (PSM) file** is generally the output from a proteomics search engine.
+This file serves as the main input to MS²Rescore.
-Various PSM file types are supported. The type can be specified with the ``psm_file_type`` option.
-Check the list of :py:mod:`psm_utils` tags in the
-:external+psm_utils:ref:`supported file formats ` section. Depending on the
-file extension, the file type can also be inferred from the file name. In that case,
-``psm_file_type`` option can be set to ``infer``.
+The PSM file should contain **all putative identifications** made by the search engine, including
+both target and decoy PSMs. Ensure that the search engine was configured to include decoy entries
+in the search database and was operated with **target-decoy competition** enabled (i.e.,
+considering both target and decoy sequences simultaneously during the search).
.. attention::
As a general rule, MS²Rescore always needs access to **all target and decoy PSMs, without any
@@ -22,6 +19,17 @@ file extension, the file type can also be inferred from the file name. In that c
set to 100%.
+One or multiple PSM files can be provided at once. Note that merging PSMs from different MS runs
+could have an impact on the correctness of the FDR control. Combining multiple PSM files should
+generally only be done for LC-fractionated mass spectrometry runs.
+
+Various PSM file types are supported. The type can be specified with the ``psm_file_type`` option.
+Check the list of :py:mod:`psm_utils` tags in the
+:external+psm_utils:ref:`supported file formats ` section. Depending on the
+file extension, the file type can also be inferred from the file name. In that case,
+``psm_file_type`` option can be set to ``infer``.
+
+
Spectrum file(s)
================
diff --git a/docs/source/userguide/output-files.rst b/docs/source/userguide/output-files.rst
index ee1a78f8..89c0363b 100644
--- a/docs/source/userguide/output-files.rst
+++ b/docs/source/userguide/output-files.rst
@@ -52,8 +52,8 @@ Rescoring engine files:
| ``..weights.txt`` | Feature weights, showing feature usage in the rescoring run |
+-------------------------------------------------------------+-------------------------------------------------------------+
-If no rescoring engine is selected (or if Percolator was selected), the following files will also
-be written:
+If no rescoring engine is selected, if Percolator was selected, or in DEBUG mode, the following
+files will also be written:
+-------------------------------------------------------------+-----------------------------------------------------------+
| File | Description |
diff --git a/docs/source/userguide/tims2Rescore.rst b/docs/source/userguide/tims2Rescore.rst
new file mode 100644
index 00000000..f9b9096f
--- /dev/null
+++ b/docs/source/userguide/tims2Rescore.rst
@@ -0,0 +1,61 @@
+.. _timsrescore:
+
+TIMS²Rescore User Guide
+=======================
+
+Introduction
+------------
+
+The `TIMS²Rescore` tool is a DDA-PASEF adapted version of `ms2rescore` that allows users to perform rescoring of peptide-spectrum matches (PSMs) acquired on Bruker instruments. This guide provides an overview of how to use `timsrescore` in `ms2rescore` effectively.
+
+Installation
+------------
+
+Before using `timsrescore`, ensure that you have `ms2rescore` installed on your system. You can install `ms2rescore` using the following command:
+
+.. code-block:: bash
+
+ pip install ms2rescore
+
+Usage
+-----
+
+To use `timsrescore`, follow these steps:
+
+1. Prepare your input files:
+ - Ensure that you have the necessary input files, including the PSM file spectrum files
+ - Make sure that the PSM file format from a supported search engine or a standard format like .mzid(:external+psm_utils:ref:`supported file formats `).
+ - Spectrum files can directly be given as .d or minitdf files from Bruker instruments or first converted to .mzML format.
+
+2. Run `timsrescore`:
+ - Open a terminal or command prompt.
+ - Navigate to the directory where your input files are located.
+ - Execute the following command:
+
+ .. code-block:: bash
+
+ timsrescore -p -s -o
+
+ Replace ``, ``, and `` with the actual paths to your input and output files.
+ _NOTE_ By default timsTOF specific models will be used for predictions. Optionally you can further configure settings through a configuration file. For more information on configuring `timsrescore`, refer to the :doc:`configuration` tab in the user guide.
+
+3. Review the results:
+ - Once the `timsrescore` process completes, you will find the rescoring results in the specified output file or if not specified in the same directory as the input files
+ - If you want a detailed overview of the performance, you can either give the set `write_report` to `True` in the configuration file, use the `--write_report` option in the command line or run the following command:
+
+ .. code-block:: bash
+
+ ms2rescore-report
+
+ Replace `` with the actual output prefix of the result files to the output file.
+
+Additional Options
+------------------
+
+`ms2rescore` provides additional options to customize the `timsrescore` process. You can explore these options by running the following command:
+
+.. code-block:: bash
+
+ timsrescore --help
+
+
diff --git a/ms2rescore/__init__.py b/ms2rescore/__init__.py
index 16f7dfd4..606a1c8a 100644
--- a/ms2rescore/__init__.py
+++ b/ms2rescore/__init__.py
@@ -1,6 +1,6 @@
"""MS²Rescore: Sensitive PSM rescoring with predicted MS² peak intensities and RTs."""
-__version__ = "3.0.3"
+__version__ = "3.1.0-dev9"
from warnings import filterwarnings
diff --git a/ms2rescore/__main__.py b/ms2rescore/__main__.py
index 4cac9122..70e384c2 100644
--- a/ms2rescore/__main__.py
+++ b/ms2rescore/__main__.py
@@ -1,6 +1,9 @@
"""MS²Rescore: Sensitive PSM rescoring with predicted MS² peak intensities and RTs."""
import argparse
+import cProfile
+import importlib.resources
+import json
import logging
import sys
from pathlib import Path
@@ -10,7 +13,7 @@
from rich.logging import RichHandler
from rich.text import Text
-from ms2rescore import __version__
+from ms2rescore import __version__, package_data
from ms2rescore.config_parser import parse_configurations
from ms2rescore.core import rescore
from ms2rescore.exceptions import MS2RescoreConfigurationError
@@ -33,19 +36,26 @@
CONSOLE = Console(record=True)
-def _print_credits():
+def _print_credits(tims=False):
"""Print software credits to terminal."""
text = Text()
text.append("\n")
- text.append("MS²Rescore", style="bold link https://github.com/compomics/ms2rescore")
+ if tims:
+ text.append("TIMS²Rescore", style="bold link https://github.com/compomics/ms2rescore")
+ else:
+ text.append("MS²Rescore", style="bold link https://github.com/compomics/ms2rescore")
text.append(f" (v{__version__})\n", style="bold")
+ if tims:
+ text.append("MS²Rescore tuned for Bruker timsTOF instruments.\n", style="italic")
text.append("Developed at CompOmics, VIB / Ghent University, Belgium.\n")
text.append("Please cite: ")
text.append(
- "Declercq et al. MCP (2022)", style="link https://doi.org/10.1016/j.mcpro.2022.100266"
+ "Buur & Declercq et al. JPR (2024)",
+ style="link https://doi.org/10.1021/acs.jproteome.3c00785",
)
text.append("\n")
- text.stylize("cyan")
+ if tims:
+ text.stylize("#006cb5")
CONSOLE.print(text)
@@ -130,6 +140,21 @@ def _argument_parser() -> argparse.ArgumentParser:
dest="fasta_file",
help="path to FASTA file",
)
+ parser.add_argument(
+ "--write-report",
+ # metavar="BOOL",
+ action="store_true",
+ dest="write_report",
+ help="boolean to enable profiling with cProfile",
+ )
+ parser.add_argument(
+ "--profile",
+ # metavar="BOOL",
+ action="store_true",
+ # type=bool,
+ # dest="profile",
+ help="boolean to enable profiling with cProfile",
+ )
return parser
@@ -152,18 +177,42 @@ def _setup_logging(passed_level: str, log_file: Union[str, Path]):
)
-def main():
+def profile(fnc, filepath):
+ """A decorator that uses cProfile to profile a function"""
+
+ def inner(*args, **kwargs):
+ with cProfile.Profile() as profiler:
+ return_value = fnc(*args, **kwargs)
+ profiler.dump_stats(filepath + ".profile.prof")
+ return return_value
+
+ return inner
+
+
+def main_tims():
+ """Run MS²Rescore command-line interface in TIMS²Rescore mode."""
+ main(tims=True)
+
+
+def main(tims=False):
"""Run MS²Rescore command-line interface."""
- _print_credits()
+ _print_credits(tims)
# Parse CLI arguments and configuration file
parser = _argument_parser()
cli_args = parser.parse_args()
+
+ configurations = []
+ if cli_args.config_file:
+ configurations.append(cli_args.config_file)
+ if tims:
+ configurations.append(
+ json.load(importlib.resources.open_text(package_data, "config_default_tims.json"))
+ )
+ configurations.append(cli_args)
+
try:
- if cli_args.config_file:
- config = parse_configurations([cli_args.config_file, cli_args])
- else:
- config = parse_configurations(cli_args)
+ config = parse_configurations(configurations)
except MS2RescoreConfigurationError as e:
LOGGER.critical(e)
sys.exit(1)
@@ -175,7 +224,11 @@ def main():
# Run MS²Rescore
try:
- rescore(configuration=config)
+ if cli_args.profile:
+ profiled_rescore = profile(rescore, config["ms2rescore"]["output_path"])
+ profiled_rescore(configuration=config)
+ else:
+ rescore(configuration=config)
except Exception as e:
LOGGER.exception(e)
sys.exit(1)
diff --git a/ms2rescore/core.py b/ms2rescore/core.py
index d1dfe405..170f1038 100644
--- a/ms2rescore/core.py
+++ b/ms2rescore/core.py
@@ -3,15 +3,18 @@
from multiprocessing import cpu_count
from typing import Dict, Optional
+import numpy as np
import psm_utils.io
+from mokapot.dataset import LinearPsmDataset
from psm_utils import PSMList
+from ms2rescore import exceptions
from ms2rescore.feature_generators import FEATURE_GENERATORS
from ms2rescore.parse_psms import parse_psms
from ms2rescore.parse_spectra import get_missing_values
from ms2rescore.report import generate
from ms2rescore.rescoring_engines import mokapot, percolator
-from ms2rescore import exceptions
+from ms2rescore.rescoring_engines.mokapot import add_peptide_confidence, add_psm_confidence
logger = logging.getLogger(__name__)
@@ -44,7 +47,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
psm_list = parse_psms(config, psm_list)
# Log #PSMs identified before rescoring
- id_psms_before = _log_id_psms_before(psm_list)
+ id_psms_before = _log_id_psms_before(psm_list, max_rank=config["max_psm_rank_output"])
# Define feature names; get existing feature names from PSM file
feature_names = dict()
@@ -58,12 +61,8 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
f"PSMs already contain the following rescoring features: {psm_list_feature_names}"
)
- # TODO: avoid hard coding feature generators in some way
- rt_required = "deeplc" in config["feature_generators"] and None in psm_list["retention_time"]
- im_required = "ionmob" in config["feature_generators"] and None in psm_list["ion_mobility"]
- if rt_required or im_required:
- logger.info("Parsing missing retention time and/or ion mobility values from spectra...")
- get_missing_values(psm_list, config, rt_required=rt_required, im_required=im_required)
+ # Add missing precursor info from spectrum file if needed
+ psm_list = _fill_missing_precursor_info(psm_list, config)
# Add rescoring features
for fgen_name, fgen_config in config["feature_generators"].items():
@@ -107,8 +106,8 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
logging.debug(f"Creating USIs for {len(psm_list)} PSMs")
psm_list["spectrum_id"] = [psm.get_usi(as_url=False) for psm in psm_list]
- # If no rescoring engine is specified, write PSMs and features to PIN file
- if not config["rescoring_engine"]:
+ # If no rescoring engine is specified or DEBUG, write PSMs and features to PIN file
+ if not config["rescoring_engine"] or config["log_level"] == "debug":
logger.info(f"Writing added features to PIN file: {output_file_root}.psms.pin")
psm_utils.io.write_file(
psm_list,
@@ -116,35 +115,49 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
filetype="percolator",
feature_names=all_feature_names,
)
+
+ if not config["rescoring_engine"]:
+ logger.info("No rescoring engine specified. Skipping rescoring.")
return None
# Rescore PSMs
- if "percolator" in config["rescoring_engine"]:
- percolator.rescore(
- psm_list,
- output_file_root=output_file_root,
- log_level=config["log_level"],
- processes=config["processes"],
- percolator_kwargs=config["rescoring_engine"]["percolator"],
- )
- elif "mokapot" in config["rescoring_engine"]:
- if "fasta_file" not in config["rescoring_engine"]["mokapot"]:
- config["rescoring_engine"]["mokapot"]["fasta_file"] = config["fasta_file"]
- if "protein_kwargs" in config["rescoring_engine"]["mokapot"]:
- protein_kwargs = config["rescoring_engine"]["mokapot"].pop("protein_kwargs")
- else:
- protein_kwargs = dict()
-
- mokapot.rescore(
- psm_list,
- output_file_root=output_file_root,
- protein_kwargs=protein_kwargs,
- **config["rescoring_engine"]["mokapot"],
- )
- else:
- logger.info("No known rescoring engine specified. Skipping rescoring.")
+ try:
+ if "percolator" in config["rescoring_engine"]:
+ percolator.rescore(
+ psm_list,
+ output_file_root=output_file_root,
+ log_level=config["log_level"],
+ processes=config["processes"],
+ percolator_kwargs=config["rescoring_engine"]["percolator"],
+ )
+ elif "mokapot" in config["rescoring_engine"]:
+ if "fasta_file" not in config["rescoring_engine"]["mokapot"]:
+ config["rescoring_engine"]["mokapot"]["fasta_file"] = config["fasta_file"]
+ if "protein_kwargs" in config["rescoring_engine"]["mokapot"]:
+ protein_kwargs = config["rescoring_engine"]["mokapot"].pop("protein_kwargs")
+ else:
+ protein_kwargs = dict()
+
+ mokapot.rescore(
+ psm_list,
+ output_file_root=output_file_root,
+ protein_kwargs=protein_kwargs,
+ **config["rescoring_engine"]["mokapot"],
+ )
+ except exceptions.RescoringError as e:
+ # Write output
+ logger.info(f"Writing intermediary output to {output_file_root}.psms.tsv...")
+ psm_utils.io.write_file(psm_list, output_file_root + ".psms.tsv", filetype="tsv")
+
+ # Reraise exception
+ raise e
- _log_id_psms_after(psm_list, id_psms_before)
+ # Post-rescoring processing
+ if all(psm_list["pep"] == 1.0):
+ psm_list = _fix_constant_pep(psm_list)
+ psm_list = _filter_by_rank(psm_list, config["max_psm_rank_output"], False)
+ psm_list = _calculate_confidence(psm_list)
+ _ = _log_id_psms_after(psm_list, id_psms_before, max_rank=config["max_psm_rank_output"])
# Write output
logger.info(f"Writing output to {output_file_root}.psms.tsv...")
@@ -160,6 +173,59 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
logger.exception(e)
+def _fill_missing_precursor_info(psm_list: PSMList, config: Dict) -> PSMList:
+ """Fill missing precursor info from spectrum file if needed."""
+ # Check if required
+ # TODO: avoid hard coding feature generators in some way
+ rt_required = ("deeplc" in config["feature_generators"]) and any(
+ v is None or v == 0 or np.isnan(v) for v in psm_list["retention_time"]
+ )
+ im_required = (
+ "ionmob" in config["feature_generators"] or "im2deep" in config["feature_generators"]
+ ) and any(v is None or v == 0 or np.isnan(v) for v in psm_list["ion_mobility"])
+ logger.debug(f"RT required: {rt_required}, IM required: {im_required}")
+
+ # Add missing values
+ if rt_required or im_required:
+ logger.info("Parsing missing retention time and/or ion mobility values from spectra...")
+ get_missing_values(psm_list, config, rt_required=rt_required, im_required=im_required)
+
+ # Check if values are now present
+ for value_name, required in [("retention_time", rt_required), ("ion_mobility", im_required)]:
+ if required and (
+ 0.0 in psm_list[value_name]
+ or None in psm_list[value_name]
+ or np.isnan(psm_list[value_name]).any()
+ ):
+ if all(v is None or v == 0.0 or np.isnan(v) for v in psm_list[value_name]):
+ raise exceptions.MissingValuesError(
+ f"Could not find any '{value_name}' values in PSM or spectrum files. Disable "
+ f"feature generators that require '{value_name}' or ensure that the values are "
+ "present in the input files."
+ )
+ else:
+ missing_value_psms = psm_list[
+ [v is None or np.isnan(v) for v in psm_list[value_name]]
+ ]
+ logger.warning(
+ f"Found {len(missing_value_psms)} PSMs with missing '{value_name}' values. "
+ "These PSMs will be removed."
+ )
+ psm_list = psm_list[
+ [v is not None and not np.isnan(v) for v in psm_list[value_name]]
+ ]
+
+ return psm_list
+
+
+def _filter_by_rank(psm_list: PSMList, max_rank: int, lower_score_better: bool) -> PSMList:
+ """Filter PSMs by rank."""
+ psm_list.set_ranks(lower_score_better=lower_score_better)
+ rank_filter = psm_list["rank"] <= max_rank
+ logger.info(f"Removed {sum(~rank_filter)} PSMs with rank >= {max_rank}.")
+ return psm_list[rank_filter]
+
+
def _write_feature_names(feature_names, output_file_root):
"""Write feature names to file."""
with open(output_file_root + ".feature_names.tsv", "w") as f:
@@ -169,25 +235,79 @@ def _write_feature_names(feature_names, output_file_root):
f.write(f"{fgen}\t{feature}\n")
-def _log_id_psms_before(psm_list):
+def _log_id_psms_before(psm_list: PSMList, fdr: float = 0.01, max_rank: int = 1) -> int:
"""Log #PSMs identified before rescoring."""
id_psms_before = (
- (psm_list["qvalue"] <= 0.01) & (psm_list["is_decoy"] == False) # noqa: E712
+ (psm_list["qvalue"] <= 0.01) & (psm_list["rank"] <= max_rank) & (~psm_list["is_decoy"])
).sum()
- logger.info("Found %i identified PSMs at 1%% FDR before rescoring.", id_psms_before)
+ logger.info(
+ f"Found {id_psms_before} identified PSMs with rank <= {max_rank} at {fdr} FDR before "
+ "rescoring."
+ )
return id_psms_before
-def _log_id_psms_after(psm_list, id_psms_before):
+def _log_id_psms_after(
+ psm_list: PSMList, id_psms_before: int, fdr: float = 0.01, max_rank: int = 1
+) -> int:
"""Log #PSMs identified after rescoring."""
id_psms_after = (
- (psm_list["qvalue"] <= 0.01) & (psm_list["is_decoy"] == False) # noqa: E712
+ (psm_list["qvalue"] <= 0.01) & (psm_list["rank"] <= max_rank) & (~psm_list["is_decoy"])
).sum()
diff = id_psms_after - id_psms_before
diff_perc = diff / id_psms_before if id_psms_before > 0 else None
diff_numbers = f"{diff} ({diff_perc:.2%})" if diff_perc is not None else str(diff)
diff_word = "more" if diff > 0 else "less"
- logger.info(f"Identified {diff_numbers} {diff_word} PSMs at 1% FDR after rescoring.")
+ logger.info(
+ f"Identified {diff_numbers} {diff_word} PSMs with rank <= {max_rank} at {fdr} FDR after "
+ "rescoring."
+ )
return id_psms_after
+
+
+def _fix_constant_pep(psm_list: PSMList) -> PSMList:
+ """Workaround for broken PEP calculation if best PSM is decoy."""
+ logger.warning(
+ "Attempting to fix constant PEP values by removing decoy PSMs that score higher than the "
+ "best target PSM."
+ )
+ max_target_score = psm_list["score"][~psm_list["is_decoy"]].max()
+ higher_scoring_decoys = psm_list["is_decoy"] & (psm_list["score"] > max_target_score)
+
+ if not higher_scoring_decoys.any():
+ logger.warning("No decoys scoring higher than the best target found. Skipping fix.")
+ else:
+ psm_list = psm_list[~higher_scoring_decoys]
+ logger.warning(f"Removed {higher_scoring_decoys.sum()} decoy PSMs.")
+
+ return psm_list
+
+
+def _calculate_confidence(psm_list: PSMList) -> PSMList:
+ """
+ Calculate scores, q-values, and PEPs for PSMs and peptides and add them to PSMList.
+ """
+ # Minimal conversion to LinearPsmDataset
+ psm_df = psm_list.to_dataframe()
+ psm_df = psm_df.reset_index(drop=True).reset_index()
+ psm_df["peptide"] = (
+ psm_df["peptidoform"].astype(str).str.replace(r"(/\d+$)", "", n=1, regex=True)
+ )
+ psm_df["is_target"] = ~psm_df["is_decoy"]
+ lin_psm_data = LinearPsmDataset(
+ psms=psm_df[["index", "peptide", "is_target"]],
+ target_column="is_target",
+ spectrum_columns="index", # Use artificial index to allow multi-rank rescoring
+ peptide_column="peptide",
+ )
+
+ # Recalculate confidence
+ new_confidence = lin_psm_data.assign_confidence(scores=psm_list["score"])
+
+ # Add new confidence estimations to PSMList
+ add_psm_confidence(psm_list, new_confidence)
+ add_peptide_confidence(psm_list, new_confidence)
+
+ return psm_list
diff --git a/ms2rescore/exceptions.py b/ms2rescore/exceptions.py
index f2b06323..ba5492a2 100644
--- a/ms2rescore/exceptions.py
+++ b/ms2rescore/exceptions.py
@@ -25,7 +25,19 @@ class ModificationParsingError(IDFileParsingError):
pass
+class MissingValuesError(MS2RescoreError):
+ """Missing values in PSMs and/or spectra."""
+
+ pass
+
+
class ReportGenerationError(MS2RescoreError):
"""Error while generating report."""
pass
+
+
+class RescoringError(MS2RescoreError):
+ """Error while rescoring PSMs."""
+
+ pass
diff --git a/ms2rescore/feature_generators/__init__.py b/ms2rescore/feature_generators/__init__.py
index 9424448a..52440a5f 100644
--- a/ms2rescore/feature_generators/__init__.py
+++ b/ms2rescore/feature_generators/__init__.py
@@ -7,6 +7,7 @@
from ms2rescore.feature_generators.ionmob import IonMobFeatureGenerator
from ms2rescore.feature_generators.maxquant import MaxQuantFeatureGenerator
from ms2rescore.feature_generators.ms2pip import MS2PIPFeatureGenerator
+from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator
FEATURE_GENERATORS = {
"basic": BasicFeatureGenerator,
@@ -14,4 +15,5 @@
"deeplc": DeepLCFeatureGenerator,
"maxquant": MaxQuantFeatureGenerator,
"ionmob": IonMobFeatureGenerator,
+ "im2deep": IM2DeepFeatureGenerator,
}
diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py
index 50b577ff..30bea716 100644
--- a/ms2rescore/feature_generators/deeplc.py
+++ b/ms2rescore/feature_generators/deeplc.py
@@ -21,12 +21,10 @@
from collections import defaultdict
from inspect import getfullargspec
from itertools import chain
-from typing import List, Optional, Union
+from typing import List, Union
import numpy as np
-import pandas as pd
from psm_utils import PSMList
-from psm_utils.io import peptide_record
from ms2rescore.feature_generators.base import FeatureGeneratorBase
@@ -41,8 +39,7 @@ def __init__(
self,
*args,
lower_score_is_better: bool = False,
- calibration_set_size: Union[int, float] = 0.15,
- spectrum_path: Optional[str] = None,
+ calibration_set_size: Union[int, float, None] = None,
processes: int = 1,
**kwargs,
) -> None:
@@ -59,9 +56,6 @@ def __init__(
calibration_set_size: int or float
Amount of best PSMs to use for DeepLC calibration. If this value is lower
than the number of available PSMs, all PSMs will be used. (default: 0.15)
- spectrum_path
- Path to spectrum file or directory with spectrum files. If None, inferred from `run`
- field in PSMs. Defaults to None.
processes: {int, None}
Number of processes to use in DeepLC. Defaults to 1.
kwargs: dict
@@ -77,7 +71,6 @@ def __init__(
self.lower_psm_score_better = lower_score_is_better
self.calibration_set_size = calibration_set_size
- self.spectrum_path = spectrum_path
self.processes = processes
self.deeplc_kwargs = kwargs or {}
@@ -151,17 +144,15 @@ def add_features(self, psm_list: PSMList) -> None:
# Make new PSM list for this run (chain PSMs per spectrum to flat list)
psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
- logger.debug("Calibrating DeepLC...")
psm_list_calibration = self._get_calibration_psms(psm_list_run)
+ logger.debug(f"Calibrating DeepLC with {len(psm_list_calibration)} PSMs...")
self.deeplc_predictor = self.DeepLC(
n_jobs=self.processes,
verbose=self._verbose,
path_model=self.selected_model or self.user_model,
**self.deeplc_kwargs,
)
- self.deeplc_predictor.calibrate_preds(
- seq_df=self._psm_list_to_deeplc_peprec(psm_list_calibration)
- )
+ self.deeplc_predictor.calibrate_preds(psm_list_calibration)
# Still calibrate for each run, but do not try out all model options.
# Just use model that was selected based on first run
if not self.selected_model:
@@ -174,11 +165,7 @@ def add_features(self, psm_list: PSMList) -> None:
)
logger.debug("Predicting retention times...")
- predictions = np.array(
- self.deeplc_predictor.make_preds(
- seq_df=self._psm_list_to_deeplc_peprec(psm_list_run)
- )
- )
+ predictions = np.array(self.deeplc_predictor.make_preds(psm_list_run))
observations = psm_list_run["retention_time"]
rt_diffs_run = np.abs(predictions - observations)
@@ -204,25 +191,25 @@ def add_features(self, psm_list: PSMList) -> None:
)
current_run += 1
- # TODO: Remove when DeepLC supports PSMList directly
- @staticmethod
- def _psm_list_to_deeplc_peprec(psm_list: PSMList) -> pd.DataFrame:
- peprec = peptide_record.to_dataframe(psm_list)
- peprec = peprec.rename(
- columns={
- "observed_retention_time": "tr",
- "peptide": "seq",
- }
- )[["tr", "seq", "modifications"]]
- return peprec
-
def _get_calibration_psms(self, psm_list: PSMList):
"""Get N best scoring target PSMs for calibration."""
psm_list_targets = psm_list[~psm_list["is_decoy"]]
- n_psms = self._get_number_of_calibration_psms(psm_list_targets)
- indices = np.argsort(psm_list_targets["score"])
- indices = indices[:n_psms] if self.lower_psm_score_better else indices[-n_psms:]
- return psm_list_targets[indices]
+ if self.calibration_set_size:
+ n_psms = self._get_number_of_calibration_psms(psm_list_targets)
+ indices = np.argsort(psm_list_targets["score"])
+ indices = indices[:n_psms] if self.lower_psm_score_better else indices[-n_psms:]
+ return psm_list_targets[indices]
+ else:
+ identified_psms = psm_list_targets[psm_list_targets["qvalue"] <= 0.01]
+ if len(identified_psms) == 0:
+ raise ValueError(
+ "No target PSMs with q-value <= 0.01 found. Please set calibration set size for calibrating deeplc."
+ )
+ elif (len(identified_psms) < 500) & (self.deeplc_kwargs["deeplc_retrain"]):
+ logger.warning(
+ " Less than 500 target PSMs with q-value <= 0.01 found for retraining. Consider turning of deeplc_retrain, as this is likely not enough data for retraining."
+ )
+ return identified_psms
def _get_number_of_calibration_psms(self, psm_list):
"""Get number of calibration PSMs given `calibration_set_size` and total number of PSMs."""
diff --git a/ms2rescore/feature_generators/im2deep.py b/ms2rescore/feature_generators/im2deep.py
new file mode 100644
index 00000000..552b7280
--- /dev/null
+++ b/ms2rescore/feature_generators/im2deep.py
@@ -0,0 +1,169 @@
+"""
+IM2Deep ion mobility-based feature generator.
+
+IM2Deep is a fully modification-aware peptide ion mobility predictor. It uses a deep convolutional
+neural network to predict retention times based on the atomic composition of the (modified) amino
+acid residues in the peptide. See
+`github.com/compomics/IM2Deep `_ for more information.
+
+"""
+
+import contextlib
+import logging
+import os
+from inspect import getfullargspec
+from itertools import chain
+from typing import List
+
+import numpy as np
+import pandas as pd
+from im2deep.calibrate import im2ccs
+from im2deep.im2deep import predict_ccs
+from psm_utils import PSMList
+
+from ms2rescore.feature_generators.base import FeatureGeneratorBase
+
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
+logger = logging.getLogger(__name__)
+
+
+class IM2DeepFeatureGenerator(FeatureGeneratorBase):
+ """IM2Deep collision cross section feature generator."""
+
+ def __init__(
+ self,
+ *args,
+ processes: int = 1,
+ **kwargs,
+ ):
+ """
+ Initialize the IM2DeepFeatureGenerator.
+
+ Parameters
+ ----------
+ processes : int, optional
+ Number of parallel processes to use for IM2Deep predictions. Default is 1.
+ **kwargs : dict, optional
+ Additional keyword arguments to `im2deep.predict_ccs`.
+
+ """
+ super().__init__(*args, **kwargs)
+
+ self._verbose = logger.getEffectiveLevel() <= logging.DEBUG
+
+ # Remove any kwargs that are not IM2Deep arguments
+ self.im2deep_kwargs = kwargs or {}
+ self.im2deep_kwargs = {
+ k: v for k, v in self.im2deep_kwargs.items() if k in getfullargspec(predict_ccs).args
+ }
+ self.im2deep_kwargs["n_jobs"] = processes
+
+ @property
+ def feature_names(self) -> List[str]:
+ return [
+ "ccs_observed_im2deep",
+ "ccs_predicted_im2deep",
+ "ccs_error_im2deep",
+ "abs_ccs_error_im2deep",
+ "perc_ccs_error_im2deep",
+ ]
+
+ def add_features(self, psm_list: PSMList) -> None:
+ """Add IM2Deep-derived features to PSMs"""
+ logger.info("Adding IM2Deep-derived features to PSMs")
+
+ # Get easy-access nested version of PSMlist
+ psm_dict = psm_list.get_psm_dict()
+
+ # Run IM2Deep for each spectrum file
+ current_run = 1
+ total_runs = sum(len(runs) for runs in psm_dict.values())
+
+ for runs in psm_dict.values():
+ # Reset IM2Deep predictor for each collection of runs
+ for run, psms in runs.items():
+ logger.info(
+ f"Running IM2Deep for PSMs from run ({current_run}/{total_runs}): `{run}`..."
+ )
+
+ # Disable wild logging to stdout by TensorFlow, unless in debug mode
+ with (
+ contextlib.redirect_stdout(open(os.devnull, "w"))
+ if not self._verbose
+ else contextlib.nullcontext()
+ ):
+ # Make new PSM list for this run (chain PSMs per spectrum to flat list)
+ psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
+
+ logger.debug("Calibrating IM2Deep...")
+
+ # Convert ion mobility to CCS and calibrate CCS values
+ psm_list_run_df = psm_list_run.to_dataframe()
+ psm_list_run_df["charge"] = [
+ pep.precursor_charge for pep in psm_list_run_df["peptidoform"]
+ ]
+ psm_list_run_df["ccs_observed"] = im2ccs(
+ psm_list_run_df["ion_mobility"],
+ psm_list_run_df["precursor_mz"],
+ psm_list_run_df["charge"],
+ )
+
+ # Create dataframe with high confidence hits for calibration
+ cal_psm_df = self.make_calibration_df(psm_list_run_df)
+
+ # Make predictions with IM2Deep
+ logger.debug("Predicting CCS values...")
+ predictions = predict_ccs(
+ psm_list_run, cal_psm_df, write_output=False, **self.im2deep_kwargs
+ )
+
+ # Add features to PSMs
+ logger.debug("Adding features to PSMs...")
+ observations = psm_list_run_df["ccs_observed"]
+ ccs_diffs_run = np.abs(predictions - observations)
+ for i, psm in enumerate(psm_list_run):
+ psm["rescoring_features"].update(
+ {
+ "ccs_observed_im2deep": observations[i],
+ "ccs_predicted_im2deep": predictions[i],
+ "ccs_error_im2deep": ccs_diffs_run[i],
+ "abs_ccs_error_im2deep": np.abs(ccs_diffs_run[i]),
+ "perc_ccs_error_im2deep": np.abs(ccs_diffs_run[i])
+ / observations[i]
+ * 100,
+ }
+ )
+
+ current_run += 1
+
+ @staticmethod
+ def make_calibration_df(psm_list_df: pd.DataFrame, threshold: float = 0.25) -> pd.DataFrame:
+ """
+ Make dataframe for calibration of IM2Deep predictions.
+
+ Parameters
+ ----------
+ psm_list_df
+ DataFrame with PSMs.
+ threshold
+ Percentage of highest scoring identified target PSMs to use for calibration,
+ default 0.95.
+
+ Returns
+ -------
+ pd.DataFrame
+ DataFrame with high confidence hits for calibration.
+
+ """
+ identified_psms = psm_list_df[
+ (psm_list_df["qvalue"] < 0.01)
+ & (~psm_list_df["is_decoy"])
+ & (psm_list_df["charge"] < 5) # predictions do not go higher for IM2Deep
+ ]
+ calibration_psms = identified_psms[
+ identified_psms["qvalue"] < identified_psms["qvalue"].quantile(1 - threshold)
+ ]
+ logger.debug(
+ f"Number of high confidence hits for calculating shift: {len(calibration_psms)}"
+ )
+ return calibration_psms
diff --git a/ms2rescore/feature_generators/ionmob.py b/ms2rescore/feature_generators/ionmob.py
index 63ed3b87..7fa0c0a1 100644
--- a/ms2rescore/feature_generators/ionmob.py
+++ b/ms2rescore/feature_generators/ionmob.py
@@ -165,6 +165,7 @@ def add_features(self, psm_list: PSMList) -> None:
)
]
+ # TODO: Use observed m/z?
psm_list_run_df["mz"] = psm_list_run_df.apply(
lambda x: calculate_mz(x["sequence-tokenized"], x["charge"]), axis=1
) # use precursor m/z from PSMs?
@@ -175,9 +176,8 @@ def add_features(self, psm_list: PSMList) -> None:
)
# calibrate CCS values
shift_factor = self.calculate_ccs_shift(psm_list_run_df)
- psm_list_run_df["ccs_observed"] = psm_list_run_df.apply(
- lambda x: x["ccs_observed"] + shift_factor, axis=1
- )
+ psm_list_run_df["ccs_observed"] + shift_factor
+
# predict CCS values
tf_ds = to_tf_dataset_inference(
psm_list_run_df["mz"],
diff --git a/ms2rescore/feature_generators/ms2pip.py b/ms2rescore/feature_generators/ms2pip.py
index 243bd794..3882b3fc 100644
--- a/ms2rescore/feature_generators/ms2pip.py
+++ b/ms2rescore/feature_generators/ms2pip.py
@@ -193,7 +193,7 @@ def add_features(self, psm_list: PSMList) -> None:
try:
ms2pip_results = correlate(
psms=psm_list_run,
- spectrum_file=spectrum_filename,
+ spectrum_file=str(spectrum_filename),
spectrum_id_pattern=self.spectrum_id_pattern,
model=self.model,
ms2_tolerance=self.ms2_tolerance,
diff --git a/ms2rescore/gui/app.py b/ms2rescore/gui/app.py
index c39f2922..a077a458 100644
--- a/ms2rescore/gui/app.py
+++ b/ms2rescore/gui/app.py
@@ -360,15 +360,20 @@ def __init__(self, *args, **kwargs):
self.deeplc_config = DeepLCConfiguration(self)
self.deeplc_config.grid(row=2, column=0, pady=(0, 20), sticky="nsew")
+ self.im2deep_config = Im2DeepConfiguration(self)
+ self.im2deep_config.grid(row=3, column=0, pady=(0, 20), sticky="nsew")
+
self.ionmob_config = IonmobConfiguration(self)
- self.ionmob_config.grid(row=3, column=0, pady=(0, 20), sticky="nsew")
+ self.ionmob_config.grid(row=4, column=0, pady=(0, 20), sticky="nsew")
def get(self) -> Dict:
"""Return the configuration as a dictionary."""
basic_enabled, basic_config = self.basic_config.get()
ms2pip_enabled, ms2pip_config = self.ms2pip_config.get()
deeplc_enabled, deeplc_config = self.deeplc_config.get()
+ im2deep_enabled, im2deep_config = self.im2deep_config.get()
ionmob_enabled, ionmob_config = self.ionmob_config.get()
+
config = {}
if basic_enabled:
config["basic"] = basic_config
@@ -523,6 +528,27 @@ def get(self) -> Dict:
return enabled, config
+class Im2DeepConfiguration(ctk.CTkFrame):
+ def __init__(self, *args, **kwargs):
+ """IM2Deep configuration frame."""
+ super().__init__(*args, **kwargs)
+
+ self.configure(fg_color="transparent")
+ self.grid_columnconfigure(0, weight=1)
+
+ self.title = widgets.Heading(self, text="im2deep")
+ self.title.grid(row=0, column=0, columnspan=2, pady=(0, 5), sticky="ew")
+
+ self.enabled = widgets.LabeledSwitch(self, label="Enable im2deep", default=False)
+ self.enabled.grid(row=1, column=0, pady=(0, 10), sticky="nsew")
+
+ def get(self) -> Dict:
+ """Return the configuration as a dictionary."""
+ enabled = self.enabled.get()
+ config = {}
+ return enabled, config
+
+
class RescoringEngineConfig(ctk.CTkFrame):
def __init__(self, *args, **kwargs):
"""Rescoring engine configuration frame."""
diff --git a/ms2rescore/package_data/config_default.json b/ms2rescore/package_data/config_default.json
index 71f704b5..733ea707 100644
--- a/ms2rescore/package_data/config_default.json
+++ b/ms2rescore/package_data/config_default.json
@@ -14,6 +14,7 @@
},
"rescoring_engine": {
"mokapot": {
+ "train_fdr": 0.01,
"write_weights": true,
"write_txt": true,
"write_flashlfq": true
@@ -29,12 +30,16 @@
"id_decoy_pattern": null,
"psm_id_pattern": null,
"spectrum_id_pattern": null,
+ "psm_id_rt_pattern": null,
+ "psm_id_im_pattern": null,
"lower_score_is_better": false,
+ "max_psm_rank_input": 10,
+ "max_psm_rank_output": 1,
"modification_mapping": {},
"fixed_modifications": {},
"processes": -1,
"rename_to_usi": false,
"fasta_file": null,
- "write_report": true
+ "write_report": false
}
}
diff --git a/ms2rescore/package_data/config_default_tims.json b/ms2rescore/package_data/config_default_tims.json
new file mode 100644
index 00000000..2a77adf1
--- /dev/null
+++ b/ms2rescore/package_data/config_default_tims.json
@@ -0,0 +1,25 @@
+{
+ "$schema": "./config_schema.json",
+ "ms2rescore": {
+ "feature_generators": {
+ "basic": {},
+ "ms2pip": {
+ "model": "timsTOF",
+ "ms2_tolerance": 0.02
+ },
+ "deeplc": {
+ "deeplc_retrain": false
+ },
+ "im2deep": {},
+ "maxquant": {}
+ },
+ "rescoring_engine": {
+ "mokapot": {
+ "write_weights": true,
+ "write_txt": true,
+ "write_flashlfq": true
+ }
+ },
+ "psm_file": null
+ }
+}
diff --git a/ms2rescore/package_data/config_schema.json b/ms2rescore/package_data/config_schema.json
index a97c9a59..ab77f3b7 100644
--- a/ms2rescore/package_data/config_schema.json
+++ b/ms2rescore/package_data/config_schema.json
@@ -29,6 +29,9 @@
},
"ionmob": {
"$ref": "#/definitions/ionmob"
+ },
+ "im2deep": {
+ "$ref": "#/definitions/im2deep"
}
},
"default": {
@@ -65,7 +68,11 @@
},
"psm_file": {
"description": "Path to file with peptide-spectrum matches.",
- "oneOf": [{ "type": "string" }, { "type": "null" }, { "type": "array", "items": { "type": "string" } }]
+ "oneOf": [
+ { "type": "string" },
+ { "type": "null" },
+ { "type": "array", "items": { "type": "string" } }
+ ]
},
"psm_file_type": {
"description": "PSM file type. By default inferred from file extension.",
@@ -107,11 +114,35 @@
"default": "(.*)",
"format": "regex"
},
+ "psm_id_rt_pattern": {
+ "description": "Regex pattern to extract retention time from PSM identifier. Requires at least one capturing group.",
+ "oneOf": [{ "type": "string" }, { "type": "null" }],
+ "default": null,
+ "format": "regex"
+ },
+ "psm_id_im_pattern": {
+ "description": "Regex pattern to extract ion mobility from PSM identifier. Requires at least one capturing group.",
+ "oneOf": [{ "type": "string" }, { "type": "null" }],
+ "default": null,
+ "format": "regex"
+ },
"lower_score_is_better": {
"description": "Bool indicating if lower score is better",
"type": "boolean",
"default": false
},
+ "max_psm_rank_input": {
+ "description": "Maximum rank of PSMs to use as input for rescoring",
+ "type": "number",
+ "default": 10,
+ "minimum": 1
+ },
+ "max_psm_rank_output": {
+ "description": "Maximum rank of PSMs to return after rescoring, before final FDR calculation",
+ "type": "number",
+ "default": 1,
+ "minimum": 1
+ },
"modification_mapping": {
"description": "Mapping of modification labels to each replacement label.",
"type": "object",
@@ -142,6 +173,11 @@
"description": "Write an HTML report with various QC metrics and charts",
"type": "boolean",
"default": false
+ },
+ "profile": {
+ "description": "Write a txt report using cProfile for profiling",
+ "type": "boolean",
+ "default": false
}
}
}
@@ -224,12 +260,32 @@
}
}
},
+ "im2deep": {
+ "$ref": "#/definitions/feature_generator",
+ "description": "Ion mobility feature generator configuration using IM2Deep",
+ "type": "object",
+ "additionalProperties": true,
+ "properties": {
+ "reference_dataset": {
+ "description": "Path to IM2Deep reference dataset file",
+ "type": "string",
+ "default": "Meier_unimod.parquet"
+ }
+ }
+ },
"mokapot": {
"$ref": "#/definitions/rescoring_engine",
"description": "Mokapot rescoring engine configuration. Additional properties are passed to the Mokapot brew function.",
"type": "object",
"additionalProperties": true,
"properties": {
+ "train_fdr": {
+ "description": "FDR threshold for training Mokapot",
+ "type": "number",
+ "minimum": 0,
+ "maximum": 1,
+ "default": 0.01
+ },
"write_weights": {
"description": "Write Mokapot weights to a text file",
"type": "boolean",
diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py
index a9855fda..12edd0e9 100644
--- a/ms2rescore/parse_psms.py
+++ b/ms2rescore/parse_psms.py
@@ -1,7 +1,8 @@
import logging
import re
-from typing import Dict, Union
+from typing import Dict, Optional, Union
+import numpy as np
import psm_utils.io
from psm_utils import PSMList
@@ -23,10 +24,30 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
PSMList object containing PSMs. If None, PSMs will be read from ``psm_file``.
"""
- # Read PSMs, find decoys, calculate q-values
- psm_list = _read_psms(config, psm_list)
- _find_decoys(config, psm_list)
- _calculate_qvalues(config, psm_list)
+ # Read PSMs
+ try:
+ psm_list = _read_psms(config, psm_list)
+ except psm_utils.io.PSMUtilsIOException:
+ raise MS2RescoreConfigurationError(
+ "Error occurred while reading PSMs. Please check the 'psm_file' and "
+ "'psm_file_type' settings. See "
+ "https://ms2rescore.readthedocs.io/en/latest/userguide/input-files/"
+ " for more information."
+ )
+
+ # Filter by PSM rank
+ psm_list.set_ranks(config["lower_score_is_better"])
+ rank_filter = psm_list["rank"] <= config["max_psm_rank_input"]
+ psm_list = psm_list[rank_filter]
+ logger.info(f"Removed {sum(~rank_filter)} PSMs with rank >= {config['max_psm_rank_input']}.")
+
+ # Remove invalid AAs, find decoys, calculate q-values
+ psm_list = _remove_invalid_aa(psm_list)
+ _find_decoys(psm_list, config["id_decoy_pattern"])
+ _calculate_qvalues(psm_list, config["lower_score_is_better"])
+ if config["psm_id_rt_pattern"] or config["psm_id_im_pattern"]:
+ logger.debug("Parsing retention time and/or ion mobility from PSM identifier...")
+ _parse_values_from_spectrum_id(config, psm_list)
# Store scoring values for comparison later
for psm in psm_list:
@@ -51,7 +72,8 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
non_mapped_modifications = modifications_found - set(config["modification_mapping"].keys())
if non_mapped_modifications:
logger.warning(
- f"Non-mapped modifications found: {non_mapped_modifications}\nThis can be ignored if Unimod modification label"
+ f"Non-mapped modifications found: {non_mapped_modifications}\n"
+ "This can be ignored if they are Unimod modification labels."
)
psm_list.rename_modifications(config["modification_mapping"])
psm_list.add_fixed_modifications(config["fixed_modifications"])
@@ -66,10 +88,6 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
new_ids = [_match_psm_ids(old_id, pattern) for old_id in psm_list["spectrum_id"]]
psm_list["spectrum_id"] = new_ids
- # TODO: Temporary fix until implemented in psm_utils
- # Ensure that spectrum IDs are strings (Pydantic 2.0 does not coerce int to str)
- psm_list["spectrum_id"] = [str(spec_id) for spec_id in psm_list["spectrum_id"]]
-
return psm_list
@@ -77,49 +95,30 @@ def _read_psms(config, psm_list):
if isinstance(psm_list, PSMList):
return psm_list
else:
- logger.info("Reading PSMs from file...")
- current_file = 1
total_files = len(config["psm_file"])
- valid_psms_list = []
- total_psms = 0
- valid_psms = 0
- for psm_file in config["psm_file"]:
+ psm_list = []
+ for current_file, psm_file in enumerate(config["psm_file"]):
logger.info(
- f"Reading PSMs from PSM file ({current_file}/{total_files}): '{psm_file}'..."
+ f"Reading PSMs from PSM file ({current_file+1}/{total_files}): '{psm_file}'..."
)
- try:
- id_file_psm_list = psm_utils.io.read_file(
+ psm_list.extend(
+ psm_utils.io.read_file(
psm_file,
filetype=config["psm_file_type"],
show_progressbar=True,
**config["psm_reader_kwargs"],
)
- except psm_utils.io.PSMUtilsIOException:
- raise MS2RescoreConfigurationError(
- "Error occurred while reading PSMs. Please check the 'psm_file' and "
- "'psm_file_type' settings. See "
- "https://ms2rescore.readthedocs.io/en/latest/userguide/input-files/"
- " for more information."
- )
-
- total_psms += len(id_file_psm_list.psm_list)
- for psm in id_file_psm_list.psm_list:
- if not _has_invalid_aminoacids(psm):
- valid_psms_list.append(psm)
- valid_psms += 1
- current_file += 1
- if total_psms - valid_psms > 0:
- logger.warning(
- f"{total_psms - valid_psms} PSMs with invalid amino acids were removed."
)
- return PSMList(psm_list=valid_psms_list)
+ logger.debug(f"Read {len(psm_list)} PSMs from '{psm_file}'.")
+ return PSMList(psm_list=psm_list)
-def _find_decoys(config, psm_list):
+
+def _find_decoys(psm_list: PSMList, id_decoy_pattern: Optional[str] = None):
"""Find decoys in PSMs, log amount, and raise error if none found."""
logger.debug("Finding decoys...")
- if config["id_decoy_pattern"]:
- psm_list.find_decoys(config["id_decoy_pattern"])
+ if id_decoy_pattern:
+ psm_list.find_decoys(id_decoy_pattern)
n_psms = len(psm_list)
percent_decoys = sum(psm_list["is_decoy"]) / n_psms * 100
@@ -134,12 +133,12 @@ def _find_decoys(config, psm_list):
)
-def _calculate_qvalues(config, psm_list):
+def _calculate_qvalues(psm_list: PSMList, lower_score_is_better: bool):
"""Calculate q-values for PSMs if not present."""
# Calculate q-values if not present
if None in psm_list["qvalue"]:
logger.debug("Recalculating q-values...")
- psm_list.calculate_qvalues(reverse=not config["lower_score_is_better"])
+ psm_list.calculate_qvalues(reverse=not lower_score_is_better)
def _match_psm_ids(old_id, regex_pattern):
@@ -154,7 +153,45 @@ def _match_psm_ids(old_id, regex_pattern):
)
-def _has_invalid_aminoacids(psm):
- """Check if a PSM contains invalid amino acids."""
+def _parse_values_from_spectrum_id(
+ psm_list: PSMList,
+ psm_id_rt_pattern: Optional[str] = None,
+ psm_id_im_pattern: Optional[str] = None,
+):
+ """Parse retention time and or ion mobility values from the spectrum_id."""
+ for pattern, label, key in zip(
+ [psm_id_rt_pattern, psm_id_im_pattern],
+ ["retention time", "ion mobility"],
+ ["retention_time", "ion_mobility"],
+ ):
+ if pattern:
+ logger.debug(
+ f"Parsing {label} from spectrum_id with regex pattern " f"{psm_id_rt_pattern}"
+ )
+ try:
+ pattern = re.compile(pattern)
+ psm_list[key] = [
+ float(pattern.search(psm.spectrum_id).group(1)) for psm in psm_list
+ ]
+ except AttributeError:
+ raise MS2RescoreConfigurationError(
+ f"Could not parse {label} from spectrum_id with the "
+ f"{pattern} regex pattern. "
+ f"Example spectrum_id: '{psm_list[0].spectrum_id}'\n. "
+ f"Please make sure the {label} key is present in the spectrum_id "
+ "and the value is in a capturing group or disable the relevant feature generator."
+ )
+
+
+def _remove_invalid_aa(psm_list: PSMList) -> PSMList:
+ """Remove PSMs with invalid amino acids."""
+ invalid_psms = np.array(
+ [any(aa in "BJOUXZ" for aa in psm.peptidoform.sequence) for psm in psm_list]
+ )
- return any(aa not in "ACDEFGHIKLMNPQRSTVWY" for aa in psm.peptidoform.sequence)
+ if any(invalid_psms):
+ logger.warning(f"Removed {sum(invalid_psms)} PSMs with invalid amino acids.")
+ return psm_list[~invalid_psms]
+ else:
+ logger.debug("No PSMs with invalid amino acids found.")
+ return psm_list
diff --git a/ms2rescore/parse_spectra.py b/ms2rescore/parse_spectra.py
index 17b35d60..2b2c1b5f 100644
--- a/ms2rescore/parse_spectra.py
+++ b/ms2rescore/parse_spectra.py
@@ -6,7 +6,6 @@
from ms2rescore_rs import get_precursor_info
from psm_utils import PSMList
-from rich.progress import track
from ms2rescore.exceptions import MS2RescoreError
from ms2rescore.utils import infer_spectrum_path
diff --git a/ms2rescore/report/charts.py b/ms2rescore/report/charts.py
index 48d543a7..669bcc62 100644
--- a/ms2rescore/report/charts.py
+++ b/ms2rescore/report/charts.py
@@ -198,6 +198,7 @@ def score_scatter_plot(
after: mokapot.LinearConfidence,
level: str = "psms",
indexer: str = "index",
+ fdr_threshold: float = 0.01,
) -> go.Figure:
"""
Plot PSM scores before and after rescoring.
@@ -241,16 +242,22 @@ def score_scatter_plot(
ce_psms = pd.concat([ce_psms_targets, ce_psms_decoys], axis=0)
# Get score thresholds
- score_threshold_before = (
- ce_psms[ce_psms["mokapot q-value before"] <= 0.01]
- .sort_values("mokapot q-value before", ascending=False)["mokapot score before"]
- .iloc[0]
- )
- score_threshold_after = (
- ce_psms[ce_psms["mokapot q-value after"] <= 0.01]
- .sort_values("mokapot q-value after", ascending=False)["mokapot score after"]
- .iloc[0]
- )
+ try:
+ score_threshold_before = (
+ ce_psms[ce_psms["mokapot q-value before"] <= fdr_threshold]
+ .sort_values("mokapot q-value before", ascending=False)["mokapot score before"]
+ .iloc[0]
+ )
+ except IndexError: # No PSMs below threshold
+ score_threshold_before = None
+ try:
+ score_threshold_after = (
+ ce_psms[ce_psms["mokapot q-value after"] <= fdr_threshold]
+ .sort_values("mokapot q-value after", ascending=False)["mokapot score after"]
+ .iloc[0]
+ )
+ except IndexError: # No PSMs below threshold
+ score_threshold_after = None
# Plot
fig = px.scatter(
@@ -267,10 +274,12 @@ def score_scatter_plot(
},
)
# draw FDR thresholds
- fig.add_vline(x=score_threshold_before, line_dash="dash", row=1, col=1)
- fig.add_hline(y=score_threshold_after, line_dash="dash", row=1, col=1)
- fig.add_vline(x=score_threshold_before, line_dash="dash", row=2, col=1)
- fig.add_hline(y=score_threshold_after, line_dash="dash", row=1, col=2)
+ if score_threshold_before:
+ fig.add_vline(x=score_threshold_before, line_dash="dash", row=1, col=1)
+ fig.add_vline(x=score_threshold_before, line_dash="dash", row=2, col=1)
+ if score_threshold_after:
+ fig.add_hline(y=score_threshold_after, line_dash="dash", row=1, col=1)
+ fig.add_hline(y=score_threshold_after, line_dash="dash", row=1, col=2)
return fig
@@ -373,7 +382,7 @@ def identification_overlap(
return figure
levels = before.levels # ["psms", "peptides", "proteins"] if all available
- indexers = ["index", "index", "mokapot protein group"]
+ indexers = ["index", "peptide", "mokapot protein group"]
overlap_data = defaultdict(dict)
for level, indexer in zip(levels, indexers):
@@ -386,7 +395,7 @@ def identification_overlap(
set_after = set(df_after[df_after["mokapot q-value"] <= 0.01][indexer])
overlap_data["removed"][level] = -len(set_before - set_after)
- overlap_data["retained"][level] = len(set_before | set_after)
+ overlap_data["retained"][level] = len(set_after.intersection(set_before))
overlap_data["gained"][level] = len(set_after - set_before)
colors = ["#953331", "#316395", "#319545"]
diff --git a/ms2rescore/report/generate.py b/ms2rescore/report/generate.py
index cc4b8aa9..91444bfe 100644
--- a/ms2rescore/report/generate.py
+++ b/ms2rescore/report/generate.py
@@ -82,7 +82,7 @@ def generate_report(
# Read config
config = json.loads(files["configuration"].read_text())
- logger.info("Recalculating confidence estimates...")
+ logger.debug("Recalculating confidence estimates...")
fasta_file = config["ms2rescore"]["fasta_file"]
confidence_before, confidence_after = get_confidence_estimates(psm_list, fasta_file)
@@ -139,19 +139,21 @@ def generate_report(
def _collect_files(output_path_prefix, use_txt_log=False):
"""Collect all files generated by MS²Rescore."""
- logger.info("Collecting files...")
+ logger.debug("Collecting files...")
files = {
"PSMs": Path(output_path_prefix + ".psms.tsv").resolve(),
"configuration": Path(output_path_prefix + ".full-config.json").resolve(),
"feature names": Path(output_path_prefix + ".feature_names.tsv").resolve(),
"feature weights": Path(output_path_prefix + ".mokapot.weights.tsv").resolve(),
- "log": Path(output_path_prefix + ".log.txt").resolve()
- if use_txt_log
- else Path(output_path_prefix + ".log.html").resolve(),
+ "log": (
+ Path(output_path_prefix + ".log.txt").resolve()
+ if use_txt_log
+ else Path(output_path_prefix + ".log.html").resolve()
+ ),
}
for file, path in files.items():
if Path(path).is_file():
- logger.info("✅ Found %s: '%s'", file, path.as_posix())
+ logger.debug("✅ Found %s: '%s'", file, path.as_posix())
else:
logger.warning("❌ %s: '%s'", file, path.as_posix())
files[file] = None
@@ -183,7 +185,7 @@ def _get_stats_context(confidence_before, confidence_after):
"item": level_name,
"card_color": card_color,
"number": after,
- "diff": f"{after - before:+}",
+ "diff": f"({after - before:+})",
"percentage": f"{increase:.1f}%",
"is_increase": increase > 0,
"bar_percentage": before / after * 100 if increase > 0 else after / before * 100,
@@ -321,16 +323,12 @@ def _get_features_context(
import deeplc.plot
scatter_chart = deeplc.plot.scatter(
- df=features[
- (psm_list["is_decoy"] == False) & (psm_list["qvalue"] <= 0.01)
- ], # noqa: E712
+ df=features[(~psm_list["is_decoy"]) & (psm_list["qvalue"] <= 0.01)],
predicted_column="predicted_retention_time_best",
observed_column="observed_retention_time_best",
)
baseline_chart = deeplc.plot.distribution_baseline(
- df=features[
- (psm_list["is_decoy"] == False) & (psm_list["qvalue"] <= 0.01)
- ], # noqa: E712
+ df=features[(~psm_list["is_decoy"]) & (psm_list["qvalue"] <= 0.01)],
predicted_column="predicted_retention_time_best",
observed_column="observed_retention_time_best",
)
@@ -343,6 +341,26 @@ def _get_features_context(
}
)
+ # IM2Deep specific charts
+ if "im2deep" in feature_names:
+ import deeplc.plot
+
+ scatter_chart = deeplc.plot.scatter(
+ df=features[(~psm_list["is_decoy"]) & (psm_list["qvalue"] <= 0.01)],
+ predicted_column="ccs_predicted_im2deep",
+ observed_column="ccs_observed_im2deep",
+ xaxis_label="Observed CCS",
+ yaxis_label="Predicted CCS",
+ plot_title="Predicted vs. observed CCS",
+ )
+
+ context["charts"].append(
+ {
+ "title": TEXTS["charts"]["im2deep_performance"]["title"],
+ "description": TEXTS["charts"]["im2deep_performance"]["description"],
+ "chart": scatter_chart.to_html(**PLOTLY_HTML_KWARGS),
+ }
+ )
return context
diff --git a/ms2rescore/report/templates/texts.toml b/ms2rescore/report/templates/texts.toml
index 2d0840cf..52c9a230 100644
--- a/ms2rescore/report/templates/texts.toml
+++ b/ms2rescore/report/templates/texts.toml
@@ -105,3 +105,9 @@ bottom chart shows the distribution of RMAE values of DeepLC predictions on 460
datasets. The red line indicates the RMAE value for all target PSMs that passed the 1% FDR threshold
of the current dataset. A lower RMAE value indicates better performance.
"""
+
+[charts.im2deep_performance]
+title = "IM2Deep model performance"
+description = """
+IM2Deep model performance can be visualized by plotting the predicted CCS against the observed CCS.
+"""
diff --git a/ms2rescore/report/utils.py b/ms2rescore/report/utils.py
index 272bdb8b..069a641f 100644
--- a/ms2rescore/report/utils.py
+++ b/ms2rescore/report/utils.py
@@ -51,35 +51,25 @@ def get_confidence_estimates(
"was generated by MS²Rescore. Could not generate report."
) from e
+ score_after = psm_list["score"]
peptide = (
pd.Series(psm_list["peptidoform"]).astype(str).str.replace(r"(/\d+$)", "", n=1, regex=True)
)
- psms = pd.DataFrame(
- {
- "peptide": peptide,
- "is_target": ~psm_list["is_decoy"],
- "before": score_before,
- "after": psm_list["score"],
- }
- ).reset_index()
-
+ psms = pd.DataFrame({"peptide": peptide, "is_target": ~psm_list["is_decoy"]}).reset_index()
+ lin_psm_dataset = LinearPsmDataset(
+ psms=psms,
+ target_column="is_target",
+ spectrum_columns="index",
+ peptide_column="peptide",
+ )
if fasta_file:
fasta = read_fasta(fasta_file)
+ lin_psm_dataset.add_proteins(fasta)
confidence = dict()
- for when in ["before", "after"]:
- lin_psm_dataset = LinearPsmDataset(
- psms=psms,
- target_column="is_target",
- spectrum_columns="index",
- feature_columns=[when],
- peptide_column="peptide",
- )
- if fasta_file:
- lin_psm_dataset.add_proteins(fasta)
-
+ for when, scores in [("before", score_before), ("after", score_after)]:
try:
- confidence[when] = lin_psm_dataset.assign_confidence()
+ confidence[when] = lin_psm_dataset.assign_confidence(scores=scores)
except RuntimeError:
confidence[when] = None
diff --git a/ms2rescore/rescoring_engines/mokapot.py b/ms2rescore/rescoring_engines/mokapot.py
index cc7a336f..f02d877f 100644
--- a/ms2rescore/rescoring_engines/mokapot.py
+++ b/ms2rescore/rescoring_engines/mokapot.py
@@ -20,7 +20,8 @@
"""
import logging
-from typing import Any, List, Optional, Tuple, Dict
+import re
+from typing import Any, Dict, List, Optional, Tuple
import mokapot
import numpy as np
@@ -28,15 +29,20 @@
import psm_utils
from mokapot.brew import brew
from mokapot.dataset import LinearPsmDataset
+from mokapot.model import PercolatorModel
from pyteomics.mass import nist_mass
+from ms2rescore.exceptions import RescoringError
+
logger = logging.getLogger(__name__)
+logging.getLogger("numba").setLevel(logging.WARNING)
def rescore(
psm_list: psm_utils.PSMList,
output_file_root: str = "ms2rescore",
fasta_file: Optional[str] = None,
+ train_fdr: float = 0.01,
write_weights: bool = False,
write_txt: bool = False,
write_flashlfq: bool = False,
@@ -63,6 +69,8 @@ def rescore(
fasta_file
Path to FASTA file with protein sequences to use for protein inference. Defaults to
``None``.
+ train_fdr
+ FDR to use for training the Mokapot model. Defaults to ``0.01``.
write_weights
Write model weights to a text file. Defaults to ``False``.
write_txt
@@ -89,27 +97,15 @@ def rescore(
# Rescore
logger.debug(f"Mokapot brew options: `{kwargs}`")
- confidence_results, models = brew(lin_psm_data, **kwargs)
-
- # Reshape confidence estimates to match PSMList
- mokapot_values_targets = (
- confidence_results.confidence_estimates["psms"]
- .set_index("index")
- .sort_index()[["mokapot score", "mokapot q-value", "mokapot PEP"]]
- )
- mokapot_values_decoys = (
- confidence_results.decoy_confidence_estimates["psms"]
- .set_index("index")
- .sort_index()[["mokapot score", "mokapot q-value", "mokapot PEP"]]
- )
- q = np.full((len(psm_list), 3), np.nan)
- q[mokapot_values_targets.index] = mokapot_values_targets.values
- q[mokapot_values_decoys.index] = mokapot_values_decoys.values
+ try:
+ confidence_results, models = brew(
+ lin_psm_data, model=PercolatorModel(train_fdr=train_fdr), rng=8, **kwargs
+ )
+ except RuntimeError as e:
+ raise RescoringError("Mokapot could not be run. Please check the input data.") from e
- # Add Mokapot results to PSMList
- psm_list["score"] = q[:, 0]
- psm_list["qvalue"] = q[:, 1]
- psm_list["pep"] = q[:, 2]
+ add_psm_confidence(psm_list, confidence_results)
+ add_peptide_confidence(psm_list, confidence_results)
# Write results
if write_weights:
@@ -173,7 +169,7 @@ def convert_psm_list(
# Ensure filename for FlashLFQ txt output
if not combined_df["run"].notnull().all():
- combined_df["run"] = "ms_run"
+ combined_df["run"] = "na"
feature_names = [f"feature:{f}" for f in feature_names] if feature_names else None
@@ -224,6 +220,58 @@ def save_model_weights(
)
+def add_psm_confidence(
+ psm_list: psm_utils.PSMList, confidence_results: mokapot.confidence.Confidence
+) -> None:
+ """Add PSM-level confidence estimates to PSM list, updating score, qvalue, pep, and rank."""
+ # Reshape confidence estimates to match PSMList
+ keys = ["mokapot score", "mokapot q-value", "mokapot PEP"]
+ mokapot_values_targets = (
+ confidence_results.confidence_estimates["psms"].set_index("index").sort_index()[keys]
+ )
+ mokapot_values_decoys = (
+ confidence_results.decoy_confidence_estimates["psms"].set_index("index").sort_index()[keys]
+ )
+ q = np.full((len(psm_list), 3), np.nan)
+ q[mokapot_values_targets.index] = mokapot_values_targets.values
+ q[mokapot_values_decoys.index] = mokapot_values_decoys.values
+
+ # Add Mokapot results to PSMList
+ psm_list["score"] = q[:, 0]
+ psm_list["qvalue"] = q[:, 1]
+ psm_list["pep"] = q[:, 2]
+
+ # Reset ranks to match new scores
+ psm_list.set_ranks(lower_score_better=False)
+
+
+def add_peptide_confidence(
+ psm_list: psm_utils.PSMList, confidence_results: mokapot.confidence.Confidence
+) -> None:
+ """Add Mokapot peptide-level confidence estimates to PSM list."""
+ keys = ["mokapot score", "mokapot q-value", "mokapot PEP"]
+ peptide_info = pd.concat(
+ [
+ confidence_results.confidence_estimates["peptides"].set_index("peptide")[keys],
+ confidence_results.decoy_confidence_estimates["peptides"].set_index("peptide")[keys],
+ ],
+ axis=0,
+ ).to_dict(orient="index")
+
+ # Add peptide-level scores to PSM metadata
+ # run_key = "na" if not all(psm.run for psm in psm_list) else None
+ no_charge_pattern = re.compile(r"(/\d+$)")
+ for psm in psm_list:
+ peptide_scores = peptide_info[(no_charge_pattern.sub("", str(psm.peptidoform), 1))]
+ psm.metadata.update(
+ {
+ "peptide_score": peptide_scores["mokapot score"],
+ "peptide_qvalue": peptide_scores["mokapot q-value"],
+ "peptide_pep": peptide_scores["mokapot PEP"],
+ }
+ )
+
+
def _mz_to_mass(mz: float, charge: int) -> float:
"""Convert m/z to mass."""
return mz * charge - charge * nist_mass["H"][1][0]
diff --git a/ms2rescore/rescoring_engines/percolator.py b/ms2rescore/rescoring_engines/percolator.py
index 5f7d4e5d..9952aed5 100644
--- a/ms2rescore/rescoring_engines/percolator.py
+++ b/ms2rescore/rescoring_engines/percolator.py
@@ -20,8 +20,8 @@
import logging
import subprocess
from typing import Any, Dict, Optional
+from copy import deepcopy
-import numpy as np
import psm_utils
from ms2rescore.exceptions import MS2RescoreError
@@ -103,8 +103,15 @@ def rescore(
# Need to be able to link back to original PSMs, so reindex spectrum IDs, but copy PSM list
# to avoid modifying original...
# TODO: Better approach for this?
- psm_list_reindexed = psm_list.copy()
- psm_list_reindexed["spectrum_id"] = np.arange(len(psm_list_reindexed))
+
+ psm_list_reindexed = deepcopy(psm_list)
+ psm_list_reindexed.set_ranks()
+ psm_list_reindexed["spectrum_id"] = [
+ f"{psm.get_usi(as_url=False)}_{psm.rank}" for psm in psm_list_reindexed
+ ]
+ spectrum_id_index = {
+ spectrum_id: index for index, spectrum_id in enumerate(psm_list_reindexed["spectrum_id"])
+ }
_write_pin_file(psm_list_reindexed, pin_filepath)
@@ -134,10 +141,13 @@ def rescore(
psm_list,
percolator_kwargs["results-psms"],
percolator_kwargs["decoy-results-psms"],
+ spectrum_id_index,
)
-def _update_psm_scores(psm_list: psm_utils.PSMList, target_pout: str, decoy_pout: str):
+def _update_psm_scores(
+ psm_list: psm_utils.PSMList, target_pout: str, decoy_pout: str, spectrum_id_index: list
+):
"""
Update PSM scores with Percolator results.
@@ -150,7 +160,9 @@ def _update_psm_scores(psm_list: psm_utils.PSMList, target_pout: str, decoy_pout
psm_list_percolator = psm_utils.PSMList(psm_list=target_psms.psm_list + decoy_psms.psm_list)
# Sort by reindexed spectrum_id so order matches original PSM list
- psm_list_percolator[np.argsort(psm_list_percolator["spectrum_id"])]
+ psm_list_percolator = sorted(
+ psm_list_percolator, key=lambda psm: spectrum_id_index[psm["spectrum_id"]]
+ )
if not len(psm_list) == len(psm_list_percolator):
raise MS2RescoreError(
@@ -163,6 +175,8 @@ def _update_psm_scores(psm_list: psm_utils.PSMList, target_pout: str, decoy_pout
original_psm["qvalue"] = new_psm["qvalue"]
original_psm["pep"] = new_psm["pep"]
+ psm_list.set_ranks(lower_score_better=False)
+
def _write_pin_file(psm_list: psm_utils.PSMList, filepath: str):
"""Write PIN file for rescoring."""
diff --git a/ms2rescore/utils.py b/ms2rescore/utils.py
index 70417b56..3515893a 100644
--- a/ms2rescore/utils.py
+++ b/ms2rescore/utils.py
@@ -35,44 +35,61 @@ def infer_spectrum_path(
"and no run name in PSM file found."
)
- # If passed path is directory, join with run name
- elif os.path.isdir(configured_path):
- if run_name:
- resolved_path = os.path.join(configured_path, run_name)
+ else:
+ is_bruker_dir = configured_path.endswith(".d") or _is_minitdf(configured_path)
+
+ # If passed path is directory (that is not Bruker raw), join with run name
+ if os.path.isdir(configured_path) and not is_bruker_dir:
+ if run_name:
+ resolved_path = os.path.join(configured_path, run_name)
+ else:
+ raise MS2RescoreConfigurationError(
+ "Could not resolve spectrum file name: Spectrum path is directory "
+ "but no run name in PSM file found."
+ )
+
+ # If passed path is file, use that, but warn if basename doesn't match expected
+ elif os.path.isfile(configured_path) or (os.path.isdir(configured_path) and is_bruker_dir):
+ if run_name and Path(configured_path).stem != Path(run_name).stem:
+ logger.warning(
+ "Passed spectrum path (`%s`) does not match run name found in PSM "
+ "file (`%s`). Continuing with passed spectrum path.",
+ configured_path,
+ run_name,
+ )
+ resolved_path = configured_path
else:
raise MS2RescoreConfigurationError(
- "Could not resolve spectrum file name: Spectrum path is directory "
- "but no run name in PSM file found."
- )
-
- # If passed path is file, use that, but warn if basename doesn't match expected
- elif os.path.isfile(configured_path):
- if run_name and Path(configured_path).stem != Path(run_name).stem:
- logger.warning(
- "Passed spectrum path (`%s`) does not match run name found in PSM "
- "file (`%s`). Continuing with passed spectrum path.",
- configured_path,
- run_name,
+ "Configured `spectrum_path` must be `None` or a path to an existing file "
+ "or directory. If `None` or path to directory, spectrum run information "
+ "should be present in the PSM file."
)
- resolved_path = configured_path
- else:
- raise MS2RescoreConfigurationError(
- "Configured `spectrum_path` must be `None` or a path to an existing file "
- "or directory. If `None` or path to directory, spectrum run information "
- "should be present in the PSM file."
- )
# Match with file extension if not in resolved_path yet
- if not re.match(".mgf$|.mzml$", resolved_path, flags=re.IGNORECASE):
+ if not _is_minitdf(resolved_path) and not re.match(
+ r"\.mgf$|\.mzml$|\.d$", resolved_path, flags=re.IGNORECASE
+ ):
for filename in glob(resolved_path + "*"):
- if re.match(r".*(\.mgf$|\.mzml$)", filename, flags=re.IGNORECASE):
+ if re.match(r".*(\.mgf$|\.mzml$|\.d)", filename, flags=re.IGNORECASE):
resolved_path = filename
break
else:
raise MS2RescoreConfigurationError(
- "Resolved spectrum filename does not contain a supported file "
- "extension (mgf or mzml) and could not find any matching existing "
+ f"Resolved spectrum filename ('{resolved_path}') does not contain a supported "
+ "file extension (mzML, MGF, or .d) and could not find any matching existing "
"files."
)
return Path(resolved_path)
+
+
+def _is_minitdf(spectrum_file: str) -> bool:
+ """
+ Check if the spectrum file is a Bruker miniTDF folder.
+
+ A Bruker miniTDF folder has no fixed name, but contains files matching the patterns
+ ``*ms2spectrum.bin`` and ``*ms2spectrum.parquet``.
+ """
+ files = set(Path(spectrum_file).glob("*ms2spectrum.bin"))
+ files.update(Path(spectrum_file).glob("*ms2spectrum.parquet"))
+ return len(files) >= 2
diff --git a/pyproject.toml b/pyproject.toml
index 18bbb8be..d25968bc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,25 +32,24 @@ classifiers = [
dynamic = ["version"]
requires-python = ">=3.8"
dependencies = [
- "ms2rescore_rs",
- "numpy>=1.16.0; python_version != '3.11'",
- "numpy==1.24.3; python_version == '3.11'", # Incompatibility with sklearn, pygam, and TF...
- "pandas>=1.0",
- "rich>=12",
- "pyteomics>=4.1.0",
- "lxml>=4.5",
- "ms2pip>=4.0.0-dev4",
- "click>=7",
"cascade-config>=0.4.0",
+ "click>=7",
+ "customtkinter>=5,<6",
"deeplc>=2.2",
"deeplcretrainer>=0.2",
- "tomli>=2; python_version < '3.11'",
- "psm_utils>=0.4",
- "customtkinter>=5,<6",
- "mokapot>=0.9",
- "pydantic>=1.8.2,<2", # Fix compatibility with v2 in psm_utils
+ "im2deep>=0.1.3",
"jinja2>=3",
+ "lxml>=4.5",
+ "mokapot>=0.9",
+ "ms2pip>=4.0.0-dev10",
+ "ms2rescore_rs",
+ "numpy>=1.16.0",
+ "pandas>=1.0",
"plotly>=5",
+ "psm_utils>=0.9",
+ "pyteomics>=4.7.2",
+ "rich>=12",
+ "tomli>=2; python_version < '3.11'",
]
[project.optional-dependencies]
@@ -79,6 +78,7 @@ CompOmics = "https://www.compomics.com"
ms2rescore = "ms2rescore.__main__:main"
ms2rescore-gui = "ms2rescore.gui.__main__:main"
ms2rescore-report = "ms2rescore.report.__main__:main"
+tims2rescore = "ms2rescore.__main__:main_tims"
[build-system]
requires = ["flit_core >=3.2,<4"]
@@ -94,3 +94,6 @@ target-version = ['py38']
[tool.ruff]
line-length = 99
target-version = 'py38'
+
+[tool.ruff.lint]
+extend-select = ["T201", "T203"]