diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index c03b1667..1137ef58 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -14,7 +14,7 @@ jobs: - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: "3.11" @@ -29,7 +29,7 @@ jobs: - name: Test built package run: | - pip install dist/ms2rescore-*.whl + pip install --only-binary :all: dist/ms2rescore-*.whl # pytest ms2rescore --help @@ -47,14 +47,14 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 with: python-version: "3.11" - name: Install package and dependencies run: | python -m pip install --upgrade pip - pip install . pyinstaller + pip install --only-binary :all: . pyinstaller - name: Install Inno Setup uses: crazy-max/ghaction-chocolatey@v3 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index dbe1019b..8830c6e0 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -24,18 +24,14 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install flake8 + pip install ruff - - name: Lint with flake8 - run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Run Ruff + run: ruff check --output-format=github . - name: Build and install ms2rescore package run: | - pip install .[dev] + pip install --only-binary :all: .[dev] - name: Test with pytest run: | diff --git a/Dockerfile b/Dockerfile index 474f6f5d..7ac34e15 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,10 @@ -FROM ubuntu:focal +FROM python:3.11 + +# ARG DEBIAN_FRONTEND=noninteractive LABEL name="ms2rescore" -ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/ms2rescore +# ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/ms2rescore ADD pyproject.toml /ms2rescore/pyproject.toml ADD LICENSE /ms2rescore/LICENSE @@ -11,8 +13,7 @@ ADD MANIFEST.in /ms2rescore/MANIFEST.in ADD ms2rescore /ms2rescore/ms2rescore RUN apt-get update \ - && apt-get install --no-install-recommends -y python3-pip procps libglib2.0-0 libsm6 libxrender1 libxext6 \ - && rm -rf /var/lib/apt/lists/* \ - && pip3 install ms2rescore/ + && apt install -y procps \ + && pip install /ms2rescore --only-binary :all: ENTRYPOINT [""] diff --git a/docs/source/config_schema.md b/docs/source/config_schema.md index 2b523aa5..209bc009 100644 --- a/docs/source/config_schema.md +++ b/docs/source/config_schema.md @@ -10,6 +10,7 @@ - **`deeplc`**: Refer to *[#/definitions/deeplc](#definitions/deeplc)*. - **`maxquant`**: Refer to *[#/definitions/maxquant](#definitions/maxquant)*. - **`ionmob`**: Refer to *[#/definitions/ionmob](#definitions/ionmob)*. + - **`im2deep`**: Refer to *[#/definitions/im2deep](#definitions/im2deep)*. - **`rescoring_engine`** *(object)*: Rescoring engine to use and its configuration. Leave empty to skip rescoring and write features to file. Default: `{"mokapot": {}}`. - **`.*`**: Refer to *[#/definitions/rescoring_engine](#definitions/rescoring_engine)*. - **`percolator`**: Refer to *[#/definitions/percolator](#definitions/percolator)*. @@ -47,7 +48,17 @@ - **One of** - *string* - *null* + - **`psm_id_rt_pattern`**: Regex pattern to extract retention time from PSM identifier. Requires at least one capturing group. Default: `null`. + - **One of** + - *string* + - *null* + - **`psm_id_im_pattern`**: Regex pattern to extract ion mobility from PSM identifier. Requires at least one capturing group. Default: `null`. + - **One of** + - *string* + - *null* - **`lower_score_is_better`** *(boolean)*: Bool indicating if lower score is better. Default: `false`. + - **`max_psm_rank_input`** *(number)*: Maximum rank of PSMs to use as input for rescoring. Minimum: `1`. Default: `10`. + - **`max_psm_rank_output`** *(number)*: Maximum rank of PSMs to return after rescoring, before final FDR calculation. Minimum: `1`. Default: `1`. - **`modification_mapping`** *(object)*: Mapping of modification labels to each replacement label. Default: `{}`. - **`fixed_modifications`** *(object)*: Mapping of amino acids with fixed modifications to the modification name. Can contain additional properties. Default: `{}`. - **`processes`** *(number)*: Number of parallel processes to use; -1 for all available. Minimum: `-1`. Default: `-1`. @@ -57,6 +68,7 @@ - *string* - *null* - **`write_report`** *(boolean)*: Write an HTML report with various QC metrics and charts. Default: `false`. + - **`profile`** *(boolean)*: Write a txt report using cProfile for profiling. Default: `false`. ## Definitions - **`feature_generator`** *(object)*: Feature generator configuration. Can contain additional properties. @@ -75,7 +87,10 @@ - **`ionmob_model`** *(string)*: Path to Ionmob model directory. Default: `"GRUPredictor"`. - **`reference_dataset`** *(string)*: Path to Ionmob reference dataset file. Default: `"Meier_unimod.parquet"`. - **`tokenizer`** *(string)*: Path to tokenizer json file. Default: `"tokenizer.json"`. +- **`im2deep`** *(object)*: Ion mobility feature generator configuration using IM2Deep. Can contain additional properties. Refer to *[#/definitions/feature_generator](#definitions/feature_generator)*. + - **`reference_dataset`** *(string)*: Path to IM2Deep reference dataset file. Default: `"Meier_unimod.parquet"`. - **`mokapot`** *(object)*: Mokapot rescoring engine configuration. Additional properties are passed to the Mokapot brew function. Can contain additional properties. Refer to *[#/definitions/rescoring_engine](#definitions/rescoring_engine)*. + - **`train_fdr`** *(number)*: FDR threshold for training Mokapot. Minimum: `0`. Maximum: `1`. Default: `0.01`. - **`write_weights`** *(boolean)*: Write Mokapot weights to a text file. Default: `false`. - **`write_txt`** *(boolean)*: Write Mokapot results to a text file. Default: `false`. - **`write_flashlfq`** *(boolean)*: Write Mokapot results to a FlashLFQ-compatible file. Default: `false`. diff --git a/docs/source/userguide/configuration.rst b/docs/source/userguide/configuration.rst index 8b68ea45..e22b3ec1 100644 --- a/docs/source/userguide/configuration.rst +++ b/docs/source/userguide/configuration.rst @@ -240,6 +240,65 @@ expression pattern that extracts the decoy status from the protein name: decoy_pattern = "DECOY_" +Multi-rank rescoring +==================== + +Some search engines, such as MaxQuant, report multiple candidate PSMs for the same spectrum. +MS²Rescore can rescore multiple candidate PSMs per spectrum. This allows for lower-ranking +candidate PSMs to become the top-ranked PSM after rescoring. This behavior can be controlled with +the ``max_psm_rank_input`` option. + +To ensure a correct FDR control after rescoring, MS²Rescore filters out lower-ranking PSMs before +final FDR calculation and writing the output files. To allow for lower-ranking PSMs to be included +in the final output - for instance, to consider chimeric spectra - the ``max_psm_rank_output`` +option can be used. + +For example, to rescore the top 5 PSMs per spectrum and output the best PSM after rescoring, +the following configuration can be used: + +.. tab:: JSON + + .. code-block:: json + + "max_psm_rank_input": 5 + "max_psm_rank_output": 1 + +.. tab:: TOML + + .. code-block:: toml + + max_psm_rank_input = 5 + max_psm_rank_output = 1 + + +Configuring rescoring engines +============================= + +MS²Rescore supports multiple rescoring engines, such as Mokapot and Percolator. The rescoring +engine can be selected and configured with the ``rescoring_engine`` option. For example, to use +Mokapot with a custom train_fdr of 0.1%, the following configuration can be used: + +.. tab:: JSON + + .. code-block:: json + + "rescoring_engine": { + "mokapot": { + "train_fdr": 0.001 + } + +.. tab:: TOML + + .. code-block:: toml + + [ms2rescore.rescoring_engine.mokapot] + train_fdr = 0.001 + + +All options for the rescoring engines can be found in the :ref:`ms2rescore.rescoring_engines` +section. + + All configuration options ========================= diff --git a/docs/source/userguide/input-files.rst b/docs/source/userguide/input-files.rst index 08129b7f..b556b520 100644 --- a/docs/source/userguide/input-files.rst +++ b/docs/source/userguide/input-files.rst @@ -5,16 +5,13 @@ Input files PSM file(s) =========== -The peptide-spectrum match (PSM) file is generally the output from a proteomics search engine. -This file serves as the main input to MS²Rescore. One or multiple PSM files can be provided at -once. Note that merging PSMs from different MS runs could have an impact on the correctness of -the FDR control. +The **peptide-spectrum match (PSM) file** is generally the output from a proteomics search engine. +This file serves as the main input to MS²Rescore. -Various PSM file types are supported. The type can be specified with the ``psm_file_type`` option. -Check the list of :py:mod:`psm_utils` tags in the -:external+psm_utils:ref:`supported file formats ` section. Depending on the -file extension, the file type can also be inferred from the file name. In that case, -``psm_file_type`` option can be set to ``infer``. +The PSM file should contain **all putative identifications** made by the search engine, including +both target and decoy PSMs. Ensure that the search engine was configured to include decoy entries +in the search database and was operated with **target-decoy competition** enabled (i.e., +considering both target and decoy sequences simultaneously during the search). .. attention:: As a general rule, MS²Rescore always needs access to **all target and decoy PSMs, without any @@ -22,6 +19,17 @@ file extension, the file type can also be inferred from the file name. In that c set to 100%. +One or multiple PSM files can be provided at once. Note that merging PSMs from different MS runs +could have an impact on the correctness of the FDR control. Combining multiple PSM files should +generally only be done for LC-fractionated mass spectrometry runs. + +Various PSM file types are supported. The type can be specified with the ``psm_file_type`` option. +Check the list of :py:mod:`psm_utils` tags in the +:external+psm_utils:ref:`supported file formats ` section. Depending on the +file extension, the file type can also be inferred from the file name. In that case, +``psm_file_type`` option can be set to ``infer``. + + Spectrum file(s) ================ diff --git a/docs/source/userguide/output-files.rst b/docs/source/userguide/output-files.rst index ee1a78f8..89c0363b 100644 --- a/docs/source/userguide/output-files.rst +++ b/docs/source/userguide/output-files.rst @@ -52,8 +52,8 @@ Rescoring engine files: | ``..weights.txt`` | Feature weights, showing feature usage in the rescoring run | +-------------------------------------------------------------+-------------------------------------------------------------+ -If no rescoring engine is selected (or if Percolator was selected), the following files will also -be written: +If no rescoring engine is selected, if Percolator was selected, or in DEBUG mode, the following +files will also be written: +-------------------------------------------------------------+-----------------------------------------------------------+ | File | Description | diff --git a/docs/source/userguide/tims2Rescore.rst b/docs/source/userguide/tims2Rescore.rst new file mode 100644 index 00000000..f9b9096f --- /dev/null +++ b/docs/source/userguide/tims2Rescore.rst @@ -0,0 +1,61 @@ +.. _timsrescore: + +TIMS²Rescore User Guide +======================= + +Introduction +------------ + +The `TIMS²Rescore` tool is a DDA-PASEF adapted version of `ms2rescore` that allows users to perform rescoring of peptide-spectrum matches (PSMs) acquired on Bruker instruments. This guide provides an overview of how to use `timsrescore` in `ms2rescore` effectively. + +Installation +------------ + +Before using `timsrescore`, ensure that you have `ms2rescore` installed on your system. You can install `ms2rescore` using the following command: + +.. code-block:: bash + + pip install ms2rescore + +Usage +----- + +To use `timsrescore`, follow these steps: + +1. Prepare your input files: + - Ensure that you have the necessary input files, including the PSM file spectrum files + - Make sure that the PSM file format from a supported search engine or a standard format like .mzid(:external+psm_utils:ref:`supported file formats `). + - Spectrum files can directly be given as .d or minitdf files from Bruker instruments or first converted to .mzML format. + +2. Run `timsrescore`: + - Open a terminal or command prompt. + - Navigate to the directory where your input files are located. + - Execute the following command: + + .. code-block:: bash + + timsrescore -p -s -o + + Replace ``, ``, and `` with the actual paths to your input and output files. + _NOTE_ By default timsTOF specific models will be used for predictions. Optionally you can further configure settings through a configuration file. For more information on configuring `timsrescore`, refer to the :doc:`configuration` tab in the user guide. + +3. Review the results: + - Once the `timsrescore` process completes, you will find the rescoring results in the specified output file or if not specified in the same directory as the input files + - If you want a detailed overview of the performance, you can either give the set `write_report` to `True` in the configuration file, use the `--write_report` option in the command line or run the following command: + + .. code-block:: bash + + ms2rescore-report + + Replace `` with the actual output prefix of the result files to the output file. + +Additional Options +------------------ + +`ms2rescore` provides additional options to customize the `timsrescore` process. You can explore these options by running the following command: + +.. code-block:: bash + + timsrescore --help + + diff --git a/ms2rescore/__init__.py b/ms2rescore/__init__.py index 16f7dfd4..606a1c8a 100644 --- a/ms2rescore/__init__.py +++ b/ms2rescore/__init__.py @@ -1,6 +1,6 @@ """MS²Rescore: Sensitive PSM rescoring with predicted MS² peak intensities and RTs.""" -__version__ = "3.0.3" +__version__ = "3.1.0-dev9" from warnings import filterwarnings diff --git a/ms2rescore/__main__.py b/ms2rescore/__main__.py index 4cac9122..70e384c2 100644 --- a/ms2rescore/__main__.py +++ b/ms2rescore/__main__.py @@ -1,6 +1,9 @@ """MS²Rescore: Sensitive PSM rescoring with predicted MS² peak intensities and RTs.""" import argparse +import cProfile +import importlib.resources +import json import logging import sys from pathlib import Path @@ -10,7 +13,7 @@ from rich.logging import RichHandler from rich.text import Text -from ms2rescore import __version__ +from ms2rescore import __version__, package_data from ms2rescore.config_parser import parse_configurations from ms2rescore.core import rescore from ms2rescore.exceptions import MS2RescoreConfigurationError @@ -33,19 +36,26 @@ CONSOLE = Console(record=True) -def _print_credits(): +def _print_credits(tims=False): """Print software credits to terminal.""" text = Text() text.append("\n") - text.append("MS²Rescore", style="bold link https://github.com/compomics/ms2rescore") + if tims: + text.append("TIMS²Rescore", style="bold link https://github.com/compomics/ms2rescore") + else: + text.append("MS²Rescore", style="bold link https://github.com/compomics/ms2rescore") text.append(f" (v{__version__})\n", style="bold") + if tims: + text.append("MS²Rescore tuned for Bruker timsTOF instruments.\n", style="italic") text.append("Developed at CompOmics, VIB / Ghent University, Belgium.\n") text.append("Please cite: ") text.append( - "Declercq et al. MCP (2022)", style="link https://doi.org/10.1016/j.mcpro.2022.100266" + "Buur & Declercq et al. JPR (2024)", + style="link https://doi.org/10.1021/acs.jproteome.3c00785", ) text.append("\n") - text.stylize("cyan") + if tims: + text.stylize("#006cb5") CONSOLE.print(text) @@ -130,6 +140,21 @@ def _argument_parser() -> argparse.ArgumentParser: dest="fasta_file", help="path to FASTA file", ) + parser.add_argument( + "--write-report", + # metavar="BOOL", + action="store_true", + dest="write_report", + help="boolean to enable profiling with cProfile", + ) + parser.add_argument( + "--profile", + # metavar="BOOL", + action="store_true", + # type=bool, + # dest="profile", + help="boolean to enable profiling with cProfile", + ) return parser @@ -152,18 +177,42 @@ def _setup_logging(passed_level: str, log_file: Union[str, Path]): ) -def main(): +def profile(fnc, filepath): + """A decorator that uses cProfile to profile a function""" + + def inner(*args, **kwargs): + with cProfile.Profile() as profiler: + return_value = fnc(*args, **kwargs) + profiler.dump_stats(filepath + ".profile.prof") + return return_value + + return inner + + +def main_tims(): + """Run MS²Rescore command-line interface in TIMS²Rescore mode.""" + main(tims=True) + + +def main(tims=False): """Run MS²Rescore command-line interface.""" - _print_credits() + _print_credits(tims) # Parse CLI arguments and configuration file parser = _argument_parser() cli_args = parser.parse_args() + + configurations = [] + if cli_args.config_file: + configurations.append(cli_args.config_file) + if tims: + configurations.append( + json.load(importlib.resources.open_text(package_data, "config_default_tims.json")) + ) + configurations.append(cli_args) + try: - if cli_args.config_file: - config = parse_configurations([cli_args.config_file, cli_args]) - else: - config = parse_configurations(cli_args) + config = parse_configurations(configurations) except MS2RescoreConfigurationError as e: LOGGER.critical(e) sys.exit(1) @@ -175,7 +224,11 @@ def main(): # Run MS²Rescore try: - rescore(configuration=config) + if cli_args.profile: + profiled_rescore = profile(rescore, config["ms2rescore"]["output_path"]) + profiled_rescore(configuration=config) + else: + rescore(configuration=config) except Exception as e: LOGGER.exception(e) sys.exit(1) diff --git a/ms2rescore/core.py b/ms2rescore/core.py index d1dfe405..170f1038 100644 --- a/ms2rescore/core.py +++ b/ms2rescore/core.py @@ -3,15 +3,18 @@ from multiprocessing import cpu_count from typing import Dict, Optional +import numpy as np import psm_utils.io +from mokapot.dataset import LinearPsmDataset from psm_utils import PSMList +from ms2rescore import exceptions from ms2rescore.feature_generators import FEATURE_GENERATORS from ms2rescore.parse_psms import parse_psms from ms2rescore.parse_spectra import get_missing_values from ms2rescore.report import generate from ms2rescore.rescoring_engines import mokapot, percolator -from ms2rescore import exceptions +from ms2rescore.rescoring_engines.mokapot import add_peptide_confidence, add_psm_confidence logger = logging.getLogger(__name__) @@ -44,7 +47,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: psm_list = parse_psms(config, psm_list) # Log #PSMs identified before rescoring - id_psms_before = _log_id_psms_before(psm_list) + id_psms_before = _log_id_psms_before(psm_list, max_rank=config["max_psm_rank_output"]) # Define feature names; get existing feature names from PSM file feature_names = dict() @@ -58,12 +61,8 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: f"PSMs already contain the following rescoring features: {psm_list_feature_names}" ) - # TODO: avoid hard coding feature generators in some way - rt_required = "deeplc" in config["feature_generators"] and None in psm_list["retention_time"] - im_required = "ionmob" in config["feature_generators"] and None in psm_list["ion_mobility"] - if rt_required or im_required: - logger.info("Parsing missing retention time and/or ion mobility values from spectra...") - get_missing_values(psm_list, config, rt_required=rt_required, im_required=im_required) + # Add missing precursor info from spectrum file if needed + psm_list = _fill_missing_precursor_info(psm_list, config) # Add rescoring features for fgen_name, fgen_config in config["feature_generators"].items(): @@ -107,8 +106,8 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: logging.debug(f"Creating USIs for {len(psm_list)} PSMs") psm_list["spectrum_id"] = [psm.get_usi(as_url=False) for psm in psm_list] - # If no rescoring engine is specified, write PSMs and features to PIN file - if not config["rescoring_engine"]: + # If no rescoring engine is specified or DEBUG, write PSMs and features to PIN file + if not config["rescoring_engine"] or config["log_level"] == "debug": logger.info(f"Writing added features to PIN file: {output_file_root}.psms.pin") psm_utils.io.write_file( psm_list, @@ -116,35 +115,49 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: filetype="percolator", feature_names=all_feature_names, ) + + if not config["rescoring_engine"]: + logger.info("No rescoring engine specified. Skipping rescoring.") return None # Rescore PSMs - if "percolator" in config["rescoring_engine"]: - percolator.rescore( - psm_list, - output_file_root=output_file_root, - log_level=config["log_level"], - processes=config["processes"], - percolator_kwargs=config["rescoring_engine"]["percolator"], - ) - elif "mokapot" in config["rescoring_engine"]: - if "fasta_file" not in config["rescoring_engine"]["mokapot"]: - config["rescoring_engine"]["mokapot"]["fasta_file"] = config["fasta_file"] - if "protein_kwargs" in config["rescoring_engine"]["mokapot"]: - protein_kwargs = config["rescoring_engine"]["mokapot"].pop("protein_kwargs") - else: - protein_kwargs = dict() - - mokapot.rescore( - psm_list, - output_file_root=output_file_root, - protein_kwargs=protein_kwargs, - **config["rescoring_engine"]["mokapot"], - ) - else: - logger.info("No known rescoring engine specified. Skipping rescoring.") + try: + if "percolator" in config["rescoring_engine"]: + percolator.rescore( + psm_list, + output_file_root=output_file_root, + log_level=config["log_level"], + processes=config["processes"], + percolator_kwargs=config["rescoring_engine"]["percolator"], + ) + elif "mokapot" in config["rescoring_engine"]: + if "fasta_file" not in config["rescoring_engine"]["mokapot"]: + config["rescoring_engine"]["mokapot"]["fasta_file"] = config["fasta_file"] + if "protein_kwargs" in config["rescoring_engine"]["mokapot"]: + protein_kwargs = config["rescoring_engine"]["mokapot"].pop("protein_kwargs") + else: + protein_kwargs = dict() + + mokapot.rescore( + psm_list, + output_file_root=output_file_root, + protein_kwargs=protein_kwargs, + **config["rescoring_engine"]["mokapot"], + ) + except exceptions.RescoringError as e: + # Write output + logger.info(f"Writing intermediary output to {output_file_root}.psms.tsv...") + psm_utils.io.write_file(psm_list, output_file_root + ".psms.tsv", filetype="tsv") + + # Reraise exception + raise e - _log_id_psms_after(psm_list, id_psms_before) + # Post-rescoring processing + if all(psm_list["pep"] == 1.0): + psm_list = _fix_constant_pep(psm_list) + psm_list = _filter_by_rank(psm_list, config["max_psm_rank_output"], False) + psm_list = _calculate_confidence(psm_list) + _ = _log_id_psms_after(psm_list, id_psms_before, max_rank=config["max_psm_rank_output"]) # Write output logger.info(f"Writing output to {output_file_root}.psms.tsv...") @@ -160,6 +173,59 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: logger.exception(e) +def _fill_missing_precursor_info(psm_list: PSMList, config: Dict) -> PSMList: + """Fill missing precursor info from spectrum file if needed.""" + # Check if required + # TODO: avoid hard coding feature generators in some way + rt_required = ("deeplc" in config["feature_generators"]) and any( + v is None or v == 0 or np.isnan(v) for v in psm_list["retention_time"] + ) + im_required = ( + "ionmob" in config["feature_generators"] or "im2deep" in config["feature_generators"] + ) and any(v is None or v == 0 or np.isnan(v) for v in psm_list["ion_mobility"]) + logger.debug(f"RT required: {rt_required}, IM required: {im_required}") + + # Add missing values + if rt_required or im_required: + logger.info("Parsing missing retention time and/or ion mobility values from spectra...") + get_missing_values(psm_list, config, rt_required=rt_required, im_required=im_required) + + # Check if values are now present + for value_name, required in [("retention_time", rt_required), ("ion_mobility", im_required)]: + if required and ( + 0.0 in psm_list[value_name] + or None in psm_list[value_name] + or np.isnan(psm_list[value_name]).any() + ): + if all(v is None or v == 0.0 or np.isnan(v) for v in psm_list[value_name]): + raise exceptions.MissingValuesError( + f"Could not find any '{value_name}' values in PSM or spectrum files. Disable " + f"feature generators that require '{value_name}' or ensure that the values are " + "present in the input files." + ) + else: + missing_value_psms = psm_list[ + [v is None or np.isnan(v) for v in psm_list[value_name]] + ] + logger.warning( + f"Found {len(missing_value_psms)} PSMs with missing '{value_name}' values. " + "These PSMs will be removed." + ) + psm_list = psm_list[ + [v is not None and not np.isnan(v) for v in psm_list[value_name]] + ] + + return psm_list + + +def _filter_by_rank(psm_list: PSMList, max_rank: int, lower_score_better: bool) -> PSMList: + """Filter PSMs by rank.""" + psm_list.set_ranks(lower_score_better=lower_score_better) + rank_filter = psm_list["rank"] <= max_rank + logger.info(f"Removed {sum(~rank_filter)} PSMs with rank >= {max_rank}.") + return psm_list[rank_filter] + + def _write_feature_names(feature_names, output_file_root): """Write feature names to file.""" with open(output_file_root + ".feature_names.tsv", "w") as f: @@ -169,25 +235,79 @@ def _write_feature_names(feature_names, output_file_root): f.write(f"{fgen}\t{feature}\n") -def _log_id_psms_before(psm_list): +def _log_id_psms_before(psm_list: PSMList, fdr: float = 0.01, max_rank: int = 1) -> int: """Log #PSMs identified before rescoring.""" id_psms_before = ( - (psm_list["qvalue"] <= 0.01) & (psm_list["is_decoy"] == False) # noqa: E712 + (psm_list["qvalue"] <= 0.01) & (psm_list["rank"] <= max_rank) & (~psm_list["is_decoy"]) ).sum() - logger.info("Found %i identified PSMs at 1%% FDR before rescoring.", id_psms_before) + logger.info( + f"Found {id_psms_before} identified PSMs with rank <= {max_rank} at {fdr} FDR before " + "rescoring." + ) return id_psms_before -def _log_id_psms_after(psm_list, id_psms_before): +def _log_id_psms_after( + psm_list: PSMList, id_psms_before: int, fdr: float = 0.01, max_rank: int = 1 +) -> int: """Log #PSMs identified after rescoring.""" id_psms_after = ( - (psm_list["qvalue"] <= 0.01) & (psm_list["is_decoy"] == False) # noqa: E712 + (psm_list["qvalue"] <= 0.01) & (psm_list["rank"] <= max_rank) & (~psm_list["is_decoy"]) ).sum() diff = id_psms_after - id_psms_before diff_perc = diff / id_psms_before if id_psms_before > 0 else None diff_numbers = f"{diff} ({diff_perc:.2%})" if diff_perc is not None else str(diff) diff_word = "more" if diff > 0 else "less" - logger.info(f"Identified {diff_numbers} {diff_word} PSMs at 1% FDR after rescoring.") + logger.info( + f"Identified {diff_numbers} {diff_word} PSMs with rank <= {max_rank} at {fdr} FDR after " + "rescoring." + ) return id_psms_after + + +def _fix_constant_pep(psm_list: PSMList) -> PSMList: + """Workaround for broken PEP calculation if best PSM is decoy.""" + logger.warning( + "Attempting to fix constant PEP values by removing decoy PSMs that score higher than the " + "best target PSM." + ) + max_target_score = psm_list["score"][~psm_list["is_decoy"]].max() + higher_scoring_decoys = psm_list["is_decoy"] & (psm_list["score"] > max_target_score) + + if not higher_scoring_decoys.any(): + logger.warning("No decoys scoring higher than the best target found. Skipping fix.") + else: + psm_list = psm_list[~higher_scoring_decoys] + logger.warning(f"Removed {higher_scoring_decoys.sum()} decoy PSMs.") + + return psm_list + + +def _calculate_confidence(psm_list: PSMList) -> PSMList: + """ + Calculate scores, q-values, and PEPs for PSMs and peptides and add them to PSMList. + """ + # Minimal conversion to LinearPsmDataset + psm_df = psm_list.to_dataframe() + psm_df = psm_df.reset_index(drop=True).reset_index() + psm_df["peptide"] = ( + psm_df["peptidoform"].astype(str).str.replace(r"(/\d+$)", "", n=1, regex=True) + ) + psm_df["is_target"] = ~psm_df["is_decoy"] + lin_psm_data = LinearPsmDataset( + psms=psm_df[["index", "peptide", "is_target"]], + target_column="is_target", + spectrum_columns="index", # Use artificial index to allow multi-rank rescoring + peptide_column="peptide", + ) + + # Recalculate confidence + new_confidence = lin_psm_data.assign_confidence(scores=psm_list["score"]) + + # Add new confidence estimations to PSMList + add_psm_confidence(psm_list, new_confidence) + add_peptide_confidence(psm_list, new_confidence) + + return psm_list diff --git a/ms2rescore/exceptions.py b/ms2rescore/exceptions.py index f2b06323..ba5492a2 100644 --- a/ms2rescore/exceptions.py +++ b/ms2rescore/exceptions.py @@ -25,7 +25,19 @@ class ModificationParsingError(IDFileParsingError): pass +class MissingValuesError(MS2RescoreError): + """Missing values in PSMs and/or spectra.""" + + pass + + class ReportGenerationError(MS2RescoreError): """Error while generating report.""" pass + + +class RescoringError(MS2RescoreError): + """Error while rescoring PSMs.""" + + pass diff --git a/ms2rescore/feature_generators/__init__.py b/ms2rescore/feature_generators/__init__.py index 9424448a..52440a5f 100644 --- a/ms2rescore/feature_generators/__init__.py +++ b/ms2rescore/feature_generators/__init__.py @@ -7,6 +7,7 @@ from ms2rescore.feature_generators.ionmob import IonMobFeatureGenerator from ms2rescore.feature_generators.maxquant import MaxQuantFeatureGenerator from ms2rescore.feature_generators.ms2pip import MS2PIPFeatureGenerator +from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator FEATURE_GENERATORS = { "basic": BasicFeatureGenerator, @@ -14,4 +15,5 @@ "deeplc": DeepLCFeatureGenerator, "maxquant": MaxQuantFeatureGenerator, "ionmob": IonMobFeatureGenerator, + "im2deep": IM2DeepFeatureGenerator, } diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py index 50b577ff..30bea716 100644 --- a/ms2rescore/feature_generators/deeplc.py +++ b/ms2rescore/feature_generators/deeplc.py @@ -21,12 +21,10 @@ from collections import defaultdict from inspect import getfullargspec from itertools import chain -from typing import List, Optional, Union +from typing import List, Union import numpy as np -import pandas as pd from psm_utils import PSMList -from psm_utils.io import peptide_record from ms2rescore.feature_generators.base import FeatureGeneratorBase @@ -41,8 +39,7 @@ def __init__( self, *args, lower_score_is_better: bool = False, - calibration_set_size: Union[int, float] = 0.15, - spectrum_path: Optional[str] = None, + calibration_set_size: Union[int, float, None] = None, processes: int = 1, **kwargs, ) -> None: @@ -59,9 +56,6 @@ def __init__( calibration_set_size: int or float Amount of best PSMs to use for DeepLC calibration. If this value is lower than the number of available PSMs, all PSMs will be used. (default: 0.15) - spectrum_path - Path to spectrum file or directory with spectrum files. If None, inferred from `run` - field in PSMs. Defaults to None. processes: {int, None} Number of processes to use in DeepLC. Defaults to 1. kwargs: dict @@ -77,7 +71,6 @@ def __init__( self.lower_psm_score_better = lower_score_is_better self.calibration_set_size = calibration_set_size - self.spectrum_path = spectrum_path self.processes = processes self.deeplc_kwargs = kwargs or {} @@ -151,17 +144,15 @@ def add_features(self, psm_list: PSMList) -> None: # Make new PSM list for this run (chain PSMs per spectrum to flat list) psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) - logger.debug("Calibrating DeepLC...") psm_list_calibration = self._get_calibration_psms(psm_list_run) + logger.debug(f"Calibrating DeepLC with {len(psm_list_calibration)} PSMs...") self.deeplc_predictor = self.DeepLC( n_jobs=self.processes, verbose=self._verbose, path_model=self.selected_model or self.user_model, **self.deeplc_kwargs, ) - self.deeplc_predictor.calibrate_preds( - seq_df=self._psm_list_to_deeplc_peprec(psm_list_calibration) - ) + self.deeplc_predictor.calibrate_preds(psm_list_calibration) # Still calibrate for each run, but do not try out all model options. # Just use model that was selected based on first run if not self.selected_model: @@ -174,11 +165,7 @@ def add_features(self, psm_list: PSMList) -> None: ) logger.debug("Predicting retention times...") - predictions = np.array( - self.deeplc_predictor.make_preds( - seq_df=self._psm_list_to_deeplc_peprec(psm_list_run) - ) - ) + predictions = np.array(self.deeplc_predictor.make_preds(psm_list_run)) observations = psm_list_run["retention_time"] rt_diffs_run = np.abs(predictions - observations) @@ -204,25 +191,25 @@ def add_features(self, psm_list: PSMList) -> None: ) current_run += 1 - # TODO: Remove when DeepLC supports PSMList directly - @staticmethod - def _psm_list_to_deeplc_peprec(psm_list: PSMList) -> pd.DataFrame: - peprec = peptide_record.to_dataframe(psm_list) - peprec = peprec.rename( - columns={ - "observed_retention_time": "tr", - "peptide": "seq", - } - )[["tr", "seq", "modifications"]] - return peprec - def _get_calibration_psms(self, psm_list: PSMList): """Get N best scoring target PSMs for calibration.""" psm_list_targets = psm_list[~psm_list["is_decoy"]] - n_psms = self._get_number_of_calibration_psms(psm_list_targets) - indices = np.argsort(psm_list_targets["score"]) - indices = indices[:n_psms] if self.lower_psm_score_better else indices[-n_psms:] - return psm_list_targets[indices] + if self.calibration_set_size: + n_psms = self._get_number_of_calibration_psms(psm_list_targets) + indices = np.argsort(psm_list_targets["score"]) + indices = indices[:n_psms] if self.lower_psm_score_better else indices[-n_psms:] + return psm_list_targets[indices] + else: + identified_psms = psm_list_targets[psm_list_targets["qvalue"] <= 0.01] + if len(identified_psms) == 0: + raise ValueError( + "No target PSMs with q-value <= 0.01 found. Please set calibration set size for calibrating deeplc." + ) + elif (len(identified_psms) < 500) & (self.deeplc_kwargs["deeplc_retrain"]): + logger.warning( + " Less than 500 target PSMs with q-value <= 0.01 found for retraining. Consider turning of deeplc_retrain, as this is likely not enough data for retraining." + ) + return identified_psms def _get_number_of_calibration_psms(self, psm_list): """Get number of calibration PSMs given `calibration_set_size` and total number of PSMs.""" diff --git a/ms2rescore/feature_generators/im2deep.py b/ms2rescore/feature_generators/im2deep.py new file mode 100644 index 00000000..552b7280 --- /dev/null +++ b/ms2rescore/feature_generators/im2deep.py @@ -0,0 +1,169 @@ +""" +IM2Deep ion mobility-based feature generator. + +IM2Deep is a fully modification-aware peptide ion mobility predictor. It uses a deep convolutional +neural network to predict retention times based on the atomic composition of the (modified) amino +acid residues in the peptide. See +`github.com/compomics/IM2Deep `_ for more information. + +""" + +import contextlib +import logging +import os +from inspect import getfullargspec +from itertools import chain +from typing import List + +import numpy as np +import pandas as pd +from im2deep.calibrate import im2ccs +from im2deep.im2deep import predict_ccs +from psm_utils import PSMList + +from ms2rescore.feature_generators.base import FeatureGeneratorBase + +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" +logger = logging.getLogger(__name__) + + +class IM2DeepFeatureGenerator(FeatureGeneratorBase): + """IM2Deep collision cross section feature generator.""" + + def __init__( + self, + *args, + processes: int = 1, + **kwargs, + ): + """ + Initialize the IM2DeepFeatureGenerator. + + Parameters + ---------- + processes : int, optional + Number of parallel processes to use for IM2Deep predictions. Default is 1. + **kwargs : dict, optional + Additional keyword arguments to `im2deep.predict_ccs`. + + """ + super().__init__(*args, **kwargs) + + self._verbose = logger.getEffectiveLevel() <= logging.DEBUG + + # Remove any kwargs that are not IM2Deep arguments + self.im2deep_kwargs = kwargs or {} + self.im2deep_kwargs = { + k: v for k, v in self.im2deep_kwargs.items() if k in getfullargspec(predict_ccs).args + } + self.im2deep_kwargs["n_jobs"] = processes + + @property + def feature_names(self) -> List[str]: + return [ + "ccs_observed_im2deep", + "ccs_predicted_im2deep", + "ccs_error_im2deep", + "abs_ccs_error_im2deep", + "perc_ccs_error_im2deep", + ] + + def add_features(self, psm_list: PSMList) -> None: + """Add IM2Deep-derived features to PSMs""" + logger.info("Adding IM2Deep-derived features to PSMs") + + # Get easy-access nested version of PSMlist + psm_dict = psm_list.get_psm_dict() + + # Run IM2Deep for each spectrum file + current_run = 1 + total_runs = sum(len(runs) for runs in psm_dict.values()) + + for runs in psm_dict.values(): + # Reset IM2Deep predictor for each collection of runs + for run, psms in runs.items(): + logger.info( + f"Running IM2Deep for PSMs from run ({current_run}/{total_runs}): `{run}`..." + ) + + # Disable wild logging to stdout by TensorFlow, unless in debug mode + with ( + contextlib.redirect_stdout(open(os.devnull, "w")) + if not self._verbose + else contextlib.nullcontext() + ): + # Make new PSM list for this run (chain PSMs per spectrum to flat list) + psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) + + logger.debug("Calibrating IM2Deep...") + + # Convert ion mobility to CCS and calibrate CCS values + psm_list_run_df = psm_list_run.to_dataframe() + psm_list_run_df["charge"] = [ + pep.precursor_charge for pep in psm_list_run_df["peptidoform"] + ] + psm_list_run_df["ccs_observed"] = im2ccs( + psm_list_run_df["ion_mobility"], + psm_list_run_df["precursor_mz"], + psm_list_run_df["charge"], + ) + + # Create dataframe with high confidence hits for calibration + cal_psm_df = self.make_calibration_df(psm_list_run_df) + + # Make predictions with IM2Deep + logger.debug("Predicting CCS values...") + predictions = predict_ccs( + psm_list_run, cal_psm_df, write_output=False, **self.im2deep_kwargs + ) + + # Add features to PSMs + logger.debug("Adding features to PSMs...") + observations = psm_list_run_df["ccs_observed"] + ccs_diffs_run = np.abs(predictions - observations) + for i, psm in enumerate(psm_list_run): + psm["rescoring_features"].update( + { + "ccs_observed_im2deep": observations[i], + "ccs_predicted_im2deep": predictions[i], + "ccs_error_im2deep": ccs_diffs_run[i], + "abs_ccs_error_im2deep": np.abs(ccs_diffs_run[i]), + "perc_ccs_error_im2deep": np.abs(ccs_diffs_run[i]) + / observations[i] + * 100, + } + ) + + current_run += 1 + + @staticmethod + def make_calibration_df(psm_list_df: pd.DataFrame, threshold: float = 0.25) -> pd.DataFrame: + """ + Make dataframe for calibration of IM2Deep predictions. + + Parameters + ---------- + psm_list_df + DataFrame with PSMs. + threshold + Percentage of highest scoring identified target PSMs to use for calibration, + default 0.95. + + Returns + ------- + pd.DataFrame + DataFrame with high confidence hits for calibration. + + """ + identified_psms = psm_list_df[ + (psm_list_df["qvalue"] < 0.01) + & (~psm_list_df["is_decoy"]) + & (psm_list_df["charge"] < 5) # predictions do not go higher for IM2Deep + ] + calibration_psms = identified_psms[ + identified_psms["qvalue"] < identified_psms["qvalue"].quantile(1 - threshold) + ] + logger.debug( + f"Number of high confidence hits for calculating shift: {len(calibration_psms)}" + ) + return calibration_psms diff --git a/ms2rescore/feature_generators/ionmob.py b/ms2rescore/feature_generators/ionmob.py index 63ed3b87..7fa0c0a1 100644 --- a/ms2rescore/feature_generators/ionmob.py +++ b/ms2rescore/feature_generators/ionmob.py @@ -165,6 +165,7 @@ def add_features(self, psm_list: PSMList) -> None: ) ] + # TODO: Use observed m/z? psm_list_run_df["mz"] = psm_list_run_df.apply( lambda x: calculate_mz(x["sequence-tokenized"], x["charge"]), axis=1 ) # use precursor m/z from PSMs? @@ -175,9 +176,8 @@ def add_features(self, psm_list: PSMList) -> None: ) # calibrate CCS values shift_factor = self.calculate_ccs_shift(psm_list_run_df) - psm_list_run_df["ccs_observed"] = psm_list_run_df.apply( - lambda x: x["ccs_observed"] + shift_factor, axis=1 - ) + psm_list_run_df["ccs_observed"] + shift_factor + # predict CCS values tf_ds = to_tf_dataset_inference( psm_list_run_df["mz"], diff --git a/ms2rescore/feature_generators/ms2pip.py b/ms2rescore/feature_generators/ms2pip.py index 243bd794..3882b3fc 100644 --- a/ms2rescore/feature_generators/ms2pip.py +++ b/ms2rescore/feature_generators/ms2pip.py @@ -193,7 +193,7 @@ def add_features(self, psm_list: PSMList) -> None: try: ms2pip_results = correlate( psms=psm_list_run, - spectrum_file=spectrum_filename, + spectrum_file=str(spectrum_filename), spectrum_id_pattern=self.spectrum_id_pattern, model=self.model, ms2_tolerance=self.ms2_tolerance, diff --git a/ms2rescore/gui/app.py b/ms2rescore/gui/app.py index c39f2922..a077a458 100644 --- a/ms2rescore/gui/app.py +++ b/ms2rescore/gui/app.py @@ -360,15 +360,20 @@ def __init__(self, *args, **kwargs): self.deeplc_config = DeepLCConfiguration(self) self.deeplc_config.grid(row=2, column=0, pady=(0, 20), sticky="nsew") + self.im2deep_config = Im2DeepConfiguration(self) + self.im2deep_config.grid(row=3, column=0, pady=(0, 20), sticky="nsew") + self.ionmob_config = IonmobConfiguration(self) - self.ionmob_config.grid(row=3, column=0, pady=(0, 20), sticky="nsew") + self.ionmob_config.grid(row=4, column=0, pady=(0, 20), sticky="nsew") def get(self) -> Dict: """Return the configuration as a dictionary.""" basic_enabled, basic_config = self.basic_config.get() ms2pip_enabled, ms2pip_config = self.ms2pip_config.get() deeplc_enabled, deeplc_config = self.deeplc_config.get() + im2deep_enabled, im2deep_config = self.im2deep_config.get() ionmob_enabled, ionmob_config = self.ionmob_config.get() + config = {} if basic_enabled: config["basic"] = basic_config @@ -523,6 +528,27 @@ def get(self) -> Dict: return enabled, config +class Im2DeepConfiguration(ctk.CTkFrame): + def __init__(self, *args, **kwargs): + """IM2Deep configuration frame.""" + super().__init__(*args, **kwargs) + + self.configure(fg_color="transparent") + self.grid_columnconfigure(0, weight=1) + + self.title = widgets.Heading(self, text="im2deep") + self.title.grid(row=0, column=0, columnspan=2, pady=(0, 5), sticky="ew") + + self.enabled = widgets.LabeledSwitch(self, label="Enable im2deep", default=False) + self.enabled.grid(row=1, column=0, pady=(0, 10), sticky="nsew") + + def get(self) -> Dict: + """Return the configuration as a dictionary.""" + enabled = self.enabled.get() + config = {} + return enabled, config + + class RescoringEngineConfig(ctk.CTkFrame): def __init__(self, *args, **kwargs): """Rescoring engine configuration frame.""" diff --git a/ms2rescore/package_data/config_default.json b/ms2rescore/package_data/config_default.json index 71f704b5..733ea707 100644 --- a/ms2rescore/package_data/config_default.json +++ b/ms2rescore/package_data/config_default.json @@ -14,6 +14,7 @@ }, "rescoring_engine": { "mokapot": { + "train_fdr": 0.01, "write_weights": true, "write_txt": true, "write_flashlfq": true @@ -29,12 +30,16 @@ "id_decoy_pattern": null, "psm_id_pattern": null, "spectrum_id_pattern": null, + "psm_id_rt_pattern": null, + "psm_id_im_pattern": null, "lower_score_is_better": false, + "max_psm_rank_input": 10, + "max_psm_rank_output": 1, "modification_mapping": {}, "fixed_modifications": {}, "processes": -1, "rename_to_usi": false, "fasta_file": null, - "write_report": true + "write_report": false } } diff --git a/ms2rescore/package_data/config_default_tims.json b/ms2rescore/package_data/config_default_tims.json new file mode 100644 index 00000000..2a77adf1 --- /dev/null +++ b/ms2rescore/package_data/config_default_tims.json @@ -0,0 +1,25 @@ +{ + "$schema": "./config_schema.json", + "ms2rescore": { + "feature_generators": { + "basic": {}, + "ms2pip": { + "model": "timsTOF", + "ms2_tolerance": 0.02 + }, + "deeplc": { + "deeplc_retrain": false + }, + "im2deep": {}, + "maxquant": {} + }, + "rescoring_engine": { + "mokapot": { + "write_weights": true, + "write_txt": true, + "write_flashlfq": true + } + }, + "psm_file": null + } +} diff --git a/ms2rescore/package_data/config_schema.json b/ms2rescore/package_data/config_schema.json index a97c9a59..ab77f3b7 100644 --- a/ms2rescore/package_data/config_schema.json +++ b/ms2rescore/package_data/config_schema.json @@ -29,6 +29,9 @@ }, "ionmob": { "$ref": "#/definitions/ionmob" + }, + "im2deep": { + "$ref": "#/definitions/im2deep" } }, "default": { @@ -65,7 +68,11 @@ }, "psm_file": { "description": "Path to file with peptide-spectrum matches.", - "oneOf": [{ "type": "string" }, { "type": "null" }, { "type": "array", "items": { "type": "string" } }] + "oneOf": [ + { "type": "string" }, + { "type": "null" }, + { "type": "array", "items": { "type": "string" } } + ] }, "psm_file_type": { "description": "PSM file type. By default inferred from file extension.", @@ -107,11 +114,35 @@ "default": "(.*)", "format": "regex" }, + "psm_id_rt_pattern": { + "description": "Regex pattern to extract retention time from PSM identifier. Requires at least one capturing group.", + "oneOf": [{ "type": "string" }, { "type": "null" }], + "default": null, + "format": "regex" + }, + "psm_id_im_pattern": { + "description": "Regex pattern to extract ion mobility from PSM identifier. Requires at least one capturing group.", + "oneOf": [{ "type": "string" }, { "type": "null" }], + "default": null, + "format": "regex" + }, "lower_score_is_better": { "description": "Bool indicating if lower score is better", "type": "boolean", "default": false }, + "max_psm_rank_input": { + "description": "Maximum rank of PSMs to use as input for rescoring", + "type": "number", + "default": 10, + "minimum": 1 + }, + "max_psm_rank_output": { + "description": "Maximum rank of PSMs to return after rescoring, before final FDR calculation", + "type": "number", + "default": 1, + "minimum": 1 + }, "modification_mapping": { "description": "Mapping of modification labels to each replacement label.", "type": "object", @@ -142,6 +173,11 @@ "description": "Write an HTML report with various QC metrics and charts", "type": "boolean", "default": false + }, + "profile": { + "description": "Write a txt report using cProfile for profiling", + "type": "boolean", + "default": false } } } @@ -224,12 +260,32 @@ } } }, + "im2deep": { + "$ref": "#/definitions/feature_generator", + "description": "Ion mobility feature generator configuration using IM2Deep", + "type": "object", + "additionalProperties": true, + "properties": { + "reference_dataset": { + "description": "Path to IM2Deep reference dataset file", + "type": "string", + "default": "Meier_unimod.parquet" + } + } + }, "mokapot": { "$ref": "#/definitions/rescoring_engine", "description": "Mokapot rescoring engine configuration. Additional properties are passed to the Mokapot brew function.", "type": "object", "additionalProperties": true, "properties": { + "train_fdr": { + "description": "FDR threshold for training Mokapot", + "type": "number", + "minimum": 0, + "maximum": 1, + "default": 0.01 + }, "write_weights": { "description": "Write Mokapot weights to a text file", "type": "boolean", diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py index a9855fda..12edd0e9 100644 --- a/ms2rescore/parse_psms.py +++ b/ms2rescore/parse_psms.py @@ -1,7 +1,8 @@ import logging import re -from typing import Dict, Union +from typing import Dict, Optional, Union +import numpy as np import psm_utils.io from psm_utils import PSMList @@ -23,10 +24,30 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: PSMList object containing PSMs. If None, PSMs will be read from ``psm_file``. """ - # Read PSMs, find decoys, calculate q-values - psm_list = _read_psms(config, psm_list) - _find_decoys(config, psm_list) - _calculate_qvalues(config, psm_list) + # Read PSMs + try: + psm_list = _read_psms(config, psm_list) + except psm_utils.io.PSMUtilsIOException: + raise MS2RescoreConfigurationError( + "Error occurred while reading PSMs. Please check the 'psm_file' and " + "'psm_file_type' settings. See " + "https://ms2rescore.readthedocs.io/en/latest/userguide/input-files/" + " for more information." + ) + + # Filter by PSM rank + psm_list.set_ranks(config["lower_score_is_better"]) + rank_filter = psm_list["rank"] <= config["max_psm_rank_input"] + psm_list = psm_list[rank_filter] + logger.info(f"Removed {sum(~rank_filter)} PSMs with rank >= {config['max_psm_rank_input']}.") + + # Remove invalid AAs, find decoys, calculate q-values + psm_list = _remove_invalid_aa(psm_list) + _find_decoys(psm_list, config["id_decoy_pattern"]) + _calculate_qvalues(psm_list, config["lower_score_is_better"]) + if config["psm_id_rt_pattern"] or config["psm_id_im_pattern"]: + logger.debug("Parsing retention time and/or ion mobility from PSM identifier...") + _parse_values_from_spectrum_id(config, psm_list) # Store scoring values for comparison later for psm in psm_list: @@ -51,7 +72,8 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: non_mapped_modifications = modifications_found - set(config["modification_mapping"].keys()) if non_mapped_modifications: logger.warning( - f"Non-mapped modifications found: {non_mapped_modifications}\nThis can be ignored if Unimod modification label" + f"Non-mapped modifications found: {non_mapped_modifications}\n" + "This can be ignored if they are Unimod modification labels." ) psm_list.rename_modifications(config["modification_mapping"]) psm_list.add_fixed_modifications(config["fixed_modifications"]) @@ -66,10 +88,6 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: new_ids = [_match_psm_ids(old_id, pattern) for old_id in psm_list["spectrum_id"]] psm_list["spectrum_id"] = new_ids - # TODO: Temporary fix until implemented in psm_utils - # Ensure that spectrum IDs are strings (Pydantic 2.0 does not coerce int to str) - psm_list["spectrum_id"] = [str(spec_id) for spec_id in psm_list["spectrum_id"]] - return psm_list @@ -77,49 +95,30 @@ def _read_psms(config, psm_list): if isinstance(psm_list, PSMList): return psm_list else: - logger.info("Reading PSMs from file...") - current_file = 1 total_files = len(config["psm_file"]) - valid_psms_list = [] - total_psms = 0 - valid_psms = 0 - for psm_file in config["psm_file"]: + psm_list = [] + for current_file, psm_file in enumerate(config["psm_file"]): logger.info( - f"Reading PSMs from PSM file ({current_file}/{total_files}): '{psm_file}'..." + f"Reading PSMs from PSM file ({current_file+1}/{total_files}): '{psm_file}'..." ) - try: - id_file_psm_list = psm_utils.io.read_file( + psm_list.extend( + psm_utils.io.read_file( psm_file, filetype=config["psm_file_type"], show_progressbar=True, **config["psm_reader_kwargs"], ) - except psm_utils.io.PSMUtilsIOException: - raise MS2RescoreConfigurationError( - "Error occurred while reading PSMs. Please check the 'psm_file' and " - "'psm_file_type' settings. See " - "https://ms2rescore.readthedocs.io/en/latest/userguide/input-files/" - " for more information." - ) - - total_psms += len(id_file_psm_list.psm_list) - for psm in id_file_psm_list.psm_list: - if not _has_invalid_aminoacids(psm): - valid_psms_list.append(psm) - valid_psms += 1 - current_file += 1 - if total_psms - valid_psms > 0: - logger.warning( - f"{total_psms - valid_psms} PSMs with invalid amino acids were removed." ) - return PSMList(psm_list=valid_psms_list) + logger.debug(f"Read {len(psm_list)} PSMs from '{psm_file}'.") + return PSMList(psm_list=psm_list) -def _find_decoys(config, psm_list): + +def _find_decoys(psm_list: PSMList, id_decoy_pattern: Optional[str] = None): """Find decoys in PSMs, log amount, and raise error if none found.""" logger.debug("Finding decoys...") - if config["id_decoy_pattern"]: - psm_list.find_decoys(config["id_decoy_pattern"]) + if id_decoy_pattern: + psm_list.find_decoys(id_decoy_pattern) n_psms = len(psm_list) percent_decoys = sum(psm_list["is_decoy"]) / n_psms * 100 @@ -134,12 +133,12 @@ def _find_decoys(config, psm_list): ) -def _calculate_qvalues(config, psm_list): +def _calculate_qvalues(psm_list: PSMList, lower_score_is_better: bool): """Calculate q-values for PSMs if not present.""" # Calculate q-values if not present if None in psm_list["qvalue"]: logger.debug("Recalculating q-values...") - psm_list.calculate_qvalues(reverse=not config["lower_score_is_better"]) + psm_list.calculate_qvalues(reverse=not lower_score_is_better) def _match_psm_ids(old_id, regex_pattern): @@ -154,7 +153,45 @@ def _match_psm_ids(old_id, regex_pattern): ) -def _has_invalid_aminoacids(psm): - """Check if a PSM contains invalid amino acids.""" +def _parse_values_from_spectrum_id( + psm_list: PSMList, + psm_id_rt_pattern: Optional[str] = None, + psm_id_im_pattern: Optional[str] = None, +): + """Parse retention time and or ion mobility values from the spectrum_id.""" + for pattern, label, key in zip( + [psm_id_rt_pattern, psm_id_im_pattern], + ["retention time", "ion mobility"], + ["retention_time", "ion_mobility"], + ): + if pattern: + logger.debug( + f"Parsing {label} from spectrum_id with regex pattern " f"{psm_id_rt_pattern}" + ) + try: + pattern = re.compile(pattern) + psm_list[key] = [ + float(pattern.search(psm.spectrum_id).group(1)) for psm in psm_list + ] + except AttributeError: + raise MS2RescoreConfigurationError( + f"Could not parse {label} from spectrum_id with the " + f"{pattern} regex pattern. " + f"Example spectrum_id: '{psm_list[0].spectrum_id}'\n. " + f"Please make sure the {label} key is present in the spectrum_id " + "and the value is in a capturing group or disable the relevant feature generator." + ) + + +def _remove_invalid_aa(psm_list: PSMList) -> PSMList: + """Remove PSMs with invalid amino acids.""" + invalid_psms = np.array( + [any(aa in "BJOUXZ" for aa in psm.peptidoform.sequence) for psm in psm_list] + ) - return any(aa not in "ACDEFGHIKLMNPQRSTVWY" for aa in psm.peptidoform.sequence) + if any(invalid_psms): + logger.warning(f"Removed {sum(invalid_psms)} PSMs with invalid amino acids.") + return psm_list[~invalid_psms] + else: + logger.debug("No PSMs with invalid amino acids found.") + return psm_list diff --git a/ms2rescore/parse_spectra.py b/ms2rescore/parse_spectra.py index 17b35d60..2b2c1b5f 100644 --- a/ms2rescore/parse_spectra.py +++ b/ms2rescore/parse_spectra.py @@ -6,7 +6,6 @@ from ms2rescore_rs import get_precursor_info from psm_utils import PSMList -from rich.progress import track from ms2rescore.exceptions import MS2RescoreError from ms2rescore.utils import infer_spectrum_path diff --git a/ms2rescore/report/charts.py b/ms2rescore/report/charts.py index 48d543a7..669bcc62 100644 --- a/ms2rescore/report/charts.py +++ b/ms2rescore/report/charts.py @@ -198,6 +198,7 @@ def score_scatter_plot( after: mokapot.LinearConfidence, level: str = "psms", indexer: str = "index", + fdr_threshold: float = 0.01, ) -> go.Figure: """ Plot PSM scores before and after rescoring. @@ -241,16 +242,22 @@ def score_scatter_plot( ce_psms = pd.concat([ce_psms_targets, ce_psms_decoys], axis=0) # Get score thresholds - score_threshold_before = ( - ce_psms[ce_psms["mokapot q-value before"] <= 0.01] - .sort_values("mokapot q-value before", ascending=False)["mokapot score before"] - .iloc[0] - ) - score_threshold_after = ( - ce_psms[ce_psms["mokapot q-value after"] <= 0.01] - .sort_values("mokapot q-value after", ascending=False)["mokapot score after"] - .iloc[0] - ) + try: + score_threshold_before = ( + ce_psms[ce_psms["mokapot q-value before"] <= fdr_threshold] + .sort_values("mokapot q-value before", ascending=False)["mokapot score before"] + .iloc[0] + ) + except IndexError: # No PSMs below threshold + score_threshold_before = None + try: + score_threshold_after = ( + ce_psms[ce_psms["mokapot q-value after"] <= fdr_threshold] + .sort_values("mokapot q-value after", ascending=False)["mokapot score after"] + .iloc[0] + ) + except IndexError: # No PSMs below threshold + score_threshold_after = None # Plot fig = px.scatter( @@ -267,10 +274,12 @@ def score_scatter_plot( }, ) # draw FDR thresholds - fig.add_vline(x=score_threshold_before, line_dash="dash", row=1, col=1) - fig.add_hline(y=score_threshold_after, line_dash="dash", row=1, col=1) - fig.add_vline(x=score_threshold_before, line_dash="dash", row=2, col=1) - fig.add_hline(y=score_threshold_after, line_dash="dash", row=1, col=2) + if score_threshold_before: + fig.add_vline(x=score_threshold_before, line_dash="dash", row=1, col=1) + fig.add_vline(x=score_threshold_before, line_dash="dash", row=2, col=1) + if score_threshold_after: + fig.add_hline(y=score_threshold_after, line_dash="dash", row=1, col=1) + fig.add_hline(y=score_threshold_after, line_dash="dash", row=1, col=2) return fig @@ -373,7 +382,7 @@ def identification_overlap( return figure levels = before.levels # ["psms", "peptides", "proteins"] if all available - indexers = ["index", "index", "mokapot protein group"] + indexers = ["index", "peptide", "mokapot protein group"] overlap_data = defaultdict(dict) for level, indexer in zip(levels, indexers): @@ -386,7 +395,7 @@ def identification_overlap( set_after = set(df_after[df_after["mokapot q-value"] <= 0.01][indexer]) overlap_data["removed"][level] = -len(set_before - set_after) - overlap_data["retained"][level] = len(set_before | set_after) + overlap_data["retained"][level] = len(set_after.intersection(set_before)) overlap_data["gained"][level] = len(set_after - set_before) colors = ["#953331", "#316395", "#319545"] diff --git a/ms2rescore/report/generate.py b/ms2rescore/report/generate.py index cc4b8aa9..91444bfe 100644 --- a/ms2rescore/report/generate.py +++ b/ms2rescore/report/generate.py @@ -82,7 +82,7 @@ def generate_report( # Read config config = json.loads(files["configuration"].read_text()) - logger.info("Recalculating confidence estimates...") + logger.debug("Recalculating confidence estimates...") fasta_file = config["ms2rescore"]["fasta_file"] confidence_before, confidence_after = get_confidence_estimates(psm_list, fasta_file) @@ -139,19 +139,21 @@ def generate_report( def _collect_files(output_path_prefix, use_txt_log=False): """Collect all files generated by MS²Rescore.""" - logger.info("Collecting files...") + logger.debug("Collecting files...") files = { "PSMs": Path(output_path_prefix + ".psms.tsv").resolve(), "configuration": Path(output_path_prefix + ".full-config.json").resolve(), "feature names": Path(output_path_prefix + ".feature_names.tsv").resolve(), "feature weights": Path(output_path_prefix + ".mokapot.weights.tsv").resolve(), - "log": Path(output_path_prefix + ".log.txt").resolve() - if use_txt_log - else Path(output_path_prefix + ".log.html").resolve(), + "log": ( + Path(output_path_prefix + ".log.txt").resolve() + if use_txt_log + else Path(output_path_prefix + ".log.html").resolve() + ), } for file, path in files.items(): if Path(path).is_file(): - logger.info("✅ Found %s: '%s'", file, path.as_posix()) + logger.debug("✅ Found %s: '%s'", file, path.as_posix()) else: logger.warning("❌ %s: '%s'", file, path.as_posix()) files[file] = None @@ -183,7 +185,7 @@ def _get_stats_context(confidence_before, confidence_after): "item": level_name, "card_color": card_color, "number": after, - "diff": f"{after - before:+}", + "diff": f"({after - before:+})", "percentage": f"{increase:.1f}%", "is_increase": increase > 0, "bar_percentage": before / after * 100 if increase > 0 else after / before * 100, @@ -321,16 +323,12 @@ def _get_features_context( import deeplc.plot scatter_chart = deeplc.plot.scatter( - df=features[ - (psm_list["is_decoy"] == False) & (psm_list["qvalue"] <= 0.01) - ], # noqa: E712 + df=features[(~psm_list["is_decoy"]) & (psm_list["qvalue"] <= 0.01)], predicted_column="predicted_retention_time_best", observed_column="observed_retention_time_best", ) baseline_chart = deeplc.plot.distribution_baseline( - df=features[ - (psm_list["is_decoy"] == False) & (psm_list["qvalue"] <= 0.01) - ], # noqa: E712 + df=features[(~psm_list["is_decoy"]) & (psm_list["qvalue"] <= 0.01)], predicted_column="predicted_retention_time_best", observed_column="observed_retention_time_best", ) @@ -343,6 +341,26 @@ def _get_features_context( } ) + # IM2Deep specific charts + if "im2deep" in feature_names: + import deeplc.plot + + scatter_chart = deeplc.plot.scatter( + df=features[(~psm_list["is_decoy"]) & (psm_list["qvalue"] <= 0.01)], + predicted_column="ccs_predicted_im2deep", + observed_column="ccs_observed_im2deep", + xaxis_label="Observed CCS", + yaxis_label="Predicted CCS", + plot_title="Predicted vs. observed CCS", + ) + + context["charts"].append( + { + "title": TEXTS["charts"]["im2deep_performance"]["title"], + "description": TEXTS["charts"]["im2deep_performance"]["description"], + "chart": scatter_chart.to_html(**PLOTLY_HTML_KWARGS), + } + ) return context diff --git a/ms2rescore/report/templates/texts.toml b/ms2rescore/report/templates/texts.toml index 2d0840cf..52c9a230 100644 --- a/ms2rescore/report/templates/texts.toml +++ b/ms2rescore/report/templates/texts.toml @@ -105,3 +105,9 @@ bottom chart shows the distribution of RMAE values of DeepLC predictions on 460 datasets. The red line indicates the RMAE value for all target PSMs that passed the 1% FDR threshold of the current dataset. A lower RMAE value indicates better performance. """ + +[charts.im2deep_performance] +title = "IM2Deep model performance" +description = """ +IM2Deep model performance can be visualized by plotting the predicted CCS against the observed CCS. +""" diff --git a/ms2rescore/report/utils.py b/ms2rescore/report/utils.py index 272bdb8b..069a641f 100644 --- a/ms2rescore/report/utils.py +++ b/ms2rescore/report/utils.py @@ -51,35 +51,25 @@ def get_confidence_estimates( "was generated by MS²Rescore. Could not generate report." ) from e + score_after = psm_list["score"] peptide = ( pd.Series(psm_list["peptidoform"]).astype(str).str.replace(r"(/\d+$)", "", n=1, regex=True) ) - psms = pd.DataFrame( - { - "peptide": peptide, - "is_target": ~psm_list["is_decoy"], - "before": score_before, - "after": psm_list["score"], - } - ).reset_index() - + psms = pd.DataFrame({"peptide": peptide, "is_target": ~psm_list["is_decoy"]}).reset_index() + lin_psm_dataset = LinearPsmDataset( + psms=psms, + target_column="is_target", + spectrum_columns="index", + peptide_column="peptide", + ) if fasta_file: fasta = read_fasta(fasta_file) + lin_psm_dataset.add_proteins(fasta) confidence = dict() - for when in ["before", "after"]: - lin_psm_dataset = LinearPsmDataset( - psms=psms, - target_column="is_target", - spectrum_columns="index", - feature_columns=[when], - peptide_column="peptide", - ) - if fasta_file: - lin_psm_dataset.add_proteins(fasta) - + for when, scores in [("before", score_before), ("after", score_after)]: try: - confidence[when] = lin_psm_dataset.assign_confidence() + confidence[when] = lin_psm_dataset.assign_confidence(scores=scores) except RuntimeError: confidence[when] = None diff --git a/ms2rescore/rescoring_engines/mokapot.py b/ms2rescore/rescoring_engines/mokapot.py index cc7a336f..f02d877f 100644 --- a/ms2rescore/rescoring_engines/mokapot.py +++ b/ms2rescore/rescoring_engines/mokapot.py @@ -20,7 +20,8 @@ """ import logging -from typing import Any, List, Optional, Tuple, Dict +import re +from typing import Any, Dict, List, Optional, Tuple import mokapot import numpy as np @@ -28,15 +29,20 @@ import psm_utils from mokapot.brew import brew from mokapot.dataset import LinearPsmDataset +from mokapot.model import PercolatorModel from pyteomics.mass import nist_mass +from ms2rescore.exceptions import RescoringError + logger = logging.getLogger(__name__) +logging.getLogger("numba").setLevel(logging.WARNING) def rescore( psm_list: psm_utils.PSMList, output_file_root: str = "ms2rescore", fasta_file: Optional[str] = None, + train_fdr: float = 0.01, write_weights: bool = False, write_txt: bool = False, write_flashlfq: bool = False, @@ -63,6 +69,8 @@ def rescore( fasta_file Path to FASTA file with protein sequences to use for protein inference. Defaults to ``None``. + train_fdr + FDR to use for training the Mokapot model. Defaults to ``0.01``. write_weights Write model weights to a text file. Defaults to ``False``. write_txt @@ -89,27 +97,15 @@ def rescore( # Rescore logger.debug(f"Mokapot brew options: `{kwargs}`") - confidence_results, models = brew(lin_psm_data, **kwargs) - - # Reshape confidence estimates to match PSMList - mokapot_values_targets = ( - confidence_results.confidence_estimates["psms"] - .set_index("index") - .sort_index()[["mokapot score", "mokapot q-value", "mokapot PEP"]] - ) - mokapot_values_decoys = ( - confidence_results.decoy_confidence_estimates["psms"] - .set_index("index") - .sort_index()[["mokapot score", "mokapot q-value", "mokapot PEP"]] - ) - q = np.full((len(psm_list), 3), np.nan) - q[mokapot_values_targets.index] = mokapot_values_targets.values - q[mokapot_values_decoys.index] = mokapot_values_decoys.values + try: + confidence_results, models = brew( + lin_psm_data, model=PercolatorModel(train_fdr=train_fdr), rng=8, **kwargs + ) + except RuntimeError as e: + raise RescoringError("Mokapot could not be run. Please check the input data.") from e - # Add Mokapot results to PSMList - psm_list["score"] = q[:, 0] - psm_list["qvalue"] = q[:, 1] - psm_list["pep"] = q[:, 2] + add_psm_confidence(psm_list, confidence_results) + add_peptide_confidence(psm_list, confidence_results) # Write results if write_weights: @@ -173,7 +169,7 @@ def convert_psm_list( # Ensure filename for FlashLFQ txt output if not combined_df["run"].notnull().all(): - combined_df["run"] = "ms_run" + combined_df["run"] = "na" feature_names = [f"feature:{f}" for f in feature_names] if feature_names else None @@ -224,6 +220,58 @@ def save_model_weights( ) +def add_psm_confidence( + psm_list: psm_utils.PSMList, confidence_results: mokapot.confidence.Confidence +) -> None: + """Add PSM-level confidence estimates to PSM list, updating score, qvalue, pep, and rank.""" + # Reshape confidence estimates to match PSMList + keys = ["mokapot score", "mokapot q-value", "mokapot PEP"] + mokapot_values_targets = ( + confidence_results.confidence_estimates["psms"].set_index("index").sort_index()[keys] + ) + mokapot_values_decoys = ( + confidence_results.decoy_confidence_estimates["psms"].set_index("index").sort_index()[keys] + ) + q = np.full((len(psm_list), 3), np.nan) + q[mokapot_values_targets.index] = mokapot_values_targets.values + q[mokapot_values_decoys.index] = mokapot_values_decoys.values + + # Add Mokapot results to PSMList + psm_list["score"] = q[:, 0] + psm_list["qvalue"] = q[:, 1] + psm_list["pep"] = q[:, 2] + + # Reset ranks to match new scores + psm_list.set_ranks(lower_score_better=False) + + +def add_peptide_confidence( + psm_list: psm_utils.PSMList, confidence_results: mokapot.confidence.Confidence +) -> None: + """Add Mokapot peptide-level confidence estimates to PSM list.""" + keys = ["mokapot score", "mokapot q-value", "mokapot PEP"] + peptide_info = pd.concat( + [ + confidence_results.confidence_estimates["peptides"].set_index("peptide")[keys], + confidence_results.decoy_confidence_estimates["peptides"].set_index("peptide")[keys], + ], + axis=0, + ).to_dict(orient="index") + + # Add peptide-level scores to PSM metadata + # run_key = "na" if not all(psm.run for psm in psm_list) else None + no_charge_pattern = re.compile(r"(/\d+$)") + for psm in psm_list: + peptide_scores = peptide_info[(no_charge_pattern.sub("", str(psm.peptidoform), 1))] + psm.metadata.update( + { + "peptide_score": peptide_scores["mokapot score"], + "peptide_qvalue": peptide_scores["mokapot q-value"], + "peptide_pep": peptide_scores["mokapot PEP"], + } + ) + + def _mz_to_mass(mz: float, charge: int) -> float: """Convert m/z to mass.""" return mz * charge - charge * nist_mass["H"][1][0] diff --git a/ms2rescore/rescoring_engines/percolator.py b/ms2rescore/rescoring_engines/percolator.py index 5f7d4e5d..9952aed5 100644 --- a/ms2rescore/rescoring_engines/percolator.py +++ b/ms2rescore/rescoring_engines/percolator.py @@ -20,8 +20,8 @@ import logging import subprocess from typing import Any, Dict, Optional +from copy import deepcopy -import numpy as np import psm_utils from ms2rescore.exceptions import MS2RescoreError @@ -103,8 +103,15 @@ def rescore( # Need to be able to link back to original PSMs, so reindex spectrum IDs, but copy PSM list # to avoid modifying original... # TODO: Better approach for this? - psm_list_reindexed = psm_list.copy() - psm_list_reindexed["spectrum_id"] = np.arange(len(psm_list_reindexed)) + + psm_list_reindexed = deepcopy(psm_list) + psm_list_reindexed.set_ranks() + psm_list_reindexed["spectrum_id"] = [ + f"{psm.get_usi(as_url=False)}_{psm.rank}" for psm in psm_list_reindexed + ] + spectrum_id_index = { + spectrum_id: index for index, spectrum_id in enumerate(psm_list_reindexed["spectrum_id"]) + } _write_pin_file(psm_list_reindexed, pin_filepath) @@ -134,10 +141,13 @@ def rescore( psm_list, percolator_kwargs["results-psms"], percolator_kwargs["decoy-results-psms"], + spectrum_id_index, ) -def _update_psm_scores(psm_list: psm_utils.PSMList, target_pout: str, decoy_pout: str): +def _update_psm_scores( + psm_list: psm_utils.PSMList, target_pout: str, decoy_pout: str, spectrum_id_index: list +): """ Update PSM scores with Percolator results. @@ -150,7 +160,9 @@ def _update_psm_scores(psm_list: psm_utils.PSMList, target_pout: str, decoy_pout psm_list_percolator = psm_utils.PSMList(psm_list=target_psms.psm_list + decoy_psms.psm_list) # Sort by reindexed spectrum_id so order matches original PSM list - psm_list_percolator[np.argsort(psm_list_percolator["spectrum_id"])] + psm_list_percolator = sorted( + psm_list_percolator, key=lambda psm: spectrum_id_index[psm["spectrum_id"]] + ) if not len(psm_list) == len(psm_list_percolator): raise MS2RescoreError( @@ -163,6 +175,8 @@ def _update_psm_scores(psm_list: psm_utils.PSMList, target_pout: str, decoy_pout original_psm["qvalue"] = new_psm["qvalue"] original_psm["pep"] = new_psm["pep"] + psm_list.set_ranks(lower_score_better=False) + def _write_pin_file(psm_list: psm_utils.PSMList, filepath: str): """Write PIN file for rescoring.""" diff --git a/ms2rescore/utils.py b/ms2rescore/utils.py index 70417b56..3515893a 100644 --- a/ms2rescore/utils.py +++ b/ms2rescore/utils.py @@ -35,44 +35,61 @@ def infer_spectrum_path( "and no run name in PSM file found." ) - # If passed path is directory, join with run name - elif os.path.isdir(configured_path): - if run_name: - resolved_path = os.path.join(configured_path, run_name) + else: + is_bruker_dir = configured_path.endswith(".d") or _is_minitdf(configured_path) + + # If passed path is directory (that is not Bruker raw), join with run name + if os.path.isdir(configured_path) and not is_bruker_dir: + if run_name: + resolved_path = os.path.join(configured_path, run_name) + else: + raise MS2RescoreConfigurationError( + "Could not resolve spectrum file name: Spectrum path is directory " + "but no run name in PSM file found." + ) + + # If passed path is file, use that, but warn if basename doesn't match expected + elif os.path.isfile(configured_path) or (os.path.isdir(configured_path) and is_bruker_dir): + if run_name and Path(configured_path).stem != Path(run_name).stem: + logger.warning( + "Passed spectrum path (`%s`) does not match run name found in PSM " + "file (`%s`). Continuing with passed spectrum path.", + configured_path, + run_name, + ) + resolved_path = configured_path else: raise MS2RescoreConfigurationError( - "Could not resolve spectrum file name: Spectrum path is directory " - "but no run name in PSM file found." - ) - - # If passed path is file, use that, but warn if basename doesn't match expected - elif os.path.isfile(configured_path): - if run_name and Path(configured_path).stem != Path(run_name).stem: - logger.warning( - "Passed spectrum path (`%s`) does not match run name found in PSM " - "file (`%s`). Continuing with passed spectrum path.", - configured_path, - run_name, + "Configured `spectrum_path` must be `None` or a path to an existing file " + "or directory. If `None` or path to directory, spectrum run information " + "should be present in the PSM file." ) - resolved_path = configured_path - else: - raise MS2RescoreConfigurationError( - "Configured `spectrum_path` must be `None` or a path to an existing file " - "or directory. If `None` or path to directory, spectrum run information " - "should be present in the PSM file." - ) # Match with file extension if not in resolved_path yet - if not re.match(".mgf$|.mzml$", resolved_path, flags=re.IGNORECASE): + if not _is_minitdf(resolved_path) and not re.match( + r"\.mgf$|\.mzml$|\.d$", resolved_path, flags=re.IGNORECASE + ): for filename in glob(resolved_path + "*"): - if re.match(r".*(\.mgf$|\.mzml$)", filename, flags=re.IGNORECASE): + if re.match(r".*(\.mgf$|\.mzml$|\.d)", filename, flags=re.IGNORECASE): resolved_path = filename break else: raise MS2RescoreConfigurationError( - "Resolved spectrum filename does not contain a supported file " - "extension (mgf or mzml) and could not find any matching existing " + f"Resolved spectrum filename ('{resolved_path}') does not contain a supported " + "file extension (mzML, MGF, or .d) and could not find any matching existing " "files." ) return Path(resolved_path) + + +def _is_minitdf(spectrum_file: str) -> bool: + """ + Check if the spectrum file is a Bruker miniTDF folder. + + A Bruker miniTDF folder has no fixed name, but contains files matching the patterns + ``*ms2spectrum.bin`` and ``*ms2spectrum.parquet``. + """ + files = set(Path(spectrum_file).glob("*ms2spectrum.bin")) + files.update(Path(spectrum_file).glob("*ms2spectrum.parquet")) + return len(files) >= 2 diff --git a/pyproject.toml b/pyproject.toml index 18bbb8be..d25968bc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,25 +32,24 @@ classifiers = [ dynamic = ["version"] requires-python = ">=3.8" dependencies = [ - "ms2rescore_rs", - "numpy>=1.16.0; python_version != '3.11'", - "numpy==1.24.3; python_version == '3.11'", # Incompatibility with sklearn, pygam, and TF... - "pandas>=1.0", - "rich>=12", - "pyteomics>=4.1.0", - "lxml>=4.5", - "ms2pip>=4.0.0-dev4", - "click>=7", "cascade-config>=0.4.0", + "click>=7", + "customtkinter>=5,<6", "deeplc>=2.2", "deeplcretrainer>=0.2", - "tomli>=2; python_version < '3.11'", - "psm_utils>=0.4", - "customtkinter>=5,<6", - "mokapot>=0.9", - "pydantic>=1.8.2,<2", # Fix compatibility with v2 in psm_utils + "im2deep>=0.1.3", "jinja2>=3", + "lxml>=4.5", + "mokapot>=0.9", + "ms2pip>=4.0.0-dev10", + "ms2rescore_rs", + "numpy>=1.16.0", + "pandas>=1.0", "plotly>=5", + "psm_utils>=0.9", + "pyteomics>=4.7.2", + "rich>=12", + "tomli>=2; python_version < '3.11'", ] [project.optional-dependencies] @@ -79,6 +78,7 @@ CompOmics = "https://www.compomics.com" ms2rescore = "ms2rescore.__main__:main" ms2rescore-gui = "ms2rescore.gui.__main__:main" ms2rescore-report = "ms2rescore.report.__main__:main" +tims2rescore = "ms2rescore.__main__:main_tims" [build-system] requires = ["flit_core >=3.2,<4"] @@ -94,3 +94,6 @@ target-version = ['py38'] [tool.ruff] line-length = 99 target-version = 'py38' + +[tool.ruff.lint] +extend-select = ["T201", "T203"]