From 66fecc988bf51a2e84693b9ea25f626762f6da52 Mon Sep 17 00:00:00 2001 From: RalfG Date: Fri, 5 Apr 2024 16:36:19 +0200 Subject: [PATCH 1/4] Use rust for parsing spectrum files --- ms2rescore/core.py | 2 +- ms2rescore/parse_spectra.py | 139 ++++++++---------------------------- pyproject.toml | 5 +- tests/test_data/test.mgf | 13 ++++ tests/test_parse_spectra.py | 23 ++++++ 5 files changed, 68 insertions(+), 114 deletions(-) create mode 100644 tests/test_data/test.mgf create mode 100644 tests/test_parse_spectra.py diff --git a/ms2rescore/core.py b/ms2rescore/core.py index f777d620..725a1d58 100644 --- a/ms2rescore/core.py +++ b/ms2rescore/core.py @@ -63,7 +63,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: im_required = "ionmob" in config["feature_generators"] and None in psm_list["ion_mobility"] if rt_required or im_required: logger.info("Parsing missing retention time and/or ion mobility values from spectra...") - get_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required) + get_missing_values(psm_list, config, rt_required=rt_required, im_required=im_required) # Add rescoring features for fgen_name, fgen_config in config["feature_generators"].items(): diff --git a/ms2rescore/parse_spectra.py b/ms2rescore/parse_spectra.py index 9ed199b9..1e197289 100644 --- a/ms2rescore/parse_spectra.py +++ b/ms2rescore/parse_spectra.py @@ -3,11 +3,9 @@ import logging import re from itertools import chain -from typing import Dict, Tuple +from ms2rescore_rs import get_precursor_info from psm_utils import PSMList -from pyteomics.mgf import MGF -from pyteomics.mzml import MzML from rich.progress import track from ms2rescore.exceptions import MS2RescoreError @@ -16,7 +14,9 @@ logger = logging.getLogger(__name__) -def get_missing_values(config, psm_list, missing_rt=False, missing_im=False): +def get_missing_values( + psm_list: PSMList, config: dict, rt_required: bool = False, im_required: bool = False +): """Get missing RT/IM features from spectrum file.""" logger.debug("Extracting missing RT/IM values from spectrum file(s).") @@ -25,113 +25,30 @@ def get_missing_values(config, psm_list, missing_rt=False, missing_im=False): for run, psms in track(runs.items(), description="Extracting RT/IM values..."): psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) spectrum_file = infer_spectrum_path(config["spectrum_path"], run) - - if spectrum_file.suffix.lower() == ".mzml": - rt_dict, im_dict = _parse_values_from_mzml( - spectrum_file, config, run, missing_rt, missing_im - ) - elif spectrum_file.suffix.lower() == ".mgf": - rt_dict, im_dict = _parse_values_from_mgf( - spectrum_file, config, run, missing_rt, missing_im - ) - - for value_dict, value in zip([rt_dict, im_dict], ["retention_time", "ion_mobility"]): - if value_dict: - try: - psm_list_run[value] = [value_dict[psm.spectrum_id] for psm in psm_list_run] - except KeyError: - raise ParsingError( - f"Could not parse {value} values from spectrum file for run {run}." - ) - - -def _parse_values_from_mgf( - spectrum_file, config, run, missing_rt, missing_im -) -> Tuple[Dict, Dict]: - """ - Parse retention time and/or ion mobility from an MGF file. - - Notes - ----- - - Extracting values (e.g., ion mobility) according to the Matrix documentation: - http://www.matrixscience.com/help/data_file_help.html - - """ - rt_dict = {} - im_dict = {} - - spectrum_id_pattern = re.compile( - config["spectrum_id_pattern"] if config["spectrum_id_pattern"] else r"(.*)" - ) - - for spectrum in MGF(str(spectrum_file)): - matched_id = spectrum_id_pattern.match(spectrum["params"]["title"]).group() - if missing_rt: - try: - rt_dict[matched_id] = float(spectrum["params"]["rtinseconds"]) - except KeyError: - raise ParsingError( - "Could not parse retention time (`rtinseconds`) from spectrum file for " - f"run {run}. Please make sure that the retention time key is present in the " - "spectrum file or disable the relevant feature generator." - ) - if missing_im: - try: - im_dict[matched_id] = float(spectrum["params"]["ion_mobility"]) - except KeyError: - raise ParsingError( - "Could not parse ion mobility (`ion_mobility`) from spectrum file " - f"for run {run}. Please make sure that the ion mobility key is present in the " - "spectrum file or disable the relevant feature generator." - ) - - return rt_dict, im_dict - - -def _parse_values_from_mzml( - spectrum_file, config, run, missing_rt, missing_im -) -> Tuple[Dict, Dict]: - """Parse retention time and/or ion mobility from an mzML file.""" - rt_dict = {} - im_dict = {} - - spectrum_id_pattern = re.compile( - config["spectrum_id_pattern"] if config["spectrum_id_pattern"] else r"(.*)" - ) - - for spectrum in MzML(str(spectrum_file)): - matched_id = spectrum_id_pattern.match(spectrum["id"]).group() - if missing_rt: - try: - rt_dict[matched_id] = float(spectrum["scanList"]["scan"][0]["scan start time"]) - except KeyError: - raise ParsingError( - "Could not parse retention time (`scan start time`) from spectrum file for " - f"run {run}. Please make sure that the retention time key is present in the " - "spectrum file or disable the relevant feature generator." - ) - if missing_im: - try: - im_dict[matched_id] = float( - spectrum["scanList"]["scan"][0]["reverse ion mobility"] - ) - except KeyError: - raise ParsingError( - "Could not parse ion mobility (`reverse ion mobility`) from spectrum file " - f"for run {run}. Please make sure that the ion mobility key is present in the " - "spectrum file or disable the relevant feature generator." - ) - - return rt_dict, im_dict - - -class ParseMGFError(MS2RescoreError): - """Error parsing MGF file.""" - - pass - - -class ParsingError(MS2RescoreError): + precursors = get_precursor_info(str(spectrum_file)) + + if config["spectrum_id_pattern"]: + spectrum_id_pattern = re.compile(config["spectrum_id_pattern"]) + precursors = { + spectrum_id_pattern.search(spectrum_id).group(1): precursor + for spectrum_id, precursor in precursors.items() + } + + for psm in psm_list_run: + try: + if rt_required: + psm.retention_time = precursors[psm.spectrum_id].rt + if im_required: + psm.ion_mobility = precursors[psm.spectrum_id].im + if not psm.precursor_mz: + psm.precursor_mz = precursors[psm.spectrum_id].mz + except KeyError as e: + raise SpectrumParsingError( + f"Could not extract missing RT/IM values from spectrum file for run {run}." + ) from e + + +class SpectrumParsingError(MS2RescoreError): """Error parsing retention time from spectrum file.""" pass diff --git a/pyproject.toml b/pyproject.toml index 8f135149..18bbb8be 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,8 +32,9 @@ classifiers = [ dynamic = ["version"] requires-python = ">=3.8" dependencies = [ + "ms2rescore_rs", "numpy>=1.16.0; python_version != '3.11'", - "numpy==1.24.3; python_version == '3.11'", # Incompatibility with sklearn, pygam, and TF... + "numpy==1.24.3; python_version == '3.11'", # Incompatibility with sklearn, pygam, and TF... "pandas>=1.0", "rich>=12", "pyteomics>=4.1.0", @@ -47,7 +48,7 @@ dependencies = [ "psm_utils>=0.4", "customtkinter>=5,<6", "mokapot>=0.9", - "pydantic>=1.8.2,<2", # Fix compatibility with v2 in psm_utils + "pydantic>=1.8.2,<2", # Fix compatibility with v2 in psm_utils "jinja2>=3", "plotly>=5", ] diff --git a/tests/test_data/test.mgf b/tests/test_data/test.mgf new file mode 100644 index 00000000..e4899c08 --- /dev/null +++ b/tests/test_data/test.mgf @@ -0,0 +1,13 @@ +BEGIN IONS +TITLE=peptide: peptide1 +CHARGE=2+ +PEPMASS=475.137295 +ION_MOBILITY=42.42 +RTINSECONDS=51.2 +72.04439 100 +148.06043 600 +232.07504 300 +263.08737 400 +347.10198 500 +423.11802 200 +END IONS diff --git a/tests/test_parse_spectra.py b/tests/test_parse_spectra.py new file mode 100644 index 00000000..4dffc9dc --- /dev/null +++ b/tests/test_parse_spectra.py @@ -0,0 +1,23 @@ +import pytest +from psm_utils import PSM, PSMList + +from ms2rescore.parse_spectra import get_missing_values + + +def test_get_missing_values(): + psm_list = PSMList( + psm_list=[ + PSM(peptidoform="PEPTIDEK/2", spectrum_id="peptide1"), + ] + ) + get_missing_values( + psm_list, + config={ + "spectrum_path": "tests/test_data/test.mgf", + "spectrum_id_pattern": "peptide: (.*)", + }, + rt_required=True, + im_required=True, + ) + assert psm_list[0].retention_time == pytest.approx(0.853, 0.001) + assert psm_list[0].ion_mobility == pytest.approx(42.42, 0.01) From c5677a3f013b332b22679d46aac258dc8ec898e6 Mon Sep 17 00:00:00 2001 From: RalfG Date: Sun, 7 Apr 2024 18:56:11 +0200 Subject: [PATCH 2/4] Use single quotes instead of backticks when logging strings --- ms2rescore/parse_psms.py | 15 +++++++-------- ms2rescore/parse_spectra.py | 5 ++--- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py index b30539f3..a9855fda 100644 --- a/ms2rescore/parse_psms.py +++ b/ms2rescore/parse_psms.py @@ -1,6 +1,5 @@ import logging import re -from itertools import chain from typing import Dict, Union import psm_utils.io @@ -60,9 +59,9 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: if config["psm_id_pattern"]: pattern = re.compile(config["psm_id_pattern"]) - logger.debug("Applying `psm_id_pattern`...") + logger.debug("Applying 'psm_id_pattern'...") logger.debug( - f"Parsing `{psm_list['spectrum_id'][0]}` to `{_match_psm_ids(psm_list['spectrum_id'][0], pattern)}`" + f"Parsing '{psm_list[0].spectrum_id}' to '{_match_psm_ids(psm_list[0].spectrum_id, pattern)}'" ) new_ids = [_match_psm_ids(old_id, pattern) for old_id in psm_list["spectrum_id"]] psm_list["spectrum_id"] = new_ids @@ -86,7 +85,7 @@ def _read_psms(config, psm_list): valid_psms = 0 for psm_file in config["psm_file"]: logger.info( - f"Reading PSMs from PSM file ({current_file}/{total_files}): `{psm_file}`..." + f"Reading PSMs from PSM file ({current_file}/{total_files}): '{psm_file}'..." ) try: id_file_psm_list = psm_utils.io.read_file( @@ -97,8 +96,8 @@ def _read_psms(config, psm_list): ) except psm_utils.io.PSMUtilsIOException: raise MS2RescoreConfigurationError( - "Error occurred while reading PSMs. Please check the `psm_file` and " - "`psm_file_type` settings. See " + "Error occurred while reading PSMs. Please check the 'psm_file' and " + "'psm_file_type' settings. See " "https://ms2rescore.readthedocs.io/en/latest/userguide/input-files/" " for more information." ) @@ -129,7 +128,7 @@ def _find_decoys(config, psm_list): if not any(psm_list["is_decoy"]): raise MS2RescoreConfigurationError( "No decoy PSMs found. Please check if decoys are present in the PSM file and that " - "the `id_decoy_pattern` option is correct. See " + "the 'id_decoy_pattern' option is correct. See " "https://ms2rescore.readthedocs.io/en/latest/userguide/configuration/#selecting-decoy-psms" " for more information." ) @@ -150,7 +149,7 @@ def _match_psm_ids(old_id, regex_pattern): return match[1] except (TypeError, IndexError): raise MS2RescoreConfigurationError( - f"`psm_id_pattern` could not be extracted from PSM spectrum IDs (i.e. {old_id})." + f"'psm_id_pattern' could not be extracted from PSM spectrum IDs (i.e. {old_id})." " Ensure that the regex contains a capturing group?" ) diff --git a/ms2rescore/parse_spectra.py b/ms2rescore/parse_spectra.py index 1e197289..17b35d60 100644 --- a/ms2rescore/parse_spectra.py +++ b/ms2rescore/parse_spectra.py @@ -18,13 +18,12 @@ def get_missing_values( psm_list: PSMList, config: dict, rt_required: bool = False, im_required: bool = False ): """Get missing RT/IM features from spectrum file.""" - logger.debug("Extracting missing RT/IM values from spectrum file(s).") - psm_dict = psm_list.get_psm_dict() for runs in psm_dict.values(): - for run, psms in track(runs.items(), description="Extracting RT/IM values..."): + for run, psms in runs.items(): psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) spectrum_file = infer_spectrum_path(config["spectrum_path"], run) + logger.debug("Reading spectrum file: '%s'", spectrum_file) precursors = get_precursor_info(str(spectrum_file)) if config["spectrum_id_pattern"]: From ed6423b29ba81a3cbb6d43f60a2a8859acb64719 Mon Sep 17 00:00:00 2001 From: RalfG Date: Sun, 7 Apr 2024 18:56:47 +0200 Subject: [PATCH 3/4] mokapot: Fix writing of FlashLFQ output when PSMList run field is empty --- ms2rescore/rescoring_engines/mokapot.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ms2rescore/rescoring_engines/mokapot.py b/ms2rescore/rescoring_engines/mokapot.py index f3927f47..cc7a336f 100644 --- a/ms2rescore/rescoring_engines/mokapot.py +++ b/ms2rescore/rescoring_engines/mokapot.py @@ -171,6 +171,10 @@ def convert_psm_list( feature_df.columns = [f"feature:{f}" for f in feature_df.columns] combined_df = pd.concat([psm_df[required_columns], feature_df], axis=1) + # Ensure filename for FlashLFQ txt output + if not combined_df["run"].notnull().all(): + combined_df["run"] = "ms_run" + feature_names = [f"feature:{f}" for f in feature_names] if feature_names else None lin_psm_data = LinearPsmDataset( From dc866495e18a33ee0dfabf58e4785ed7d7d1ec25 Mon Sep 17 00:00:00 2001 From: RalfG Date: Sun, 7 Apr 2024 18:57:24 +0200 Subject: [PATCH 4/4] Reactivate tests in CI --- .github/workflows/test.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 339aade3..dbe1019b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -37,9 +37,10 @@ jobs: run: | pip install .[dev] - # - name: Test with pytest - # run: | - # pytest + - name: Test with pytest + run: | + pytest + - name: Test installation run: | ms2rescore --help