From fdceebac3d306a0630ba992343b3fb2c9f9a0286 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Sat, 24 Feb 2024 15:48:00 +0100 Subject: [PATCH 01/23] initial commit --- ms2rescore/feature_generators/ms2.py | 161 +++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 ms2rescore/feature_generators/ms2.py diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py new file mode 100644 index 00000000..8e144574 --- /dev/null +++ b/ms2rescore/feature_generators/ms2.py @@ -0,0 +1,161 @@ +""" +MS2-based feature generator. + +""" + +import logging +import re +from typing import List, Optional, Union +from itertools import chain +from collections import defaultdict + + +import numpy as np +from psm_utils import PSMList +from pyteomics.mass import calculate_mass + +from ms2rescore.feature_generators.base import FeatureGeneratorBase +from ms2rescore.utils import infer_spectrum_path + +logger = logging.getLogger(__name__) + + +class MS2FeatureGenerator(FeatureGeneratorBase): + """DeepLC retention time-based feature generator.""" + + def __init__( + self, + *args, + spectrum_path: Optional[str] = None, + spectrum_id_pattern: str = "(.*)", + **kwargs, + ) -> None: + """ + Generate MS2-based features for rescoring. + + DeepLC retraining is on by default. Add ``deeplc_retrain: False`` as a keyword argument to + disable retraining. + + Parameters + ---------- + + Attributes + ---------- + feature_names: list[str] + Names of the features that will be added to the PSMs. + + """ + super().__init__(*args, **kwargs) + self.spectrum_path = spectrum_path + self.spectrum_id_pattern = spectrum_id_pattern + + @property + def feature_names(self) -> List[str]: + return [ + "ln_explained_intensity", + "ln_total_intensity", + "ln_explained_intensity_ratio", + "ln_explained_b_ion_ratio", + "ln_explained_y_ion_ratio", + "longest_b_ion_sequence", + "longest_y_ion_sequence", + "matched_b_ions", + "matched_b_ions_pct", + "matched_y_ions", + "matched_y_ions_pct", + "matched_ions_pct", + ] + + def add_features(self, psm_list: PSMList) -> None: + """Add DeepLC-derived features to PSMs.""" + + logger.info("Adding MS2-derived features to PSMs.") + psm_dict = psm_list.get_psm_dict() + current_run = 1 + total_runs = sum(len(runs) for runs in psm_dict.values()) + + for runs in psm_dict.values(): + for run, psms in runs.items(): + logger.info( + f"Running MS2 for PSMs from run ({current_run}/{total_runs}) `{run}`..." + ) + psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) + spectrum_filename = infer_spectrum_path(self.spectrum_path, run) + logger.debug(f"Using spectrum file `{spectrum_filename}`") + self._add_features_run(psm_list_run, spectrum_filename) + current_run += 1 + + def _add_features_run(self, psm_list: PSMList, spectrum_filename: str) -> None: + pass + + +def longest_sequence_of_trues(lst): + max_sequence = 0 + current_sequence = 0 + + for value in lst: + current_sequence = current_sequence + 1 if value else 0 + max_sequence = max(max_sequence, current_sequence) + + return max_sequence + + +def extract_spectrum_features(annotated_spectrum, peptidoform): + features = defaultdict(list) + b_ions_matched = [False] * (len(peptidoform.sequence)) + y_ions_matched = [False] * (len(peptidoform.sequence)) + + pseudo_count = 0.00001 + + for annotated_peak in annotated_spectrum.spectrum: + features["total_intensity"].append(annotated_peak.intensity) + + if annotated_peak.annotation: + features["matched_intensity"].append(annotated_peak.intensity) + for matched_ion in annotated_peak.annotation: + if "y" in matched_ion.ion: + features["y_ion_matched"].append(annotated_peak.intensity) + y_ions_matched[int(re.search(r"\d+", matched_ion.ion).group())] = True + elif "b" in matched_ion.ion: + features["b_ion_matched"].append(annotated_peak.intensity) + b_ions_matched[int(re.search(r"\d+", matched_ion.ion).group())] = True + + return { + "ln_explained_intensity": np.log(np.sum(features["matched_intensity"]) + pseudo_count), + "ln_total_intensity": np.log(np.sum(features["total_intensity"]) + pseudo_count), + "ln_explained_intensity_ratio": np.log( + np.sum(features["matched_intensity"]) / np.sum(features["total_intensity"]) + + pseudo_count + ), + "ln_explained_b_ion_ratio": np.log( + np.sum(features["b_ion_matched"]) / np.sum(features["matched_intensity"]) + + pseudo_count + ), + "ln_explained_y_ion_ratio": np.log( + np.sum(features["y_ion_matched"]) / np.sum(features["matched_intensity"]) + + pseudo_count + ), + "longest_b_ion_sequence": longest_sequence_of_trues(b_ions_matched), + "longest_y_ion_sequence": longest_sequence_of_trues(y_ions_matched), + "matched_b_ions": sum(b_ions_matched), + "matched_b_ions_pct": sum(b_ions_matched) / len(b_ions_matched), + "matched_y_ions": sum(y_ions_matched), + "matched_y_ions_pct": sum(y_ions_matched) / len(y_ions_matched), + "matched_ions_pct": (sum(b_ions_matched) + sum(y_ions_matched)) + / (len(b_ions_matched) + len(y_ions_matched)), + } + + +# TODO: keep this here? +def modification_evidence(): + return + + +# TODO: move to basic feature generator +def extract_psm_features(psm): + return { + "peptide_len": len(psm.peptidoform.sequence), + "expmass": (psm.precursor_mz * psm.get_precursor_charge()) + - (psm.get_precursor_charge() * 1.007276), # TODO: replace by constant + "calcmass": calculate_mass(psm.peptidoform.composition), + } From 5374ed83aa56c075bcd2cf2da1004e049d13ee9d Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Sun, 25 Feb 2024 16:44:54 +0100 Subject: [PATCH 02/23] finalize ms2 feature generation --- ms2rescore/core.py | 25 +-- ms2rescore/exceptions.py | 6 + ms2rescore/feature_generators/__init__.py | 2 + ms2rescore/feature_generators/ms2.py | 218 +++++++++++++++------- 4 files changed, 171 insertions(+), 80 deletions(-) diff --git a/ms2rescore/core.py b/ms2rescore/core.py index e5ae26f1..f8ff38d7 100644 --- a/ms2rescore/core.py +++ b/ms2rescore/core.py @@ -54,18 +54,18 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: f"PSMs already contain the following rescoring features: {psm_list_feature_names}" ) - # TODO: avoid hard coding feature generators in some way - rt_required = ("deeplc" in config["feature_generators"]) and ( - None in psm_list["retention_time"] - ) - im_required = ("ionmob" or "im2deep" in config["feature_generators"]) and ( - None in psm_list["ion_mobility"] - ) - logger.debug(f"RT required: {rt_required}, IM required: {im_required}") - - if rt_required or im_required: - logger.info("Parsing missing retention time and/or ion mobility values from spectra...") - fill_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required) + # # TODO: avoid hard coding feature generators in some way + # rt_required = ("deeplc" in config["feature_generators"]) and ( + # None in psm_list["retention_time"] + # ) + # im_required = ("ionmob" or "im2deep" in config["feature_generators"]) and ( + # None in psm_list["ion_mobility"] + # ) + # logger.debug(f"RT required: {rt_required}, IM required: {im_required}") + + # if rt_required or im_required: + # logger.info("Parsing missing retention time and/or ion mobility values from spectra...") + # fill_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required) # Add rescoring features for fgen_name, fgen_config in config["feature_generators"].items(): @@ -82,6 +82,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: fgen.add_features(psm_list) logger.debug(f"Adding features from {fgen_name}: {set(fgen.feature_names)}") feature_names[fgen_name] = set(fgen.feature_names) + exit() # Filter out psms that do not have all added features all_feature_names = {f for fgen in feature_names.values() for f in fgen} diff --git a/ms2rescore/exceptions.py b/ms2rescore/exceptions.py index f8d1fa01..9b429a30 100644 --- a/ms2rescore/exceptions.py +++ b/ms2rescore/exceptions.py @@ -23,3 +23,9 @@ class ModificationParsingError(IDFileParsingError): """Identification file parsing error.""" pass + + +class ParseSpectrumError(MS2RescoreError): + """Error parsing spectrum file.""" + + pass diff --git a/ms2rescore/feature_generators/__init__.py b/ms2rescore/feature_generators/__init__.py index 52440a5f..7a10fd4c 100644 --- a/ms2rescore/feature_generators/__init__.py +++ b/ms2rescore/feature_generators/__init__.py @@ -8,6 +8,7 @@ from ms2rescore.feature_generators.maxquant import MaxQuantFeatureGenerator from ms2rescore.feature_generators.ms2pip import MS2PIPFeatureGenerator from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator +from ms2rescore.feature_generators.ms2 import MS2FeatureGenerator FEATURE_GENERATORS = { "basic": BasicFeatureGenerator, @@ -16,4 +17,5 @@ "maxquant": MaxQuantFeatureGenerator, "ionmob": IonMobFeatureGenerator, "im2deep": IM2DeepFeatureGenerator, + "ms2": MS2FeatureGenerator, } diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py index 8e144574..013bcb44 100644 --- a/ms2rescore/feature_generators/ms2.py +++ b/ms2rescore/feature_generators/ms2.py @@ -12,13 +12,27 @@ import numpy as np from psm_utils import PSMList -from pyteomics.mass import calculate_mass +from pyteomics import mass, mzml, mgf +from rustyms import RawSpectrum, LinearPeptide, FragmentationModel, MassMode from ms2rescore.feature_generators.base import FeatureGeneratorBase from ms2rescore.utils import infer_spectrum_path +from ms2rescore.exceptions import ParseSpectrumError logger = logging.getLogger(__name__) +FRAGMENTATION_MODELS = { + "cidhcd": FragmentationModel.CidHcd, + "etcid": FragmentationModel.Etcid, + "etd": FragmentationModel.Etd, + "ethcd": FragmentationModel.Ethcd, + "all": FragmentationModel.All, +} +MASS_MODES = { + "average": MassMode.Average, + "monoisotopic": MassMode.Monoisotopic, +} + class MS2FeatureGenerator(FeatureGeneratorBase): """DeepLC retention time-based feature generator.""" @@ -28,14 +42,13 @@ def __init__( *args, spectrum_path: Optional[str] = None, spectrum_id_pattern: str = "(.*)", + fragmentationmodel: str = "All", + massmode: str = "Monoisotopic", **kwargs, ) -> None: """ Generate MS2-based features for rescoring. - DeepLC retraining is on by default. Add ``deeplc_retrain: False`` as a keyword argument to - disable retraining. - Parameters ---------- @@ -48,6 +61,8 @@ def __init__( super().__init__(*args, **kwargs) self.spectrum_path = spectrum_path self.spectrum_id_pattern = spectrum_id_pattern + self.fragmentation_model = FRAGMENTATION_MODELS[fragmentationmodel.lower()] + self.mass_mode = MASS_MODES[massmode.lower()] @property def feature_names(self) -> List[str]: @@ -67,7 +82,7 @@ def feature_names(self) -> List[str]: ] def add_features(self, psm_list: PSMList) -> None: - """Add DeepLC-derived features to PSMs.""" + """Add MS2-derived features to PSMs.""" logger.info("Adding MS2-derived features to PSMs.") psm_dict = psm_list.get_psm_dict() @@ -82,68 +97,128 @@ def add_features(self, psm_list: PSMList) -> None: psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) spectrum_filename = infer_spectrum_path(self.spectrum_path, run) logger.debug(f"Using spectrum file `{spectrum_filename}`") - self._add_features_run(psm_list_run, spectrum_filename) - current_run += 1 - - def _add_features_run(self, psm_list: PSMList, spectrum_filename: str) -> None: - pass - - -def longest_sequence_of_trues(lst): - max_sequence = 0 - current_sequence = 0 - - for value in lst: - current_sequence = current_sequence + 1 if value else 0 - max_sequence = max(max_sequence, current_sequence) - - return max_sequence - -def extract_spectrum_features(annotated_spectrum, peptidoform): - features = defaultdict(list) - b_ions_matched = [False] * (len(peptidoform.sequence)) - y_ions_matched = [False] * (len(peptidoform.sequence)) - - pseudo_count = 0.00001 - - for annotated_peak in annotated_spectrum.spectrum: - features["total_intensity"].append(annotated_peak.intensity) - - if annotated_peak.annotation: - features["matched_intensity"].append(annotated_peak.intensity) - for matched_ion in annotated_peak.annotation: - if "y" in matched_ion.ion: - features["y_ion_matched"].append(annotated_peak.intensity) - y_ions_matched[int(re.search(r"\d+", matched_ion.ion).group())] = True - elif "b" in matched_ion.ion: - features["b_ion_matched"].append(annotated_peak.intensity) - b_ions_matched[int(re.search(r"\d+", matched_ion.ion).group())] = True + annotated_spectra = self._annotate_spectra(psm_list_run, spectrum_filename) + self._calculate_features(psm_list_run, annotated_spectra) + current_run += 1 - return { - "ln_explained_intensity": np.log(np.sum(features["matched_intensity"]) + pseudo_count), - "ln_total_intensity": np.log(np.sum(features["total_intensity"]) + pseudo_count), - "ln_explained_intensity_ratio": np.log( - np.sum(features["matched_intensity"]) / np.sum(features["total_intensity"]) - + pseudo_count - ), - "ln_explained_b_ion_ratio": np.log( - np.sum(features["b_ion_matched"]) / np.sum(features["matched_intensity"]) - + pseudo_count - ), - "ln_explained_y_ion_ratio": np.log( - np.sum(features["y_ion_matched"]) / np.sum(features["matched_intensity"]) - + pseudo_count - ), - "longest_b_ion_sequence": longest_sequence_of_trues(b_ions_matched), - "longest_y_ion_sequence": longest_sequence_of_trues(y_ions_matched), - "matched_b_ions": sum(b_ions_matched), - "matched_b_ions_pct": sum(b_ions_matched) / len(b_ions_matched), - "matched_y_ions": sum(y_ions_matched), - "matched_y_ions_pct": sum(y_ions_matched) / len(y_ions_matched), - "matched_ions_pct": (sum(b_ions_matched) + sum(y_ions_matched)) - / (len(b_ions_matched) + len(y_ions_matched)), - } + def _calculate_features(self, psm_list: PSMList, annotated_spectra: List) -> None: + """Calculate features from annotated spectra and add them to the PSMs.""" + for psm, spectrum in zip(psm_list, annotated_spectra): + psm.features.update(self.extract_spectrum_features(spectrum, psm.peptidoform)) + + def _annotate_spectra(self, psm_list: PSMList, spectrum_file: str) -> List: + """Retrieve annotated spectra for all psms.""" + + annotated_spectra = [] + spectrum_reader = self._read_spectrum_file(spectrum_file) + + spectrum_id_pattern = re.compile( + self.spectrum_id_pattern if self.spectrum_id_pattern else r"(.*)" + ) + logger.info(spectrum_id_pattern) + logger.info(spectrum_reader._offset_index.mapping["spectrum"].keys()) + + try: + mapper = { + spectrum_id_pattern.search(spectrum_id).group(1): spectrum_id + for spectrum_id in spectrum_reader._offset_index.mapping["spectrum"].keys() + } + except AttributeError: + raise ParseSpectrumError( + "Could not parse spectrum IDs using ´spectrum_id_pattern´. Please make sure that there is a capturing in the pattern." + ) + + for psm in psm_list: + pyteomics_spectrum = spectrum_reader.get_by_id(mapper[psm.spectrum_id]) + annotated_spectrum = self._annotate_spectrum(psm, pyteomics_spectrum) + annotated_spectra.append(annotated_spectrum) + + return annotated_spectra + + @staticmethod + def _read_spectrum_file(spectrum_filepath: str) -> Union[mzml.PreIndexedMzML, mgf.IndexedMGF]: + + if spectrum_filepath.suffix == ".mzML": + return mzml.PreIndexedMzML(str(spectrum_filepath)) + elif spectrum_filepath.suffix == ".mgf": + return mgf.IndexedMGF(str(spectrum_filepath)) + + @staticmethod + def _longest_ion_sequence(lst): + max_sequence = 0 + current_sequence = 0 + + for value in lst: + current_sequence = current_sequence + 1 if value else 0 + max_sequence = max(max_sequence, current_sequence) + + return max_sequence + + def extract_spectrum_features(self, annotated_spectrum, peptidoform): + features = defaultdict(list) + b_ions_matched = [False] * (len(peptidoform.sequence)) + y_ions_matched = [False] * (len(peptidoform.sequence)) + + pseudo_count = 0.00001 + + for annotated_peak in annotated_spectrum.spectrum: + features["total_intensity"].append(annotated_peak.intensity) + + if annotated_peak.annotation: + features["matched_intensity"].append(annotated_peak.intensity) + for matched_ion in annotated_peak.annotation: + if "y" in matched_ion.ion: + features["y_ion_matched"].append(annotated_peak.intensity) + y_ions_matched[int(re.search(r"\d+", matched_ion.ion).group())] = True + elif "b" in matched_ion.ion: + features["b_ion_matched"].append(annotated_peak.intensity) + b_ions_matched[int(re.search(r"\d+", matched_ion.ion).group())] = True + + return { + "ln_explained_intensity": np.log(np.sum(features["matched_intensity"]) + pseudo_count), + "ln_total_intensity": np.log(np.sum(features["total_intensity"]) + pseudo_count), + "ln_explained_intensity_ratio": np.log( + np.sum(features["matched_intensity"]) / np.sum(features["total_intensity"]) + + pseudo_count + ), + "ln_explained_b_ion_ratio": np.log( + np.sum(features["b_ion_matched"]) / np.sum(features["matched_intensity"]) + + pseudo_count + ), + "ln_explained_y_ion_ratio": np.log( + np.sum(features["y_ion_matched"]) / np.sum(features["matched_intensity"]) + + pseudo_count + ), + "longest_b_ion_sequence": self._longest_ion_sequence(b_ions_matched), + "longest_y_ion_sequence": self._longest_ion_sequence(y_ions_matched), + "matched_b_ions": sum(b_ions_matched), + "matched_b_ions_pct": sum(b_ions_matched) / len(b_ions_matched), + "matched_y_ions": sum(y_ions_matched), + "matched_y_ions_pct": sum(y_ions_matched) / len(y_ions_matched), + "matched_ions_pct": (sum(b_ions_matched) + sum(y_ions_matched)) + / (len(b_ions_matched) + len(y_ions_matched)), + } + + def _annotate_spectrum(self, psm, spectrum): + + spectrum = RawSpectrum( + title=psm.spectrum_id, + num_scans=1, + rt=psm.retention_time, + precursor_charge=psm.get_precursor_charge(), + precursor_mass=mass.calculate_mass(psm.peptidoform.compsoition), + mz_array=spectrum["m/z array"], + intensity_array=spectrum["intensity array"], + ) + + annotated_spectrum = spectrum.annotate( + peptide=LinearPeptide(psm.peptidoform.proforma), + model=self.fragmentationmodel, + mode=self.massmode, + ) + + return annotated_spectrum # TODO: keep this here? @@ -153,9 +228,16 @@ def modification_evidence(): # TODO: move to basic feature generator def extract_psm_features(psm): + + expmass = (psm.precursor_mz * psm.get_precursor_charge()) - ( + psm.get_precursor_charge() * 1.007276 + ) # TODO: replace by constant + calcmass = mass.calculate_mass(psm.peptidoform.composition) + delta_mass = expmass - calcmass + return { "peptide_len": len(psm.peptidoform.sequence), - "expmass": (psm.precursor_mz * psm.get_precursor_charge()) - - (psm.get_precursor_charge() * 1.007276), # TODO: replace by constant - "calcmass": calculate_mass(psm.peptidoform.composition), + "expmass": expmass, + "calcmass": calcmass, + "delta_mass": delta_mass, } From 60207a390f08cba34730889543a356fd1cb9b06b Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Sun, 25 Feb 2024 16:45:09 +0100 Subject: [PATCH 03/23] add rustyms --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index be5295ba..beb6b440 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,6 +55,7 @@ dependencies = [ [project.optional-dependencies] ionmob = ["ionmob>=0.2", "tensorflow"] +ms2 = ["rustyms"] dev = ["ruff", "black", "pytest", "pytest-cov", "pre-commit"] docs = [ "sphinx", From ae39844a7b46b9e4622c5f66458aae5146e3776b Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Mon, 26 Feb 2024 11:04:37 +0100 Subject: [PATCH 04/23] remove exit statement fixed IM required value --- ms2rescore/core.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/ms2rescore/core.py b/ms2rescore/core.py index f8ff38d7..6e0e547c 100644 --- a/ms2rescore/core.py +++ b/ms2rescore/core.py @@ -54,18 +54,19 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: f"PSMs already contain the following rescoring features: {psm_list_feature_names}" ) - # # TODO: avoid hard coding feature generators in some way - # rt_required = ("deeplc" in config["feature_generators"]) and ( - # None in psm_list["retention_time"] - # ) - # im_required = ("ionmob" or "im2deep" in config["feature_generators"]) and ( - # None in psm_list["ion_mobility"] - # ) - # logger.debug(f"RT required: {rt_required}, IM required: {im_required}") - - # if rt_required or im_required: - # logger.info("Parsing missing retention time and/or ion mobility values from spectra...") - # fill_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required) + # TODO: avoid hard coding feature generators in some way + rt_required = ("deeplc" in config["feature_generators"]) and ( + None in psm_list["retention_time"] + ) + im_required = ( + "ionmob" in config["feature_generators"] or "im2deep" in config["feature_generators"] + ) and (None in psm_list["ion_mobility"]) + + logger.info(f"RT required: {rt_required}, IM required: {im_required}") + + if rt_required or im_required: + logger.info("Parsing missing retention time and/or ion mobility values from spectra...") + fill_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required) # Add rescoring features for fgen_name, fgen_config in config["feature_generators"].items(): @@ -82,7 +83,6 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: fgen.add_features(psm_list) logger.debug(f"Adding features from {fgen_name}: {set(fgen.feature_names)}") feature_names[fgen_name] = set(fgen.feature_names) - exit() # Filter out psms that do not have all added features all_feature_names = {f for fgen in feature_names.values() for f in fgen} From 9b98c4df3b13d0ab935172a5cfeca8bdbb6159f4 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Mon, 26 Feb 2024 15:05:23 +0100 Subject: [PATCH 05/23] change logger.info to debug --- ms2rescore/core.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ms2rescore/core.py b/ms2rescore/core.py index 6e0e547c..032b1463 100644 --- a/ms2rescore/core.py +++ b/ms2rescore/core.py @@ -11,10 +11,12 @@ from ms2rescore.parse_spectra import fill_missing_values from ms2rescore.report import generate from ms2rescore.rescoring_engines import mokapot, percolator +from ms2rescore.utils import profile logger = logging.getLogger(__name__) +@profile def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: """ Run full MS²Rescore workflow with passed configuration. @@ -62,7 +64,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: "ionmob" in config["feature_generators"] or "im2deep" in config["feature_generators"] ) and (None in psm_list["ion_mobility"]) - logger.info(f"RT required: {rt_required}, IM required: {im_required}") + logger.debug(f"RT required: {rt_required}, IM required: {im_required}") if rt_required or im_required: logger.info("Parsing missing retention time and/or ion mobility values from spectra...") From 5e45756726582fa54e63f2d572c400332bcaa683 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Mon, 26 Feb 2024 15:06:34 +0100 Subject: [PATCH 06/23] added profile decorator to get timings for functions --- ms2rescore/utils.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/ms2rescore/utils.py b/ms2rescore/utils.py index 70417b56..982664b5 100644 --- a/ms2rescore/utils.py +++ b/ms2rescore/utils.py @@ -4,6 +4,9 @@ from glob import glob from pathlib import Path from typing import Optional, Union +import cProfile +import pstats +import io from ms2rescore.exceptions import MS2RescoreConfigurationError @@ -76,3 +79,22 @@ def infer_spectrum_path( ) return Path(resolved_path) + + +def profile(fnc): + """A decorator that uses cProfile to profile a function""" + + def inner(*args, **kwargs): + + pr = cProfile.Profile() + pr.enable() + retval = fnc(*args, **kwargs) + pr.disable() + s = io.StringIO() + sortby = "time" + ps = pstats.Stats(pr, stream=s).sort_stats(sortby) + ps.print_stats() + logger.info(s.getvalue()) + return retval + + return inner From 304777cfa171cf5b7abf5cd14640e971f6510167 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Mon, 26 Feb 2024 15:07:13 +0100 Subject: [PATCH 07/23] removed profile as standard rescore debug statement --- ms2rescore/core.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/ms2rescore/core.py b/ms2rescore/core.py index 032b1463..d0a52a4e 100644 --- a/ms2rescore/core.py +++ b/ms2rescore/core.py @@ -11,12 +11,10 @@ from ms2rescore.parse_spectra import fill_missing_values from ms2rescore.report import generate from ms2rescore.rescoring_engines import mokapot, percolator -from ms2rescore.utils import profile logger = logging.getLogger(__name__) -@profile def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: """ Run full MS²Rescore workflow with passed configuration. From 95ee4755325cd85990473cf15313c92e8c2f567a Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Mon, 26 Feb 2024 15:07:21 +0100 Subject: [PATCH 08/23] added new basic features --- ms2rescore/feature_generators/basic.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/ms2rescore/feature_generators/basic.py b/ms2rescore/feature_generators/basic.py index bc2222ee..df005167 100644 --- a/ms2rescore/feature_generators/basic.py +++ b/ms2rescore/feature_generators/basic.py @@ -56,6 +56,7 @@ def add_features(self, psm_list: PSMList) -> None: charge_states = np.array([psm.peptidoform.precursor_charge for psm in psm_list]) precursor_mzs = psm_list["precursor_mz"] scores = psm_list["score"] + peptide_lengths = np.array([len(psm.peptidoform.sequence) for psm in psm_list]) has_charge = None not in charge_states has_mz = None not in precursor_mzs and has_charge @@ -74,6 +75,14 @@ def add_features(self, psm_list: PSMList) -> None: if has_score: self._feature_names.append("search_engine_score") + if has_mz and has_charge: + experimental_mass = (precursor_mzs * charge_n) - (charge_n * 1.007276466812) + theoretical_mass = (theo_mz * charge_n) - (charge_n * 1.007276466812) + mass_error = experimental_mass - theoretical_mass + self._feature_names.extend(["theoretical_mass", "experimental_mass", "mass_error"]) + + self._feature_names.append("pep_len") + for i, psm in enumerate(psm_list): psm.rescoring_features.update( dict( @@ -81,6 +90,10 @@ def add_features(self, psm_list: PSMList) -> None: **charge_one_hot[i] if has_charge else {}, **{"abs_ms1_error_ppm": abs_ms1_error_ppm[i]} if has_mz else {}, **{"search_engine_score": scores[i]} if has_score else {}, + **{"theoretical_mass": theoretical_mass[i]} if has_mz and has_charge else {}, + **{"experimental_mass": experimental_mass[i]} if has_mz and has_charge else {}, + **{"mass_error": mass_error[i]} if has_mz and has_charge else {}, + **{"pep_len": peptide_lengths[i]}, ) ) From 73f4573331a82ea3984457333d3c701736f4d86d Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Mon, 26 Feb 2024 15:07:47 +0100 Subject: [PATCH 09/23] fixes for ms2 feature generator, removed multiprocessing --- ms2rescore/feature_generators/ms2.py | 149 +++++++++++++++------------ 1 file changed, 81 insertions(+), 68 deletions(-) diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py index 013bcb44..997accb2 100644 --- a/ms2rescore/feature_generators/ms2.py +++ b/ms2rescore/feature_generators/ms2.py @@ -4,6 +4,7 @@ """ import logging +import multiprocessing import re from typing import List, Optional, Union from itertools import chain @@ -14,6 +15,7 @@ from psm_utils import PSMList from pyteomics import mass, mzml, mgf from rustyms import RawSpectrum, LinearPeptide, FragmentationModel, MassMode +from rich.progress import track from ms2rescore.feature_generators.base import FeatureGeneratorBase from ms2rescore.utils import infer_spectrum_path @@ -42,8 +44,9 @@ def __init__( *args, spectrum_path: Optional[str] = None, spectrum_id_pattern: str = "(.*)", - fragmentationmodel: str = "All", - massmode: str = "Monoisotopic", + fragmentation_model: str = "All", + mass_mode: str = "Monoisotopic", + processes: int = 1, **kwargs, ) -> None: """ @@ -61,8 +64,9 @@ def __init__( super().__init__(*args, **kwargs) self.spectrum_path = spectrum_path self.spectrum_id_pattern = spectrum_id_pattern - self.fragmentation_model = FRAGMENTATION_MODELS[fragmentationmodel.lower()] - self.mass_mode = MASS_MODES[massmode.lower()] + self.fragmentation_model = FRAGMENTATION_MODELS[fragmentation_model.lower()] + self.mass_mode = MASS_MODES[mass_mode.lower()] + self.processes = processes @property def feature_names(self) -> List[str]: @@ -98,26 +102,17 @@ def add_features(self, psm_list: PSMList) -> None: spectrum_filename = infer_spectrum_path(self.spectrum_path, run) logger.debug(f"Using spectrum file `{spectrum_filename}`") - annotated_spectra = self._annotate_spectra(psm_list_run, spectrum_filename) - self._calculate_features(psm_list_run, annotated_spectra) + self._calculate_features(psm_list_run, spectrum_filename) current_run += 1 - def _calculate_features(self, psm_list: PSMList, annotated_spectra: List) -> None: - """Calculate features from annotated spectra and add them to the PSMs.""" - for psm, spectrum in zip(psm_list, annotated_spectra): - psm.features.update(self.extract_spectrum_features(spectrum, psm.peptidoform)) - - def _annotate_spectra(self, psm_list: PSMList, spectrum_file: str) -> List: + def _calculate_features(self, psm_list: PSMList, spectrum_file: str) -> List: """Retrieve annotated spectra for all psms.""" - annotated_spectra = [] spectrum_reader = self._read_spectrum_file(spectrum_file) spectrum_id_pattern = re.compile( self.spectrum_id_pattern if self.spectrum_id_pattern else r"(.*)" ) - logger.info(spectrum_id_pattern) - logger.info(spectrum_reader._offset_index.mapping["spectrum"].keys()) try: mapper = { @@ -128,20 +123,43 @@ def _annotate_spectra(self, psm_list: PSMList, spectrum_file: str) -> List: raise ParseSpectrumError( "Could not parse spectrum IDs using ´spectrum_id_pattern´. Please make sure that there is a capturing in the pattern." ) - - for psm in psm_list: - pyteomics_spectrum = spectrum_reader.get_by_id(mapper[psm.spectrum_id]) - annotated_spectrum = self._annotate_spectrum(psm, pyteomics_spectrum) - annotated_spectra.append(annotated_spectrum) - - return annotated_spectra + annotated_spectra = [ + self._annotate_spectrum(psm, spectrum_reader.get_by_id(mapper[psm.spectrum_id])) + for psm in psm_list + ] + for psm, annotated_spectrum in zip(psm_list, annotated_spectra): + psm.rescoring_features.update( + self._calculate_spectrum_features(psm, annotated_spectrum) + ) + # with multiprocessing.Pool(self.processes) as pool: + # counts_failed = 0 + # for psm, features in zip( + # psm_list, + # track( + # pool.imap( + # self._calculate_spectrum_features_wrapper, + # zip(psm_list, annotated_spectra), + # chunksize=1000, + # ), + # total=len(psm_list), + # description="Calculating MS2 features...", + # transient=True, + # ), + # ): + # if features: + # psm.rescoring_features.update(features) + + # else: + # counts_failed += 1 + # if counts_failed > 0: + # logger.warning(f"Failed to calculate features for {counts_failed} PSMs") @staticmethod def _read_spectrum_file(spectrum_filepath: str) -> Union[mzml.PreIndexedMzML, mgf.IndexedMGF]: - if spectrum_filepath.suffix == ".mzML": + if spectrum_filepath.suffix.lower() == ".mzml": return mzml.PreIndexedMzML(str(spectrum_filepath)) - elif spectrum_filepath.suffix == ".mgf": + elif spectrum_filepath.suffix.lower() == ".mgf": return mgf.IndexedMGF(str(spectrum_filepath)) @staticmethod @@ -155,40 +173,48 @@ def _longest_ion_sequence(lst): return max_sequence - def extract_spectrum_features(self, annotated_spectrum, peptidoform): + def _calculate_spectrum_features(self, psm, annotated_spectrum): + features = defaultdict(list) - b_ions_matched = [False] * (len(peptidoform.sequence)) - y_ions_matched = [False] * (len(peptidoform.sequence)) + b_ions_matched = [False] * (len(psm.peptidoform.sequence)) + y_ions_matched = [False] * (len(psm.peptidoform.sequence)) pseudo_count = 0.00001 + ion_fragment_regex = re.compile(r"\d+") - for annotated_peak in annotated_spectrum.spectrum: - features["total_intensity"].append(annotated_peak.intensity) + for peak in annotated_spectrum: + features["total_intensity"].append(peak.intensity) - if annotated_peak.annotation: - features["matched_intensity"].append(annotated_peak.intensity) - for matched_ion in annotated_peak.annotation: + if peak.annotation: + features["matched_intensity"].append(peak.intensity) + for matched_ion in peak.annotation: if "y" in matched_ion.ion: - features["y_ion_matched"].append(annotated_peak.intensity) - y_ions_matched[int(re.search(r"\d+", matched_ion.ion).group())] = True + features["y_ion_matched"].append(peak.intensity) + y_ions_matched[int(ion_fragment_regex.search(matched_ion.ion).group())] = ( + True + ) elif "b" in matched_ion.ion: - features["b_ion_matched"].append(annotated_peak.intensity) - b_ions_matched[int(re.search(r"\d+", matched_ion.ion).group())] = True + features["b_ion_matched"].append(peak.intensity) + b_ions_matched[int(ion_fragment_regex.search(matched_ion.ion).group())] = ( + True + ) + + total_intensity_sum = np.sum(features["total_intensity"]) + matched_intensity_sum = np.sum(features["matched_intensity"]) + b_ion_matched_sum = np.sum(features["b_ion_matched"]) + y_ion_matched_sum = np.sum(features["y_ion_matched"]) return { - "ln_explained_intensity": np.log(np.sum(features["matched_intensity"]) + pseudo_count), - "ln_total_intensity": np.log(np.sum(features["total_intensity"]) + pseudo_count), + "ln_explained_intensity": np.log(matched_intensity_sum + pseudo_count), + "ln_total_intensity": np.log(total_intensity_sum + pseudo_count), "ln_explained_intensity_ratio": np.log( - np.sum(features["matched_intensity"]) / np.sum(features["total_intensity"]) - + pseudo_count + matched_intensity_sum / total_intensity_sum + pseudo_count ), "ln_explained_b_ion_ratio": np.log( - np.sum(features["b_ion_matched"]) / np.sum(features["matched_intensity"]) - + pseudo_count + b_ion_matched_sum / matched_intensity_sum + pseudo_count ), "ln_explained_y_ion_ratio": np.log( - np.sum(features["y_ion_matched"]) / np.sum(features["matched_intensity"]) - + pseudo_count + y_ion_matched_sum / matched_intensity_sum + pseudo_count ), "longest_b_ion_sequence": self._longest_ion_sequence(b_ions_matched), "longest_y_ion_sequence": self._longest_ion_sequence(y_ions_matched), @@ -200,44 +226,31 @@ def extract_spectrum_features(self, annotated_spectrum, peptidoform): / (len(b_ions_matched) + len(y_ions_matched)), } - def _annotate_spectrum(self, psm, spectrum): + def _calculate_spectrum_features_wrapper(self, psm_spectrum_tuple): + psm, spectrum = psm_spectrum_tuple + return self._calculate_spectrum_features(psm, spectrum) + + def _annotate_spectrum(self, psm, pyteomics_spectrum): spectrum = RawSpectrum( title=psm.spectrum_id, num_scans=1, rt=psm.retention_time, precursor_charge=psm.get_precursor_charge(), - precursor_mass=mass.calculate_mass(psm.peptidoform.compsoition), - mz_array=spectrum["m/z array"], - intensity_array=spectrum["intensity array"], + precursor_mass=mass.calculate_mass(psm.peptidoform.composition), + mz_array=pyteomics_spectrum["m/z array"], + intensity_array=pyteomics_spectrum["intensity array"], ) annotated_spectrum = spectrum.annotate( peptide=LinearPeptide(psm.peptidoform.proforma), - model=self.fragmentationmodel, - mode=self.massmode, + model=self.fragmentation_model, + mode=self.mass_mode, ) - return annotated_spectrum + return annotated_spectrum.spectrum # TODO: keep this here? def modification_evidence(): return - - -# TODO: move to basic feature generator -def extract_psm_features(psm): - - expmass = (psm.precursor_mz * psm.get_precursor_charge()) - ( - psm.get_precursor_charge() * 1.007276 - ) # TODO: replace by constant - calcmass = mass.calculate_mass(psm.peptidoform.composition) - delta_mass = expmass - calcmass - - return { - "peptide_len": len(psm.peptidoform.sequence), - "expmass": expmass, - "calcmass": calcmass, - "delta_mass": delta_mass, - } From 947233ee0039757676ea5b62daaacf2e1059089f Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Wed, 28 Feb 2024 11:12:32 +0100 Subject: [PATCH 10/23] return empty list on parsing error with rustyms, removed multiprocessing --- ms2rescore/feature_generators/ms2.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py index 997accb2..b292d51a 100644 --- a/ms2rescore/feature_generators/ms2.py +++ b/ms2rescore/feature_generators/ms2.py @@ -4,7 +4,6 @@ """ import logging -import multiprocessing import re from typing import List, Optional, Union from itertools import chain @@ -15,7 +14,6 @@ from psm_utils import PSMList from pyteomics import mass, mzml, mgf from rustyms import RawSpectrum, LinearPeptide, FragmentationModel, MassMode -from rich.progress import track from ms2rescore.feature_generators.base import FeatureGeneratorBase from ms2rescore.utils import infer_spectrum_path @@ -62,6 +60,7 @@ def __init__( """ super().__init__(*args, **kwargs) + self.spectrum_path = spectrum_path self.spectrum_id_pattern = spectrum_id_pattern self.fragmentation_model = FRAGMENTATION_MODELS[fragmentation_model.lower()] @@ -100,7 +99,6 @@ def add_features(self, psm_list: PSMList) -> None: ) psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) spectrum_filename = infer_spectrum_path(self.spectrum_path, run) - logger.debug(f"Using spectrum file `{spectrum_filename}`") self._calculate_features(psm_list_run, spectrum_filename) current_run += 1 @@ -175,6 +173,9 @@ def _longest_ion_sequence(lst): def _calculate_spectrum_features(self, psm, annotated_spectrum): + if not annotated_spectrum: + return {} + features = defaultdict(list) b_ions_matched = [False] * (len(psm.peptidoform.sequence)) y_ions_matched = [False] * (len(psm.peptidoform.sequence)) @@ -241,12 +242,14 @@ def _annotate_spectrum(self, psm, pyteomics_spectrum): mz_array=pyteomics_spectrum["m/z array"], intensity_array=pyteomics_spectrum["intensity array"], ) - - annotated_spectrum = spectrum.annotate( - peptide=LinearPeptide(psm.peptidoform.proforma), - model=self.fragmentation_model, - mode=self.mass_mode, - ) + try: + annotated_spectrum = spectrum.annotate( + peptide=LinearPeptide(psm.peptidoform.proforma.split("/")[0]), + model=self.fragmentation_model, + mode=self.mass_mode, + ) + except: # noqa E722 + return [] return annotated_spectrum.spectrum From 24ce565542fb42cea534b81b53093435dba2a36c Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Fri, 15 Mar 2024 09:47:02 +0100 Subject: [PATCH 11/23] add deeplc_calibration psm set --- ms2rescore/feature_generators/deeplc.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py index 30bea716..207f8ba1 100644 --- a/ms2rescore/feature_generators/deeplc.py +++ b/ms2rescore/feature_generators/deeplc.py @@ -25,6 +25,7 @@ import numpy as np from psm_utils import PSMList +from psm_utils.io import read_file from ms2rescore.feature_generators.base import FeatureGeneratorBase @@ -40,6 +41,7 @@ def __init__( *args, lower_score_is_better: bool = False, calibration_set_size: Union[int, float, None] = None, + calibration_set: Union[str, None] = None, processes: int = 1, **kwargs, ) -> None: @@ -71,6 +73,7 @@ def __init__( self.lower_psm_score_better = lower_score_is_better self.calibration_set_size = calibration_set_size + self.calibration_set = calibration_set self.processes = processes self.deeplc_kwargs = kwargs or {} @@ -120,7 +123,9 @@ def add_features(self, psm_list: PSMList) -> None: # Run DeepLC for each spectrum file current_run = 1 total_runs = sum(len(runs) for runs in psm_dict.values()) - + if self.calibration_set: + logger.info("Loading calibration set...") + self.calibration_set = read_file(self.calibration_set, filetype="tsv") for runs in psm_dict.values(): # Reset DeepLC predictor for each collection of runs self.deeplc_predictor = None @@ -138,13 +143,19 @@ def add_features(self, psm_list: PSMList) -> None: ) # Disable wild logging to stdout by Tensorflow, unless in debug mode - with contextlib.redirect_stdout( - open(os.devnull, "w") - ) if not self._verbose else contextlib.nullcontext(): + with ( + contextlib.redirect_stdout(open(os.devnull, "w")) + if not self._verbose + else contextlib.nullcontext() + ): # Make new PSM list for this run (chain PSMs per spectrum to flat list) psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) - - psm_list_calibration = self._get_calibration_psms(psm_list_run) + if self.calibration_set: + psm_list_calibration = self.calibration_set[ + self.calibration_set["run"] == run + ] + else: + psm_list_calibration = self._get_calibration_psms(psm_list_run) logger.debug(f"Calibrating DeepLC with {len(psm_list_calibration)} PSMs...") self.deeplc_predictor = self.DeepLC( n_jobs=self.processes, From 33c38b04e304e6b865714adb26b813c647911628 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Wed, 17 Apr 2024 17:10:38 +0200 Subject: [PATCH 12/23] remove unused import --- ms2rescore/utils.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/ms2rescore/utils.py b/ms2rescore/utils.py index 28249c51..3515893a 100644 --- a/ms2rescore/utils.py +++ b/ms2rescore/utils.py @@ -4,9 +4,6 @@ from glob import glob from pathlib import Path from typing import Optional, Union -import cProfile -import pstats -import io from ms2rescore.exceptions import MS2RescoreConfigurationError From 11fdc51ad70c225393553f5964f7485a60b7ae19 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Wed, 21 Aug 2024 13:25:10 +0200 Subject: [PATCH 13/23] integrate mumble into ms2branch --- ms2rescore/exceptions.py | 6 +++ ms2rescore/package_data/config_default.json | 1 + ms2rescore/package_data/config_schema.json | 42 +++++++++++++++++++++ ms2rescore/parse_psms.py | 14 +++++++ pyproject.toml | 1 + 5 files changed, 64 insertions(+) diff --git a/ms2rescore/exceptions.py b/ms2rescore/exceptions.py index ba5492a2..68b1b43d 100644 --- a/ms2rescore/exceptions.py +++ b/ms2rescore/exceptions.py @@ -41,3 +41,9 @@ class RescoringError(MS2RescoreError): """Error while rescoring PSMs.""" pass + + +class ParseSpectrumError(MS2RescoreError): + """Error while rescoring PSMs.""" + + pass diff --git a/ms2rescore/package_data/config_default.json b/ms2rescore/package_data/config_default.json index 733ea707..126fde8b 100644 --- a/ms2rescore/package_data/config_default.json +++ b/ms2rescore/package_data/config_default.json @@ -12,6 +12,7 @@ }, "maxquant": {} }, + "psm_generator": {}, "rescoring_engine": { "mokapot": { "train_fdr": 0.01, diff --git a/ms2rescore/package_data/config_schema.json b/ms2rescore/package_data/config_schema.json index ab77f3b7..40471714 100644 --- a/ms2rescore/package_data/config_schema.json +++ b/ms2rescore/package_data/config_schema.json @@ -62,6 +62,18 @@ "mokapot": {} } }, + "psm_generator": { + "description": "PSM generator and their configuration.", + "type": "object", + "minProperties": 0, + "maxProperties": 1, + "patternProperties": { + ".*": { "$ref": "#/definitions/psm_generator" }, + "mumble": { + "$ref": "#/definitions/mumble" + } + } + }, "config_file": { "description": "Path to configuration file", "oneOf": [{ "type": "string" }, { "type": "null" }] @@ -179,6 +191,7 @@ "type": "boolean", "default": false } + } } }, @@ -193,6 +206,11 @@ "type": "object", "additionalProperties": true }, + "psm_generator": { + "description": "Additional PSM feature generator configuration", + "type": "object", + "additionalProperties": true + }, "basic": { "$ref": "#/definitions/feature_generator", "description": "Basic feature generator configuration", @@ -273,6 +291,30 @@ } } }, + "mumble": { + "$ref": "#/definitions/psm_generator", + "description": "Mumble PSM generator configuration using Mubmle", + "type": "object", + "additionalProperties": true, + "properties": { + "aa_combinations": { + "description": "Additional amino acid combinations to consider as mass shift", + "type": "integer", + "default": 0 + }, + "fasta_file": { + "description": "Maximum number of modifications per peptide", + "oneOf": [{ "type": "string" }, { "type": "null" }], + "default": false + }, + "mass_error": { + "description": "mass error in Da", + "type": "number", + "default": 0.2 + } + } + }, + "mokapot": { "$ref": "#/definitions/rescoring_engine", "description": "Mokapot rescoring engine configuration. Additional properties are passed to the Mokapot brew function.", diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py index 6a3c6065..ae28b519 100644 --- a/ms2rescore/parse_psms.py +++ b/ms2rescore/parse_psms.py @@ -5,6 +5,7 @@ import numpy as np import psm_utils.io from psm_utils import PSMList +from mumble import PSMHandler from ms2rescore.exceptions import MS2RescoreConfigurationError @@ -90,6 +91,19 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: new_ids = [_match_psm_ids(old_id, pattern) for old_id in psm_list["spectrum_id"]] psm_list["spectrum_id"] = new_ids + # Addition of Modifications for mass shifts in the PSMs with Mumble + if "mumble" in config["psm_generator"]: + logger.debug("Applying modifications for mass shifts using Mumble...") + mumble_config = config["psm_generator"]["mumble"] + psm_handler = PSMHandler( + **mumble_config, # TODO how do we store config for mumble? + ) + psm_list = psm_handler.add_modified_psms( + psm_list, generate_modified_decoys=True, keep_original=True + ) + if mumble_config["output_file"]: + psm_handler.write_modified_psm_list(psm_list, mumble_config["output_file"]) + return psm_list diff --git a/pyproject.toml b/pyproject.toml index f57ad800..7c1b0a9c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,6 +50,7 @@ dependencies = [ "pyteomics>=4.7.2", "rich>=12", "tomli>=2; python_version < '3.11'", + "mumble" ] [project.optional-dependencies] From 883169a90cf7c57970dac1085ba54defbf1ffdb9 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Fri, 27 Sep 2024 09:58:27 +0200 Subject: [PATCH 14/23] temp removal of sage features before rescoring --- ms2rescore/parse_psms.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py index ae28b519..91c94168 100644 --- a/ms2rescore/parse_psms.py +++ b/ms2rescore/parse_psms.py @@ -95,6 +95,15 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: if "mumble" in config["psm_generator"]: logger.debug("Applying modifications for mass shifts using Mumble...") mumble_config = config["psm_generator"]["mumble"] + for psm in psm_list: + psm["rescoring_features"].pop("expmass", None) + psm["rescoring_features"].pop("calcmass", None) + psm["rescoring_features"].pop("peptide_len", None) + psm["rescoring_features"].pop("matched_peaks", None) + psm["rescoring_features"].pop("longest_b", None) + psm["rescoring_features"].pop("longest_y", None) + psm["rescoring_features"].pop("longest_y_pct", None) + psm["rescoring_features"].pop("matched_intensity_pct", None) psm_handler = PSMHandler( **mumble_config, # TODO how do we store config for mumble? ) From da39ae81d052b4391ada17fd24bb5a856078e8a8 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Fri, 8 Nov 2024 15:41:26 +0100 Subject: [PATCH 15/23] remove psm_file features when rescoring with mumble --- ms2rescore/parse_psms.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py index 91c94168..309c1c96 100644 --- a/ms2rescore/parse_psms.py +++ b/ms2rescore/parse_psms.py @@ -95,15 +95,14 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: if "mumble" in config["psm_generator"]: logger.debug("Applying modifications for mass shifts using Mumble...") mumble_config = config["psm_generator"]["mumble"] - for psm in psm_list: - psm["rescoring_features"].pop("expmass", None) - psm["rescoring_features"].pop("calcmass", None) - psm["rescoring_features"].pop("peptide_len", None) - psm["rescoring_features"].pop("matched_peaks", None) - psm["rescoring_features"].pop("longest_b", None) - psm["rescoring_features"].pop("longest_y", None) - psm["rescoring_features"].pop("longest_y_pct", None) - psm["rescoring_features"].pop("matched_intensity_pct", None) + + # Check if psm_list[0].rescoring_features is empty or not + if psm_list[0].rescoring_features: + logger.debug("Removing psm_file rescoring features from PSMs...") + # psm_list.remove_rescoring_features() # TODO add this to psm_utils + for psm in psm_list: + psm.rescoring_features = {} + psm_handler = PSMHandler( **mumble_config, # TODO how do we store config for mumble? ) From 37fff28ebc03fcf66f98be5a5c51ca7e3b68e97b Mon Sep 17 00:00:00 2001 From: SamvPy Date: Tue, 19 Nov 2024 16:10:43 +0100 Subject: [PATCH 16/23] linting --- ms2rescore/core.py | 5 ++++- ms2rescore/feature_generators/__init__.py | 4 ++-- ms2rescore/feature_generators/deeplc.py | 8 +++++--- ms2rescore/feature_generators/ionmob.py | 11 +++++++++-- ms2rescore/feature_generators/ms2pip.py | 5 ++++- ms2rescore/gui/__main__.py | 2 +- ms2rescore/parse_psms.py | 2 +- ms2rescore/rescoring_engines/percolator.py | 2 +- 8 files changed, 27 insertions(+), 12 deletions(-) diff --git a/ms2rescore/core.py b/ms2rescore/core.py index a02ab4f6..48e125bc 100644 --- a/ms2rescore/core.py +++ b/ms2rescore/core.py @@ -14,7 +14,10 @@ from ms2rescore.parse_spectra import get_missing_values from ms2rescore.report import generate from ms2rescore.rescoring_engines import mokapot, percolator -from ms2rescore.rescoring_engines.mokapot import add_peptide_confidence, add_psm_confidence +from ms2rescore.rescoring_engines.mokapot import ( + add_peptide_confidence, + add_psm_confidence, +) logger = logging.getLogger(__name__) diff --git a/ms2rescore/feature_generators/__init__.py b/ms2rescore/feature_generators/__init__.py index 7a10fd4c..9eff1b5c 100644 --- a/ms2rescore/feature_generators/__init__.py +++ b/ms2rescore/feature_generators/__init__.py @@ -4,11 +4,11 @@ from ms2rescore.feature_generators.basic import BasicFeatureGenerator from ms2rescore.feature_generators.deeplc import DeepLCFeatureGenerator +from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator from ms2rescore.feature_generators.ionmob import IonMobFeatureGenerator from ms2rescore.feature_generators.maxquant import MaxQuantFeatureGenerator -from ms2rescore.feature_generators.ms2pip import MS2PIPFeatureGenerator -from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator from ms2rescore.feature_generators.ms2 import MS2FeatureGenerator +from ms2rescore.feature_generators.ms2pip import MS2PIPFeatureGenerator FEATURE_GENERATORS = { "basic": BasicFeatureGenerator, diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py index 206d6a21..e3113544 100644 --- a/ms2rescore/feature_generators/deeplc.py +++ b/ms2rescore/feature_generators/deeplc.py @@ -143,9 +143,11 @@ def add_features(self, psm_list: PSMList) -> None: ) # Disable wild logging to stdout by Tensorflow, unless in debug mode - with contextlib.redirect_stdout( - open(os.devnull, "w", encoding="utf-8") - ) if not self._verbose else contextlib.nullcontext(): + with ( + contextlib.redirect_stdout(open(os.devnull, "w", encoding="utf-8")) + if not self._verbose + else contextlib.nullcontext() + ): # Make new PSM list for this run (chain PSMs per spectrum to flat list) psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) if self.calibration_set: diff --git a/ms2rescore/feature_generators/ionmob.py b/ms2rescore/feature_generators/ionmob.py index 7fa0c0a1..e9b51026 100644 --- a/ms2rescore/feature_generators/ionmob.py +++ b/ms2rescore/feature_generators/ionmob.py @@ -23,12 +23,19 @@ import tensorflow as tf from psm_utils import Peptidoform, PSMList -from ms2rescore.feature_generators.base import FeatureGeneratorBase, FeatureGeneratorException +from ms2rescore.feature_generators.base import ( + FeatureGeneratorBase, + FeatureGeneratorException, +) try: from ionmob import __file__ as ionmob_file from ionmob.preprocess.data import to_tf_dataset_inference - from ionmob.utilities.chemistry import VARIANT_DICT, calculate_mz, reduced_mobility_to_ccs + from ionmob.utilities.chemistry import ( + VARIANT_DICT, + calculate_mz, + reduced_mobility_to_ccs, + ) from ionmob.utilities.tokenization import tokenizer_from_json from ionmob.utilities.utility import get_ccs_shift except ImportError: diff --git a/ms2rescore/feature_generators/ms2pip.py b/ms2rescore/feature_generators/ms2pip.py index 3882b3fc..34d427e2 100644 --- a/ms2rescore/feature_generators/ms2pip.py +++ b/ms2rescore/feature_generators/ms2pip.py @@ -37,7 +37,10 @@ from psm_utils import PSMList from rich.progress import track -from ms2rescore.feature_generators.base import FeatureGeneratorBase, FeatureGeneratorException +from ms2rescore.feature_generators.base import ( + FeatureGeneratorBase, + FeatureGeneratorException, +) from ms2rescore.utils import infer_spectrum_path logger = logging.getLogger(__name__) diff --git a/ms2rescore/gui/__main__.py b/ms2rescore/gui/__main__.py index 429e117f..57cee3c6 100644 --- a/ms2rescore/gui/__main__.py +++ b/ms2rescore/gui/__main__.py @@ -1,8 +1,8 @@ """Entrypoint for MS²Rescore GUI.""" +import contextlib import multiprocessing import os -import contextlib from ms2rescore.gui.app import app diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py index 309c1c96..cc61926f 100644 --- a/ms2rescore/parse_psms.py +++ b/ms2rescore/parse_psms.py @@ -4,8 +4,8 @@ import numpy as np import psm_utils.io -from psm_utils import PSMList from mumble import PSMHandler +from psm_utils import PSMList from ms2rescore.exceptions import MS2RescoreConfigurationError diff --git a/ms2rescore/rescoring_engines/percolator.py b/ms2rescore/rescoring_engines/percolator.py index 192950bd..32d856c4 100644 --- a/ms2rescore/rescoring_engines/percolator.py +++ b/ms2rescore/rescoring_engines/percolator.py @@ -19,8 +19,8 @@ import logging import subprocess -from typing import Any, Dict, Optional from copy import deepcopy +from typing import Any, Dict, Optional import psm_utils From e8b59f3ca2c6f1fbaae4ec5cc6b531f25284fad7 Mon Sep 17 00:00:00 2001 From: SamvPy Date: Tue, 19 Nov 2024 16:10:56 +0100 Subject: [PATCH 17/23] add hyperscore calculation --- ms2rescore/feature_generators/ms2.py | 183 ++++++++++++++++++++++++++- 1 file changed, 176 insertions(+), 7 deletions(-) diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py index b292d51a..490f4f47 100644 --- a/ms2rescore/feature_generators/ms2.py +++ b/ms2rescore/feature_generators/ms2.py @@ -5,19 +5,19 @@ import logging import re -from typing import List, Optional, Union -from itertools import chain from collections import defaultdict - +from itertools import chain +from typing import List, Optional, Union import numpy as np -from psm_utils import PSMList -from pyteomics import mass, mzml, mgf -from rustyms import RawSpectrum, LinearPeptide, FragmentationModel, MassMode +import pyopenms as oms +from psm_utils import PSM, Peptidoform, PSMList +from pyteomics import mass, mgf, mzml +from rustyms import FragmentationModel, LinearPeptide, MassMode, RawSpectrum +from ms2rescore.exceptions import ParseSpectrumError from ms2rescore.feature_generators.base import FeatureGeneratorBase from ms2rescore.utils import infer_spectrum_path -from ms2rescore.exceptions import ParseSpectrumError logger = logging.getLogger(__name__) @@ -45,6 +45,7 @@ def __init__( fragmentation_model: str = "All", mass_mode: str = "Monoisotopic", processes: int = 1, + calculate_hyperscore: bool = True, # Allow optional ? **kwargs, ) -> None: """ @@ -66,6 +67,7 @@ def __init__( self.fragmentation_model = FRAGMENTATION_MODELS[fragmentation_model.lower()] self.mass_mode = MASS_MODES[mass_mode.lower()] self.processes = processes + self.calculate_hyperscore = calculate_hyperscore @property def feature_names(self) -> List[str]: @@ -82,6 +84,7 @@ def feature_names(self) -> List[str]: "matched_y_ions", "matched_y_ions_pct", "matched_ions_pct", + "hyperscore", ] def add_features(self, psm_list: PSMList) -> None: @@ -129,6 +132,22 @@ def _calculate_features(self, psm_list: PSMList, spectrum_file: str) -> List: psm.rescoring_features.update( self._calculate_spectrum_features(psm, annotated_spectrum) ) + + if self.calculate_hyperscore: + # Filters out peaks which are unnannotated (can be specified, but keep at b-y ions of any charge ?) + mz_list, intensity_list = _annotated_spectrum_to_mzint( + annotated_spectrum=annotated_spectrum + ) + psm.rescoring_features.update( + { + "hyperscore": calculate_hyperscore( + psm=psm, observed_mz=mz_list, observed_intensity=intensity_list + ) + } + ) + + else: + psm.rescoring_features.update({"hyperscore": np.nan}) # with multiprocessing.Pool(self.processes) as pool: # counts_failed = 0 # for psm, features in zip( @@ -253,7 +272,157 @@ def _annotate_spectrum(self, psm, pyteomics_spectrum): return annotated_spectrum.spectrum + def _calculate_hyperscore(self, psm, spectrum): + pass + + def _calculate_fragmentation_features(self, psm, annotated_spectrum): + pass + # TODO: keep this here? def modification_evidence(): return + + +def _annotated_spectrum_to_mzint(annotated_spectrum, ion_types=["b", "y"]): + + mz_list = [] + intensity_list = [] + + for peak in annotated_spectrum: + + annotations = peak.annotation + for fragment in annotations: + ion = fragment.ion + if ion[0] not in ion_types: + continue + + mz_list.append(peak.experimental_mz) + intensity_list.append(peak.intensity) + break + + return mz_list, intensity_list + + +def _peptidoform_to_oms(peptidoform: Peptidoform) -> tuple[oms.AASequence, Optional[int]]: + """ + Parse a peptidoform object to pyOpenMS compatible format. + + Parameter + --------- + Peptidoform: Peptidoform + Peptide string in Peptidoform format + + Returns + ------- + AASequence (pyOpenMS): + A peptide sequence in pyOpenMS format + int: + charge of the peptide + """ + + n_term = peptidoform.properties["n_term"] + peptide_oms_str = f"[{sum([mod.mass for mod in n_term])}]" if n_term else "" + + for aa, mods in peptidoform.parsed_sequence: + peptide_oms_str += aa + if isinstance(mods, list): + peptide_oms_str += f"[{sum([mod.mass for mod in mods])}]" + + c_term = peptidoform.properties["c_term"] + peptide_oms_str += f"[{sum([mod.mass for mod in c_term])}]" if c_term else "" + + peptide_oms = oms.AASequence.fromString(peptide_oms_str) + + return peptide_oms + + +def _peptidoform_to_theoretical_spectrum(peptidoform: str) -> oms.MSSpectrum: + """ + Create a theoretical spectrum from a peptide sequence. + + Parameter + --------- + peptide: str + Peptide sequence in proforma format + engine: str + The engine to use to create theoretical spectrum. + Can only be 'pyopenms' or 'spectrum-utils' (default) + + Return + ------ + MSSpectrum + Spectrum object in pyOpenMS format + """ + # Reformat peptide sequence in pyOpenMS format + peptide_oms = _peptidoform_to_oms(peptidoform=peptidoform) + + # Initialize the required objects to create the spectrum + spectrum = oms.MSSpectrum() + tsg = oms.TheoreticalSpectrumGenerator() + p = oms.Param() + + p.setValue("add_b_ions", "true") + p.setValue("add_metainfo", "true") + tsg.setParameters(param=p) + + # Create the theoretical spectrum + tsg.getSpectrum(spec=spectrum, peptide=peptide_oms, min_charge=1, max_charge=2) + return spectrum + + +def calculate_hyperscore( + psm: PSM, + observed_mz: List[float], + observed_intensity: List[float], + fragment_tol_mass=20, + fragment_tol_mode="ppm", +): + """ + Calculate the hyperscore as defined in the X!Tandem search engine. + + It is a metric of how good two spectra match with each other (matching peaks). + + Parameters + ---------- + psm: psm_utils.PSM + The PSM used to extract 'spectrum_id' (for MGF spectrum extraction) + and 'Peptidoform' (the peptide sequence) + observed_mz: List[float] + List of observed mz values with matching order as observed intensity + observed_intensity: List[float] + List of observed intensity values + fragment_tol_mass: int + The allowed tolerance to match peaks + fragment_tol_mode: str + 'ppm' for parts-per-million mode. 'Da' for fragment_tol_mass in Dalton. + Return + ------ + int + The hyperscore + """ + if fragment_tol_mode == "ppm": + fragment_mass_tolerance_unit_ppm = True + elif fragment_tol_mode == "Da": + fragment_mass_tolerance_unit_ppm = False + else: + raise Exception( + "fragment_tol_mode can only take 'Da' or 'ppm'. {} was provided.".format( + fragment_tol_mode + ) + ) + if len(observed_intensity) == 0: + logging.warning(f"PSM ({psm.spectrum_id}) has no annotated peaks.") + return 0.0 + + theoretical_spectrum = _peptidoform_to_theoretical_spectrum(peptidoform=psm.peptidoform) + observed_spectrum_oms = oms.MSSpectrum() + observed_spectrum_oms.set_peaks([observed_mz, observed_intensity]) + hyperscore = oms.HyperScore() + result = hyperscore.compute( + fragment_mass_tolerance=fragment_tol_mass, + fragment_mass_tolerance_unit_ppm=fragment_mass_tolerance_unit_ppm, + exp_spectrum=observed_spectrum_oms, + theo_spectrum=theoretical_spectrum, + ) + return result From c51cd346f438b2afc97c2b171ea73bcdc60a64eb Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Thu, 21 Nov 2024 17:21:04 +0100 Subject: [PATCH 18/23] calibration fixes --- ms2rescore/feature_generators/deeplc.py | 15 +++++---------- ms2rescore/feature_generators/im2deep.py | 3 ++- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py index e3113544..29a249eb 100644 --- a/ms2rescore/feature_generators/deeplc.py +++ b/ms2rescore/feature_generators/deeplc.py @@ -123,9 +123,6 @@ def add_features(self, psm_list: PSMList) -> None: # Run DeepLC for each spectrum file current_run = 1 total_runs = sum(len(runs) for runs in psm_dict.values()) - if self.calibration_set: - logger.info("Loading calibration set...") - self.calibration_set = read_file(self.calibration_set, filetype="tsv") for runs in psm_dict.values(): # Reset DeepLC predictor for each collection of runs self.deeplc_predictor = None @@ -150,12 +147,7 @@ def add_features(self, psm_list: PSMList) -> None: ): # Make new PSM list for this run (chain PSMs per spectrum to flat list) psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) - if self.calibration_set: - psm_list_calibration = self.calibration_set[ - self.calibration_set["run"] == run - ] - else: - psm_list_calibration = self._get_calibration_psms(psm_list_run) + psm_list_calibration = self._get_calibration_psms(psm_list_run) logger.debug(f"Calibrating DeepLC with {len(psm_list_calibration)} PSMs...") self.deeplc_predictor = self.DeepLC( n_jobs=self.processes, @@ -204,7 +196,10 @@ def add_features(self, psm_list: PSMList) -> None: def _get_calibration_psms(self, psm_list: PSMList): """Get N best scoring target PSMs for calibration.""" - psm_list_targets = psm_list[~psm_list["is_decoy"]] + psm_list_targets = psm_list[ + ~psm_list["is_decoy"] + & [metadata.get("original_psm", True) for metadata in psm_list["metadata"]] + ] if self.calibration_set_size: n_psms = self._get_number_of_calibration_psms(psm_list_targets) indices = np.argsort(psm_list_targets["score"]) diff --git a/ms2rescore/feature_generators/im2deep.py b/ms2rescore/feature_generators/im2deep.py index 772f6855..2afa4fcd 100644 --- a/ms2rescore/feature_generators/im2deep.py +++ b/ms2rescore/feature_generators/im2deep.py @@ -158,7 +158,8 @@ def make_calibration_df(psm_list_df: pd.DataFrame, threshold: float = 0.25) -> p identified_psms = psm_list_df[ (psm_list_df["qvalue"] < 0.01) & (~psm_list_df["is_decoy"]) - & (psm_list_df["charge"] < 5) # predictions do not go higher for IM2Deep + & (psm_list_df["charge"] < 7) # predictions do not go higher for IM2Deep + & ([metadata.get("original_psm", True) for metadata in psm_list_df["metadata"]]) ] calibration_psms = identified_psms[ identified_psms["qvalue"] < identified_psms["qvalue"].quantile(1 - threshold) From 295e37f20d2e344f46bf5fc9d911f6955210f184 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Thu, 21 Nov 2024 17:21:16 +0100 Subject: [PATCH 19/23] changes for mumble implementation --- ms2rescore/feature_generators/ms2.py | 38 ---------------------------- ms2rescore/parse_psms.py | 8 ++---- pyproject.toml | 1 - 3 files changed, 2 insertions(+), 45 deletions(-) diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py index 490f4f47..a3bd6d77 100644 --- a/ms2rescore/feature_generators/ms2.py +++ b/ms2rescore/feature_generators/ms2.py @@ -23,7 +23,6 @@ FRAGMENTATION_MODELS = { "cidhcd": FragmentationModel.CidHcd, - "etcid": FragmentationModel.Etcid, "etd": FragmentationModel.Etd, "ethcd": FragmentationModel.Ethcd, "all": FragmentationModel.All, @@ -148,28 +147,6 @@ def _calculate_features(self, psm_list: PSMList, spectrum_file: str) -> List: else: psm.rescoring_features.update({"hyperscore": np.nan}) - # with multiprocessing.Pool(self.processes) as pool: - # counts_failed = 0 - # for psm, features in zip( - # psm_list, - # track( - # pool.imap( - # self._calculate_spectrum_features_wrapper, - # zip(psm_list, annotated_spectra), - # chunksize=1000, - # ), - # total=len(psm_list), - # description="Calculating MS2 features...", - # transient=True, - # ), - # ): - # if features: - # psm.rescoring_features.update(features) - - # else: - # counts_failed += 1 - # if counts_failed > 0: - # logger.warning(f"Failed to calculate features for {counts_failed} PSMs") @staticmethod def _read_spectrum_file(spectrum_filepath: str) -> Union[mzml.PreIndexedMzML, mgf.IndexedMGF]: @@ -246,10 +223,6 @@ def _calculate_spectrum_features(self, psm, annotated_spectrum): / (len(b_ions_matched) + len(y_ions_matched)), } - def _calculate_spectrum_features_wrapper(self, psm_spectrum_tuple): - psm, spectrum = psm_spectrum_tuple - return self._calculate_spectrum_features(psm, spectrum) - def _annotate_spectrum(self, psm, pyteomics_spectrum): spectrum = RawSpectrum( @@ -272,17 +245,6 @@ def _annotate_spectrum(self, psm, pyteomics_spectrum): return annotated_spectrum.spectrum - def _calculate_hyperscore(self, psm, spectrum): - pass - - def _calculate_fragmentation_features(self, psm, annotated_spectrum): - pass - - -# TODO: keep this here? -def modification_evidence(): - return - def _annotated_spectrum_to_mzint(annotated_spectrum, ion_types=["b", "y"]): diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py index cc61926f..2f0ec3b0 100644 --- a/ms2rescore/parse_psms.py +++ b/ms2rescore/parse_psms.py @@ -104,13 +104,9 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: psm.rescoring_features = {} psm_handler = PSMHandler( - **mumble_config, # TODO how do we store config for mumble? + **mumble_config, ) - psm_list = psm_handler.add_modified_psms( - psm_list, generate_modified_decoys=True, keep_original=True - ) - if mumble_config["output_file"]: - psm_handler.write_modified_psm_list(psm_list, mumble_config["output_file"]) + psm_list = psm_handler.add_modified_psms(psm_list) return psm_list diff --git a/pyproject.toml b/pyproject.toml index 4285f1fa..985cc27e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,7 +50,6 @@ dependencies = [ "pyteomics>=4.7.2", "rich>=12", "tomli>=2; python_version < '3.11'", - "mumble" ] [project.optional-dependencies] From 909860d9c6f81490f9af3f6de364b606ddd7fe9c Mon Sep 17 00:00:00 2001 From: SamvPy Date: Fri, 22 Nov 2024 11:51:23 +0100 Subject: [PATCH 20/23] change openms peptide formatting --- ms2rescore/feature_generators/ms2.py | 121 ++++++++++++++++++++++----- 1 file changed, 98 insertions(+), 23 deletions(-) diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py index a3bd6d77..eb049ed8 100644 --- a/ms2rescore/feature_generators/ms2.py +++ b/ms2rescore/feature_generators/ms2.py @@ -12,8 +12,11 @@ import numpy as np import pyopenms as oms from psm_utils import PSM, Peptidoform, PSMList +from psm_utils.peptidoform import PeptidoformException +from pyteomics.proforma import ProFormaError from pyteomics import mass, mgf, mzml from rustyms import FragmentationModel, LinearPeptide, MassMode, RawSpectrum +from pathlib import Path from ms2rescore.exceptions import ParseSpectrumError from ms2rescore.feature_generators.base import FeatureGeneratorBase @@ -68,6 +71,7 @@ def __init__( self.processes = processes self.calculate_hyperscore = calculate_hyperscore + @property def feature_names(self) -> List[str]: return [ @@ -137,19 +141,20 @@ def _calculate_features(self, psm_list: PSMList, spectrum_file: str) -> List: mz_list, intensity_list = _annotated_spectrum_to_mzint( annotated_spectrum=annotated_spectrum ) + hyperscore = calculate_hyperscore( + psm=psm, observed_mz=mz_list, observed_intensity=intensity_list + ) + if hyperscore is None: + continue psm.rescoring_features.update( { - "hyperscore": calculate_hyperscore( - psm=psm, observed_mz=mz_list, observed_intensity=intensity_list - ) + "hyperscore": hyperscore } ) - else: - psm.rescoring_features.update({"hyperscore": np.nan}) @staticmethod - def _read_spectrum_file(spectrum_filepath: str) -> Union[mzml.PreIndexedMzML, mgf.IndexedMGF]: + def _read_spectrum_file(spectrum_filepath: Path) -> Union[mzml.PreIndexedMzML, mgf.IndexedMGF]: if spectrum_filepath.suffix.lower() == ".mzml": return mzml.PreIndexedMzML(str(spectrum_filepath)) @@ -235,8 +240,9 @@ def _annotate_spectrum(self, psm, pyteomics_spectrum): intensity_array=pyteomics_spectrum["intensity array"], ) try: + linear_peptide = LinearPeptide(psm.peptidoform.proforma.split("/")[0]) annotated_spectrum = spectrum.annotate( - peptide=LinearPeptide(psm.peptidoform.proforma.split("/")[0]), + peptide=linear_peptide, model=self.fragmentation_model, mode=self.mass_mode, ) @@ -246,6 +252,9 @@ def _annotate_spectrum(self, psm, pyteomics_spectrum): return annotated_spectrum.spectrum +############################ +### HYPERSCORE FUNCTIONS ### +############################ def _annotated_spectrum_to_mzint(annotated_spectrum, ion_types=["b", "y"]): mz_list = [] @@ -268,35 +277,87 @@ def _annotated_spectrum_to_mzint(annotated_spectrum, ion_types=["b", "y"]): def _peptidoform_to_oms(peptidoform: Peptidoform) -> tuple[oms.AASequence, Optional[int]]: """ - Parse a peptidoform object to pyOpenMS compatible format. + Parse a peptide sequence in proforma format to pyOpenMS compatible format. + + Only supports UNIMOD format. Parameter --------- - Peptidoform: Peptidoform - Peptide string in Peptidoform format + peptide: str + Peptide string in proforma format Returns ------- AASequence (pyOpenMS): A peptide sequence in pyOpenMS format - int: - charge of the peptide """ + peptide = peptidoform.proforma + + # Reformat unimod modifications + pattern_unimod = r"\[UNIMOD:(\d+)\]" + + def replace_unimod(match): + return f"(UniMod:{match.group(1)})" + + peptide_oms_str = re.sub( + pattern=pattern_unimod, repl=replace_unimod, string=peptide + ) + + # Parse N-terminal modifications + if ")-" in peptide_oms_str: + peptide_oms_list = peptide_oms_str.split(")-") + nterm_modification, peptide_oms_str = peptide_oms_list[-2], peptide_oms_list[-1] + nterm_modification += ")" + peptide_oms_str = "." + nterm_modification + peptide_oms_str + "." + elif "]-" in peptide_oms_str: + peptide_oms_list = peptide_oms_str.split("]-") + nterm_modification, peptide_oms_str = peptide_oms_list[-2], peptide_oms_list[-1] + nterm_modification += "]" + peptide_oms_str = "." + nterm_modification + peptide_oms_str + "." - n_term = peptidoform.properties["n_term"] - peptide_oms_str = f"[{sum([mod.mass for mod in n_term])}]" if n_term else "" + # Split the charge from the peptide string + if "/" in peptide_oms_str: + peptide_oms_str, _ = peptide_oms_str.split("/") - for aa, mods in peptidoform.parsed_sequence: - peptide_oms_str += aa - if isinstance(mods, list): - peptide_oms_str += f"[{sum([mod.mass for mod in mods])}]" + try: + peptide_oms = oms.AASequence.fromString(peptide_oms_str) + return peptide_oms - c_term = peptidoform.properties["c_term"] - peptide_oms_str += f"[{sum([mod.mass for mod in c_term])}]" if c_term else "" + except: + return - peptide_oms = oms.AASequence.fromString(peptide_oms_str) - return peptide_oms +# def _peptidoform_to_oms(peptidoform: Peptidoform) -> tuple[oms.AASequence, Optional[int]]: +# """ +# Parse a peptidoform object to pyOpenMS compatible format. + +# Parameter +# --------- +# Peptidoform: Peptidoform +# Peptide string in Peptidoform format + +# Returns +# ------- +# AASequence (pyOpenMS): +# A peptide sequence in pyOpenMS format +# int: +# charge of the peptide +# """ + +# n_term = peptidoform.properties["n_term"] +# peptide_oms_str = f"[{sum([mod.mass for mod in n_term])}]" if n_term else "" + +# for aa, mods in peptidoform.parsed_sequence: +# peptide_oms_str += aa +# if isinstance(mods, list): +# peptide_oms_str += f"[{sum([mod.mass for mod in mods])}]" + +# c_term = peptidoform.properties["c_term"] +# peptide_oms_str += f"[{sum([mod.mass for mod in c_term])}]" if c_term else "" + +# peptide_oms = oms.AASequence.fromString(peptide_oms_str) + +# return peptide_oms def _peptidoform_to_theoretical_spectrum(peptidoform: str) -> oms.MSSpectrum: @@ -318,6 +379,8 @@ def _peptidoform_to_theoretical_spectrum(peptidoform: str) -> oms.MSSpectrum: """ # Reformat peptide sequence in pyOpenMS format peptide_oms = _peptidoform_to_oms(peptidoform=peptidoform) + if peptide_oms is None: + return # Initialize the required objects to create the spectrum spectrum = oms.MSSpectrum() @@ -375,9 +438,16 @@ def calculate_hyperscore( ) if len(observed_intensity) == 0: logging.warning(f"PSM ({psm.spectrum_id}) has no annotated peaks.") - return 0.0 + return theoretical_spectrum = _peptidoform_to_theoretical_spectrum(peptidoform=psm.peptidoform) + # This is mainly the cause of the modification not being allowed according to pyopenms + # pyOpenMS sets stringent criteria on modification being placed on annotated amino acids + # according to the unimod database + if theoretical_spectrum is None: + logging.warning(f'Peptidoform has either unsupported modifications or is being placed on non-allowed residue: {psm.peptidoform.proforma}') + return + observed_spectrum_oms = oms.MSSpectrum() observed_spectrum_oms.set_peaks([observed_mz, observed_intensity]) hyperscore = oms.HyperScore() @@ -388,3 +458,8 @@ def calculate_hyperscore( theo_spectrum=theoretical_spectrum, ) return result + + +########################### +### FRAG SITE FUNCTIONS ### +########################### From c5902c21c0c5c72e71bb64cafc17c15c28b50191 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Fri, 22 Nov 2024 13:36:41 +0100 Subject: [PATCH 21/23] add mumble psm filtering functionality --- ms2rescore/core.py | 10 +++++++++- ms2rescore/utils.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/ms2rescore/core.py b/ms2rescore/core.py index 48e125bc..f314bef6 100644 --- a/ms2rescore/core.py +++ b/ms2rescore/core.py @@ -18,6 +18,7 @@ add_peptide_confidence, add_psm_confidence, ) +from ms2rescore.utils import filter_mumble_psms logger = logging.getLogger(__name__) @@ -102,6 +103,10 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: ) psm_list = psm_list[psms_with_features] + if "mumble" in config["psm_generator"]: + # Remove PSMS that have a less matched ions than the original hit + psm_list = filter_mumble_psms(psm_list) + # Write feature names to file _write_feature_names(feature_names, output_file_root) @@ -251,7 +256,10 @@ def _write_feature_names(feature_names, output_file_root): def _log_id_psms_before(psm_list: PSMList, fdr: float = 0.01, max_rank: int = 1) -> int: """Log #PSMs identified before rescoring.""" id_psms_before = ( - (psm_list["qvalue"] <= 0.01) & (psm_list["rank"] <= max_rank) & (~psm_list["is_decoy"]) + (psm_list["qvalue"] <= 0.01) + & (psm_list["rank"] <= max_rank) + & (~psm_list["is_decoy"]) + & ([metadata.get("original_psm", True) for metadata in psm_list["metadata"]]) ).sum() logger.info( f"Found {id_psms_before} identified PSMs with rank <= {max_rank} at {fdr} FDR before " diff --git a/ms2rescore/utils.py b/ms2rescore/utils.py index 3515893a..33b1f0a1 100644 --- a/ms2rescore/utils.py +++ b/ms2rescore/utils.py @@ -4,8 +4,10 @@ from glob import glob from pathlib import Path from typing import Optional, Union +import numpy as np from ms2rescore.exceptions import MS2RescoreConfigurationError +from psm_utils import PSMList logger = logging.getLogger(__name__) @@ -93,3 +95,32 @@ def _is_minitdf(spectrum_file: str) -> bool: files = set(Path(spectrum_file).glob("*ms2spectrum.bin")) files.update(Path(spectrum_file).glob("*ms2spectrum.parquet")) return len(files) >= 2 + + +def filter_mumble_psms(psm_list: PSMList) -> PSMList: + """ + Filter out PSMs with mumble in the peptide sequence. + """ + keep = [None] * len(psm_list) + original_hit = np.array([metadata.get("original_hit") for metadata in psm_list["metadata"]]) + spectrum_indices = np.array([psm.spectrum for psm in psm_list]) + runs = np.array([psm.run for psm in psm_list]) + if "matched_ions_pct" in psm_list[0].rescoring_features: + matched_ions = [psm.rescoring_features["matched_ions_pct"] for psm in psm_list] + else: + return psm_list + for i, psm in enumerate(psm_list): + if isinstance(keep[i], bool): + continue + elif original_hit[i]: + keep[i] = True + else: + original_matched_ions_pct = np.logical_and.reduce( + [original_hit, spectrum_indices == psm.spectrum_index, runs == psm.run] + ) + if original_matched_ions_pct > matched_ions[i]: + keep[i] = False + else: + keep[i] = True + logger.debug(f"Filtered out {len(psm_list) - sum(keep)} mumble PSMs.") + return psm_list[keep] From 5ce55f501d4a52fe6714a904d6402b90d8450613 Mon Sep 17 00:00:00 2001 From: SamvPy Date: Fri, 22 Nov 2024 15:05:03 +0100 Subject: [PATCH 22/23] remove pyopenms dependency for hyperscore calculation --- ms2rescore/feature_generators/ms2.py | 296 +++++++-------------------- 1 file changed, 75 insertions(+), 221 deletions(-) diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py index eb049ed8..df12ca95 100644 --- a/ms2rescore/feature_generators/ms2.py +++ b/ms2rescore/feature_generators/ms2.py @@ -138,20 +138,14 @@ def _calculate_features(self, psm_list: PSMList, spectrum_file: str) -> List: if self.calculate_hyperscore: # Filters out peaks which are unnannotated (can be specified, but keep at b-y ions of any charge ?) - mz_list, intensity_list = _annotated_spectrum_to_mzint( - annotated_spectrum=annotated_spectrum + b, y = get_by_fragments(annotated_spectrum) + hs = calculate_hyperscore( + n_y=len(y), + n_b=len(b), + y_ion_intensities=y, + b_ion_intensities=b ) - hyperscore = calculate_hyperscore( - psm=psm, observed_mz=mz_list, observed_intensity=intensity_list - ) - if hyperscore is None: - continue - psm.rescoring_features.update( - { - "hyperscore": hyperscore - } - ) - + psm.rescoring_features.update({'hyperscore': hs}) @staticmethod def _read_spectrum_file(spectrum_filepath: Path) -> Union[mzml.PreIndexedMzML, mgf.IndexedMGF]: @@ -251,215 +245,75 @@ def _annotate_spectrum(self, psm, pyteomics_spectrum): return annotated_spectrum.spectrum - -############################ -### HYPERSCORE FUNCTIONS ### -############################ -def _annotated_spectrum_to_mzint(annotated_spectrum, ion_types=["b", "y"]): - - mz_list = [] - intensity_list = [] - - for peak in annotated_spectrum: - - annotations = peak.annotation - for fragment in annotations: - ion = fragment.ion - if ion[0] not in ion_types: - continue - - mz_list.append(peak.experimental_mz) - intensity_list.append(peak.intensity) - break - - return mz_list, intensity_list - - -def _peptidoform_to_oms(peptidoform: Peptidoform) -> tuple[oms.AASequence, Optional[int]]: - """ - Parse a peptide sequence in proforma format to pyOpenMS compatible format. - - Only supports UNIMOD format. - - Parameter - --------- - peptide: str - Peptide string in proforma format - - Returns - ------- - AASequence (pyOpenMS): - A peptide sequence in pyOpenMS format - """ - peptide = peptidoform.proforma - - # Reformat unimod modifications - pattern_unimod = r"\[UNIMOD:(\d+)\]" - - def replace_unimod(match): - return f"(UniMod:{match.group(1)})" - - peptide_oms_str = re.sub( - pattern=pattern_unimod, repl=replace_unimod, string=peptide - ) - - # Parse N-terminal modifications - if ")-" in peptide_oms_str: - peptide_oms_list = peptide_oms_str.split(")-") - nterm_modification, peptide_oms_str = peptide_oms_list[-2], peptide_oms_list[-1] - nterm_modification += ")" - peptide_oms_str = "." + nterm_modification + peptide_oms_str + "." - elif "]-" in peptide_oms_str: - peptide_oms_list = peptide_oms_str.split("]-") - nterm_modification, peptide_oms_str = peptide_oms_list[-2], peptide_oms_list[-1] - nterm_modification += "]" - peptide_oms_str = "." + nterm_modification + peptide_oms_str + "." - - # Split the charge from the peptide string - if "/" in peptide_oms_str: - peptide_oms_str, _ = peptide_oms_str.split("/") - - try: - peptide_oms = oms.AASequence.fromString(peptide_oms_str) - return peptide_oms - - except: - return - - -# def _peptidoform_to_oms(peptidoform: Peptidoform) -> tuple[oms.AASequence, Optional[int]]: -# """ -# Parse a peptidoform object to pyOpenMS compatible format. - -# Parameter -# --------- -# Peptidoform: Peptidoform -# Peptide string in Peptidoform format - -# Returns -# ------- -# AASequence (pyOpenMS): -# A peptide sequence in pyOpenMS format -# int: -# charge of the peptide -# """ - -# n_term = peptidoform.properties["n_term"] -# peptide_oms_str = f"[{sum([mod.mass for mod in n_term])}]" if n_term else "" - -# for aa, mods in peptidoform.parsed_sequence: -# peptide_oms_str += aa -# if isinstance(mods, list): -# peptide_oms_str += f"[{sum([mod.mass for mod in mods])}]" - -# c_term = peptidoform.properties["c_term"] -# peptide_oms_str += f"[{sum([mod.mass for mod in c_term])}]" if c_term else "" - -# peptide_oms = oms.AASequence.fromString(peptide_oms_str) - -# return peptide_oms - - -def _peptidoform_to_theoretical_spectrum(peptidoform: str) -> oms.MSSpectrum: + +def factorial(n): """ - Create a theoretical spectrum from a peptide sequence. - - Parameter - --------- - peptide: str - Peptide sequence in proforma format - engine: str - The engine to use to create theoretical spectrum. - Can only be 'pyopenms' or 'spectrum-utils' (default) - - Return - ------ - MSSpectrum - Spectrum object in pyOpenMS format + Compute factorial of n using a loop. + Parameters: + n (int): Non-negative integer. + Returns: + int: Factorial of n. """ - # Reformat peptide sequence in pyOpenMS format - peptide_oms = _peptidoform_to_oms(peptidoform=peptidoform) - if peptide_oms is None: - return - - # Initialize the required objects to create the spectrum - spectrum = oms.MSSpectrum() - tsg = oms.TheoreticalSpectrumGenerator() - p = oms.Param() - - p.setValue("add_b_ions", "true") - p.setValue("add_metainfo", "true") - tsg.setParameters(param=p) - - # Create the theoretical spectrum - tsg.getSpectrum(spec=spectrum, peptide=peptide_oms, min_charge=1, max_charge=2) - return spectrum - - -def calculate_hyperscore( - psm: PSM, - observed_mz: List[float], - observed_intensity: List[float], - fragment_tol_mass=20, - fragment_tol_mode="ppm", -): + result = 1 + for i in range(1, n + 1): + result *= i + return result + +def calculate_hyperscore(n_y, n_b, y_ion_intensities, b_ion_intensities): """ - Calculate the hyperscore as defined in the X!Tandem search engine. - - It is a metric of how good two spectra match with each other (matching peaks). - - Parameters - ---------- - psm: psm_utils.PSM - The PSM used to extract 'spectrum_id' (for MGF spectrum extraction) - and 'Peptidoform' (the peptide sequence) - observed_mz: List[float] - List of observed mz values with matching order as observed intensity - observed_intensity: List[float] - List of observed intensity values - fragment_tol_mass: int - The allowed tolerance to match peaks - fragment_tol_mode: str - 'ppm' for parts-per-million mode. 'Da' for fragment_tol_mass in Dalton. - Return - ------ - int - The hyperscore + Calculate the hyperscore for a peptide-spectrum match. + Parameters: + n_y (int): Number of matched y-ions. + n_b (int): Number of matched b-ions. + y_ion_intensities (list): Intensities of matched y-ions. + b_ion_intensities (list): Intensities of matched b-ions. + Returns: + float: Calculated hyperscore. """ - if fragment_tol_mode == "ppm": - fragment_mass_tolerance_unit_ppm = True - elif fragment_tol_mode == "Da": - fragment_mass_tolerance_unit_ppm = False - else: - raise Exception( - "fragment_tol_mode can only take 'Da' or 'ppm'. {} was provided.".format( - fragment_tol_mode - ) - ) - if len(observed_intensity) == 0: - logging.warning(f"PSM ({psm.spectrum_id}) has no annotated peaks.") - return - - theoretical_spectrum = _peptidoform_to_theoretical_spectrum(peptidoform=psm.peptidoform) - # This is mainly the cause of the modification not being allowed according to pyopenms - # pyOpenMS sets stringent criteria on modification being placed on annotated amino acids - # according to the unimod database - if theoretical_spectrum is None: - logging.warning(f'Peptidoform has either unsupported modifications or is being placed on non-allowed residue: {psm.peptidoform.proforma}') - return - - observed_spectrum_oms = oms.MSSpectrum() - observed_spectrum_oms.set_peaks([observed_mz, observed_intensity]) - hyperscore = oms.HyperScore() - result = hyperscore.compute( - fragment_mass_tolerance=fragment_tol_mass, - fragment_mass_tolerance_unit_ppm=fragment_mass_tolerance_unit_ppm, - exp_spectrum=observed_spectrum_oms, - theo_spectrum=theoretical_spectrum, - ) - return result - - -########################### -### FRAG SITE FUNCTIONS ### -########################### + # Calculate the product of y-ion and b-ion intensities + product_y = np.sum(y_ion_intensities) if y_ion_intensities else 1 + product_b = np.sum(b_ion_intensities) if b_ion_intensities else 1 + + # Calculate factorial using custom function + factorial_y = factorial(n_y) + factorial_b = factorial(n_b) + + # Compute hyperscore + hyperscore = np.log(factorial_y * factorial_b * (product_y+product_b)) + return hyperscore + +def infer_fragment_identity(frag, allow_ion_types=['b', 'y']): + ion = frag.ion + + is_allowed = False + for allowed_ion_type in allow_ion_types: + if allowed_ion_type in ion: + is_allowed=True + break + + if not is_allowed: + return False + # Radicals + if "·" in ion: + return False + if frag.neutral_loss is not None: + return False + if frag.charge > 2: + return False + + return ion[0] + +def get_by_fragments(annotated_spectrum): + b_intensities = [] + y_intensities = [] + for peak in annotated_spectrum: + + for fragment in peak.annotation: + + ion_type = infer_fragment_identity(fragment) + + if ion_type == 'b': + b_intensities.append(peak.intensity) + if ion_type == 'y': + y_intensities.append(peak.intensity) + return b_intensities, y_intensities \ No newline at end of file From 986c5f612bbd46d4f5f8f25e5d95425cde470e31 Mon Sep 17 00:00:00 2001 From: ArthurDeclercq Date: Fri, 22 Nov 2024 16:00:42 +0100 Subject: [PATCH 23/23] fix spectrum_id accession --- ms2rescore/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ms2rescore/utils.py b/ms2rescore/utils.py index 33b1f0a1..7897b924 100644 --- a/ms2rescore/utils.py +++ b/ms2rescore/utils.py @@ -103,7 +103,7 @@ def filter_mumble_psms(psm_list: PSMList) -> PSMList: """ keep = [None] * len(psm_list) original_hit = np.array([metadata.get("original_hit") for metadata in psm_list["metadata"]]) - spectrum_indices = np.array([psm.spectrum for psm in psm_list]) + spectrum_indices = np.array([psm.spectrum_id for psm in psm_list]) runs = np.array([psm.run for psm in psm_list]) if "matched_ions_pct" in psm_list[0].rescoring_features: matched_ions = [psm.rescoring_features["matched_ions_pct"] for psm in psm_list] @@ -116,7 +116,7 @@ def filter_mumble_psms(psm_list: PSMList) -> PSMList: keep[i] = True else: original_matched_ions_pct = np.logical_and.reduce( - [original_hit, spectrum_indices == psm.spectrum_index, runs == psm.run] + [original_hit, spectrum_indices == psm.spectrum_id, runs == psm.run] ) if original_matched_ions_pct > matched_ions[i]: keep[i] = False