From e16a3b102a9c920c1d0d3db54e33c185c8ec7f87 Mon Sep 17 00:00:00 2001 From: RalfG Date: Thu, 12 Oct 2023 15:37:17 +0200 Subject: [PATCH] PR review --- README.md | 6 +- ms2rescore/config_parser.py | 2 +- ms2rescore/core.py | 28 ++--- ms2rescore/feature_generators/deeplc.py | 108 +++++++++--------- ms2rescore/gui/__main__.py | 1 + ms2rescore/gui/app.py | 54 ++++----- ms2rescore/gui/function2ctk.py | 9 +- ms2rescore/parse_psms.py | 15 ++- ms2rescore/parse_spectra.py | 146 +++++++++++------------- ms2rescore/rescoring_engines/mokapot.py | 5 +- ms2rescore/utils.py | 2 +- 11 files changed, 177 insertions(+), 199 deletions(-) diff --git a/README.md b/README.md index 2adfdb17..991bb436 100644 --- a/README.md +++ b/README.md @@ -30,8 +30,10 @@ MS²Rescore can read peptide identifications in any format supported by [psm_uti files: - [MS Amanda](http://ms.imp.ac.at/?goto=msamanda) `.csv` +- [Sage](https://github.com/lazear/sage) `.sage.tsv` - [PeptideShaker](https://compomics.github.io/projects/peptide-shaker.html) `.mzid` - [MSGFPlus](https://omics.pnl.gov/software/ms-gf) `.mzid` +- [Mascot](https://www.matrixscience.com/) `.mzid` - [MaxQuant](https://www.maxquant.org/) `msms.txt` - [X!Tandem](https://www.thegpm.org/tandem/) `.xml` - [PEAKS](https://www.bioinfor.com/peaksdb/) `.mzid` @@ -45,13 +47,13 @@ MS²Rescore is available as a [desktop application][desktop], a [command line to > **MS2Rescore: Data-driven rescoring dramatically boosts immunopeptide identification rates.** > Arthur Declercq, Robbin Bouwmeester, Aurélie Hirschler, Christine Carapito, Sven Degroeve, Lennart Martens, and Ralf Gabriels. -> _Molecular & Cellular Proteomics_ (2021) [doi:10.1016/j.mcpro.2022.100266](https://doi.org/10.1016/j.mcpro.2022.100266) > +> _Molecular & Cellular Proteomics_ (2021) [doi:10.1016/j.mcpro.2022.100266](https://doi.org/10.1016/j.mcpro.2022.100266) **Original publication describing the concept of rescoring with predicted spectra:** > **Accurate peptide fragmentation predictions allow data driven approaches to replace and improve upon proteomics search engine scoring functions.** > Ana S C Silva, Robbin Bouwmeester, Lennart Martens, and Sven Degroeve. -> _Bioinformatics_ (2019) [doi:10.1093/bioinformatics/btz383](https://doi.org/10.1093/bioinformatics/btz383) > +> _Bioinformatics_ (2019) [doi:10.1093/bioinformatics/btz383](https://doi.org/10.1093/bioinformatics/btz383) To replicate the experiments described in this article, check out the [publication branch][publication-branch] of the repository. diff --git a/ms2rescore/config_parser.py b/ms2rescore/config_parser.py index 0d254fd3..ced80c28 100644 --- a/ms2rescore/config_parser.py +++ b/ms2rescore/config_parser.py @@ -69,7 +69,7 @@ def _validate_filenames(config: Dict) -> Dict: config["ms2rescore"]["output_path"], config["ms2rescore"]["psm_file"][0] ) - # Parse config_file as posix path #TODO: Is this necessary? + # Parse config_file as posix path to avoid combination of forward and backward slashes if config["ms2rescore"]["config_file"]: config["ms2rescore"]["config_file"] = Path(config["ms2rescore"]["config_file"]).as_posix() diff --git a/ms2rescore/core.py b/ms2rescore/core.py index b27d7f50..aee29c36 100644 --- a/ms2rescore/core.py +++ b/ms2rescore/core.py @@ -8,9 +8,9 @@ from ms2rescore.feature_generators import FEATURE_GENERATORS from ms2rescore.parse_psms import parse_psms +from ms2rescore.parse_spectra import get_missing_values from ms2rescore.report import generate from ms2rescore.rescoring_engines import mokapot, percolator -from ms2rescore.parse_spectra import get_missing_values logger = logging.getLogger(__name__) @@ -27,7 +27,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: PSMList object containing PSMs. If None, PSMs will be read from configuration ``psm_file``. """ - config = configuration["ms2rescore"] # TODO: Remove top-level key? + config = configuration["ms2rescore"] output_file_root = config["output_path"] # Write full configuration including defaults to file @@ -54,23 +54,13 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: f"PSMs already contain the following rescoring features: {psm_list_feature_names}" ) - if ("deeplc" in config["feature_generators"] and None in psm_list["retention_time"]) or ( - "ionmob" in config["feature_generators"] and None in psm_list["ion_mobility"] - ): - logger.warning( - "One or more PSMs are missing retention time and/or ion mobility values. These will be " - "parsed from the spectrum file." - ) - get_missing_values( - config, - psm_list, - missing_rt_values=( - "deeplc" in config["feature_generators"] and None in psm_list["retention_time"] - ), - missing_im_values=( - "ionmob" in config["feature_generators"] and None in psm_list["ion_mobility"] - ), - ) + # TODO: avoid hard coding feature generators in some way + rt_required = "deeplc" in config["feature_generators"] and None in psm_list["retention_time"] + im_required = "ionmob" in config["feature_generators"] and None in psm_list["ion_mobility"] + if rt_required or im_required: + logger.info("Parsing missing retention time and/or ion mobility values from spectra...") + get_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required) + # Add rescoring features for fgen_name, fgen_config in config["feature_generators"].items(): # TODO: Handle this somewhere else, more generally? diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py index cb6f6a40..50b577ff 100644 --- a/ms2rescore/feature_generators/deeplc.py +++ b/ms2rescore/feature_generators/deeplc.py @@ -28,9 +28,7 @@ from psm_utils import PSMList from psm_utils.io import peptide_record -from ms2rescore.exceptions import MS2RescoreError from ms2rescore.feature_generators.base import FeatureGeneratorBase -from ms2rescore.utils import infer_spectrum_path os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" logger = logging.getLogger(__name__) @@ -146,59 +144,65 @@ def add_features(self, psm_list: PSMList) -> None: f"Running DeepLC for PSMs from run ({current_run}/{total_runs}): `{run}`..." ) - psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) - - psm_list_calibration = self._get_calibration_psms(psm_list_run) - - logger.debug("Calibrating DeepLC") - self.deeplc_predictor = self.DeepLC( - n_jobs=self.processes, - verbose=self._verbose, - path_model=self.selected_model or self.user_model, - **self.deeplc_kwargs, - ) - self.deeplc_predictor.calibrate_preds( - seq_df=self._psm_list_to_deeplc_peprec(psm_list_calibration) - ) - # Still calibrate for each run, but do not try out all model options. - # Just use model that was selected based on first run - if not self.selected_model: - self.selected_model = list(self.deeplc_predictor.model.keys()) - self.deeplc_kwargs["deeplc_retrain"] = False - logger.debug( - f"Selected DeepLC model {self.selected_model} based on " - "calibration of first run. Using this model (after new " - "calibrations) for the remaining runs." + # Disable wild logging to stdout by Tensorflow, unless in debug mode + with contextlib.redirect_stdout( + open(os.devnull, "w") + ) if not self._verbose else contextlib.nullcontext(): + # Make new PSM list for this run (chain PSMs per spectrum to flat list) + psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) + + logger.debug("Calibrating DeepLC...") + psm_list_calibration = self._get_calibration_psms(psm_list_run) + self.deeplc_predictor = self.DeepLC( + n_jobs=self.processes, + verbose=self._verbose, + path_model=self.selected_model or self.user_model, + **self.deeplc_kwargs, ) - - predictions = np.array( - self.deeplc_predictor.make_preds( - seq_df=self._psm_list_to_deeplc_peprec(psm_list_run) - ) - ) - observations = psm_list_run["retention_time"] - rt_diffs_run = np.abs(predictions - observations) - - for i, psm in enumerate(psm_list_run): - psm["rescoring_features"].update( - { - "observed_retention_time": observations[i], - "predicted_retention_time": predictions[i], - "rt_diff": rt_diffs_run[i], - } + self.deeplc_predictor.calibrate_preds( + seq_df=self._psm_list_to_deeplc_peprec(psm_list_calibration) ) - peptide = psm.peptidoform.proforma.split("\\")[0] # remove charge - if peptide_rt_diff_dict[peptide]["rt_diff_best"] > rt_diffs_run[i]: - peptide_rt_diff_dict[peptide] = { - "observed_retention_time_best": observations[i], - "predicted_retention_time_best": predictions[i], - "rt_diff_best": rt_diffs_run[i], - } - for psm in psm_list_run: - psm["rescoring_features"].update( - peptide_rt_diff_dict[psm.peptidoform.proforma.split("\\")[0]] + # Still calibrate for each run, but do not try out all model options. + # Just use model that was selected based on first run + if not self.selected_model: + self.selected_model = list(self.deeplc_predictor.model.keys()) + self.deeplc_kwargs["deeplc_retrain"] = False + logger.debug( + f"Selected DeepLC model {self.selected_model} based on " + "calibration of first run. Using this model (after new " + "calibrations) for the remaining runs." + ) + + logger.debug("Predicting retention times...") + predictions = np.array( + self.deeplc_predictor.make_preds( + seq_df=self._psm_list_to_deeplc_peprec(psm_list_run) + ) ) - current_run += 1 + observations = psm_list_run["retention_time"] + rt_diffs_run = np.abs(predictions - observations) + + logger.debug("Adding features to PSMs...") + for i, psm in enumerate(psm_list_run): + psm["rescoring_features"].update( + { + "observed_retention_time": observations[i], + "predicted_retention_time": predictions[i], + "rt_diff": rt_diffs_run[i], + } + ) + peptide = psm.peptidoform.proforma.split("\\")[0] # remove charge + if peptide_rt_diff_dict[peptide]["rt_diff_best"] > rt_diffs_run[i]: + peptide_rt_diff_dict[peptide] = { + "observed_retention_time_best": observations[i], + "predicted_retention_time_best": predictions[i], + "rt_diff_best": rt_diffs_run[i], + } + for psm in psm_list_run: + psm["rescoring_features"].update( + peptide_rt_diff_dict[psm.peptidoform.proforma.split("\\")[0]] + ) + current_run += 1 # TODO: Remove when DeepLC supports PSMList directly @staticmethod diff --git a/ms2rescore/gui/__main__.py b/ms2rescore/gui/__main__.py index 03ccd0e8..429e117f 100644 --- a/ms2rescore/gui/__main__.py +++ b/ms2rescore/gui/__main__.py @@ -10,6 +10,7 @@ def main(): """Entrypoint for MS²Rescore GUI.""" multiprocessing.freeze_support() + # Redirect stdout when running GUI (packaged app might not have console attached) with contextlib.redirect_stdout(open(os.devnull, "w")): app() diff --git a/ms2rescore/gui/app.py b/ms2rescore/gui/app.py index c322a77e..3196659e 100644 --- a/ms2rescore/gui/app.py +++ b/ms2rescore/gui/app.py @@ -8,9 +8,9 @@ import webbrowser from pathlib import Path from typing import Dict, List, Tuple -from joblib import parallel_backend import customtkinter as ctk +from joblib import parallel_backend from ms2pip.constants import MODELS as ms2pip_models from PIL import Image from psm_utils.io import FILETYPES @@ -41,6 +41,8 @@ pass ctk.set_default_color_theme(_THEME_FILE) + +# TODO Does this disable multiprocessing everywhere? parallel_backend("threading") @@ -163,18 +165,11 @@ def get(self): main_config = self.main_config.get() advanced_config = self.advanced_config.get() - # TODO Move to rescoring engine config - percolator_config = {"init-weights": advanced_config.pop("weightsfile")} - config = {"ms2rescore": main_config} config["ms2rescore"].update(advanced_config) config["ms2rescore"]["feature_generators"] = self.fgen_config.get() config["ms2rescore"]["rescoring_engine"] = self.rescoring_engine_config.get() - # TODO See above - if "percolator" in config["ms2rescore"]["rescoring_engine"]: - config["ms2rescore"]["rescoring_engine"]["percolator"] = percolator_config - args = (config,) # Comma required to wrap in tuple kwargs = {} @@ -284,11 +279,13 @@ def __init__(self, *args, **kwargs): def get(self) -> Dict: """Get the configured values as a dictionary.""" try: + # there cannot be spaces in the file path + # TODO: Fix this in widgets.LabeledFileSelect psm_files = self.psm_file.get().split(" ") except AttributeError: raise MS2RescoreConfigurationError("No PSM file provided. Please select a file.") return { - "psm_file": psm_files, # there cannot be spaces in the file path + "psm_file": psm_files, "psm_file_type": self.psm_file_type.get(), } @@ -321,11 +318,6 @@ def __init__(self, *args, **kwargs): self.spectrum_id_pattern = widgets.LabeledEntry(self, label="Spectrum ID regex pattern") self.spectrum_id_pattern.grid(row=5, column=0, pady=(0, 10), sticky="nsew") - self.weightsfile = widgets.LabeledFileSelect( - self, label="Pretrained Percolator weights", file_option="openfile" - ) - self.weightsfile.grid(row=6, column=0, columnspan=2, sticky="nsew") - self.file_prefix = widgets.LabeledFileSelect( self, label="Filename for output files", file_option="savefile" ) @@ -344,7 +336,6 @@ def get(self) -> Dict: "id_decoy_pattern": self.id_decoy_pattern.get(), "psm_id_pattern": self.psm_id_pattern.get(), "spectrum_id_pattern": self.spectrum_id_pattern.get(), - "weightsfile": self.weightsfile.get(), "output_path": self.file_prefix.get(), "config_file": self.config_file.get(), "write_report": self.generate_report.get(), @@ -466,7 +457,7 @@ def __init__(self, *args, **kwargs): self.num_epochs = widgets.LabeledFloatSpinbox( self, - label="Number of epochs", + label="Number of transfer learning epochs", step_size=5, initial_value=20, ) # way to remove float in spinbox label? @@ -569,25 +560,29 @@ def __init__(self, *args, **kwargs): self.configure(fg_color="transparent") self.grid_columnconfigure(0, weight=1) - self.title = widgets.Heading(self, text="Mokapot cofiguration") + self.title = widgets.Heading(self, text="Mokapot coffeeguration") self.title.grid(row=0, column=0, columnspan=2, pady=(0, 5), sticky="ew") - self.write_weights = widgets.LabeledSwitch(self, label="Write weightsfile", default=True) + self.write_weights = widgets.LabeledSwitch( + self, label="Write model weights to file", default=True + ) self.write_weights.grid(row=1, column=0, pady=(0, 10), sticky="nsew") - self.write_txt = widgets.LabeledSwitch(self, label="Write txt output file", default=True) + self.write_txt = widgets.LabeledSwitch(self, label="Write TXT output files", default=True) self.write_txt.grid(row=2, column=0, pady=(0, 10), sticky="nsew") - self.write_flashlfq = widgets.LabeledSwitch(self, label="Write flashlfq", default=False) + self.write_flashlfq = widgets.LabeledSwitch( + self, label="Write file for FlashLFQ", default=False + ) self.write_flashlfq.grid(row=3, column=0, pady=(0, 10), sticky="nsew") self.protein_kwargs = widgets.TableInput( self, - label="mokapot protein kwargs", + label="`mokapot.read_fasta` options (see Mokapot documentation)", columns=2, - header_labels=["keyword", "value"], + header_labels=["Parameter", "Value"], ) - self.protein_kwargs.grid(row=4, column=0, sticky="new") # leave this in? + self.protein_kwargs.grid(row=4, column=0, sticky="nsew") def get(self) -> Dict: """Return the configuration as a dictionary.""" @@ -617,12 +612,17 @@ def __init__(self, *args, **kwargs): self.configure(fg_color="transparent") self.grid_columnconfigure(0, weight=1) - self.title = widgets.Heading(self, text="Percolator cofiguration") + self.title = widgets.Heading(self, text="Percolator configuration") self.title.grid(row=0, column=0, columnspan=2, pady=(0, 5), sticky="ew") + self.weights_file = widgets.LabeledFileSelect( + self, label="Pretrained Percolator model weights", file_option="openfile" + ) + self.weights_file.grid(row=1, column=0, columnspan=2, sticky="nsew") + def get(self) -> Dict: """Return the configuration as a dictionary.""" - config = {} + config = {"init-weights": self.weights_file.get()} return config @@ -641,8 +641,8 @@ def app(): function=function, ) root.protocol("WM_DELETE_WINDOW", sys.exit) - root.geometry(f"{1250}x{700}") - root.minsize(1000, 700) + dpi = root.winfo_fpixels("1i") + root.geometry(f"{int(15*dpi)}x{int(10*dpi)}") root.title("MS²Rescore") root.wm_iconbitmap(os.path.join(str(_IMG_DIR), "program_icon.ico")) diff --git a/ms2rescore/gui/function2ctk.py b/ms2rescore/gui/function2ctk.py index 9ffe38bd..60bad120 100644 --- a/ms2rescore/gui/function2ctk.py +++ b/ms2rescore/gui/function2ctk.py @@ -10,7 +10,6 @@ import customtkinter as ctk - logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -51,13 +50,9 @@ def __init__( self.function = function - # # App config - self.geometry(f"{1250}x{700}") - self.minsize(1000, 700) - # 2x3 grid, only logging column expands with window - self.grid_columnconfigure(0, weight=0, minsize=500) # Left: Sidebar - self.grid_columnconfigure(1, weight=0, minsize=1000) # Middle: Configuration + self.grid_columnconfigure(0, weight=0) # Left: Sidebar + self.grid_columnconfigure(1, weight=2) # Middle: Configuration self.grid_columnconfigure(2, weight=1) # Right: Logging self.grid_rowconfigure(0, weight=1) diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py index 8ee47a7e..8cf1b19a 100644 --- a/ms2rescore/parse_psms.py +++ b/ms2rescore/parse_psms.py @@ -1,16 +1,17 @@ import logging import re from typing import Dict, Union +from itertools import chain import psm_utils.io from psm_utils import PSMList -from ms2rescore.exceptions import MS2RescoreConfigurationError, MS2RescoreError +from ms2rescore.exceptions import MS2RescoreConfigurationError logger = logging.getLogger(__name__) -def parse_psms(config: Dict, psm_list: Union[PSMList, None], output_file_root: str) -> PSMList: +def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: """ Parse PSMs and prepare for rescoring. @@ -21,8 +22,6 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None], output_file_root: s top-level key). psm_list PSMList object containing PSMs. If None, PSMs will be read from ``psm_file``. - output_file_root - Path to output file root (without file extension). #TODO doesn't get used? """ # Read PSMs, find decoys, calculate q-values @@ -61,12 +60,12 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None], output_file_root: s def _read_psms(config, psm_list): if isinstance(psm_list, PSMList): - logger.info("Reading PSMs...") return psm_list else: + logger.info("Reading PSMs from file...") current_file = 1 total_files = len(config["psm_file"]) - all_psms = [] + psm_list_list = [] for psm_file in config["psm_file"]: logger.info( f"Reading PSMs from PSM file ({current_file}/{total_files}): `{psm_file}`..." @@ -86,10 +85,10 @@ def _read_psms(config, psm_list): " for more information." ) - all_psms = all_psms + id_file_psm_list.psm_list + psm_list_list.append(id_file_psm_list) current_file += 1 - return PSMList(psm_list=all_psms) + return PSMList(psm_list=chain.from_iterable(p.psm_list for p in psm_list_list)) def _find_decoys(config, psm_list): diff --git a/ms2rescore/parse_spectra.py b/ms2rescore/parse_spectra.py index d10a436f..130d0d5c 100644 --- a/ms2rescore/parse_spectra.py +++ b/ms2rescore/parse_spectra.py @@ -2,53 +2,37 @@ import logging import re -from typing import Union, Tuple, Dict +from itertools import chain +from typing import Dict, Tuple -from rich.progress import track +from psm_utils import PSMList from pyteomics.mgf import MGF from pyteomics.mzml import MzML -from itertools import chain -from pathlib import Path +from rich.progress import track from ms2rescore.exceptions import MS2RescoreError from ms2rescore.utils import infer_spectrum_path -from psm_utils import PSMList logger = logging.getLogger(__name__) -class ParseMGFError(MS2RescoreError): - """Error parsing MGF file.""" - - pass - - -class ParsingError(MS2RescoreError): - """Error parsing retention time from spectrum file.""" - - pass - - -def get_missing_values(config, psm_list, missing_rt_values=False, missing_im_values=False): - """Get missing features from spectrum file.""" - logger.info("Parsing missing values from spectrum file.") +def get_missing_values(config, psm_list, missing_rt=False, missing_im=False): + """Get missing RT/IM features from spectrum file.""" + logger.debug("Extracting missing RT/IM values from spectrum file(s).") psm_dict = psm_list.get_psm_dict() - logger.debug(f"Extracting missing values from spectrum files.") for runs in psm_dict.values(): - for run, psms in track(runs.items(), description="Parsing missing values"): + for run, psms in track(runs.items(), description="Extracting RT/IM values..."): psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) spectrum_file = infer_spectrum_path(config["spectrum_path"], run) - if isinstance(spectrum_file, str): - spectrum_file = Path(spectrum_file) if spectrum_file.suffix.lower() == ".mzml": rt_dict, im_dict = _parse_values_from_mzml( - spectrum_file, config, run, missing_rt_values, missing_im_values + spectrum_file, config, run, missing_rt, missing_im ) elif spectrum_file.suffix.lower() == ".mgf": rt_dict, im_dict = _parse_values_from_mgf( - spectrum_file, config, run, missing_rt_values, missing_im_values + spectrum_file, config, run, missing_rt, missing_im ) for value_dict, value in zip([rt_dict, im_dict], ["retention_time", "ion_mobility"]): @@ -62,90 +46,92 @@ def get_missing_values(config, psm_list, missing_rt_values=False, missing_im_val def _parse_values_from_mgf( - spectrum_file, config, run, missing_rt_values, missing_im_values + spectrum_file, config, run, missing_rt, missing_im ) -> Tuple[Dict, Dict]: - """Parse retention time and/or ion mobility from MGF file.""" + """ + Parse retention time and/or ion mobility from an MGF file. + + Notes + ----- + - Extracting values (e.g., ion mobility) according to the Matrix documentation: + http://www.matrixscience.com/help/data_file_help.html + + """ rt_dict = {} im_dict = {} + spectrum_id_pattern = re.compile( + config["spectrum_id_pattern"] if config["spectrum_id_pattern"] else r"(.*)" + ) + for spectrum in MGF(str(spectrum_file)): - if missing_rt_values: + matched_id = spectrum_id_pattern.match(spectrum["id"]).group() + if missing_rt: try: - rt_dict[ - re.match( - config["spectrum_id_pattern"] - if config["spectrum_id_pattern"] - else r"(.*)", - spectrum["params"]["title"], - ).group() - ] = float(spectrum["params"]["rtinseconds"]) + rt_dict[matched_id] = float(spectrum["params"]["rtinseconds"]) except KeyError: raise ParsingError( - f"Could not parse retention time key `rtinsecondes` from spectrum file for run {run}." - "Please make sure that the retention time key is present in the spectrum file." + "Could not parse retention time (`rtinseconds`) from spectrum file for " + f"run {run}. Please make sure that the retention time key is present in the " + "spectrum file or disable the relevant feature generator." ) - if missing_im_values: + if missing_im: try: - im_dict[ - re.match( - config["spectrum_id_pattern"] - if config["spectrum_id_pattern"] - else r"(.*)", - spectrum["params"]["title"], - ).group() - ] = float( - spectrum["params"]["ion_mobility"] - ) # http://www.matrixscience.com/help/data_file_help.html + im_dict[matched_id] = float(spectrum["params"]["ion_mobility"]) except KeyError: raise ParsingError( - f"Could not parse ion mobility key `ionmobility` from spectrum file for run {run}." - "Please make sure that the ion mobility key is present in the spectrum file." + "Could not parse ion mobility (`ion_mobility`) from spectrum file " + f"for run {run}. Please make sure that the ion mobility key is present in the " + "spectrum file or disable the relevant feature generator." ) return rt_dict, im_dict def _parse_values_from_mzml( - spectrum_file, config, run, missing_rt_values, missing_im_values + spectrum_file, config, run, missing_rt, missing_im ) -> Tuple[Dict, Dict]: - """Parse retention time and/or ion mobility from MGF file.""" + """Parse retention time and/or ion mobility from an mzML file.""" rt_dict = {} im_dict = {} + spectrum_id_pattern = re.compile( + config["spectrum_id_pattern"] if config["spectrum_id_pattern"] else r"(.*)" + ) + for spectrum in MzML(str(spectrum_file)): - if missing_rt_values: + matched_id = spectrum_id_pattern.match(spectrum["id"]).group() + if missing_rt: try: - rt_dict[ - re.match( - config["spectrum_id_pattern"] - if config["spectrum_id_pattern"] - else r"(.*)", - spectrum["id"], - ).group() - ] = float( - spectrum["scanList"]["scan"][0][ - "scan start time" - ] # is rt in minutes by default? - ) + rt_dict[matched_id] = float(spectrum["scanList"]["scan"][0]["scan start time"]) except KeyError: raise ParsingError( - f"Could not parse retention time key `scan start time` from spectrum file for run {run}." - "Please make sure that the retention time key is present in the spectrum file." + "Could not parse retention time (`scan start time`) from spectrum file for " + f"run {run}. Please make sure that the retention time key is present in the " + "spectrum file or disable the relevant feature generator." ) - if missing_im_values: + if missing_im: try: - im_dict[ - re.match( - config["spectrum_id_pattern"] - if config["spectrum_id_pattern"] - else r"(.*)", - spectrum["id"], - ).group() - ] = float(spectrum["scanList"]["scan"][0]["reverse ion mobility"]) + im_dict[matched_id] = float( + spectrum["scanList"]["scan"][0]["reverse ion mobility"] + ) except KeyError: raise ParsingError( - f"Could not parse ion mobility key `reverse ion mobility` from spectrum file for run {run}." - "Please make sure that the ion mobility key is present in the spectrum file." + "Could not parse ion mobility (`reverse ion mobility`) from spectrum file " + f"for run {run}. Please make sure that the ion mobility key is present in the " + "spectrum file or disable the relevant feature generator." ) return rt_dict, im_dict + + +class ParseMGFError(MS2RescoreError): + """Error parsing MGF file.""" + + pass + + +class ParsingError(MS2RescoreError): + """Error parsing retention time from spectrum file.""" + + pass diff --git a/ms2rescore/rescoring_engines/mokapot.py b/ms2rescore/rescoring_engines/mokapot.py index 58cea069..f3927f47 100644 --- a/ms2rescore/rescoring_engines/mokapot.py +++ b/ms2rescore/rescoring_engines/mokapot.py @@ -84,11 +84,11 @@ def rescore( # Add proteins if fasta_file: - logger.debug(f"Mokapot read fasta keyword arguments : {protein_kwargs}") + logger.debug(f"Adding protein info from {fasta_file} with options: `{protein_kwargs}`") lin_psm_data.add_proteins(fasta_file, **protein_kwargs) # Rescore - logger.debug(f"Mokapot brew keyword arguments : {kwargs}") + logger.debug(f"Mokapot brew options: `{kwargs}`") confidence_results, models = brew(lin_psm_data, **kwargs) # Reshape confidence estimates to match PSMList @@ -124,6 +124,7 @@ def rescore( if write_txt: confidence_results.to_txt(file_root=output_file_root, decoys=True) if write_flashlfq: + # TODO: How do we validate that the RTs are in minutes? confidence_results.psms["retention_time"] = confidence_results.psms["retention_time"] * 60 confidence_results.to_flashlfq(output_file_root + ".mokapot.flashlfq.txt") diff --git a/ms2rescore/utils.py b/ms2rescore/utils.py index 26c88a7e..70417b56 100644 --- a/ms2rescore/utils.py +++ b/ms2rescore/utils.py @@ -75,4 +75,4 @@ def infer_spectrum_path( "files." ) - return resolved_path + return Path(resolved_path)