diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index c03b1667..a6a65b9e 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -14,7 +14,7 @@ jobs: - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: "3.11" @@ -47,7 +47,7 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 with: python-version: "3.11" diff --git a/Dockerfile b/Dockerfile index 59fc4000..2a51f7f3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,10 @@ -FROM ubuntu:focal +FROM python:3.10 + +# ARG DEBIAN_FRONTEND=noninteractive LABEL name="ms2rescore" -ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/ms2rescore +# ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/ms2rescore ADD pyproject.toml /ms2rescore/pyproject.toml ADD LICENSE /ms2rescore/LICENSE @@ -11,8 +13,7 @@ ADD MANIFEST.in /ms2rescore/MANIFEST.in ADD ms2rescore /ms2rescore/ms2rescore RUN apt-get update \ - && apt-get install -y python3-pip procps libglib2.0-0 libsm6 libxrender1 libxext6 \ - && rm -rf /var/lib/apt/lists/* \ - && pip3 install ms2rescore/ + && apt install -y procps git-lfs \ + && pip install /ms2rescore ENTRYPOINT [""] diff --git a/docs/source/config_schema.md b/docs/source/config_schema.md index 2b523aa5..1953cb78 100644 --- a/docs/source/config_schema.md +++ b/docs/source/config_schema.md @@ -10,6 +10,7 @@ - **`deeplc`**: Refer to *[#/definitions/deeplc](#definitions/deeplc)*. - **`maxquant`**: Refer to *[#/definitions/maxquant](#definitions/maxquant)*. - **`ionmob`**: Refer to *[#/definitions/ionmob](#definitions/ionmob)*. + - **`im2deep`**: Refer to *[#/definitions/im2deep](#definitions/im2deep)*. - **`rescoring_engine`** *(object)*: Rescoring engine to use and its configuration. Leave empty to skip rescoring and write features to file. Default: `{"mokapot": {}}`. - **`.*`**: Refer to *[#/definitions/rescoring_engine](#definitions/rescoring_engine)*. - **`percolator`**: Refer to *[#/definitions/percolator](#definitions/percolator)*. @@ -43,6 +44,14 @@ - **One of** - *string* - *null* + - **`psm_id_rt_pattern`**: Regex pattern to extract retention time from psm identifier. Requires at least one capturing group. Default: `null`. + - **One of** + - *string* + - *null* + - **`psm_id_im_pattern`**: Regex pattern to extract ion mobility from psm identifier. Requires at least one capturing group. Default: `null`. + - **One of** + - *string* + - *null* - **`psm_id_pattern`**: Regex pattern to extract index or scan number from PSM file. Requires at least one capturing group. Default: `"(.*)"`. - **One of** - *string* @@ -75,6 +84,8 @@ - **`ionmob_model`** *(string)*: Path to Ionmob model directory. Default: `"GRUPredictor"`. - **`reference_dataset`** *(string)*: Path to Ionmob reference dataset file. Default: `"Meier_unimod.parquet"`. - **`tokenizer`** *(string)*: Path to tokenizer json file. Default: `"tokenizer.json"`. +- **`im2deep`** *(object)*: Ion mobility feature generator configuration using IM2Deep. Can contain additional properties. Refer to *[#/definitions/feature_generator](#definitions/feature_generator)*. + - **`reference_dataset`** *(string)*: Path to IM2Deep reference dataset file. Default: `"Meier_unimod.parquet"`. - **`mokapot`** *(object)*: Mokapot rescoring engine configuration. Additional properties are passed to the Mokapot brew function. Can contain additional properties. Refer to *[#/definitions/rescoring_engine](#definitions/rescoring_engine)*. - **`write_weights`** *(boolean)*: Write Mokapot weights to a text file. Default: `false`. - **`write_txt`** *(boolean)*: Write Mokapot results to a text file. Default: `false`. diff --git a/ms2rescore/core.py b/ms2rescore/core.py index fffb1902..e5ae26f1 100644 --- a/ms2rescore/core.py +++ b/ms2rescore/core.py @@ -8,7 +8,7 @@ from ms2rescore.feature_generators import FEATURE_GENERATORS from ms2rescore.parse_psms import parse_psms -from ms2rescore.parse_spectra import get_missing_values +from ms2rescore.parse_spectra import fill_missing_values from ms2rescore.report import generate from ms2rescore.rescoring_engines import mokapot, percolator @@ -55,11 +55,17 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: ) # TODO: avoid hard coding feature generators in some way - rt_required = "deeplc" in config["feature_generators"] and None in psm_list["retention_time"] - im_required = "ionmob" in config["feature_generators"] and None in psm_list["ion_mobility"] + rt_required = ("deeplc" in config["feature_generators"]) and ( + None in psm_list["retention_time"] + ) + im_required = ("ionmob" or "im2deep" in config["feature_generators"]) and ( + None in psm_list["ion_mobility"] + ) + logger.debug(f"RT required: {rt_required}, IM required: {im_required}") + if rt_required or im_required: logger.info("Parsing missing retention time and/or ion mobility values from spectra...") - get_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required) + fill_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required) # Add rescoring features for fgen_name, fgen_config in config["feature_generators"].items(): diff --git a/ms2rescore/feature_generators/__init__.py b/ms2rescore/feature_generators/__init__.py index 9424448a..52440a5f 100644 --- a/ms2rescore/feature_generators/__init__.py +++ b/ms2rescore/feature_generators/__init__.py @@ -7,6 +7,7 @@ from ms2rescore.feature_generators.ionmob import IonMobFeatureGenerator from ms2rescore.feature_generators.maxquant import MaxQuantFeatureGenerator from ms2rescore.feature_generators.ms2pip import MS2PIPFeatureGenerator +from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator FEATURE_GENERATORS = { "basic": BasicFeatureGenerator, @@ -14,4 +15,5 @@ "deeplc": DeepLCFeatureGenerator, "maxquant": MaxQuantFeatureGenerator, "ionmob": IonMobFeatureGenerator, + "im2deep": IM2DeepFeatureGenerator, } diff --git a/ms2rescore/feature_generators/im2deep.py b/ms2rescore/feature_generators/im2deep.py new file mode 100644 index 00000000..508ffbf2 --- /dev/null +++ b/ms2rescore/feature_generators/im2deep.py @@ -0,0 +1,228 @@ +""" +IM2Deep ion mobility-based feature generator. + +IM2Deep is a fully modification-aware peptide ion mobility predictor. It uses a deep convolutional +neural network to predict retention times based on the atomic composition of the (modified) amino +acid residues in the peptide. See +`github.com/compomics/IM2Deep `_ for more information. + +""" + +import contextlib +import logging +import os +from inspect import getfullargspec +from itertools import chain +from typing import List, Optional + +import numpy as np +from im2deep.im2deep import predict_ccs +from psm_utils import PSMList + +from ms2rescore.feature_generators.base import FeatureGeneratorBase + +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" +logger = logging.getLogger(__name__) + + +class IM2DeepFeatureGenerator(FeatureGeneratorBase): + """IM2Deep collision cross section feature generator.""" + + def __init__( + self, + *args, + lower_score_is_better: bool = False, + spectrum_path: Optional[str] = None, + processes: int = 1, + calibrate_per_charge: bool = True, + **kwargs, + ): + """ + Initialize the IM2DeepFeatureGenerator. + + Parameters + ---------- + lower_score_is_better : bool, optional + A boolean indicating whether lower scores are better for the generated features. + spectrum_path : str or None, optional + Optional path to the spectrum file used for IM2Deep predictions. + processes : int, optional + Number of parallel processes to use for IM2Deep predictions. + calibrate_per_charge : bool, optional + A boolean indicating whether to calibrate CCS values per charge state. + **kwargs : dict, optional + Additional keyword arguments. + + Returns + ------- + None + """ + super().__init__(*args, **kwargs) + self.lower_score_is_better = lower_score_is_better + self.spectrum_path = spectrum_path + self.processes = processes + self.deeplc_kwargs = kwargs or {} + + self._verbose = logger.getEffectiveLevel() <= logging.DEBUG + + # Lazy-load DeepLC + from deeplc import DeepLC + + self.im2deep = DeepLC + + # Remove any kwargs that are not DeepLC arguments + self.im2deep_kwargs = { + k: v for k, v in self.deeplc_kwargs.items() if k in getfullargspec(DeepLC).args + } + self.im2deep_kwargs.update({"config_file": None}) + + # TODO: Implement im2deep_retrain? + + self.im2deep_predictor = None + self.calibrate_per_charge = calibrate_per_charge + + @property + def feature_names(self) -> List[str]: + return [ + "ccs_observed_im2deep", + "ccs_predicted_im2deep", + "ccs_error_im2deep", + "abs_ccs_error_im2deep", + "perc_ccs_error_im2deep", + ] + + def add_features(self, psm_list: PSMList) -> None: + """Add IM2Deep-derived features to PSMs""" + + logger.info("Adding IM2Deep-derived features to PSMs") + + # Get easy-access nested version of PSMlist + psm_dict = psm_list.get_psm_dict() + + # Run IM2Deep for each spectrum file + current_run = 1 + total_runs = sum(len(runs) for runs in psm_dict.values()) + + for runs in psm_dict.values(): + # Reset IM2Deep predictor for each collection of runs + self.im2deep_predictor = None + self.selected_model = None + for run, psms in runs.items(): + logger.info( + f"Running IM2Deep for PSMs from run ({current_run}/{total_runs}): `{run}`..." + ) + + # Disable wild logging to stdout by TensorFlow, unless in debug mode + with ( + contextlib.redirect_stdout(open(os.devnull, "w")) + if not self._verbose + else contextlib.nullcontext() + ): + # Make new PSM list for this run (chain PSMs per spectrum to flat list) + psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) + + logger.debug("Calibrating IM2Deep...") + + # Convert ion mobility to CCS and calibrate CCS values + psm_list_run_df = psm_list_run.to_dataframe() + psm_list_run_df["charge"] = [ + peptidoform.precursor_charge + for peptidoform in psm_list_run_df["peptidoform"] + ] + psm_list_run_df["ccs_observed"] = psm_list_run_df.apply( + lambda x: self.im2ccs( + x["ion_mobility"], + x["precursor_mz"], # TODO: Why does ionmob use calculated mz? + x["charge"], + ), + axis=1, + ) + + # Create dataframe with high confidence hits for calibration + cal_psm_df = self.make_cal_df(psm_list_run_df) + + # Make predictions with IM2Deep + logger.debug("Predicting CCS values...") + calibrated_predictions = predict_ccs( + psm_list_run, cal_psm_df, write_output=False + ) + + # Add features to PSMs + logger.debug("Adding features to PSMs...") + predictions = calibrated_predictions + observations = psm_list_run_df["ccs_observed"] + ccs_diffs_run = np.abs(predictions - observations) + for i, psm in enumerate(psm_list_run): + psm["rescoring_features"].update( + { + "ccs_observed_im2deep": observations[i], + "ccs_predicted_im2deep": predictions[i], + "ccs_error_im2deep": ccs_diffs_run[i], + "abs_ccs_error_im2deep": np.abs(ccs_diffs_run[i]), + "perc_ccs_error_im2deep": np.abs(ccs_diffs_run[i]) + / observations[i] + * 100, + } + ) + + current_run += 1 + + def im2ccs(self, reverse_im, mz, charge, mass_gas=28.013, temp=31.85, t_diff=273.15): + """ + Convert ion mobility to CCS. + + Parameters + ---------- + reverse_im : float + Reduced ion mobility. + mz : float + Precursor m/z. + charge : int + Precursor charge. + mass_gas : float, optional + Mass of gas, by default 28.013 + temp : float, optional + Temperature in Celsius, by default 31.85 + t_diff : float, optional + Factor to convert Celsius to Kelvin, by default 273.15 + + Notes + ----- + Adapted from theGreatHerrLebert/ionmob (https://doi.org/10.1093/bioinformatics/btad486) + + """ + + SUMMARY_CONSTANT = 18509.8632163405 + reduced_mass = (mz * charge * mass_gas) / (mz * charge + mass_gas) + return (SUMMARY_CONSTANT * charge) / ( + np.sqrt(reduced_mass * (temp + t_diff)) * 1 / reverse_im + ) + + # TODO: replace threshold by identified psms? + def make_cal_df(self, psm_list_df, threshold=0.95): + """Make dataframe for calibration of IM2Deep predictions. + + Parameters + ---------- + psm_list_df : pd.DataFrame + DataFrame with PSMs. + threshold : float, optional + Threshold for high confidence hits, by default 0.95. + + Returns + ------- + pd.DataFrame + DataFrame with high confidence hits for calibration.""" + + psm_list_df = psm_list_df[ + psm_list_df["charge"] < 5 + ] # predictions do not go higher for IM2Deep + high_conf_hits = list( + psm_list_df["spectrum_id"][psm_list_df["score"].rank(pct=True) > threshold] + ) + logger.debug( + f"Number of high confidence hits for calculating shift: {len(high_conf_hits)}" + ) + # Filter df for high_conf_hits + cal_psm_df = psm_list_df[psm_list_df["spectrum_id"].isin(high_conf_hits)] + return cal_psm_df diff --git a/ms2rescore/gui/app.py b/ms2rescore/gui/app.py index c436a902..4d6ca98f 100644 --- a/ms2rescore/gui/app.py +++ b/ms2rescore/gui/app.py @@ -359,15 +359,20 @@ def __init__(self, *args, **kwargs): self.deeplc_config = DeepLCConfiguration(self) self.deeplc_config.grid(row=2, column=0, pady=(0, 20), sticky="nsew") + self.im2deep_config = Im2DeepConfiguration(self) + self.im2deep_config.grid(row=3, column=0, pady=(0, 20), sticky="nsew") + self.ionmob_config = IonmobConfiguration(self) - self.ionmob_config.grid(row=3, column=0, pady=(0, 20), sticky="nsew") + self.ionmob_config.grid(row=4, column=0, pady=(0, 20), sticky="nsew") def get(self) -> Dict: """Return the configuration as a dictionary.""" basic_enabled, basic_config = self.basic_config.get() ms2pip_enabled, ms2pip_config = self.ms2pip_config.get() deeplc_enabled, deeplc_config = self.deeplc_config.get() + im2deep_enabled, im2deep_config = self.im2deep_config.get() ionmob_enabled, ionmob_config = self.ionmob_config.get() + config = {} if basic_enabled: config["basic"] = basic_config @@ -522,6 +527,27 @@ def get(self) -> Dict: return enabled, config +class Im2DeepConfiguration(ctk.CTkFrame): + def __init__(self, *args, **kwargs): + """IM2Deep configuration frame.""" + super().__init__(*args, **kwargs) + + self.configure(fg_color="transparent") + self.grid_columnconfigure(0, weight=1) + + self.title = widgets.Heading(self, text="im2deep") + self.title.grid(row=0, column=0, columnspan=2, pady=(0, 5), sticky="ew") + + self.enabled = widgets.LabeledSwitch(self, label="Enable im2deep", default=False) + self.enabled.grid(row=1, column=0, pady=(0, 10), sticky="nsew") + + def get(self) -> Dict: + """Return the configuration as a dictionary.""" + enabled = self.enabled.get() + config = {} + return enabled, config + + class RescoringEngineConfig(ctk.CTkFrame): def __init__(self, *args, **kwargs): """Rescoring engine configuration frame.""" diff --git a/ms2rescore/package_data/config_schema.json b/ms2rescore/package_data/config_schema.json index 459a6cd4..47b1452e 100644 --- a/ms2rescore/package_data/config_schema.json +++ b/ms2rescore/package_data/config_schema.json @@ -29,6 +29,9 @@ }, "ionmob": { "$ref": "#/definitions/ionmob" + }, + "im2deep": { + "$ref": "#/definitions/im2deep" } }, "default": { @@ -236,6 +239,19 @@ } } }, + "im2deep": { + "$ref": "#/definitions/feature_generator", + "description": "Ion mobility feature generator configuration using IM2Deep", + "type": "object", + "additionalProperties": true, + "properties": { + "reference_dataset": { + "description": "Path to IM2Deep reference dataset file", + "type": "string", + "default": "Meier_unimod.parquet" + } + } + }, "mokapot": { "$ref": "#/definitions/rescoring_engine", "description": "Mokapot rescoring engine configuration. Additional properties are passed to the Mokapot brew function.", diff --git a/ms2rescore/parse_spectra.py b/ms2rescore/parse_spectra.py index b584c12b..21ae6af8 100644 --- a/ms2rescore/parse_spectra.py +++ b/ms2rescore/parse_spectra.py @@ -6,8 +6,8 @@ from typing import Dict, Tuple from psm_utils import PSMList -from pyteomics.mgf import MGF -from pyteomics.mzml import MzML +from pyteomics.mgf import IndexedMGF +from pyteomics.mzml import PreIndexedMzML from rich.progress import track from ms2rescore.exceptions import MS2RescoreError @@ -16,7 +16,7 @@ logger = logging.getLogger(__name__) -def get_missing_values(config, psm_list, missing_rt=False, missing_im=False): +def fill_missing_values(config, psm_list, missing_rt=False, missing_im=False): """Get missing RT/IM features from spectrum file.""" logger.debug("Extracting missing RT/IM values from spectrum file(s).") @@ -26,17 +26,14 @@ def get_missing_values(config, psm_list, missing_rt=False, missing_im=False): psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) spectrum_file = infer_spectrum_path(config["spectrum_path"], run) - rt_dict = None - im_dict = None - if missing_im or missing_rt: if spectrum_file.suffix.lower() == ".mzml": - rt_dict, im_dict = _parse_values_from_mzml( - spectrum_file, config, run, missing_rt, missing_im + _parse_values_from_mzml( + psm_list_run, spectrum_file, config, missing_rt, missing_im ) elif spectrum_file.suffix.lower() == ".mgf": - rt_dict, im_dict = _parse_values_from_mgf( - spectrum_file, config, run, missing_rt, missing_im + _parse_values_from_mgf( + psm_list_run, spectrum_file, config, missing_rt, missing_im ) else: raise MS2RescoreError( @@ -44,104 +41,95 @@ def get_missing_values(config, psm_list, missing_rt=False, missing_im=False): "Please make sure that the spectrum file is either in mzML or MGF format." ) - for value_dict, value in zip([rt_dict, im_dict], ["retention_time", "ion_mobility"]): - if value_dict: - try: - psm_list_run[value] = [value_dict[psm.spectrum_id] for psm in psm_list_run] - except KeyError: - raise ParsingError( - f"Could not parse {value} values from spectrum file for run {run}." - ) - def _parse_values_from_mgf( - spectrum_file, config, run, missing_rt, missing_im + psm_list_run, spectrum_file, config, missing_rt, missing_im ) -> Tuple[Dict, Dict]: - """ - Parse retention time and/or ion mobility from an MGF file. - - Notes - ----- - - Extracting values (e.g., ion mobility) according to the Matrix documentation: - http://www.matrixscience.com/help/data_file_help.html - - """ - rt_dict = {} - im_dict = {} + """Parse retention time and/or ion mobility from an mzML file.""" + mgf = IndexedMGF(str(spectrum_file)) spectrum_id_pattern = re.compile( config["spectrum_id_pattern"] if config["spectrum_id_pattern"] else r"(.*)" ) - for spectrum in MGF(str(spectrum_file)): - matched_id = spectrum_id_pattern.match(spectrum["params"]["title"]).group() - if missing_rt: - try: - rt_dict[matched_id] = float(spectrum["params"]["rtinseconds"]) - except KeyError: - raise ParsingError( - "Could not parse retention time (`rtinseconds`) from spectrum file for " - f"run {run}. Please make sure that the retention time key is present in the " - "spectrum file or disable the relevant feature generator." - ) - if missing_im: - try: - im_dict[matched_id] = float(spectrum["params"]["ion_mobility"]) - except KeyError: - raise ParsingError( - "Could not parse ion mobility (`ion_mobility`) from spectrum file " - f"for run {run}. Please make sure that the ion mobility key is present in the " - "spectrum file or disable the relevant feature generator." - ) + try: + mapper = { + spectrum_id_pattern.search(spectrum_id).group(1): spectrum_id + for spectrum_id in mgf._offset_index.mapping["spectrum"].keys() + } + except AttributeError: + raise ParseMGFError( + "Could not parse spectrum IDs using ´spectrum_id_pattern´. Please make sure that there is a capturing in the pattern." + ) + + spectra = {spectrum_id: mgf.get_by_id(spectrum_id) for spectrum_id in mapper.values()} - return rt_dict, im_dict + for psm in psm_list_run: + spectrum = spectra.get(mapper[psm.spectrum_id]) + if spectrum is None: + raise ParsingError(f"Could not find spectrum with ID {psm.spectrum_id} in MGF file.") + + if missing_rt and "params" in spectrum and "rtinseconds" in spectrum["params"]: + psm.retention_time = float(spectrum["params"]["rtinseconds"]) + + if missing_im and "params" in spectrum and "ion_mobility" in spectrum["params"]: + psm.ion_mobility = float(spectrum["params"]["ion_mobility"]) def _parse_values_from_mzml( - spectrum_file, config, run, missing_rt, missing_im + psm_list_run, spectrum_file, config, missing_rt, missing_im ) -> Tuple[Dict, Dict]: """Parse retention time and/or ion mobility from an mzML file.""" - rt_dict = {} - im_dict = {} + mzml = PreIndexedMzML(str(spectrum_file)) spectrum_id_pattern = re.compile( config["spectrum_id_pattern"] if config["spectrum_id_pattern"] else r"(.*)" ) - for spectrum in MzML(str(spectrum_file)): - if spectrum["ms level"] != 2: - continue - matched_id = spectrum_id_pattern.match(spectrum["id"]).group() - if missing_rt: - try: - rt_dict[matched_id] = float(spectrum["scanList"]["scan"][0]["scan start time"]) - except KeyError: - raise ParsingError( - "Could not parse retention time (`scan start time`) from spectrum file for " - f"run {run}. Please make sure that the retention time key is present in the " - "spectrum file or disable the relevant feature generator." - ) + try: + mapper = { + spectrum_id_pattern.search(spectrum_id).group(1): spectrum_id + for spectrum_id in mzml._offset_index.mapping["spectrum"].keys() + } + except AttributeError as e: + raise ParseMGFError( + "Could not parse spectrum IDs using ´spectrum_id_pattern´. Please make sure that there is a capturing in the pattern." + ) from e + + spectra = {spectrum_id: mzml.get_by_id(spectrum_id) for spectrum_id in mapper.values()} + + for psm in psm_list_run: + spectrum = spectra.get(mapper[psm.spectrum_id]) + if spectrum is None: + raise ParsingError(f"Could not find spectrum with ID {psm.spectrum_id} in mzML file.") + + if ( + missing_rt + and "scanList" in spectrum + and "scan" in spectrum["scanList"] + and spectrum["scanList"]["scan"] + ): + psm.retention_time = float(spectrum["scanList"]["scan"][0].get("scan start time", 0)) + if missing_im: - try: - # TODO Can this be in the scanList or only in the precursorList? - im_dict[matched_id] = float( - spectrum["scanList"]["scan"][0]["reverse ion mobility"] + if ( + "precursorList" in spectrum + and "precursor" in spectrum["precursorList"] + and spectrum["precursorList"]["precursor"] + ): + psm.ion_mobility = float( + spectrum["precursorList"]["precursor"][0]["selectedIonList"]["selectedIon"][ + 0 + ].get("inverse reduced ion mobility", 0) + ) + elif ( + "scanList" in spectrum + and "scan" in spectrum["scanList"] + and spectrum["scanList"]["scan"] + ): + psm.ion_mobility = float( + spectrum["scanList"]["scan"][0].get("reverse ion mobility", 0) ) - except KeyError: - try: - im_dict[matched_id] = float( - spectrum["precursorList"]["precursor"][0]["selectedIonList"][ - "selectedIon" - ][0]["inverse reduced ion mobility"] - ) - except KeyError: - raise ParsingError( - "Could not parse ion mobility (`reverse ion mobility`) from spectrum file " - f"for run {run}. Please make sure that the ion mobility key is present in the " - "spectrum file or disable the relevant feature generator." - ) - - return rt_dict, im_dict class ParseMGFError(MS2RescoreError): diff --git a/ms2rescore/report/generate.py b/ms2rescore/report/generate.py index 090db873..bca047cc 100644 --- a/ms2rescore/report/generate.py +++ b/ms2rescore/report/generate.py @@ -145,9 +145,11 @@ def _collect_files(output_path_prefix, use_txt_log=False): "configuration": Path(output_path_prefix + ".full-config.json").resolve(), "feature names": Path(output_path_prefix + ".feature_names.tsv").resolve(), "feature weights": Path(output_path_prefix + ".mokapot.weights.tsv").resolve(), - "log": Path(output_path_prefix + ".log.txt").resolve() - if use_txt_log - else Path(output_path_prefix + ".log.html").resolve(), + "log": ( + Path(output_path_prefix + ".log.txt").resolve() + if use_txt_log + else Path(output_path_prefix + ".log.html").resolve() + ), } for file, path in files.items(): if Path(path).is_file(): @@ -338,6 +340,28 @@ def _get_features_context( } ) + # IM2Deep specific charts + if "im2deep" in feature_names: + import deeplc.plot + + scatter_chart = deeplc.plot.scatter( + df=features[ + (psm_list["is_decoy"] == False) & (psm_list["qvalue"] <= 0.01) + ], # noqa: E712 + predicted_column="ccs_predicted_im2deep", + observed_column="ccs_observed_im2deep", + xaxis_label="Observed CCS", + yaxis_label="Predicted CCS", + plot_title="Predicted vs. observed CCS", + ) + + context["charts"].append( + { + "title": TEXTS["charts"]["im2deep_performance"]["title"], + "description": TEXTS["charts"]["im2deep_performance"]["description"], + "chart": scatter_chart.to_html(**PLOTLY_HTML_KWARGS), + } + ) return context diff --git a/ms2rescore/report/templates/texts.toml b/ms2rescore/report/templates/texts.toml index 2d0840cf..52c9a230 100644 --- a/ms2rescore/report/templates/texts.toml +++ b/ms2rescore/report/templates/texts.toml @@ -105,3 +105,9 @@ bottom chart shows the distribution of RMAE values of DeepLC predictions on 460 datasets. The red line indicates the RMAE value for all target PSMs that passed the 1% FDR threshold of the current dataset. A lower RMAE value indicates better performance. """ + +[charts.im2deep_performance] +title = "IM2Deep model performance" +description = """ +IM2Deep model performance can be visualized by plotting the predicted CCS against the observed CCS. +""" diff --git a/pyproject.toml b/pyproject.toml index 1763cabe..484d4334 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,16 +32,17 @@ classifiers = [ dynamic = ["version"] requires-python = ">=3.8" dependencies = [ - "numpy>=1.16.0", + "numpy>=1.16.0; python_version != '3.11'", + "numpy==1.24.3; python_version == '3.11'", # Incompatibility with sklearn, pygam, and TF... "pandas>=1.0", "rich>=12", - "pyteomics>=4.1.0", + "pyteomics>=4.1.0, <4.7", "lxml>=4.5", - "ms2pip>=4.0.0-dev4", + "ms2pip>=4.0.0-dev5", "click>=7", "cascade-config>=0.4.0", "deeplc>=2.2", - "deeplcretrainer==0.1.17", # TODO: Release version pin + "deeplcretrainer>=0.1.17", "tomli>=2; python_version < '3.11'", "psm_utils>=0.4", "customtkinter>=5,<6", @@ -49,6 +50,7 @@ dependencies = [ "pydantic>=1.8.2,<2", # Fix compatibility with v2 in psm_utils "jinja2>=3", "plotly>=5", + "im2deep>=0.1.3", ] [project.optional-dependencies]