diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index c03b1667..a6a65b9e 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -14,7 +14,7 @@ jobs:
- uses: actions/checkout@v4
- name: Set up Python
- uses: actions/setup-python@v4
+ uses: actions/setup-python@v5
with:
python-version: "3.11"
@@ -47,7 +47,7 @@ jobs:
steps:
- uses: actions/checkout@v4
- - uses: actions/setup-python@v4
+ - uses: actions/setup-python@v5
with:
python-version: "3.11"
diff --git a/Dockerfile b/Dockerfile
index 59fc4000..2a51f7f3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,8 +1,10 @@
-FROM ubuntu:focal
+FROM python:3.10
+
+# ARG DEBIAN_FRONTEND=noninteractive
LABEL name="ms2rescore"
-ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/ms2rescore
+# ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/ms2rescore
ADD pyproject.toml /ms2rescore/pyproject.toml
ADD LICENSE /ms2rescore/LICENSE
@@ -11,8 +13,7 @@ ADD MANIFEST.in /ms2rescore/MANIFEST.in
ADD ms2rescore /ms2rescore/ms2rescore
RUN apt-get update \
- && apt-get install -y python3-pip procps libglib2.0-0 libsm6 libxrender1 libxext6 \
- && rm -rf /var/lib/apt/lists/* \
- && pip3 install ms2rescore/
+ && apt install -y procps git-lfs \
+ && pip install /ms2rescore
ENTRYPOINT [""]
diff --git a/docs/source/config_schema.md b/docs/source/config_schema.md
index 2b523aa5..1953cb78 100644
--- a/docs/source/config_schema.md
+++ b/docs/source/config_schema.md
@@ -10,6 +10,7 @@
- **`deeplc`**: Refer to *[#/definitions/deeplc](#definitions/deeplc)*.
- **`maxquant`**: Refer to *[#/definitions/maxquant](#definitions/maxquant)*.
- **`ionmob`**: Refer to *[#/definitions/ionmob](#definitions/ionmob)*.
+ - **`im2deep`**: Refer to *[#/definitions/im2deep](#definitions/im2deep)*.
- **`rescoring_engine`** *(object)*: Rescoring engine to use and its configuration. Leave empty to skip rescoring and write features to file. Default: `{"mokapot": {}}`.
- **`.*`**: Refer to *[#/definitions/rescoring_engine](#definitions/rescoring_engine)*.
- **`percolator`**: Refer to *[#/definitions/percolator](#definitions/percolator)*.
@@ -43,6 +44,14 @@
- **One of**
- *string*
- *null*
+ - **`psm_id_rt_pattern`**: Regex pattern to extract retention time from psm identifier. Requires at least one capturing group. Default: `null`.
+ - **One of**
+ - *string*
+ - *null*
+ - **`psm_id_im_pattern`**: Regex pattern to extract ion mobility from psm identifier. Requires at least one capturing group. Default: `null`.
+ - **One of**
+ - *string*
+ - *null*
- **`psm_id_pattern`**: Regex pattern to extract index or scan number from PSM file. Requires at least one capturing group. Default: `"(.*)"`.
- **One of**
- *string*
@@ -75,6 +84,8 @@
- **`ionmob_model`** *(string)*: Path to Ionmob model directory. Default: `"GRUPredictor"`.
- **`reference_dataset`** *(string)*: Path to Ionmob reference dataset file. Default: `"Meier_unimod.parquet"`.
- **`tokenizer`** *(string)*: Path to tokenizer json file. Default: `"tokenizer.json"`.
+- **`im2deep`** *(object)*: Ion mobility feature generator configuration using IM2Deep. Can contain additional properties. Refer to *[#/definitions/feature_generator](#definitions/feature_generator)*.
+ - **`reference_dataset`** *(string)*: Path to IM2Deep reference dataset file. Default: `"Meier_unimod.parquet"`.
- **`mokapot`** *(object)*: Mokapot rescoring engine configuration. Additional properties are passed to the Mokapot brew function. Can contain additional properties. Refer to *[#/definitions/rescoring_engine](#definitions/rescoring_engine)*.
- **`write_weights`** *(boolean)*: Write Mokapot weights to a text file. Default: `false`.
- **`write_txt`** *(boolean)*: Write Mokapot results to a text file. Default: `false`.
diff --git a/ms2rescore/core.py b/ms2rescore/core.py
index fffb1902..e5ae26f1 100644
--- a/ms2rescore/core.py
+++ b/ms2rescore/core.py
@@ -8,7 +8,7 @@
from ms2rescore.feature_generators import FEATURE_GENERATORS
from ms2rescore.parse_psms import parse_psms
-from ms2rescore.parse_spectra import get_missing_values
+from ms2rescore.parse_spectra import fill_missing_values
from ms2rescore.report import generate
from ms2rescore.rescoring_engines import mokapot, percolator
@@ -55,11 +55,17 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
)
# TODO: avoid hard coding feature generators in some way
- rt_required = "deeplc" in config["feature_generators"] and None in psm_list["retention_time"]
- im_required = "ionmob" in config["feature_generators"] and None in psm_list["ion_mobility"]
+ rt_required = ("deeplc" in config["feature_generators"]) and (
+ None in psm_list["retention_time"]
+ )
+ im_required = ("ionmob" or "im2deep" in config["feature_generators"]) and (
+ None in psm_list["ion_mobility"]
+ )
+ logger.debug(f"RT required: {rt_required}, IM required: {im_required}")
+
if rt_required or im_required:
logger.info("Parsing missing retention time and/or ion mobility values from spectra...")
- get_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required)
+ fill_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required)
# Add rescoring features
for fgen_name, fgen_config in config["feature_generators"].items():
diff --git a/ms2rescore/feature_generators/__init__.py b/ms2rescore/feature_generators/__init__.py
index 9424448a..52440a5f 100644
--- a/ms2rescore/feature_generators/__init__.py
+++ b/ms2rescore/feature_generators/__init__.py
@@ -7,6 +7,7 @@
from ms2rescore.feature_generators.ionmob import IonMobFeatureGenerator
from ms2rescore.feature_generators.maxquant import MaxQuantFeatureGenerator
from ms2rescore.feature_generators.ms2pip import MS2PIPFeatureGenerator
+from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator
FEATURE_GENERATORS = {
"basic": BasicFeatureGenerator,
@@ -14,4 +15,5 @@
"deeplc": DeepLCFeatureGenerator,
"maxquant": MaxQuantFeatureGenerator,
"ionmob": IonMobFeatureGenerator,
+ "im2deep": IM2DeepFeatureGenerator,
}
diff --git a/ms2rescore/feature_generators/im2deep.py b/ms2rescore/feature_generators/im2deep.py
new file mode 100644
index 00000000..508ffbf2
--- /dev/null
+++ b/ms2rescore/feature_generators/im2deep.py
@@ -0,0 +1,228 @@
+"""
+IM2Deep ion mobility-based feature generator.
+
+IM2Deep is a fully modification-aware peptide ion mobility predictor. It uses a deep convolutional
+neural network to predict retention times based on the atomic composition of the (modified) amino
+acid residues in the peptide. See
+`github.com/compomics/IM2Deep `_ for more information.
+
+"""
+
+import contextlib
+import logging
+import os
+from inspect import getfullargspec
+from itertools import chain
+from typing import List, Optional
+
+import numpy as np
+from im2deep.im2deep import predict_ccs
+from psm_utils import PSMList
+
+from ms2rescore.feature_generators.base import FeatureGeneratorBase
+
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
+logger = logging.getLogger(__name__)
+
+
+class IM2DeepFeatureGenerator(FeatureGeneratorBase):
+ """IM2Deep collision cross section feature generator."""
+
+ def __init__(
+ self,
+ *args,
+ lower_score_is_better: bool = False,
+ spectrum_path: Optional[str] = None,
+ processes: int = 1,
+ calibrate_per_charge: bool = True,
+ **kwargs,
+ ):
+ """
+ Initialize the IM2DeepFeatureGenerator.
+
+ Parameters
+ ----------
+ lower_score_is_better : bool, optional
+ A boolean indicating whether lower scores are better for the generated features.
+ spectrum_path : str or None, optional
+ Optional path to the spectrum file used for IM2Deep predictions.
+ processes : int, optional
+ Number of parallel processes to use for IM2Deep predictions.
+ calibrate_per_charge : bool, optional
+ A boolean indicating whether to calibrate CCS values per charge state.
+ **kwargs : dict, optional
+ Additional keyword arguments.
+
+ Returns
+ -------
+ None
+ """
+ super().__init__(*args, **kwargs)
+ self.lower_score_is_better = lower_score_is_better
+ self.spectrum_path = spectrum_path
+ self.processes = processes
+ self.deeplc_kwargs = kwargs or {}
+
+ self._verbose = logger.getEffectiveLevel() <= logging.DEBUG
+
+ # Lazy-load DeepLC
+ from deeplc import DeepLC
+
+ self.im2deep = DeepLC
+
+ # Remove any kwargs that are not DeepLC arguments
+ self.im2deep_kwargs = {
+ k: v for k, v in self.deeplc_kwargs.items() if k in getfullargspec(DeepLC).args
+ }
+ self.im2deep_kwargs.update({"config_file": None})
+
+ # TODO: Implement im2deep_retrain?
+
+ self.im2deep_predictor = None
+ self.calibrate_per_charge = calibrate_per_charge
+
+ @property
+ def feature_names(self) -> List[str]:
+ return [
+ "ccs_observed_im2deep",
+ "ccs_predicted_im2deep",
+ "ccs_error_im2deep",
+ "abs_ccs_error_im2deep",
+ "perc_ccs_error_im2deep",
+ ]
+
+ def add_features(self, psm_list: PSMList) -> None:
+ """Add IM2Deep-derived features to PSMs"""
+
+ logger.info("Adding IM2Deep-derived features to PSMs")
+
+ # Get easy-access nested version of PSMlist
+ psm_dict = psm_list.get_psm_dict()
+
+ # Run IM2Deep for each spectrum file
+ current_run = 1
+ total_runs = sum(len(runs) for runs in psm_dict.values())
+
+ for runs in psm_dict.values():
+ # Reset IM2Deep predictor for each collection of runs
+ self.im2deep_predictor = None
+ self.selected_model = None
+ for run, psms in runs.items():
+ logger.info(
+ f"Running IM2Deep for PSMs from run ({current_run}/{total_runs}): `{run}`..."
+ )
+
+ # Disable wild logging to stdout by TensorFlow, unless in debug mode
+ with (
+ contextlib.redirect_stdout(open(os.devnull, "w"))
+ if not self._verbose
+ else contextlib.nullcontext()
+ ):
+ # Make new PSM list for this run (chain PSMs per spectrum to flat list)
+ psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
+
+ logger.debug("Calibrating IM2Deep...")
+
+ # Convert ion mobility to CCS and calibrate CCS values
+ psm_list_run_df = psm_list_run.to_dataframe()
+ psm_list_run_df["charge"] = [
+ peptidoform.precursor_charge
+ for peptidoform in psm_list_run_df["peptidoform"]
+ ]
+ psm_list_run_df["ccs_observed"] = psm_list_run_df.apply(
+ lambda x: self.im2ccs(
+ x["ion_mobility"],
+ x["precursor_mz"], # TODO: Why does ionmob use calculated mz?
+ x["charge"],
+ ),
+ axis=1,
+ )
+
+ # Create dataframe with high confidence hits for calibration
+ cal_psm_df = self.make_cal_df(psm_list_run_df)
+
+ # Make predictions with IM2Deep
+ logger.debug("Predicting CCS values...")
+ calibrated_predictions = predict_ccs(
+ psm_list_run, cal_psm_df, write_output=False
+ )
+
+ # Add features to PSMs
+ logger.debug("Adding features to PSMs...")
+ predictions = calibrated_predictions
+ observations = psm_list_run_df["ccs_observed"]
+ ccs_diffs_run = np.abs(predictions - observations)
+ for i, psm in enumerate(psm_list_run):
+ psm["rescoring_features"].update(
+ {
+ "ccs_observed_im2deep": observations[i],
+ "ccs_predicted_im2deep": predictions[i],
+ "ccs_error_im2deep": ccs_diffs_run[i],
+ "abs_ccs_error_im2deep": np.abs(ccs_diffs_run[i]),
+ "perc_ccs_error_im2deep": np.abs(ccs_diffs_run[i])
+ / observations[i]
+ * 100,
+ }
+ )
+
+ current_run += 1
+
+ def im2ccs(self, reverse_im, mz, charge, mass_gas=28.013, temp=31.85, t_diff=273.15):
+ """
+ Convert ion mobility to CCS.
+
+ Parameters
+ ----------
+ reverse_im : float
+ Reduced ion mobility.
+ mz : float
+ Precursor m/z.
+ charge : int
+ Precursor charge.
+ mass_gas : float, optional
+ Mass of gas, by default 28.013
+ temp : float, optional
+ Temperature in Celsius, by default 31.85
+ t_diff : float, optional
+ Factor to convert Celsius to Kelvin, by default 273.15
+
+ Notes
+ -----
+ Adapted from theGreatHerrLebert/ionmob (https://doi.org/10.1093/bioinformatics/btad486)
+
+ """
+
+ SUMMARY_CONSTANT = 18509.8632163405
+ reduced_mass = (mz * charge * mass_gas) / (mz * charge + mass_gas)
+ return (SUMMARY_CONSTANT * charge) / (
+ np.sqrt(reduced_mass * (temp + t_diff)) * 1 / reverse_im
+ )
+
+ # TODO: replace threshold by identified psms?
+ def make_cal_df(self, psm_list_df, threshold=0.95):
+ """Make dataframe for calibration of IM2Deep predictions.
+
+ Parameters
+ ----------
+ psm_list_df : pd.DataFrame
+ DataFrame with PSMs.
+ threshold : float, optional
+ Threshold for high confidence hits, by default 0.95.
+
+ Returns
+ -------
+ pd.DataFrame
+ DataFrame with high confidence hits for calibration."""
+
+ psm_list_df = psm_list_df[
+ psm_list_df["charge"] < 5
+ ] # predictions do not go higher for IM2Deep
+ high_conf_hits = list(
+ psm_list_df["spectrum_id"][psm_list_df["score"].rank(pct=True) > threshold]
+ )
+ logger.debug(
+ f"Number of high confidence hits for calculating shift: {len(high_conf_hits)}"
+ )
+ # Filter df for high_conf_hits
+ cal_psm_df = psm_list_df[psm_list_df["spectrum_id"].isin(high_conf_hits)]
+ return cal_psm_df
diff --git a/ms2rescore/gui/app.py b/ms2rescore/gui/app.py
index c436a902..4d6ca98f 100644
--- a/ms2rescore/gui/app.py
+++ b/ms2rescore/gui/app.py
@@ -359,15 +359,20 @@ def __init__(self, *args, **kwargs):
self.deeplc_config = DeepLCConfiguration(self)
self.deeplc_config.grid(row=2, column=0, pady=(0, 20), sticky="nsew")
+ self.im2deep_config = Im2DeepConfiguration(self)
+ self.im2deep_config.grid(row=3, column=0, pady=(0, 20), sticky="nsew")
+
self.ionmob_config = IonmobConfiguration(self)
- self.ionmob_config.grid(row=3, column=0, pady=(0, 20), sticky="nsew")
+ self.ionmob_config.grid(row=4, column=0, pady=(0, 20), sticky="nsew")
def get(self) -> Dict:
"""Return the configuration as a dictionary."""
basic_enabled, basic_config = self.basic_config.get()
ms2pip_enabled, ms2pip_config = self.ms2pip_config.get()
deeplc_enabled, deeplc_config = self.deeplc_config.get()
+ im2deep_enabled, im2deep_config = self.im2deep_config.get()
ionmob_enabled, ionmob_config = self.ionmob_config.get()
+
config = {}
if basic_enabled:
config["basic"] = basic_config
@@ -522,6 +527,27 @@ def get(self) -> Dict:
return enabled, config
+class Im2DeepConfiguration(ctk.CTkFrame):
+ def __init__(self, *args, **kwargs):
+ """IM2Deep configuration frame."""
+ super().__init__(*args, **kwargs)
+
+ self.configure(fg_color="transparent")
+ self.grid_columnconfigure(0, weight=1)
+
+ self.title = widgets.Heading(self, text="im2deep")
+ self.title.grid(row=0, column=0, columnspan=2, pady=(0, 5), sticky="ew")
+
+ self.enabled = widgets.LabeledSwitch(self, label="Enable im2deep", default=False)
+ self.enabled.grid(row=1, column=0, pady=(0, 10), sticky="nsew")
+
+ def get(self) -> Dict:
+ """Return the configuration as a dictionary."""
+ enabled = self.enabled.get()
+ config = {}
+ return enabled, config
+
+
class RescoringEngineConfig(ctk.CTkFrame):
def __init__(self, *args, **kwargs):
"""Rescoring engine configuration frame."""
diff --git a/ms2rescore/package_data/config_schema.json b/ms2rescore/package_data/config_schema.json
index 459a6cd4..47b1452e 100644
--- a/ms2rescore/package_data/config_schema.json
+++ b/ms2rescore/package_data/config_schema.json
@@ -29,6 +29,9 @@
},
"ionmob": {
"$ref": "#/definitions/ionmob"
+ },
+ "im2deep": {
+ "$ref": "#/definitions/im2deep"
}
},
"default": {
@@ -236,6 +239,19 @@
}
}
},
+ "im2deep": {
+ "$ref": "#/definitions/feature_generator",
+ "description": "Ion mobility feature generator configuration using IM2Deep",
+ "type": "object",
+ "additionalProperties": true,
+ "properties": {
+ "reference_dataset": {
+ "description": "Path to IM2Deep reference dataset file",
+ "type": "string",
+ "default": "Meier_unimod.parquet"
+ }
+ }
+ },
"mokapot": {
"$ref": "#/definitions/rescoring_engine",
"description": "Mokapot rescoring engine configuration. Additional properties are passed to the Mokapot brew function.",
diff --git a/ms2rescore/parse_spectra.py b/ms2rescore/parse_spectra.py
index b584c12b..21ae6af8 100644
--- a/ms2rescore/parse_spectra.py
+++ b/ms2rescore/parse_spectra.py
@@ -6,8 +6,8 @@
from typing import Dict, Tuple
from psm_utils import PSMList
-from pyteomics.mgf import MGF
-from pyteomics.mzml import MzML
+from pyteomics.mgf import IndexedMGF
+from pyteomics.mzml import PreIndexedMzML
from rich.progress import track
from ms2rescore.exceptions import MS2RescoreError
@@ -16,7 +16,7 @@
logger = logging.getLogger(__name__)
-def get_missing_values(config, psm_list, missing_rt=False, missing_im=False):
+def fill_missing_values(config, psm_list, missing_rt=False, missing_im=False):
"""Get missing RT/IM features from spectrum file."""
logger.debug("Extracting missing RT/IM values from spectrum file(s).")
@@ -26,17 +26,14 @@ def get_missing_values(config, psm_list, missing_rt=False, missing_im=False):
psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
spectrum_file = infer_spectrum_path(config["spectrum_path"], run)
- rt_dict = None
- im_dict = None
-
if missing_im or missing_rt:
if spectrum_file.suffix.lower() == ".mzml":
- rt_dict, im_dict = _parse_values_from_mzml(
- spectrum_file, config, run, missing_rt, missing_im
+ _parse_values_from_mzml(
+ psm_list_run, spectrum_file, config, missing_rt, missing_im
)
elif spectrum_file.suffix.lower() == ".mgf":
- rt_dict, im_dict = _parse_values_from_mgf(
- spectrum_file, config, run, missing_rt, missing_im
+ _parse_values_from_mgf(
+ psm_list_run, spectrum_file, config, missing_rt, missing_im
)
else:
raise MS2RescoreError(
@@ -44,104 +41,95 @@ def get_missing_values(config, psm_list, missing_rt=False, missing_im=False):
"Please make sure that the spectrum file is either in mzML or MGF format."
)
- for value_dict, value in zip([rt_dict, im_dict], ["retention_time", "ion_mobility"]):
- if value_dict:
- try:
- psm_list_run[value] = [value_dict[psm.spectrum_id] for psm in psm_list_run]
- except KeyError:
- raise ParsingError(
- f"Could not parse {value} values from spectrum file for run {run}."
- )
-
def _parse_values_from_mgf(
- spectrum_file, config, run, missing_rt, missing_im
+ psm_list_run, spectrum_file, config, missing_rt, missing_im
) -> Tuple[Dict, Dict]:
- """
- Parse retention time and/or ion mobility from an MGF file.
-
- Notes
- -----
- - Extracting values (e.g., ion mobility) according to the Matrix documentation:
- http://www.matrixscience.com/help/data_file_help.html
-
- """
- rt_dict = {}
- im_dict = {}
+ """Parse retention time and/or ion mobility from an mzML file."""
+ mgf = IndexedMGF(str(spectrum_file))
spectrum_id_pattern = re.compile(
config["spectrum_id_pattern"] if config["spectrum_id_pattern"] else r"(.*)"
)
- for spectrum in MGF(str(spectrum_file)):
- matched_id = spectrum_id_pattern.match(spectrum["params"]["title"]).group()
- if missing_rt:
- try:
- rt_dict[matched_id] = float(spectrum["params"]["rtinseconds"])
- except KeyError:
- raise ParsingError(
- "Could not parse retention time (`rtinseconds`) from spectrum file for "
- f"run {run}. Please make sure that the retention time key is present in the "
- "spectrum file or disable the relevant feature generator."
- )
- if missing_im:
- try:
- im_dict[matched_id] = float(spectrum["params"]["ion_mobility"])
- except KeyError:
- raise ParsingError(
- "Could not parse ion mobility (`ion_mobility`) from spectrum file "
- f"for run {run}. Please make sure that the ion mobility key is present in the "
- "spectrum file or disable the relevant feature generator."
- )
+ try:
+ mapper = {
+ spectrum_id_pattern.search(spectrum_id).group(1): spectrum_id
+ for spectrum_id in mgf._offset_index.mapping["spectrum"].keys()
+ }
+ except AttributeError:
+ raise ParseMGFError(
+ "Could not parse spectrum IDs using ´spectrum_id_pattern´. Please make sure that there is a capturing in the pattern."
+ )
+
+ spectra = {spectrum_id: mgf.get_by_id(spectrum_id) for spectrum_id in mapper.values()}
- return rt_dict, im_dict
+ for psm in psm_list_run:
+ spectrum = spectra.get(mapper[psm.spectrum_id])
+ if spectrum is None:
+ raise ParsingError(f"Could not find spectrum with ID {psm.spectrum_id} in MGF file.")
+
+ if missing_rt and "params" in spectrum and "rtinseconds" in spectrum["params"]:
+ psm.retention_time = float(spectrum["params"]["rtinseconds"])
+
+ if missing_im and "params" in spectrum and "ion_mobility" in spectrum["params"]:
+ psm.ion_mobility = float(spectrum["params"]["ion_mobility"])
def _parse_values_from_mzml(
- spectrum_file, config, run, missing_rt, missing_im
+ psm_list_run, spectrum_file, config, missing_rt, missing_im
) -> Tuple[Dict, Dict]:
"""Parse retention time and/or ion mobility from an mzML file."""
- rt_dict = {}
- im_dict = {}
+ mzml = PreIndexedMzML(str(spectrum_file))
spectrum_id_pattern = re.compile(
config["spectrum_id_pattern"] if config["spectrum_id_pattern"] else r"(.*)"
)
- for spectrum in MzML(str(spectrum_file)):
- if spectrum["ms level"] != 2:
- continue
- matched_id = spectrum_id_pattern.match(spectrum["id"]).group()
- if missing_rt:
- try:
- rt_dict[matched_id] = float(spectrum["scanList"]["scan"][0]["scan start time"])
- except KeyError:
- raise ParsingError(
- "Could not parse retention time (`scan start time`) from spectrum file for "
- f"run {run}. Please make sure that the retention time key is present in the "
- "spectrum file or disable the relevant feature generator."
- )
+ try:
+ mapper = {
+ spectrum_id_pattern.search(spectrum_id).group(1): spectrum_id
+ for spectrum_id in mzml._offset_index.mapping["spectrum"].keys()
+ }
+ except AttributeError as e:
+ raise ParseMGFError(
+ "Could not parse spectrum IDs using ´spectrum_id_pattern´. Please make sure that there is a capturing in the pattern."
+ ) from e
+
+ spectra = {spectrum_id: mzml.get_by_id(spectrum_id) for spectrum_id in mapper.values()}
+
+ for psm in psm_list_run:
+ spectrum = spectra.get(mapper[psm.spectrum_id])
+ if spectrum is None:
+ raise ParsingError(f"Could not find spectrum with ID {psm.spectrum_id} in mzML file.")
+
+ if (
+ missing_rt
+ and "scanList" in spectrum
+ and "scan" in spectrum["scanList"]
+ and spectrum["scanList"]["scan"]
+ ):
+ psm.retention_time = float(spectrum["scanList"]["scan"][0].get("scan start time", 0))
+
if missing_im:
- try:
- # TODO Can this be in the scanList or only in the precursorList?
- im_dict[matched_id] = float(
- spectrum["scanList"]["scan"][0]["reverse ion mobility"]
+ if (
+ "precursorList" in spectrum
+ and "precursor" in spectrum["precursorList"]
+ and spectrum["precursorList"]["precursor"]
+ ):
+ psm.ion_mobility = float(
+ spectrum["precursorList"]["precursor"][0]["selectedIonList"]["selectedIon"][
+ 0
+ ].get("inverse reduced ion mobility", 0)
+ )
+ elif (
+ "scanList" in spectrum
+ and "scan" in spectrum["scanList"]
+ and spectrum["scanList"]["scan"]
+ ):
+ psm.ion_mobility = float(
+ spectrum["scanList"]["scan"][0].get("reverse ion mobility", 0)
)
- except KeyError:
- try:
- im_dict[matched_id] = float(
- spectrum["precursorList"]["precursor"][0]["selectedIonList"][
- "selectedIon"
- ][0]["inverse reduced ion mobility"]
- )
- except KeyError:
- raise ParsingError(
- "Could not parse ion mobility (`reverse ion mobility`) from spectrum file "
- f"for run {run}. Please make sure that the ion mobility key is present in the "
- "spectrum file or disable the relevant feature generator."
- )
-
- return rt_dict, im_dict
class ParseMGFError(MS2RescoreError):
diff --git a/ms2rescore/report/generate.py b/ms2rescore/report/generate.py
index 090db873..bca047cc 100644
--- a/ms2rescore/report/generate.py
+++ b/ms2rescore/report/generate.py
@@ -145,9 +145,11 @@ def _collect_files(output_path_prefix, use_txt_log=False):
"configuration": Path(output_path_prefix + ".full-config.json").resolve(),
"feature names": Path(output_path_prefix + ".feature_names.tsv").resolve(),
"feature weights": Path(output_path_prefix + ".mokapot.weights.tsv").resolve(),
- "log": Path(output_path_prefix + ".log.txt").resolve()
- if use_txt_log
- else Path(output_path_prefix + ".log.html").resolve(),
+ "log": (
+ Path(output_path_prefix + ".log.txt").resolve()
+ if use_txt_log
+ else Path(output_path_prefix + ".log.html").resolve()
+ ),
}
for file, path in files.items():
if Path(path).is_file():
@@ -338,6 +340,28 @@ def _get_features_context(
}
)
+ # IM2Deep specific charts
+ if "im2deep" in feature_names:
+ import deeplc.plot
+
+ scatter_chart = deeplc.plot.scatter(
+ df=features[
+ (psm_list["is_decoy"] == False) & (psm_list["qvalue"] <= 0.01)
+ ], # noqa: E712
+ predicted_column="ccs_predicted_im2deep",
+ observed_column="ccs_observed_im2deep",
+ xaxis_label="Observed CCS",
+ yaxis_label="Predicted CCS",
+ plot_title="Predicted vs. observed CCS",
+ )
+
+ context["charts"].append(
+ {
+ "title": TEXTS["charts"]["im2deep_performance"]["title"],
+ "description": TEXTS["charts"]["im2deep_performance"]["description"],
+ "chart": scatter_chart.to_html(**PLOTLY_HTML_KWARGS),
+ }
+ )
return context
diff --git a/ms2rescore/report/templates/texts.toml b/ms2rescore/report/templates/texts.toml
index 2d0840cf..52c9a230 100644
--- a/ms2rescore/report/templates/texts.toml
+++ b/ms2rescore/report/templates/texts.toml
@@ -105,3 +105,9 @@ bottom chart shows the distribution of RMAE values of DeepLC predictions on 460
datasets. The red line indicates the RMAE value for all target PSMs that passed the 1% FDR threshold
of the current dataset. A lower RMAE value indicates better performance.
"""
+
+[charts.im2deep_performance]
+title = "IM2Deep model performance"
+description = """
+IM2Deep model performance can be visualized by plotting the predicted CCS against the observed CCS.
+"""
diff --git a/pyproject.toml b/pyproject.toml
index 1763cabe..484d4334 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,16 +32,17 @@ classifiers = [
dynamic = ["version"]
requires-python = ">=3.8"
dependencies = [
- "numpy>=1.16.0",
+ "numpy>=1.16.0; python_version != '3.11'",
+ "numpy==1.24.3; python_version == '3.11'", # Incompatibility with sklearn, pygam, and TF...
"pandas>=1.0",
"rich>=12",
- "pyteomics>=4.1.0",
+ "pyteomics>=4.1.0, <4.7",
"lxml>=4.5",
- "ms2pip>=4.0.0-dev4",
+ "ms2pip>=4.0.0-dev5",
"click>=7",
"cascade-config>=0.4.0",
"deeplc>=2.2",
- "deeplcretrainer==0.1.17", # TODO: Release version pin
+ "deeplcretrainer>=0.1.17",
"tomli>=2; python_version < '3.11'",
"psm_utils>=0.4",
"customtkinter>=5,<6",
@@ -49,6 +50,7 @@ dependencies = [
"pydantic>=1.8.2,<2", # Fix compatibility with v2 in psm_utils
"jinja2>=3",
"plotly>=5",
+ "im2deep>=0.1.3",
]
[project.optional-dependencies]