Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use Rust for parsing spectrum files #141

Merged
merged 4 commits into from
Apr 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,10 @@ jobs:
run: |
pip install .[dev]

# - name: Test with pytest
# run: |
# pytest
- name: Test with pytest
run: |
pytest

- name: Test installation
run: |
ms2rescore --help
Expand Down
2 changes: 1 addition & 1 deletion ms2rescore/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
im_required = "ionmob" in config["feature_generators"] and None in psm_list["ion_mobility"]
if rt_required or im_required:
logger.info("Parsing missing retention time and/or ion mobility values from spectra...")
get_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required)
get_missing_values(psm_list, config, rt_required=rt_required, im_required=im_required)

# Add rescoring features
for fgen_name, fgen_config in config["feature_generators"].items():
Expand Down
15 changes: 7 additions & 8 deletions ms2rescore/parse_psms.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import logging
import re
from itertools import chain
from typing import Dict, Union

import psm_utils.io
Expand Down Expand Up @@ -60,9 +59,9 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:

if config["psm_id_pattern"]:
pattern = re.compile(config["psm_id_pattern"])
logger.debug("Applying `psm_id_pattern`...")
logger.debug("Applying 'psm_id_pattern'...")
logger.debug(
f"Parsing `{psm_list['spectrum_id'][0]}` to `{_match_psm_ids(psm_list['spectrum_id'][0], pattern)}`"
f"Parsing '{psm_list[0].spectrum_id}' to '{_match_psm_ids(psm_list[0].spectrum_id, pattern)}'"
)
new_ids = [_match_psm_ids(old_id, pattern) for old_id in psm_list["spectrum_id"]]
psm_list["spectrum_id"] = new_ids
Expand All @@ -86,7 +85,7 @@ def _read_psms(config, psm_list):
valid_psms = 0
for psm_file in config["psm_file"]:
logger.info(
f"Reading PSMs from PSM file ({current_file}/{total_files}): `{psm_file}`..."
f"Reading PSMs from PSM file ({current_file}/{total_files}): '{psm_file}'..."
)
try:
id_file_psm_list = psm_utils.io.read_file(
Expand All @@ -97,8 +96,8 @@ def _read_psms(config, psm_list):
)
except psm_utils.io.PSMUtilsIOException:
raise MS2RescoreConfigurationError(
"Error occurred while reading PSMs. Please check the `psm_file` and "
"`psm_file_type` settings. See "
"Error occurred while reading PSMs. Please check the 'psm_file' and "
"'psm_file_type' settings. See "
"https://ms2rescore.readthedocs.io/en/latest/userguide/input-files/"
" for more information."
)
Expand Down Expand Up @@ -129,7 +128,7 @@ def _find_decoys(config, psm_list):
if not any(psm_list["is_decoy"]):
raise MS2RescoreConfigurationError(
"No decoy PSMs found. Please check if decoys are present in the PSM file and that "
"the `id_decoy_pattern` option is correct. See "
"the 'id_decoy_pattern' option is correct. See "
"https://ms2rescore.readthedocs.io/en/latest/userguide/configuration/#selecting-decoy-psms"
" for more information."
)
Expand All @@ -150,7 +149,7 @@ def _match_psm_ids(old_id, regex_pattern):
return match[1]
except (TypeError, IndexError):
raise MS2RescoreConfigurationError(
f"`psm_id_pattern` could not be extracted from PSM spectrum IDs (i.e. {old_id})."
f"'psm_id_pattern' could not be extracted from PSM spectrum IDs (i.e. {old_id})."
" Ensure that the regex contains a capturing group?"
)

Expand Down
144 changes: 30 additions & 114 deletions ms2rescore/parse_spectra.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,9 @@
import logging
import re
from itertools import chain
from typing import Dict, Tuple

from ms2rescore_rs import get_precursor_info
from psm_utils import PSMList
from pyteomics.mgf import MGF
from pyteomics.mzml import MzML
from rich.progress import track

from ms2rescore.exceptions import MS2RescoreError
Expand All @@ -16,122 +14,40 @@
logger = logging.getLogger(__name__)


def get_missing_values(config, psm_list, missing_rt=False, missing_im=False):
def get_missing_values(
psm_list: PSMList, config: dict, rt_required: bool = False, im_required: bool = False
):
"""Get missing RT/IM features from spectrum file."""
logger.debug("Extracting missing RT/IM values from spectrum file(s).")

psm_dict = psm_list.get_psm_dict()
for runs in psm_dict.values():
for run, psms in track(runs.items(), description="Extracting RT/IM values..."):
for run, psms in runs.items():
psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
spectrum_file = infer_spectrum_path(config["spectrum_path"], run)

if spectrum_file.suffix.lower() == ".mzml":
rt_dict, im_dict = _parse_values_from_mzml(
spectrum_file, config, run, missing_rt, missing_im
)
elif spectrum_file.suffix.lower() == ".mgf":
rt_dict, im_dict = _parse_values_from_mgf(
spectrum_file, config, run, missing_rt, missing_im
)

for value_dict, value in zip([rt_dict, im_dict], ["retention_time", "ion_mobility"]):
if value_dict:
try:
psm_list_run[value] = [value_dict[psm.spectrum_id] for psm in psm_list_run]
except KeyError:
raise ParsingError(
f"Could not parse {value} values from spectrum file for run {run}."
)


def _parse_values_from_mgf(
spectrum_file, config, run, missing_rt, missing_im
) -> Tuple[Dict, Dict]:
"""
Parse retention time and/or ion mobility from an MGF file.

Notes
-----
- Extracting values (e.g., ion mobility) according to the Matrix documentation:
http://www.matrixscience.com/help/data_file_help.html

"""
rt_dict = {}
im_dict = {}

spectrum_id_pattern = re.compile(
config["spectrum_id_pattern"] if config["spectrum_id_pattern"] else r"(.*)"
)

for spectrum in MGF(str(spectrum_file)):
matched_id = spectrum_id_pattern.match(spectrum["params"]["title"]).group()
if missing_rt:
try:
rt_dict[matched_id] = float(spectrum["params"]["rtinseconds"])
except KeyError:
raise ParsingError(
"Could not parse retention time (`rtinseconds`) from spectrum file for "
f"run {run}. Please make sure that the retention time key is present in the "
"spectrum file or disable the relevant feature generator."
)
if missing_im:
try:
im_dict[matched_id] = float(spectrum["params"]["ion_mobility"])
except KeyError:
raise ParsingError(
"Could not parse ion mobility (`ion_mobility`) from spectrum file "
f"for run {run}. Please make sure that the ion mobility key is present in the "
"spectrum file or disable the relevant feature generator."
)

return rt_dict, im_dict


def _parse_values_from_mzml(
spectrum_file, config, run, missing_rt, missing_im
) -> Tuple[Dict, Dict]:
"""Parse retention time and/or ion mobility from an mzML file."""
rt_dict = {}
im_dict = {}

spectrum_id_pattern = re.compile(
config["spectrum_id_pattern"] if config["spectrum_id_pattern"] else r"(.*)"
)

for spectrum in MzML(str(spectrum_file)):
matched_id = spectrum_id_pattern.match(spectrum["id"]).group()
if missing_rt:
try:
rt_dict[matched_id] = float(spectrum["scanList"]["scan"][0]["scan start time"])
except KeyError:
raise ParsingError(
"Could not parse retention time (`scan start time`) from spectrum file for "
f"run {run}. Please make sure that the retention time key is present in the "
"spectrum file or disable the relevant feature generator."
)
if missing_im:
try:
im_dict[matched_id] = float(
spectrum["scanList"]["scan"][0]["reverse ion mobility"]
)
except KeyError:
raise ParsingError(
"Could not parse ion mobility (`reverse ion mobility`) from spectrum file "
f"for run {run}. Please make sure that the ion mobility key is present in the "
"spectrum file or disable the relevant feature generator."
)

return rt_dict, im_dict


class ParseMGFError(MS2RescoreError):
"""Error parsing MGF file."""

pass


class ParsingError(MS2RescoreError):
logger.debug("Reading spectrum file: '%s'", spectrum_file)
precursors = get_precursor_info(str(spectrum_file))

if config["spectrum_id_pattern"]:
spectrum_id_pattern = re.compile(config["spectrum_id_pattern"])
precursors = {
spectrum_id_pattern.search(spectrum_id).group(1): precursor
for spectrum_id, precursor in precursors.items()
}

for psm in psm_list_run:
try:
if rt_required:
psm.retention_time = precursors[psm.spectrum_id].rt
if im_required:
psm.ion_mobility = precursors[psm.spectrum_id].im
if not psm.precursor_mz:
psm.precursor_mz = precursors[psm.spectrum_id].mz
except KeyError as e:
raise SpectrumParsingError(
f"Could not extract missing RT/IM values from spectrum file for run {run}."
) from e


class SpectrumParsingError(MS2RescoreError):
"""Error parsing retention time from spectrum file."""

pass
4 changes: 4 additions & 0 deletions ms2rescore/rescoring_engines/mokapot.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,10 @@ def convert_psm_list(
feature_df.columns = [f"feature:{f}" for f in feature_df.columns]
combined_df = pd.concat([psm_df[required_columns], feature_df], axis=1)

# Ensure filename for FlashLFQ txt output
if not combined_df["run"].notnull().all():
combined_df["run"] = "ms_run"

feature_names = [f"feature:{f}" for f in feature_names] if feature_names else None

lin_psm_data = LinearPsmDataset(
Expand Down
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,9 @@ classifiers = [
dynamic = ["version"]
requires-python = ">=3.8"
dependencies = [
"ms2rescore_rs",
"numpy>=1.16.0; python_version != '3.11'",
"numpy==1.24.3; python_version == '3.11'", # Incompatibility with sklearn, pygam, and TF...
"numpy==1.24.3; python_version == '3.11'", # Incompatibility with sklearn, pygam, and TF...
"pandas>=1.0",
"rich>=12",
"pyteomics>=4.1.0",
Expand All @@ -47,7 +48,7 @@ dependencies = [
"psm_utils>=0.4",
"customtkinter>=5,<6",
"mokapot>=0.9",
"pydantic>=1.8.2,<2", # Fix compatibility with v2 in psm_utils
"pydantic>=1.8.2,<2", # Fix compatibility with v2 in psm_utils
"jinja2>=3",
"plotly>=5",
]
Expand Down
13 changes: 13 additions & 0 deletions tests/test_data/test.mgf
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
BEGIN IONS
TITLE=peptide: peptide1
CHARGE=2+
PEPMASS=475.137295
ION_MOBILITY=42.42
RTINSECONDS=51.2
72.04439 100
148.06043 600
232.07504 300
263.08737 400
347.10198 500
423.11802 200
END IONS
23 changes: 23 additions & 0 deletions tests/test_parse_spectra.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import pytest
from psm_utils import PSM, PSMList

from ms2rescore.parse_spectra import get_missing_values


def test_get_missing_values():
psm_list = PSMList(
psm_list=[
PSM(peptidoform="PEPTIDEK/2", spectrum_id="peptide1"),
]
)
get_missing_values(
psm_list,
config={
"spectrum_path": "tests/test_data/test.mgf",
"spectrum_id_pattern": "peptide: (.*)",
},
rt_required=True,
im_required=True,
)
assert psm_list[0].retention_time == pytest.approx(0.853, 0.001)
assert psm_list[0].ion_mobility == pytest.approx(42.42, 0.01)
Loading