Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/compomics/ms2rescore into t…
Browse files Browse the repository at this point in the history
…imsRescore
  • Loading branch information
RalfG committed Apr 8, 2024
2 parents 17a4e12 + 1fe9d9e commit 86ba7b6
Show file tree
Hide file tree
Showing 12 changed files with 124 additions and 144 deletions.
7 changes: 4 additions & 3 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,10 @@ jobs:
run: |
pip install .[dev]
# - name: Test with pytest
# run: |
# pytest
- name: Test with pytest
run: |
pytest
- name: Test installation
run: |
ms2rescore --help
Expand Down
6 changes: 3 additions & 3 deletions ms2rescore/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from ms2rescore.feature_generators import FEATURE_GENERATORS
from ms2rescore.parse_psms import parse_psms
from ms2rescore.parse_spectra import fill_missing_values
from ms2rescore.parse_spectra import get_missing_values
from ms2rescore.report import generate
from ms2rescore.rescoring_engines import mokapot, percolator
from ms2rescore import exceptions
Expand Down Expand Up @@ -69,7 +69,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:

if rt_required or im_required:
logger.info("Parsing missing retention time and/or ion mobility values from spectra...")
fill_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required)
get_missing_values(psm_list, config, rt_required=rt_required, im_required=im_required)

# Add rescoring features
for fgen_name, fgen_config in config["feature_generators"].items():
Expand Down Expand Up @@ -163,7 +163,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
output_file_root, psm_list=psm_list, feature_names=feature_names, use_txt_log=True
)
except exceptions.ReportGenerationError as e:
logger.error(e)
logger.exception(e)


def _write_feature_names(feature_names, output_file_root):
Expand Down
5 changes: 5 additions & 0 deletions ms2rescore/feature_generators/ms2pip.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def __init__(
ms2_tolerance: float = 0.02,
spectrum_path: Optional[str] = None,
spectrum_id_pattern: str = "(.*)",
model_dir: Optional[str] = None,
processes: 1,
**kwargs,
) -> None:
Expand All @@ -71,6 +72,8 @@ def __init__(
spectrum_id_pattern : str, optional
Regular expression pattern to extract spectrum ID from spectrum file. Defaults to
:py:const:`.*`.
model_dir
Directory containing MS²PIP models. Defaults to :py:const:`None` (use MS²PIP default).
processes : int, optional
Number of processes to use. Defaults to 1.
Expand All @@ -85,6 +88,7 @@ def __init__(
self.ms2_tolerance = ms2_tolerance
self.spectrum_path = spectrum_path
self.spectrum_id_pattern = spectrum_id_pattern
self.model_dir = model_dir
self.processes = processes

@property
Expand Down Expand Up @@ -194,6 +198,7 @@ def add_features(self, psm_list: PSMList) -> None:
model=self.model,
ms2_tolerance=self.ms2_tolerance,
compute_correlations=False,
model_dir=self.model_dir,
processes=self.processes,
)
except NoMatchingSpectraFound as e:
Expand Down
16 changes: 7 additions & 9 deletions ms2rescore/parse_psms.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import logging
import re
from itertools import chain
from typing import Dict, Union

import psm_utils.io
Expand Down Expand Up @@ -64,10 +63,9 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:

if config["psm_id_pattern"]:
pattern = re.compile(config["psm_id_pattern"])
logger.debug("Applying `psm_id_pattern`...")
logger.debug("Applying 'psm_id_pattern'...")
logger.debug(
f"Parsing `{psm_list['spectrum_id'][0]}` to "
f"`{_match_psm_ids(psm_list['spectrum_id'][0], pattern)}`"
f"Parsing '{psm_list[0].spectrum_id}' to '{_match_psm_ids(psm_list[0].spectrum_id, pattern)}'"
)
new_ids = [_match_psm_ids(old_id, pattern) for old_id in psm_list["spectrum_id"]]
psm_list["spectrum_id"] = new_ids
Expand All @@ -91,7 +89,7 @@ def _read_psms(config, psm_list):
valid_psms = 0
for psm_file in config["psm_file"]:
logger.info(
f"Reading PSMs from PSM file ({current_file}/{total_files}): `{psm_file}`..."
f"Reading PSMs from PSM file ({current_file}/{total_files}): '{psm_file}'..."
)
try:
id_file_psm_list = psm_utils.io.read_file(
Expand All @@ -102,8 +100,8 @@ def _read_psms(config, psm_list):
)
except psm_utils.io.PSMUtilsIOException:
raise MS2RescoreConfigurationError(
"Error occurred while reading PSMs. Please check the `psm_file` and "
"`psm_file_type` settings. See "
"Error occurred while reading PSMs. Please check the 'psm_file' and "
"'psm_file_type' settings. See "
"https://ms2rescore.readthedocs.io/en/latest/userguide/input-files/"
" for more information."
)
Expand Down Expand Up @@ -134,7 +132,7 @@ def _find_decoys(config, psm_list):
if not any(psm_list["is_decoy"]):
raise MS2RescoreConfigurationError(
"No decoy PSMs found. Please check if decoys are present in the PSM file and that "
"the `id_decoy_pattern` option is correct. See "
"the 'id_decoy_pattern' option is correct. See "
"https://ms2rescore.readthedocs.io/en/latest/userguide/configuration/#selecting-decoy-psms"
" for more information."
)
Expand All @@ -155,7 +153,7 @@ def _match_psm_ids(old_id, regex_pattern):
return match[1]
except (TypeError, IndexError):
raise MS2RescoreConfigurationError(
f"`psm_id_pattern` could not be extracted from PSM spectrum IDs (i.e. {old_id})."
f"'psm_id_pattern' could not be extracted from PSM spectrum IDs (i.e. {old_id})."
" Ensure that the regex contains a capturing group?"
)

Expand Down
152 changes: 30 additions & 122 deletions ms2rescore/parse_spectra.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,142 +3,50 @@
import logging
import re
from itertools import chain
from typing import Dict, Tuple

from ms2rescore_rs import get_precursor_info
from psm_utils import PSMList
from pyteomics.mgf import IndexedMGF
from pyteomics.mzml import PreIndexedMzML
from rich.progress import track

from ms2rescore.exceptions import MS2RescoreError
from ms2rescore.utils import infer_spectrum_path

logger = logging.getLogger(__name__)


def fill_missing_values(config, psm_list, missing_rt=False, missing_im=False):
def get_missing_values(
psm_list: PSMList, config: dict, rt_required: bool = False, im_required: bool = False
):
"""Get missing RT/IM features from spectrum file."""
logger.debug("Extracting missing RT/IM values from spectrum file(s).")

psm_dict = psm_list.get_psm_dict()
for runs in psm_dict.values():
for run, psms in track(runs.items(), description="Extracting RT/IM values..."):
for run, psms in runs.items():
psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
spectrum_file = infer_spectrum_path(config["spectrum_path"], run)

if missing_im or missing_rt:
if spectrum_file.suffix.lower() == ".mzml":
_parse_values_from_mzml(
psm_list_run, spectrum_file, config, missing_rt, missing_im
)
elif spectrum_file.suffix.lower() == ".mgf":
_parse_values_from_mgf(
psm_list_run, spectrum_file, config, missing_rt, missing_im
)
else:
raise MS2RescoreError(
f"Could not parse retention time and/or ion mobility from spectrum file for run {run}. "
"Please make sure that the spectrum file is either in mzML or MGF format."
)


def _parse_values_from_mgf(
psm_list_run, spectrum_file, config, missing_rt, missing_im
) -> Tuple[Dict, Dict]:
"""Parse retention time and/or ion mobility from an mzML file."""

mgf = IndexedMGF(str(spectrum_file))
spectrum_id_pattern = re.compile(
config["spectrum_id_pattern"] if config["spectrum_id_pattern"] else r"(.*)"
)

try:
mapper = {
spectrum_id_pattern.search(spectrum_id).group(1): spectrum_id
for spectrum_id in mgf._offset_index.mapping["spectrum"].keys()
}
except AttributeError:
raise ParseMGFError(
"Could not parse spectrum IDs using ´spectrum_id_pattern´. Please make sure that there is a capturing in the pattern."
)

spectra = {spectrum_id: mgf.get_by_id(spectrum_id) for spectrum_id in mapper.values()}

for psm in psm_list_run:
spectrum = spectra.get(mapper[psm.spectrum_id])
if spectrum is None:
raise ParsingError(f"Could not find spectrum with ID {psm.spectrum_id} in MGF file.")

if missing_rt and "params" in spectrum and "rtinseconds" in spectrum["params"]:
psm.retention_time = float(spectrum["params"]["rtinseconds"])

if missing_im and "params" in spectrum and "ion_mobility" in spectrum["params"]:
psm.ion_mobility = float(spectrum["params"]["ion_mobility"])


def _parse_values_from_mzml(
psm_list_run, spectrum_file, config, missing_rt, missing_im
) -> Tuple[Dict, Dict]:
"""Parse retention time and/or ion mobility from an mzML file."""

mzml = PreIndexedMzML(str(spectrum_file))
spectrum_id_pattern = re.compile(
config["spectrum_id_pattern"] if config["spectrum_id_pattern"] else r"(.*)"
)

try:
mapper = {
spectrum_id_pattern.search(spectrum_id).group(1): spectrum_id
for spectrum_id in mzml._offset_index.mapping["spectrum"].keys()
}
except AttributeError as e:
raise ParseMGFError(
"Could not parse spectrum IDs using ´spectrum_id_pattern´. Please make sure that there is a capturing in the pattern."
) from e

spectra = {spectrum_id: mzml.get_by_id(spectrum_id) for spectrum_id in mapper.values()}

for psm in psm_list_run:
spectrum = spectra.get(mapper[psm.spectrum_id])
if spectrum is None:
raise ParsingError(f"Could not find spectrum with ID {psm.spectrum_id} in mzML file.")

if (
missing_rt
and "scanList" in spectrum
and "scan" in spectrum["scanList"]
and spectrum["scanList"]["scan"]
):
psm.retention_time = float(spectrum["scanList"]["scan"][0].get("scan start time", 0))

if missing_im:
if (
"precursorList" in spectrum
and "precursor" in spectrum["precursorList"]
and spectrum["precursorList"]["precursor"]
):
psm.ion_mobility = float(
spectrum["precursorList"]["precursor"][0]["selectedIonList"]["selectedIon"][
0
].get("inverse reduced ion mobility", 0)
)
elif (
"scanList" in spectrum
and "scan" in spectrum["scanList"]
and spectrum["scanList"]["scan"]
):
psm.ion_mobility = float(
spectrum["scanList"]["scan"][0].get("reverse ion mobility", 0)
)


class ParseMGFError(MS2RescoreError):
"""Error parsing MGF file."""

pass


class ParsingError(MS2RescoreError):
logger.debug("Reading spectrum file: '%s'", spectrum_file)
precursors = get_precursor_info(str(spectrum_file))

if config["spectrum_id_pattern"]:
spectrum_id_pattern = re.compile(config["spectrum_id_pattern"])
precursors = {
spectrum_id_pattern.search(spectrum_id).group(1): precursor
for spectrum_id, precursor in precursors.items()
}

for psm in psm_list_run:
try:
if rt_required:
psm.retention_time = precursors[psm.spectrum_id].rt
if im_required:
psm.ion_mobility = precursors[psm.spectrum_id].im
if not psm.precursor_mz:
psm.precursor_mz = precursors[psm.spectrum_id].mz
except KeyError as e:
raise SpectrumParsingError(
f"Could not extract missing RT/IM values from spectrum file for run {run}."
) from e


class SpectrumParsingError(MS2RescoreError):
"""Error parsing retention time from spectrum file."""

pass
26 changes: 25 additions & 1 deletion ms2rescore/report/charts.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,14 @@ def score_scatter_plot(
Column with index for each PSM, peptide, or protein to use for merging data frames.
"""
if not before or not after:
figure = go.Figure()
figure.add_annotation(
text="No data available for comparison.",
showarrow=False,
)
return figure

# Restructure data
merge_columns = [indexer, "mokapot score", "mokapot q-value", "mokapot PEP"]
ce_psms_targets = pd.merge(
Expand Down Expand Up @@ -288,6 +296,14 @@ def fdr_plot_comparison(
Column with index for each PSM, peptide, or protein to use for merging dataframes.
"""
if not before or not after:
figure = go.Figure()
figure.add_annotation(
text="No data available for comparison.",
showarrow=False,
)
return figure

# Prepare data
ce_psms_targets_melted = (
pd.merge(
Expand Down Expand Up @@ -336,7 +352,7 @@ def fdr_plot_comparison(
def identification_overlap(
before: mokapot.LinearConfidence,
after: mokapot.LinearConfidence,
) -> go.Figure():
) -> go.Figure:
"""
Plot stacked bar charts of removed, retained, and gained PSMs, peptides, and proteins.
Expand All @@ -348,6 +364,14 @@ def identification_overlap(
Mokapot linear confidence results after rescoring.
"""
if not before or not after:
figure = go.Figure()
figure.add_annotation(
text="No data available for comparison.",
showarrow=False,
)
return figure

levels = before.levels # ["psms", "peptides", "proteins"] if all available
indexers = ["index", "index", "mokapot protein group"]

Expand Down
5 changes: 5 additions & 0 deletions ms2rescore/report/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,11 @@ def _get_stats_context(confidence_before, confidence_after):
levels = ["psms", "peptides", "proteins"]
level_names = ["PSMs", "Peptides", "Protein groups"]
card_colors = ["card-bg-blue", "card-bg-green", "card-bg-red"]

# Cannot report stats if confidence estimates are not present
if not confidence_before or not confidence_after:
return stats

for level, level_name, card_color in zip(levels, level_names, card_colors):
try:
before = confidence_before.accepted[level.lower()]
Expand Down
7 changes: 2 additions & 5 deletions ms2rescore/report/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,7 @@ def get_confidence_estimates(

try:
confidence[when] = lin_psm_dataset.assign_confidence()
except RuntimeError as e:
raise ReportGenerationError(
f"Error while assigning confidence estimates to PSMs ({when} rescoring). "
"Could not generate report."
) from e
except RuntimeError:
confidence[when] = None

return confidence["before"], confidence["after"]
4 changes: 4 additions & 0 deletions ms2rescore/rescoring_engines/mokapot.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,10 @@ def convert_psm_list(
feature_df.columns = [f"feature:{f}" for f in feature_df.columns]
combined_df = pd.concat([psm_df[required_columns], feature_df], axis=1)

# Ensure filename for FlashLFQ txt output
if not combined_df["run"].notnull().all():
combined_df["run"] = "ms_run"

feature_names = [f"feature:{f}" for f in feature_names] if feature_names else None

lin_psm_data = LinearPsmDataset(
Expand Down
Loading

0 comments on commit 86ba7b6

Please sign in to comment.