From bc6c314a076f226c906950df2d3e3fb975e7c0f0 Mon Sep 17 00:00:00 2001 From: RalfG Date: Mon, 8 Jul 2024 15:19:23 +0200 Subject: [PATCH] Add options `max_psm_rank_input` and `max_psm_rank_output` to control multi-rank rescoring. --- docs/source/config_schema.md | 2 + docs/source/userguide/configuration.rst | 30 +++++ ms2rescore/core.py | 106 +++++++++++------ ms2rescore/package_data/config_default.json | 2 + ms2rescore/package_data/config_schema.json | 12 ++ ms2rescore/parse_psms.py | 125 ++++++++++---------- ms2rescore/report/charts.py | 36 +++--- ms2rescore/rescoring_engines/mokapot.py | 5 +- ms2rescore/rescoring_engines/percolator.py | 2 + 9 files changed, 201 insertions(+), 119 deletions(-) diff --git a/docs/source/config_schema.md b/docs/source/config_schema.md index ad6938c..209bc00 100644 --- a/docs/source/config_schema.md +++ b/docs/source/config_schema.md @@ -57,6 +57,8 @@ - *string* - *null* - **`lower_score_is_better`** *(boolean)*: Bool indicating if lower score is better. Default: `false`. + - **`max_psm_rank_input`** *(number)*: Maximum rank of PSMs to use as input for rescoring. Minimum: `1`. Default: `10`. + - **`max_psm_rank_output`** *(number)*: Maximum rank of PSMs to return after rescoring, before final FDR calculation. Minimum: `1`. Default: `1`. - **`modification_mapping`** *(object)*: Mapping of modification labels to each replacement label. Default: `{}`. - **`fixed_modifications`** *(object)*: Mapping of amino acids with fixed modifications to the modification name. Can contain additional properties. Default: `{}`. - **`processes`** *(number)*: Number of parallel processes to use; -1 for all available. Minimum: `-1`. Default: `-1`. diff --git a/docs/source/userguide/configuration.rst b/docs/source/userguide/configuration.rst index 8b68ea4..3936d41 100644 --- a/docs/source/userguide/configuration.rst +++ b/docs/source/userguide/configuration.rst @@ -240,6 +240,36 @@ expression pattern that extracts the decoy status from the protein name: decoy_pattern = "DECOY_" +Multi-rank rescoring +==================== + +Some search engines, such as MaxQuant, report multiple candidate PSMs for the same spectrum. +MSĀ²Rescore can rescore multiple candidate PSMs per spectrum. This allows for lower-ranking +candidate PSMs to become the top-ranked PSM after rescoring. This behavior can be controlled with +the ``max_psm_rank_input`` option. + +To ensure a correct FDR control after rescoring, MSĀ²Rescore filters out lower-ranking PSMs before +final FDR calculation and writing the output files. To allow for lower-ranking PSMs to be included +in the final output - for instance, to consider chimeric spectra - the ``max_psm_rank_output`` +option can be used. + +For example, to rescore the top 5 PSMs per spectrum and output the best PSM after rescoring, +the following configuration can be used: + +.. tab:: JSON + + .. code-block:: json + + "max_psm_rank_input": 5 + "max_psm_rank_output": 1 + +.. tab:: TOML + + .. code-block:: toml + + max_psm_rank_input = 5 + max_psm_rank_output = 1 + All configuration options ========================= diff --git a/ms2rescore/core.py b/ms2rescore/core.py index 176fa57..3f4d9cb 100644 --- a/ms2rescore/core.py +++ b/ms2rescore/core.py @@ -47,7 +47,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: psm_list = parse_psms(config, psm_list) # Log #PSMs identified before rescoring - id_psms_before = _log_id_psms_before(psm_list) + id_psms_before = _log_id_psms_before(psm_list, max_rank=config["max_psm_rank_output"]) # Define feature names; get existing feature names from PSM file feature_names = dict() @@ -62,7 +62,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: ) # Add missing precursor info from spectrum file if needed - _fill_missing_precursor_info(psm_list, config) + psm_list = _fill_missing_precursor_info(psm_list, config) # Add rescoring features for fgen_name, fgen_config in config["feature_generators"].items(): @@ -145,22 +145,26 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: **config["rescoring_engine"]["mokapot"], ) except exceptions.RescoringError as e: - logger.exception(e) - rescoring_succeeded = False - else: - rescoring_succeeded = True - _log_id_psms_after(psm_list, id_psms_before) + # Write output + logger.info(f"Writing intermediary output to {output_file_root}.psms.tsv...") + psm_utils.io.write_file(psm_list, output_file_root + ".psms.tsv", filetype="tsv") + + # Reraise exception + raise e - # Workaround for broken PEP calculation if best PSM is decoy + # Post-rescoring processing if all(psm_list["pep"] == 1.0): psm_list = _fix_constant_pep(psm_list) + psm_list = _filter_by_rank(psm_list, config["max_psm_rank_output"], False) + psm_list = _calculate_confidence(psm_list) + _ = _log_id_psms_after(psm_list, id_psms_before, max_rank=config["max_psm_rank_output"]) # Write output logger.info(f"Writing output to {output_file_root}.psms.tsv...") psm_utils.io.write_file(psm_list, output_file_root + ".psms.tsv", filetype="tsv") # Write report - if config["write_report"] and rescoring_succeeded: + if config["write_report"]: try: generate.generate_report( output_file_root, psm_list=psm_list, feature_names=feature_names, use_txt_log=True @@ -169,7 +173,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: logger.exception(e) -def _fill_missing_precursor_info(psm_list, config): +def _fill_missing_precursor_info(psm_list: PSMList, config: Dict) -> PSMList: """Fill missing precursor info from spectrum file if needed.""" # Check if required # TODO: avoid hard coding feature generators in some way @@ -211,6 +215,16 @@ def _fill_missing_precursor_info(psm_list, config): [v is not None and not np.isnan(v) for v in psm_list[value_name]] ] + return psm_list + + +def _filter_by_rank(psm_list: PSMList, max_rank: int, lower_score_better: bool) -> PSMList: + """Filter PSMs by rank.""" + psm_list.set_ranks(lower_score_better=lower_score_better) + rank_filter = psm_list["rank"] <= max_rank + logger.info(f"Removed {sum(~rank_filter)} PSMs with rank >= {max_rank}.") + return psm_list[rank_filter] + def _write_feature_names(feature_names, output_file_root): """Write feature names to file.""" @@ -221,31 +235,39 @@ def _write_feature_names(feature_names, output_file_root): f.write(f"{fgen}\t{feature}\n") -def _log_id_psms_before(psm_list): +def _log_id_psms_before(psm_list: PSMList, fdr: float = 0.01, max_rank: int = 1) -> int: """Log #PSMs identified before rescoring.""" id_psms_before = ( - (psm_list["qvalue"] <= 0.01) & (psm_list["is_decoy"] == False) # noqa: E712 + (psm_list["qvalue"] <= 0.01) & (psm_list["rank"] <= max_rank) & (~psm_list["is_decoy"]) ).sum() - logger.info("Found %i identified PSMs at 1%% FDR before rescoring.", id_psms_before) + logger.info( + f"Found {id_psms_before} identified PSMs with rank <= {max_rank} at {fdr} FDR before " + "rescoring." + ) return id_psms_before -def _log_id_psms_after(psm_list, id_psms_before): +def _log_id_psms_after( + psm_list: PSMList, id_psms_before: int, fdr: float = 0.01, max_rank: int = 1 +) -> int: """Log #PSMs identified after rescoring.""" id_psms_after = ( - (psm_list["qvalue"] <= 0.01) & (psm_list["is_decoy"] == False) # noqa: E712 + (psm_list["qvalue"] <= 0.01) & (psm_list["rank"] <= max_rank) & (~psm_list["is_decoy"]) ).sum() diff = id_psms_after - id_psms_before diff_perc = diff / id_psms_before if id_psms_before > 0 else None diff_numbers = f"{diff} ({diff_perc:.2%})" if diff_perc is not None else str(diff) diff_word = "more" if diff > 0 else "less" - logger.info(f"Identified {diff_numbers} {diff_word} PSMs at 1% FDR after rescoring.") + logger.info( + f"Identified {diff_numbers} {diff_word} PSMs with rank <= {max_rank} at {fdr} FDR after " + "rescoring." + ) return id_psms_after -def _fix_constant_pep(psm_list): +def _fix_constant_pep(psm_list: PSMList) -> PSMList: """Workaround for broken PEP calculation if best PSM is decoy.""" logger.warning( "Attempting to fix constant PEP values by removing decoy PSMs that score higher than the " @@ -257,30 +279,36 @@ def _fix_constant_pep(psm_list): if not higher_scoring_decoys.any(): logger.warning("No decoys scoring higher than the best target found. Skipping fix.") else: - logger.warning(f"Removing {higher_scoring_decoys.sum()} decoy PSMs.") - psm_list = psm_list[~higher_scoring_decoys] + logger.warning(f"Removed {higher_scoring_decoys.sum()} decoy PSMs.") - # Minimal conversion to LinearPsmDataset - psm_df = psm_list.to_dataframe() - psm_df = psm_df.reset_index(drop=True).reset_index() - psm_df["peptide"] = ( - psm_df["peptidoform"].astype(str).str.replace(r"(/\d+$)", "", n=1, regex=True) - ) - psm_df["is_target"] = ~psm_df["is_decoy"] - lin_psm_data = LinearPsmDataset( - psms=psm_df[["index", "peptide", "score", "is_target"]], - target_column="is_target", - spectrum_columns="index", # Use artificial index to allow multi-rank rescoring - peptide_column="peptide", - feature_columns=["score"], - ) + return psm_list + + +def _calculate_confidence(psm_list: PSMList) -> PSMList: + """ + Calculate scores, q-values, and PEPs for PSMs and peptides and add them to PSMList. + """ + # Minimal conversion to LinearPsmDataset + psm_df = psm_list.to_dataframe() + psm_df = psm_df.reset_index(drop=True).reset_index() + psm_df["peptide"] = ( + psm_df["peptidoform"].astype(str).str.replace(r"(/\d+$)", "", n=1, regex=True) + ) + psm_df["is_target"] = ~psm_df["is_decoy"] + lin_psm_data = LinearPsmDataset( + psms=psm_df[["index", "peptide", "score", "is_target"]], + target_column="is_target", + spectrum_columns="index", # Use artificial index to allow multi-rank rescoring + peptide_column="peptide", + feature_columns=["score"], + ) - # Recalculate confidence - new_confidence = lin_psm_data.assign_confidence() + # Recalculate confidence + new_confidence = lin_psm_data.assign_confidence() - # Add new confidence estimations to PSMList - add_psm_confidence(psm_list, new_confidence) - add_peptide_confidence(psm_list, new_confidence) + # Add new confidence estimations to PSMList + add_psm_confidence(psm_list, new_confidence) + add_peptide_confidence(psm_list, new_confidence) - return psm_list + return psm_list diff --git a/ms2rescore/package_data/config_default.json b/ms2rescore/package_data/config_default.json index 765c1a0..7dd8894 100644 --- a/ms2rescore/package_data/config_default.json +++ b/ms2rescore/package_data/config_default.json @@ -33,6 +33,8 @@ "psm_id_rt_pattern": null, "psm_id_im_pattern": null, "lower_score_is_better": false, + "max_psm_rank_input": 10, + "max_psm_rank_output": 1, "modification_mapping": {}, "fixed_modifications": {}, "processes": -1, diff --git a/ms2rescore/package_data/config_schema.json b/ms2rescore/package_data/config_schema.json index dfed7ca..ab77f3b 100644 --- a/ms2rescore/package_data/config_schema.json +++ b/ms2rescore/package_data/config_schema.json @@ -131,6 +131,18 @@ "type": "boolean", "default": false }, + "max_psm_rank_input": { + "description": "Maximum rank of PSMs to use as input for rescoring", + "type": "number", + "default": 10, + "minimum": 1 + }, + "max_psm_rank_output": { + "description": "Maximum rank of PSMs to return after rescoring, before final FDR calculation", + "type": "number", + "default": 1, + "minimum": 1 + }, "modification_mapping": { "description": "Mapping of modification labels to each replacement label.", "type": "object", diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py index cc8440b..12edd0e 100644 --- a/ms2rescore/parse_psms.py +++ b/ms2rescore/parse_psms.py @@ -1,6 +1,6 @@ import logging import re -from typing import Dict, Union +from typing import Dict, Optional, Union import numpy as np import psm_utils.io @@ -24,14 +24,30 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: PSMList object containing PSMs. If None, PSMs will be read from ``psm_file``. """ - # Read PSMs, find decoys, calculate q-values - psm_list = _read_psms(config, psm_list) + # Read PSMs + try: + psm_list = _read_psms(config, psm_list) + except psm_utils.io.PSMUtilsIOException: + raise MS2RescoreConfigurationError( + "Error occurred while reading PSMs. Please check the 'psm_file' and " + "'psm_file_type' settings. See " + "https://ms2rescore.readthedocs.io/en/latest/userguide/input-files/" + " for more information." + ) + + # Filter by PSM rank + psm_list.set_ranks(config["lower_score_is_better"]) + rank_filter = psm_list["rank"] <= config["max_psm_rank_input"] + psm_list = psm_list[rank_filter] + logger.info(f"Removed {sum(~rank_filter)} PSMs with rank >= {config['max_psm_rank_input']}.") + + # Remove invalid AAs, find decoys, calculate q-values psm_list = _remove_invalid_aa(psm_list) - _find_decoys(config, psm_list) - _calculate_qvalues(config, psm_list) + _find_decoys(psm_list, config["id_decoy_pattern"]) + _calculate_qvalues(psm_list, config["lower_score_is_better"]) if config["psm_id_rt_pattern"] or config["psm_id_im_pattern"]: logger.debug("Parsing retention time and/or ion mobility from PSM identifier...") - _parse_values_spectrum_id(config, psm_list) + _parse_values_from_spectrum_id(config, psm_list) # Store scoring values for comparison later for psm in psm_list: @@ -79,39 +95,30 @@ def _read_psms(config, psm_list): if isinstance(psm_list, PSMList): return psm_list else: - logger.info("Reading PSMs from file...") total_files = len(config["psm_file"]) psm_list = [] for current_file, psm_file in enumerate(config["psm_file"]): logger.info( f"Reading PSMs from PSM file ({current_file+1}/{total_files}): '{psm_file}'..." ) - try: - psm_list.extend( - psm_utils.io.read_file( - psm_file, - filetype=config["psm_file_type"], - show_progressbar=True, - **config["psm_reader_kwargs"], - ) - ) - except psm_utils.io.PSMUtilsIOException: - raise MS2RescoreConfigurationError( - "Error occurred while reading PSMs. Please check the 'psm_file' and " - "'psm_file_type' settings. See " - "https://ms2rescore.readthedocs.io/en/latest/userguide/input-files/" - " for more information." + psm_list.extend( + psm_utils.io.read_file( + psm_file, + filetype=config["psm_file_type"], + show_progressbar=True, + **config["psm_reader_kwargs"], ) + ) logger.debug(f"Read {len(psm_list)} PSMs from '{psm_file}'.") return PSMList(psm_list=psm_list) -def _find_decoys(config, psm_list): +def _find_decoys(psm_list: PSMList, id_decoy_pattern: Optional[str] = None): """Find decoys in PSMs, log amount, and raise error if none found.""" logger.debug("Finding decoys...") - if config["id_decoy_pattern"]: - psm_list.find_decoys(config["id_decoy_pattern"]) + if id_decoy_pattern: + psm_list.find_decoys(id_decoy_pattern) n_psms = len(psm_list) percent_decoys = sum(psm_list["is_decoy"]) / n_psms * 100 @@ -126,12 +133,12 @@ def _find_decoys(config, psm_list): ) -def _calculate_qvalues(config, psm_list): +def _calculate_qvalues(psm_list: PSMList, lower_score_is_better: bool): """Calculate q-values for PSMs if not present.""" # Calculate q-values if not present if None in psm_list["qvalue"]: logger.debug("Recalculating q-values...") - psm_list.calculate_qvalues(reverse=not config["lower_score_is_better"]) + psm_list.calculate_qvalues(reverse=not lower_score_is_better) def _match_psm_ids(old_id, regex_pattern): @@ -146,50 +153,38 @@ def _match_psm_ids(old_id, regex_pattern): ) -def _parse_values_spectrum_id(config, psm_list): +def _parse_values_from_spectrum_id( + psm_list: PSMList, + psm_id_rt_pattern: Optional[str] = None, + psm_id_im_pattern: Optional[str] = None, +): """Parse retention time and or ion mobility values from the spectrum_id.""" - - if config["psm_id_rt_pattern"]: - logger.debug( - "Parsing retention time from spectrum_id with regex pattern " - f"{config['psm_id_rt_pattern']}" - ) - try: - rt_pattern = re.compile(config["psm_id_rt_pattern"]) - psm_list["retention_time"] = [ - float(rt_pattern.search(psm.spectrum_id).group(1)) for psm in psm_list - ] - except AttributeError: - raise MS2RescoreConfigurationError( - f"Could not parse retention time from spectrum_id with the " - f"{config['psm_id_rt_pattern']} regex pattern. " - f"Example spectrum_id: '{psm_list[0].spectrum_id}'\n." - "Please make sure the retention time key is present in the spectrum_id " - "and the value is in a capturing group or disable the relevant feature generator." - ) - - if config["psm_id_im_pattern"]: - logger.debug( - "Parsing ion mobility from spectrum_id with regex pattern " - f"{config['psm_id_im_pattern']}" - ) - try: - im_pattern = re.compile(config["psm_id_im_pattern"]) - psm_list["ion_mobility"] = [ - float(im_pattern.search(psm.spectrum_id).group(1)) for psm in psm_list - ] - except AttributeError: - raise MS2RescoreConfigurationError( - f"Could not parse ion mobility from spectrum_id with the " - f"{config['psm_id_im_pattern']} regex pattern. " - "Please make sure the ion mobility key is present in the spectrum_id " - "and the value is in a capturing group or disable the relevant feature generator." + for pattern, label, key in zip( + [psm_id_rt_pattern, psm_id_im_pattern], + ["retention time", "ion mobility"], + ["retention_time", "ion_mobility"], + ): + if pattern: + logger.debug( + f"Parsing {label} from spectrum_id with regex pattern " f"{psm_id_rt_pattern}" ) + try: + pattern = re.compile(pattern) + psm_list[key] = [ + float(pattern.search(psm.spectrum_id).group(1)) for psm in psm_list + ] + except AttributeError: + raise MS2RescoreConfigurationError( + f"Could not parse {label} from spectrum_id with the " + f"{pattern} regex pattern. " + f"Example spectrum_id: '{psm_list[0].spectrum_id}'\n. " + f"Please make sure the {label} key is present in the spectrum_id " + "and the value is in a capturing group or disable the relevant feature generator." + ) def _remove_invalid_aa(psm_list: PSMList) -> PSMList: """Remove PSMs with invalid amino acids.""" - logger.debug("Removing PSMs with invalid amino acids...") invalid_psms = np.array( [any(aa in "BJOUXZ" for aa in psm.peptidoform.sequence) for psm in psm_list] ) diff --git a/ms2rescore/report/charts.py b/ms2rescore/report/charts.py index 8313228..669bcc6 100644 --- a/ms2rescore/report/charts.py +++ b/ms2rescore/report/charts.py @@ -242,16 +242,22 @@ def score_scatter_plot( ce_psms = pd.concat([ce_psms_targets, ce_psms_decoys], axis=0) # Get score thresholds - score_threshold_before = ( - ce_psms[ce_psms["mokapot q-value before"] <= fdr_threshold] - .sort_values("mokapot q-value before", ascending=False)["mokapot score before"] - .iloc[0] - ) - score_threshold_after = ( - ce_psms[ce_psms["mokapot q-value after"] <= fdr_threshold] - .sort_values("mokapot q-value after", ascending=False)["mokapot score after"] - .iloc[0] - ) + try: + score_threshold_before = ( + ce_psms[ce_psms["mokapot q-value before"] <= fdr_threshold] + .sort_values("mokapot q-value before", ascending=False)["mokapot score before"] + .iloc[0] + ) + except IndexError: # No PSMs below threshold + score_threshold_before = None + try: + score_threshold_after = ( + ce_psms[ce_psms["mokapot q-value after"] <= fdr_threshold] + .sort_values("mokapot q-value after", ascending=False)["mokapot score after"] + .iloc[0] + ) + except IndexError: # No PSMs below threshold + score_threshold_after = None # Plot fig = px.scatter( @@ -268,10 +274,12 @@ def score_scatter_plot( }, ) # draw FDR thresholds - fig.add_vline(x=score_threshold_before, line_dash="dash", row=1, col=1) - fig.add_hline(y=score_threshold_after, line_dash="dash", row=1, col=1) - fig.add_vline(x=score_threshold_before, line_dash="dash", row=2, col=1) - fig.add_hline(y=score_threshold_after, line_dash="dash", row=1, col=2) + if score_threshold_before: + fig.add_vline(x=score_threshold_before, line_dash="dash", row=1, col=1) + fig.add_vline(x=score_threshold_before, line_dash="dash", row=2, col=1) + if score_threshold_after: + fig.add_hline(y=score_threshold_after, line_dash="dash", row=1, col=1) + fig.add_hline(y=score_threshold_after, line_dash="dash", row=1, col=2) return fig diff --git a/ms2rescore/rescoring_engines/mokapot.py b/ms2rescore/rescoring_engines/mokapot.py index 86dc20d..f02d877 100644 --- a/ms2rescore/rescoring_engines/mokapot.py +++ b/ms2rescore/rescoring_engines/mokapot.py @@ -223,7 +223,7 @@ def save_model_weights( def add_psm_confidence( psm_list: psm_utils.PSMList, confidence_results: mokapot.confidence.Confidence ) -> None: - """Add Mokapot PSM-level confidence estimates to PSM list.""" + """Add PSM-level confidence estimates to PSM list, updating score, qvalue, pep, and rank.""" # Reshape confidence estimates to match PSMList keys = ["mokapot score", "mokapot q-value", "mokapot PEP"] mokapot_values_targets = ( @@ -241,6 +241,9 @@ def add_psm_confidence( psm_list["qvalue"] = q[:, 1] psm_list["pep"] = q[:, 2] + # Reset ranks to match new scores + psm_list.set_ranks(lower_score_better=False) + def add_peptide_confidence( psm_list: psm_utils.PSMList, confidence_results: mokapot.confidence.Confidence diff --git a/ms2rescore/rescoring_engines/percolator.py b/ms2rescore/rescoring_engines/percolator.py index c6ea3d3..9952aed 100644 --- a/ms2rescore/rescoring_engines/percolator.py +++ b/ms2rescore/rescoring_engines/percolator.py @@ -175,6 +175,8 @@ def _update_psm_scores( original_psm["qvalue"] = new_psm["qvalue"] original_psm["pep"] = new_psm["pep"] + psm_list.set_ranks(lower_score_better=False) + def _write_pin_file(psm_list: psm_utils.PSMList, filepath: str): """Write PIN file for rescoring."""