From 3047f28e895bd0eb08a1e08698cf9413ccd07d7a Mon Sep 17 00:00:00 2001 From: RalfG Date: Thu, 4 Jul 2024 19:00:44 +0200 Subject: [PATCH] Added: - Add option to pass train_fdr to mokapot; Changed: - Split functions to add mokapot confidence - Implement workaround for broken PEP calculation if best PSM is decoy - Still write PSMs to file, even if rescoring step failed. Still skip report generation --- docs/source/config_schema.md | 2 + ms2rescore/core.py | 107 +++++++++++++++----- ms2rescore/exceptions.py | 6 ++ ms2rescore/package_data/config_default.json | 1 + ms2rescore/package_data/config_schema.json | 15 ++- ms2rescore/report/charts.py | 5 +- ms2rescore/rescoring_engines/mokapot.py | 102 ++++++++++++------- 7 files changed, 168 insertions(+), 70 deletions(-) diff --git a/docs/source/config_schema.md b/docs/source/config_schema.md index 37d98047..ad6938cc 100644 --- a/docs/source/config_schema.md +++ b/docs/source/config_schema.md @@ -66,6 +66,7 @@ - *string* - *null* - **`write_report`** *(boolean)*: Write an HTML report with various QC metrics and charts. Default: `false`. + - **`profile`** *(boolean)*: Write a txt report using cProfile for profiling. Default: `false`. ## Definitions - **`feature_generator`** *(object)*: Feature generator configuration. Can contain additional properties. @@ -87,6 +88,7 @@ - **`im2deep`** *(object)*: Ion mobility feature generator configuration using IM2Deep. Can contain additional properties. Refer to *[#/definitions/feature_generator](#definitions/feature_generator)*. - **`reference_dataset`** *(string)*: Path to IM2Deep reference dataset file. Default: `"Meier_unimod.parquet"`. - **`mokapot`** *(object)*: Mokapot rescoring engine configuration. Additional properties are passed to the Mokapot brew function. Can contain additional properties. Refer to *[#/definitions/rescoring_engine](#definitions/rescoring_engine)*. + - **`train_fdr`** *(number)*: FDR threshold for training Mokapot. Minimum: `0`. Maximum: `1`. Default: `0.01`. - **`write_weights`** *(boolean)*: Write Mokapot weights to a text file. Default: `false`. - **`write_txt`** *(boolean)*: Write Mokapot results to a text file. Default: `false`. - **`write_flashlfq`** *(boolean)*: Write Mokapot results to a FlashLFQ-compatible file. Default: `false`. diff --git a/ms2rescore/core.py b/ms2rescore/core.py index 4690ac91..176fa573 100644 --- a/ms2rescore/core.py +++ b/ms2rescore/core.py @@ -5,6 +5,7 @@ import numpy as np import psm_utils.io +from mokapot.dataset import LinearPsmDataset from psm_utils import PSMList from ms2rescore import exceptions @@ -13,6 +14,7 @@ from ms2rescore.parse_spectra import get_missing_values from ms2rescore.report import generate from ms2rescore.rescoring_engines import mokapot, percolator +from ms2rescore.rescoring_engines.mokapot import add_peptide_confidence, add_psm_confidence logger = logging.getLogger(__name__) @@ -104,8 +106,8 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: logging.debug(f"Creating USIs for {len(psm_list)} PSMs") psm_list["spectrum_id"] = [psm.get_usi(as_url=False) for psm in psm_list] - # If no rescoring engine is specified, write PSMs and features to PIN file - if not config["rescoring_engine"]: + # If no rescoring engine is specified or DEBUG, write PSMs and features to PIN file + if not config["rescoring_engine"] or config["log_level"] == "debug": logger.info(f"Writing added features to PIN file: {output_file_root}.psms.pin") psm_utils.io.write_file( psm_list, @@ -113,42 +115,52 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: filetype="percolator", feature_names=all_feature_names, ) + + if not config["rescoring_engine"]: + logger.info("No rescoring engine specified. Skipping rescoring.") return None # Rescore PSMs - if "percolator" in config["rescoring_engine"]: - percolator.rescore( - psm_list, - output_file_root=output_file_root, - log_level=config["log_level"], - processes=config["processes"], - percolator_kwargs=config["rescoring_engine"]["percolator"], - ) - elif "mokapot" in config["rescoring_engine"]: - if "fasta_file" not in config["rescoring_engine"]["mokapot"]: - config["rescoring_engine"]["mokapot"]["fasta_file"] = config["fasta_file"] - if "protein_kwargs" in config["rescoring_engine"]["mokapot"]: - protein_kwargs = config["rescoring_engine"]["mokapot"].pop("protein_kwargs") - else: - protein_kwargs = dict() - - mokapot.rescore( - psm_list, - output_file_root=output_file_root, - protein_kwargs=protein_kwargs, - **config["rescoring_engine"]["mokapot"], - ) + try: + if "percolator" in config["rescoring_engine"]: + percolator.rescore( + psm_list, + output_file_root=output_file_root, + log_level=config["log_level"], + processes=config["processes"], + percolator_kwargs=config["rescoring_engine"]["percolator"], + ) + elif "mokapot" in config["rescoring_engine"]: + if "fasta_file" not in config["rescoring_engine"]["mokapot"]: + config["rescoring_engine"]["mokapot"]["fasta_file"] = config["fasta_file"] + if "protein_kwargs" in config["rescoring_engine"]["mokapot"]: + protein_kwargs = config["rescoring_engine"]["mokapot"].pop("protein_kwargs") + else: + protein_kwargs = dict() + + mokapot.rescore( + psm_list, + output_file_root=output_file_root, + protein_kwargs=protein_kwargs, + **config["rescoring_engine"]["mokapot"], + ) + except exceptions.RescoringError as e: + logger.exception(e) + rescoring_succeeded = False else: - logger.info("No known rescoring engine specified. Skipping rescoring.") + rescoring_succeeded = True + _log_id_psms_after(psm_list, id_psms_before) - _log_id_psms_after(psm_list, id_psms_before) + # Workaround for broken PEP calculation if best PSM is decoy + if all(psm_list["pep"] == 1.0): + psm_list = _fix_constant_pep(psm_list) # Write output logger.info(f"Writing output to {output_file_root}.psms.tsv...") psm_utils.io.write_file(psm_list, output_file_root + ".psms.tsv", filetype="tsv") # Write report - if config["write_report"]: + if config["write_report"] and rescoring_succeeded: try: generate.generate_report( output_file_root, psm_list=psm_list, feature_names=feature_names, use_txt_log=True @@ -231,3 +243,44 @@ def _log_id_psms_after(psm_list, id_psms_before): logger.info(f"Identified {diff_numbers} {diff_word} PSMs at 1% FDR after rescoring.") return id_psms_after + + +def _fix_constant_pep(psm_list): + """Workaround for broken PEP calculation if best PSM is decoy.""" + logger.warning( + "Attempting to fix constant PEP values by removing decoy PSMs that score higher than the " + "best target PSM." + ) + max_target_score = psm_list["score"][~psm_list["is_decoy"]].max() + higher_scoring_decoys = psm_list["is_decoy"] & (psm_list["score"] > max_target_score) + + if not higher_scoring_decoys.any(): + logger.warning("No decoys scoring higher than the best target found. Skipping fix.") + else: + logger.warning(f"Removing {higher_scoring_decoys.sum()} decoy PSMs.") + + psm_list = psm_list[~higher_scoring_decoys] + + # Minimal conversion to LinearPsmDataset + psm_df = psm_list.to_dataframe() + psm_df = psm_df.reset_index(drop=True).reset_index() + psm_df["peptide"] = ( + psm_df["peptidoform"].astype(str).str.replace(r"(/\d+$)", "", n=1, regex=True) + ) + psm_df["is_target"] = ~psm_df["is_decoy"] + lin_psm_data = LinearPsmDataset( + psms=psm_df[["index", "peptide", "score", "is_target"]], + target_column="is_target", + spectrum_columns="index", # Use artificial index to allow multi-rank rescoring + peptide_column="peptide", + feature_columns=["score"], + ) + + # Recalculate confidence + new_confidence = lin_psm_data.assign_confidence() + + # Add new confidence estimations to PSMList + add_psm_confidence(psm_list, new_confidence) + add_peptide_confidence(psm_list, new_confidence) + + return psm_list diff --git a/ms2rescore/exceptions.py b/ms2rescore/exceptions.py index a2c828ba..ba5492a2 100644 --- a/ms2rescore/exceptions.py +++ b/ms2rescore/exceptions.py @@ -35,3 +35,9 @@ class ReportGenerationError(MS2RescoreError): """Error while generating report.""" pass + + +class RescoringError(MS2RescoreError): + """Error while rescoring PSMs.""" + + pass diff --git a/ms2rescore/package_data/config_default.json b/ms2rescore/package_data/config_default.json index 575724c2..765c1a00 100644 --- a/ms2rescore/package_data/config_default.json +++ b/ms2rescore/package_data/config_default.json @@ -14,6 +14,7 @@ }, "rescoring_engine": { "mokapot": { + "train_fdr": 0.01, "write_weights": true, "write_txt": true, "write_flashlfq": true diff --git a/ms2rescore/package_data/config_schema.json b/ms2rescore/package_data/config_schema.json index e07290a2..dfed7cac 100644 --- a/ms2rescore/package_data/config_schema.json +++ b/ms2rescore/package_data/config_schema.json @@ -68,7 +68,11 @@ }, "psm_file": { "description": "Path to file with peptide-spectrum matches.", - "oneOf": [{ "type": "string" }, { "type": "null" }, { "type": "array", "items": { "type": "string" } }] + "oneOf": [ + { "type": "string" }, + { "type": "null" }, + { "type": "array", "items": { "type": "string" } } + ] }, "psm_file_type": { "description": "PSM file type. By default inferred from file extension.", @@ -159,7 +163,7 @@ "default": false }, "profile": { - "description": "Write an txt report using cProfile for profiling", + "description": "Write a txt report using cProfile for profiling", "type": "boolean", "default": false } @@ -263,6 +267,13 @@ "type": "object", "additionalProperties": true, "properties": { + "train_fdr": { + "description": "FDR threshold for training Mokapot", + "type": "number", + "minimum": 0, + "maximum": 1, + "default": 0.01 + }, "write_weights": { "description": "Write Mokapot weights to a text file", "type": "boolean", diff --git a/ms2rescore/report/charts.py b/ms2rescore/report/charts.py index 8d802b8b..83132281 100644 --- a/ms2rescore/report/charts.py +++ b/ms2rescore/report/charts.py @@ -198,6 +198,7 @@ def score_scatter_plot( after: mokapot.LinearConfidence, level: str = "psms", indexer: str = "index", + fdr_threshold: float = 0.01, ) -> go.Figure: """ Plot PSM scores before and after rescoring. @@ -242,12 +243,12 @@ def score_scatter_plot( # Get score thresholds score_threshold_before = ( - ce_psms[ce_psms["mokapot q-value before"] <= 0.01] + ce_psms[ce_psms["mokapot q-value before"] <= fdr_threshold] .sort_values("mokapot q-value before", ascending=False)["mokapot score before"] .iloc[0] ) score_threshold_after = ( - ce_psms[ce_psms["mokapot q-value after"] <= 0.01] + ce_psms[ce_psms["mokapot q-value after"] <= fdr_threshold] .sort_values("mokapot q-value after", ascending=False)["mokapot score after"] .iloc[0] ) diff --git a/ms2rescore/rescoring_engines/mokapot.py b/ms2rescore/rescoring_engines/mokapot.py index 7b54e2a9..86dc20da 100644 --- a/ms2rescore/rescoring_engines/mokapot.py +++ b/ms2rescore/rescoring_engines/mokapot.py @@ -29,8 +29,11 @@ import psm_utils from mokapot.brew import brew from mokapot.dataset import LinearPsmDataset +from mokapot.model import PercolatorModel from pyteomics.mass import nist_mass +from ms2rescore.exceptions import RescoringError + logger = logging.getLogger(__name__) logging.getLogger("numba").setLevel(logging.WARNING) @@ -39,6 +42,7 @@ def rescore( psm_list: psm_utils.PSMList, output_file_root: str = "ms2rescore", fasta_file: Optional[str] = None, + train_fdr: float = 0.01, write_weights: bool = False, write_txt: bool = False, write_flashlfq: bool = False, @@ -65,6 +69,8 @@ def rescore( fasta_file Path to FASTA file with protein sequences to use for protein inference. Defaults to ``None``. + train_fdr + FDR to use for training the Mokapot model. Defaults to ``0.01``. write_weights Write model weights to a text file. Defaults to ``False``. write_txt @@ -91,46 +97,15 @@ def rescore( # Rescore logger.debug(f"Mokapot brew options: `{kwargs}`") - confidence_results, models = brew(lin_psm_data, rng=8, **kwargs) - - # Reshape confidence estimates to match PSMList - keys = ["mokapot score", "mokapot q-value", "mokapot PEP"] - mokapot_values_targets = ( - confidence_results.confidence_estimates["psms"].set_index("index").sort_index()[keys] - ) - mokapot_values_decoys = ( - confidence_results.decoy_confidence_estimates["psms"].set_index("index").sort_index()[keys] - ) - q = np.full((len(psm_list), 3), np.nan) - q[mokapot_values_targets.index] = mokapot_values_targets.values - q[mokapot_values_decoys.index] = mokapot_values_decoys.values - - # Add Mokapot results to PSMList - psm_list["score"] = q[:, 0] - psm_list["qvalue"] = q[:, 1] - psm_list["pep"] = q[:, 2] - - # Repeat for peptide-level scores - peptides_targets = confidence_results.confidence_estimates["peptides"].set_index(["peptide"])[ - keys - ] - peptides_decoys = confidence_results.decoy_confidence_estimates["peptides"].set_index( - ["peptide"] - )[keys] - peptide_info = pd.concat([peptides_targets, peptides_decoys], axis=0).to_dict(orient="index") - - # Add peptide-level scores to PSM metadata - # run_key = "na" if not all(psm.run for psm in psm_list) else None - no_charge_pattern = re.compile(r"(/\d+$)") - for psm in psm_list: - peptide_scores = peptide_info[(no_charge_pattern.sub("", str(psm.peptidoform), 1))] - psm.metadata.update( - { - "peptide_score": peptide_scores["mokapot score"], - "peptide_qvalue": peptide_scores["mokapot q-value"], - "peptide_pep": peptide_scores["mokapot PEP"], - } + try: + confidence_results, models = brew( + lin_psm_data, model=PercolatorModel(train_fdr=train_fdr), rng=8, **kwargs ) + except RuntimeError as e: + raise RescoringError("Mokapot could not be run. Please check the input data.") from e + + add_psm_confidence(psm_list, confidence_results) + add_peptide_confidence(psm_list, confidence_results) # Write results if write_weights: @@ -245,6 +220,55 @@ def save_model_weights( ) +def add_psm_confidence( + psm_list: psm_utils.PSMList, confidence_results: mokapot.confidence.Confidence +) -> None: + """Add Mokapot PSM-level confidence estimates to PSM list.""" + # Reshape confidence estimates to match PSMList + keys = ["mokapot score", "mokapot q-value", "mokapot PEP"] + mokapot_values_targets = ( + confidence_results.confidence_estimates["psms"].set_index("index").sort_index()[keys] + ) + mokapot_values_decoys = ( + confidence_results.decoy_confidence_estimates["psms"].set_index("index").sort_index()[keys] + ) + q = np.full((len(psm_list), 3), np.nan) + q[mokapot_values_targets.index] = mokapot_values_targets.values + q[mokapot_values_decoys.index] = mokapot_values_decoys.values + + # Add Mokapot results to PSMList + psm_list["score"] = q[:, 0] + psm_list["qvalue"] = q[:, 1] + psm_list["pep"] = q[:, 2] + + +def add_peptide_confidence( + psm_list: psm_utils.PSMList, confidence_results: mokapot.confidence.Confidence +) -> None: + """Add Mokapot peptide-level confidence estimates to PSM list.""" + keys = ["mokapot score", "mokapot q-value", "mokapot PEP"] + peptide_info = pd.concat( + [ + confidence_results.confidence_estimates["peptides"].set_index("peptide")[keys], + confidence_results.decoy_confidence_estimates["peptides"].set_index("peptide")[keys], + ], + axis=0, + ).to_dict(orient="index") + + # Add peptide-level scores to PSM metadata + # run_key = "na" if not all(psm.run for psm in psm_list) else None + no_charge_pattern = re.compile(r"(/\d+$)") + for psm in psm_list: + peptide_scores = peptide_info[(no_charge_pattern.sub("", str(psm.peptidoform), 1))] + psm.metadata.update( + { + "peptide_score": peptide_scores["mokapot score"], + "peptide_qvalue": peptide_scores["mokapot q-value"], + "peptide_pep": peptide_scores["mokapot PEP"], + } + ) + + def _mz_to_mass(mz: float, charge: int) -> float: """Convert m/z to mass.""" return mz * charge - charge * nist_mass["H"][1][0]