From 3047f28e895bd0eb08a1e08698cf9413ccd07d7a Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Thu, 4 Jul 2024 19:00:44 +0200
Subject: [PATCH] Added: - Add option to pass train_fdr to mokapot;

Changed:
- Split functions to add mokapot confidence
- Implement workaround for broken PEP calculation if best PSM is decoy
- Still write PSMs to file, even if rescoring step failed. Still skip report generation
---
 docs/source/config_schema.md                |   2 +
 ms2rescore/core.py                          | 107 +++++++++++++++-----
 ms2rescore/exceptions.py                    |   6 ++
 ms2rescore/package_data/config_default.json |   1 +
 ms2rescore/package_data/config_schema.json  |  15 ++-
 ms2rescore/report/charts.py                 |   5 +-
 ms2rescore/rescoring_engines/mokapot.py     | 102 ++++++++++++-------
 7 files changed, 168 insertions(+), 70 deletions(-)
diff --git a/docs/source/config_schema.md b/docs/source/config_schema.md
index 37d98047..ad6938cc 100644
--- a/docs/source/config_schema.md
+++ b/docs/source/config_schema.md
@@ -66,6 +66,7 @@
       - *string*
       - *null*
   - **`write_report`** *(boolean)*: Write an HTML report with various QC metrics and charts. Default: `false`.
+  - **`profile`** *(boolean)*: Write a txt report using cProfile for profiling. Default: `false`.
 ## Definitions
 
 - <a id="definitions/feature_generator"></a>**`feature_generator`** *(object)*: Feature generator configuration. Can contain additional properties.
@@ -87,6 +88,7 @@
 - <a id="definitions/im2deep"></a>**`im2deep`** *(object)*: Ion mobility feature generator configuration using IM2Deep. Can contain additional properties. Refer to *[#/definitions/feature_generator](#definitions/feature_generator)*.
   - **`reference_dataset`** *(string)*: Path to IM2Deep reference dataset file. Default: `"Meier_unimod.parquet"`.
 - <a id="definitions/mokapot"></a>**`mokapot`** *(object)*: Mokapot rescoring engine configuration. Additional properties are passed to the Mokapot brew function. Can contain additional properties. Refer to *[#/definitions/rescoring_engine](#definitions/rescoring_engine)*.
+  - **`train_fdr`** *(number)*: FDR threshold for training Mokapot. Minimum: `0`. Maximum: `1`. Default: `0.01`.
   - **`write_weights`** *(boolean)*: Write Mokapot weights to a text file. Default: `false`.
   - **`write_txt`** *(boolean)*: Write Mokapot results to a text file. Default: `false`.
   - **`write_flashlfq`** *(boolean)*: Write Mokapot results to a FlashLFQ-compatible file. Default: `false`.
diff --git a/ms2rescore/core.py b/ms2rescore/core.py
index 4690ac91..176fa573 100644
--- a/ms2rescore/core.py
+++ b/ms2rescore/core.py
@@ -5,6 +5,7 @@
 
 import numpy as np
 import psm_utils.io
+from mokapot.dataset import LinearPsmDataset
 from psm_utils import PSMList
 
 from ms2rescore import exceptions
@@ -13,6 +14,7 @@
 from ms2rescore.parse_spectra import get_missing_values
 from ms2rescore.report import generate
 from ms2rescore.rescoring_engines import mokapot, percolator
+from ms2rescore.rescoring_engines.mokapot import add_peptide_confidence, add_psm_confidence
 
 logger = logging.getLogger(__name__)
 
@@ -104,8 +106,8 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
         logging.debug(f"Creating USIs for {len(psm_list)} PSMs")
         psm_list["spectrum_id"] = [psm.get_usi(as_url=False) for psm in psm_list]
 
-    # If no rescoring engine is specified, write PSMs and features to PIN file
-    if not config["rescoring_engine"]:
+    # If no rescoring engine is specified or DEBUG, write PSMs and features to PIN file
+    if not config["rescoring_engine"] or config["log_level"] == "debug":
         logger.info(f"Writing added features to PIN file: {output_file_root}.psms.pin")
         psm_utils.io.write_file(
             psm_list,
@@ -113,42 +115,52 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
             filetype="percolator",
             feature_names=all_feature_names,
         )
+
+    if not config["rescoring_engine"]:
+        logger.info("No rescoring engine specified. Skipping rescoring.")
         return None
 
     # Rescore PSMs
-    if "percolator" in config["rescoring_engine"]:
-        percolator.rescore(
-            psm_list,
-            output_file_root=output_file_root,
-            log_level=config["log_level"],
-            processes=config["processes"],
-            percolator_kwargs=config["rescoring_engine"]["percolator"],
-        )
-    elif "mokapot" in config["rescoring_engine"]:
-        if "fasta_file" not in config["rescoring_engine"]["mokapot"]:
-            config["rescoring_engine"]["mokapot"]["fasta_file"] = config["fasta_file"]
-        if "protein_kwargs" in config["rescoring_engine"]["mokapot"]:
-            protein_kwargs = config["rescoring_engine"]["mokapot"].pop("protein_kwargs")
-        else:
-            protein_kwargs = dict()
-
-        mokapot.rescore(
-            psm_list,
-            output_file_root=output_file_root,
-            protein_kwargs=protein_kwargs,
-            **config["rescoring_engine"]["mokapot"],
-        )
+    try:
+        if "percolator" in config["rescoring_engine"]:
+            percolator.rescore(
+                psm_list,
+                output_file_root=output_file_root,
+                log_level=config["log_level"],
+                processes=config["processes"],
+                percolator_kwargs=config["rescoring_engine"]["percolator"],
+            )
+        elif "mokapot" in config["rescoring_engine"]:
+            if "fasta_file" not in config["rescoring_engine"]["mokapot"]:
+                config["rescoring_engine"]["mokapot"]["fasta_file"] = config["fasta_file"]
+            if "protein_kwargs" in config["rescoring_engine"]["mokapot"]:
+                protein_kwargs = config["rescoring_engine"]["mokapot"].pop("protein_kwargs")
+            else:
+                protein_kwargs = dict()
+
+            mokapot.rescore(
+                psm_list,
+                output_file_root=output_file_root,
+                protein_kwargs=protein_kwargs,
+                **config["rescoring_engine"]["mokapot"],
+            )
+    except exceptions.RescoringError as e:
+        logger.exception(e)
+        rescoring_succeeded = False
     else:
-        logger.info("No known rescoring engine specified. Skipping rescoring.")
+        rescoring_succeeded = True
+        _log_id_psms_after(psm_list, id_psms_before)
 
-    _log_id_psms_after(psm_list, id_psms_before)
+    # Workaround for broken PEP calculation if best PSM is decoy
+    if all(psm_list["pep"] == 1.0):
+        psm_list = _fix_constant_pep(psm_list)
 
     # Write output
     logger.info(f"Writing output to {output_file_root}.psms.tsv...")
     psm_utils.io.write_file(psm_list, output_file_root + ".psms.tsv", filetype="tsv")
 
     # Write report
-    if config["write_report"]:
+    if config["write_report"] and rescoring_succeeded:
         try:
             generate.generate_report(
                 output_file_root, psm_list=psm_list, feature_names=feature_names, use_txt_log=True
@@ -231,3 +243,44 @@ def _log_id_psms_after(psm_list, id_psms_before):
     logger.info(f"Identified {diff_numbers} {diff_word} PSMs at 1% FDR after rescoring.")
 
     return id_psms_after
+
+
+def _fix_constant_pep(psm_list):
+    """Workaround for broken PEP calculation if best PSM is decoy."""
+    logger.warning(
+        "Attempting to fix constant PEP values by removing decoy PSMs that score higher than the "
+        "best target PSM."
+    )
+    max_target_score = psm_list["score"][~psm_list["is_decoy"]].max()
+    higher_scoring_decoys = psm_list["is_decoy"] & (psm_list["score"] > max_target_score)
+
+    if not higher_scoring_decoys.any():
+        logger.warning("No decoys scoring higher than the best target found. Skipping fix.")
+    else:
+        logger.warning(f"Removing {higher_scoring_decoys.sum()} decoy PSMs.")
+
+        psm_list = psm_list[~higher_scoring_decoys]
+
+        # Minimal conversion to LinearPsmDataset
+        psm_df = psm_list.to_dataframe()
+        psm_df = psm_df.reset_index(drop=True).reset_index()
+        psm_df["peptide"] = (
+            psm_df["peptidoform"].astype(str).str.replace(r"(/\d+$)", "", n=1, regex=True)
+        )
+        psm_df["is_target"] = ~psm_df["is_decoy"]
+        lin_psm_data = LinearPsmDataset(
+            psms=psm_df[["index", "peptide", "score", "is_target"]],
+            target_column="is_target",
+            spectrum_columns="index",  # Use artificial index to allow multi-rank rescoring
+            peptide_column="peptide",
+            feature_columns=["score"],
+        )
+
+        # Recalculate confidence
+        new_confidence = lin_psm_data.assign_confidence()
+
+        # Add new confidence estimations to PSMList
+        add_psm_confidence(psm_list, new_confidence)
+        add_peptide_confidence(psm_list, new_confidence)
+
+        return psm_list
diff --git a/ms2rescore/exceptions.py b/ms2rescore/exceptions.py
index a2c828ba..ba5492a2 100644
--- a/ms2rescore/exceptions.py
+++ b/ms2rescore/exceptions.py
@@ -35,3 +35,9 @@ class ReportGenerationError(MS2RescoreError):
     """Error while generating report."""
 
     pass
+
+
+class RescoringError(MS2RescoreError):
+    """Error while rescoring PSMs."""
+
+    pass
diff --git a/ms2rescore/package_data/config_default.json b/ms2rescore/package_data/config_default.json
index 575724c2..765c1a00 100644
--- a/ms2rescore/package_data/config_default.json
+++ b/ms2rescore/package_data/config_default.json
@@ -14,6 +14,7 @@
         },
         "rescoring_engine": {
             "mokapot": {
+                "train_fdr": 0.01,
                 "write_weights": true,
                 "write_txt": true,
                 "write_flashlfq": true
diff --git a/ms2rescore/package_data/config_schema.json b/ms2rescore/package_data/config_schema.json
index e07290a2..dfed7cac 100644
--- a/ms2rescore/package_data/config_schema.json
+++ b/ms2rescore/package_data/config_schema.json
@@ -68,7 +68,11 @@
                 },
                 "psm_file": {
                     "description": "Path to file with peptide-spectrum matches.",
-                    "oneOf": [{ "type": "string" }, { "type": "null" }, { "type": "array", "items": { "type": "string" } }]
+                    "oneOf": [
+                        { "type": "string" },
+                        { "type": "null" },
+                        { "type": "array", "items": { "type": "string" } }
+                    ]
                 },
                 "psm_file_type": {
                     "description": "PSM file type. By default inferred from file extension.",
@@ -159,7 +163,7 @@
                     "default": false
                 },
                 "profile": {
-                    "description": "Write an txt report using cProfile for profiling",
+                    "description": "Write a txt report using cProfile for profiling",
                     "type": "boolean",
                     "default": false
                 }
@@ -263,6 +267,13 @@
             "type": "object",
             "additionalProperties": true,
             "properties": {
+                "train_fdr": {
+                    "description": "FDR threshold for training Mokapot",
+                    "type": "number",
+                    "minimum": 0,
+                    "maximum": 1,
+                    "default": 0.01
+                },
                 "write_weights": {
                     "description": "Write Mokapot weights to a text file",
                     "type": "boolean",
diff --git a/ms2rescore/report/charts.py b/ms2rescore/report/charts.py
index 8d802b8b..83132281 100644
--- a/ms2rescore/report/charts.py
+++ b/ms2rescore/report/charts.py
@@ -198,6 +198,7 @@ def score_scatter_plot(
     after: mokapot.LinearConfidence,
     level: str = "psms",
     indexer: str = "index",
+    fdr_threshold: float = 0.01,
 ) -> go.Figure:
     """
     Plot PSM scores before and after rescoring.
@@ -242,12 +243,12 @@ def score_scatter_plot(
 
     # Get score thresholds
     score_threshold_before = (
-        ce_psms[ce_psms["mokapot q-value before"] <= 0.01]
+        ce_psms[ce_psms["mokapot q-value before"] <= fdr_threshold]
         .sort_values("mokapot q-value before", ascending=False)["mokapot score before"]
         .iloc[0]
     )
     score_threshold_after = (
-        ce_psms[ce_psms["mokapot q-value after"] <= 0.01]
+        ce_psms[ce_psms["mokapot q-value after"] <= fdr_threshold]
         .sort_values("mokapot q-value after", ascending=False)["mokapot score after"]
         .iloc[0]
     )
diff --git a/ms2rescore/rescoring_engines/mokapot.py b/ms2rescore/rescoring_engines/mokapot.py
index 7b54e2a9..86dc20da 100644
--- a/ms2rescore/rescoring_engines/mokapot.py
+++ b/ms2rescore/rescoring_engines/mokapot.py
@@ -29,8 +29,11 @@
 import psm_utils
 from mokapot.brew import brew
 from mokapot.dataset import LinearPsmDataset
+from mokapot.model import PercolatorModel
 from pyteomics.mass import nist_mass
 
+from ms2rescore.exceptions import RescoringError
+
 logger = logging.getLogger(__name__)
 logging.getLogger("numba").setLevel(logging.WARNING)
 
@@ -39,6 +42,7 @@ def rescore(
     psm_list: psm_utils.PSMList,
     output_file_root: str = "ms2rescore",
     fasta_file: Optional[str] = None,
+    train_fdr: float = 0.01,
     write_weights: bool = False,
     write_txt: bool = False,
     write_flashlfq: bool = False,
@@ -65,6 +69,8 @@ def rescore(
     fasta_file
         Path to FASTA file with protein sequences to use for protein inference. Defaults to
         ``None``.
+    train_fdr
+        FDR to use for training the Mokapot model. Defaults to ``0.01``.
     write_weights
         Write model weights to a text file. Defaults to ``False``.
     write_txt
@@ -91,46 +97,15 @@ def rescore(
 
     # Rescore
     logger.debug(f"Mokapot brew options: `{kwargs}`")
-    confidence_results, models = brew(lin_psm_data, rng=8, **kwargs)
-
-    # Reshape confidence estimates to match PSMList
-    keys = ["mokapot score", "mokapot q-value", "mokapot PEP"]
-    mokapot_values_targets = (
-        confidence_results.confidence_estimates["psms"].set_index("index").sort_index()[keys]
-    )
-    mokapot_values_decoys = (
-        confidence_results.decoy_confidence_estimates["psms"].set_index("index").sort_index()[keys]
-    )
-    q = np.full((len(psm_list), 3), np.nan)
-    q[mokapot_values_targets.index] = mokapot_values_targets.values
-    q[mokapot_values_decoys.index] = mokapot_values_decoys.values
-
-    # Add Mokapot results to PSMList
-    psm_list["score"] = q[:, 0]
-    psm_list["qvalue"] = q[:, 1]
-    psm_list["pep"] = q[:, 2]
-
-    # Repeat for peptide-level scores
-    peptides_targets = confidence_results.confidence_estimates["peptides"].set_index(["peptide"])[
-        keys
-    ]
-    peptides_decoys = confidence_results.decoy_confidence_estimates["peptides"].set_index(
-        ["peptide"]
-    )[keys]
-    peptide_info = pd.concat([peptides_targets, peptides_decoys], axis=0).to_dict(orient="index")
-
-    # Add peptide-level scores to PSM metadata
-    # run_key = "na" if not all(psm.run for psm in psm_list) else None
-    no_charge_pattern = re.compile(r"(/\d+$)")
-    for psm in psm_list:
-        peptide_scores = peptide_info[(no_charge_pattern.sub("", str(psm.peptidoform), 1))]
-        psm.metadata.update(
-            {
-                "peptide_score": peptide_scores["mokapot score"],
-                "peptide_qvalue": peptide_scores["mokapot q-value"],
-                "peptide_pep": peptide_scores["mokapot PEP"],
-            }
+    try:
+        confidence_results, models = brew(
+            lin_psm_data, model=PercolatorModel(train_fdr=train_fdr), rng=8, **kwargs
         )
+    except RuntimeError as e:
+        raise RescoringError("Mokapot could not be run. Please check the input data.") from e
+
+    add_psm_confidence(psm_list, confidence_results)
+    add_peptide_confidence(psm_list, confidence_results)
 
     # Write results
     if write_weights:
@@ -245,6 +220,55 @@ def save_model_weights(
     )
 
 
+def add_psm_confidence(
+    psm_list: psm_utils.PSMList, confidence_results: mokapot.confidence.Confidence
+) -> None:
+    """Add Mokapot PSM-level confidence estimates to PSM list."""
+    # Reshape confidence estimates to match PSMList
+    keys = ["mokapot score", "mokapot q-value", "mokapot PEP"]
+    mokapot_values_targets = (
+        confidence_results.confidence_estimates["psms"].set_index("index").sort_index()[keys]
+    )
+    mokapot_values_decoys = (
+        confidence_results.decoy_confidence_estimates["psms"].set_index("index").sort_index()[keys]
+    )
+    q = np.full((len(psm_list), 3), np.nan)
+    q[mokapot_values_targets.index] = mokapot_values_targets.values
+    q[mokapot_values_decoys.index] = mokapot_values_decoys.values
+
+    # Add Mokapot results to PSMList
+    psm_list["score"] = q[:, 0]
+    psm_list["qvalue"] = q[:, 1]
+    psm_list["pep"] = q[:, 2]
+
+
+def add_peptide_confidence(
+    psm_list: psm_utils.PSMList, confidence_results: mokapot.confidence.Confidence
+) -> None:
+    """Add Mokapot peptide-level confidence estimates to PSM list."""
+    keys = ["mokapot score", "mokapot q-value", "mokapot PEP"]
+    peptide_info = pd.concat(
+        [
+            confidence_results.confidence_estimates["peptides"].set_index("peptide")[keys],
+            confidence_results.decoy_confidence_estimates["peptides"].set_index("peptide")[keys],
+        ],
+        axis=0,
+    ).to_dict(orient="index")
+
+    # Add peptide-level scores to PSM metadata
+    # run_key = "na" if not all(psm.run for psm in psm_list) else None
+    no_charge_pattern = re.compile(r"(/\d+$)")
+    for psm in psm_list:
+        peptide_scores = peptide_info[(no_charge_pattern.sub("", str(psm.peptidoform), 1))]
+        psm.metadata.update(
+            {
+                "peptide_score": peptide_scores["mokapot score"],
+                "peptide_qvalue": peptide_scores["mokapot q-value"],
+                "peptide_pep": peptide_scores["mokapot PEP"],
+            }
+        )
+
+
 def _mz_to_mass(mz: float, charge: int) -> float:
     """Convert m/z to mass."""
     return mz * charge - charge * nist_mass["H"][1][0]