From e16a3b102a9c920c1d0d3db54e33c185c8ec7f87 Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Thu, 12 Oct 2023 15:37:17 +0200
Subject: [PATCH] PR review

---
 README.md                               |   6 +-
 ms2rescore/config_parser.py             |   2 +-
 ms2rescore/core.py                      |  28 ++---
 ms2rescore/feature_generators/deeplc.py | 108 +++++++++---------
 ms2rescore/gui/__main__.py              |   1 +
 ms2rescore/gui/app.py                   |  54 ++++-----
 ms2rescore/gui/function2ctk.py          |   9 +-
 ms2rescore/parse_psms.py                |  15 ++-
 ms2rescore/parse_spectra.py             | 146 +++++++++++-------------
 ms2rescore/rescoring_engines/mokapot.py |   5 +-
 ms2rescore/utils.py                     |   2 +-
 11 files changed, 177 insertions(+), 199 deletions(-)
diff --git a/README.md b/README.md
index 2adfdb17..991bb436 100644
--- a/README.md
+++ b/README.md
@@ -30,8 +30,10 @@ MS²Rescore can read peptide identifications in any format supported by [psm_uti
 files:
 
 - [MS Amanda](http://ms.imp.ac.at/?goto=msamanda) `.csv`
+- [Sage](https://github.com/lazear/sage) `.sage.tsv`
 - [PeptideShaker](https://compomics.github.io/projects/peptide-shaker.html) `.mzid`
 - [MSGFPlus](https://omics.pnl.gov/software/ms-gf) `.mzid`
+- [Mascot](https://www.matrixscience.com/) `.mzid`
 - [MaxQuant](https://www.maxquant.org/) `msms.txt`
 - [X!Tandem](https://www.thegpm.org/tandem/) `.xml`
 - [PEAKS](https://www.bioinfor.com/peaksdb/) `.mzid`
@@ -45,13 +47,13 @@ MS²Rescore is available as a [desktop application][desktop], a [command line to
 
 > **MS2Rescore: Data-driven rescoring dramatically boosts immunopeptide identification rates.**
 > Arthur Declercq, Robbin Bouwmeester, Aurélie Hirschler, Christine Carapito, Sven Degroeve, Lennart Martens, and Ralf Gabriels.
-> _Molecular & Cellular Proteomics_ (2021) [doi:10.1016/j.mcpro.2022.100266](https://doi.org/10.1016/j.mcpro.2022.100266) > <span class="__dimensions_badge_embed__" data-doi="10.1016/j.mcpro.2022.100266" data-hide-zero-citations="true" data-style="small_rectangle"></span>
+> _Molecular & Cellular Proteomics_ (2021) [doi:10.1016/j.mcpro.2022.100266](https://doi.org/10.1016/j.mcpro.2022.100266) <span class="__dimensions_badge_embed__" data-doi="10.1016/j.mcpro.2022.100266" data-hide-zero-citations="true" data-style="small_rectangle"></span>
 
 **Original publication describing the concept of rescoring with predicted spectra:**
 
 > **Accurate peptide fragmentation predictions allow data driven approaches to replace and improve upon proteomics search engine scoring functions.**
 > Ana S C Silva, Robbin Bouwmeester, Lennart Martens, and Sven Degroeve.
-> _Bioinformatics_ (2019) [doi:10.1093/bioinformatics/btz383](https://doi.org/10.1093/bioinformatics/btz383) > <span class="__dimensions_badge_embed__" data-doi="10.1093/bioinformatics/btz383" data-hide-zero-citations="true" data-style="small_rectangle"></span>
+> _Bioinformatics_ (2019) [doi:10.1093/bioinformatics/btz383](https://doi.org/10.1093/bioinformatics/btz383) <span class="__dimensions_badge_embed__" data-doi="10.1093/bioinformatics/btz383" data-hide-zero-citations="true" data-style="small_rectangle"></span>
 
 To replicate the experiments described in this article, check out the
 [publication branch][publication-branch] of the repository.
diff --git a/ms2rescore/config_parser.py b/ms2rescore/config_parser.py
index 0d254fd3..ced80c28 100644
--- a/ms2rescore/config_parser.py
+++ b/ms2rescore/config_parser.py
@@ -69,7 +69,7 @@ def _validate_filenames(config: Dict) -> Dict:
         config["ms2rescore"]["output_path"], config["ms2rescore"]["psm_file"][0]
     )
 
-    # Parse config_file as posix path #TODO: Is this necessary?
+    # Parse config_file as posix path to avoid combination of forward and backward slashes
     if config["ms2rescore"]["config_file"]:
         config["ms2rescore"]["config_file"] = Path(config["ms2rescore"]["config_file"]).as_posix()
 
diff --git a/ms2rescore/core.py b/ms2rescore/core.py
index b27d7f50..aee29c36 100644
--- a/ms2rescore/core.py
+++ b/ms2rescore/core.py
@@ -8,9 +8,9 @@
 
 from ms2rescore.feature_generators import FEATURE_GENERATORS
 from ms2rescore.parse_psms import parse_psms
+from ms2rescore.parse_spectra import get_missing_values
 from ms2rescore.report import generate
 from ms2rescore.rescoring_engines import mokapot, percolator
-from ms2rescore.parse_spectra import get_missing_values
 
 logger = logging.getLogger(__name__)
 
@@ -27,7 +27,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
         PSMList object containing PSMs. If None, PSMs will be read from configuration ``psm_file``.
 
     """
-    config = configuration["ms2rescore"]  # TODO: Remove top-level key?
+    config = configuration["ms2rescore"]
     output_file_root = config["output_path"]
 
     # Write full configuration including defaults to file
@@ -54,23 +54,13 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
         f"PSMs already contain the following rescoring features: {psm_list_feature_names}"
     )
 
-    if ("deeplc" in config["feature_generators"] and None in psm_list["retention_time"]) or (
-        "ionmob" in config["feature_generators"] and None in psm_list["ion_mobility"]
-    ):
-        logger.warning(
-            "One or more PSMs are missing retention time and/or ion mobility values. These will be "
-            "parsed from the spectrum file."
-        )
-        get_missing_values(
-            config,
-            psm_list,
-            missing_rt_values=(
-                "deeplc" in config["feature_generators"] and None in psm_list["retention_time"]
-            ),
-            missing_im_values=(
-                "ionmob" in config["feature_generators"] and None in psm_list["ion_mobility"]
-            ),
-        )
+    # TODO: avoid hard coding feature generators in some way
+    rt_required = "deeplc" in config["feature_generators"] and None in psm_list["retention_time"]
+    im_required = "ionmob" in config["feature_generators"] and None in psm_list["ion_mobility"]
+    if rt_required or im_required:
+        logger.info("Parsing missing retention time and/or ion mobility values from spectra...")
+        get_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required)
+
     # Add rescoring features
     for fgen_name, fgen_config in config["feature_generators"].items():
         # TODO: Handle this somewhere else, more generally?
diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py
index cb6f6a40..50b577ff 100644
--- a/ms2rescore/feature_generators/deeplc.py
+++ b/ms2rescore/feature_generators/deeplc.py
@@ -28,9 +28,7 @@
 from psm_utils import PSMList
 from psm_utils.io import peptide_record
 
-from ms2rescore.exceptions import MS2RescoreError
 from ms2rescore.feature_generators.base import FeatureGeneratorBase
-from ms2rescore.utils import infer_spectrum_path
 
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
 logger = logging.getLogger(__name__)
@@ -146,59 +144,65 @@ def add_features(self, psm_list: PSMList) -> None:
                     f"Running DeepLC for PSMs from run ({current_run}/{total_runs}): `{run}`..."
                 )
 
-                psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
-
-                psm_list_calibration = self._get_calibration_psms(psm_list_run)
-
-                logger.debug("Calibrating DeepLC")
-                self.deeplc_predictor = self.DeepLC(
-                    n_jobs=self.processes,
-                    verbose=self._verbose,
-                    path_model=self.selected_model or self.user_model,
-                    **self.deeplc_kwargs,
-                )
-                self.deeplc_predictor.calibrate_preds(
-                    seq_df=self._psm_list_to_deeplc_peprec(psm_list_calibration)
-                )
-                # Still calibrate for each run, but do not try out all model options.
-                # Just use model that was selected based on first run
-                if not self.selected_model:
-                    self.selected_model = list(self.deeplc_predictor.model.keys())
-                    self.deeplc_kwargs["deeplc_retrain"] = False
-                    logger.debug(
-                        f"Selected DeepLC model {self.selected_model} based on "
-                        "calibration of first run. Using this model (after new "
-                        "calibrations) for the remaining runs."
+                # Disable wild logging to stdout by Tensorflow, unless in debug mode
+                with contextlib.redirect_stdout(
+                    open(os.devnull, "w")
+                ) if not self._verbose else contextlib.nullcontext():
+                    # Make new PSM list for this run (chain PSMs per spectrum to flat list)
+                    psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
+
+                    logger.debug("Calibrating DeepLC...")
+                    psm_list_calibration = self._get_calibration_psms(psm_list_run)
+                    self.deeplc_predictor = self.DeepLC(
+                        n_jobs=self.processes,
+                        verbose=self._verbose,
+                        path_model=self.selected_model or self.user_model,
+                        **self.deeplc_kwargs,
                     )
-
-                predictions = np.array(
-                    self.deeplc_predictor.make_preds(
-                        seq_df=self._psm_list_to_deeplc_peprec(psm_list_run)
-                    )
-                )
-                observations = psm_list_run["retention_time"]
-                rt_diffs_run = np.abs(predictions - observations)
-
-                for i, psm in enumerate(psm_list_run):
-                    psm["rescoring_features"].update(
-                        {
-                            "observed_retention_time": observations[i],
-                            "predicted_retention_time": predictions[i],
-                            "rt_diff": rt_diffs_run[i],
-                        }
+                    self.deeplc_predictor.calibrate_preds(
+                        seq_df=self._psm_list_to_deeplc_peprec(psm_list_calibration)
                     )
-                    peptide = psm.peptidoform.proforma.split("\\")[0]  # remove charge
-                    if peptide_rt_diff_dict[peptide]["rt_diff_best"] > rt_diffs_run[i]:
-                        peptide_rt_diff_dict[peptide] = {
-                            "observed_retention_time_best": observations[i],
-                            "predicted_retention_time_best": predictions[i],
-                            "rt_diff_best": rt_diffs_run[i],
-                        }
-                for psm in psm_list_run:
-                    psm["rescoring_features"].update(
-                        peptide_rt_diff_dict[psm.peptidoform.proforma.split("\\")[0]]
+                    # Still calibrate for each run, but do not try out all model options.
+                    # Just use model that was selected based on first run
+                    if not self.selected_model:
+                        self.selected_model = list(self.deeplc_predictor.model.keys())
+                        self.deeplc_kwargs["deeplc_retrain"] = False
+                        logger.debug(
+                            f"Selected DeepLC model {self.selected_model} based on "
+                            "calibration of first run. Using this model (after new "
+                            "calibrations) for the remaining runs."
+                        )
+
+                    logger.debug("Predicting retention times...")
+                    predictions = np.array(
+                        self.deeplc_predictor.make_preds(
+                            seq_df=self._psm_list_to_deeplc_peprec(psm_list_run)
+                        )
                     )
-            current_run += 1
+                    observations = psm_list_run["retention_time"]
+                    rt_diffs_run = np.abs(predictions - observations)
+
+                    logger.debug("Adding features to PSMs...")
+                    for i, psm in enumerate(psm_list_run):
+                        psm["rescoring_features"].update(
+                            {
+                                "observed_retention_time": observations[i],
+                                "predicted_retention_time": predictions[i],
+                                "rt_diff": rt_diffs_run[i],
+                            }
+                        )
+                        peptide = psm.peptidoform.proforma.split("\\")[0]  # remove charge
+                        if peptide_rt_diff_dict[peptide]["rt_diff_best"] > rt_diffs_run[i]:
+                            peptide_rt_diff_dict[peptide] = {
+                                "observed_retention_time_best": observations[i],
+                                "predicted_retention_time_best": predictions[i],
+                                "rt_diff_best": rt_diffs_run[i],
+                            }
+                    for psm in psm_list_run:
+                        psm["rescoring_features"].update(
+                            peptide_rt_diff_dict[psm.peptidoform.proforma.split("\\")[0]]
+                        )
+                current_run += 1
 
     # TODO: Remove when DeepLC supports PSMList directly
     @staticmethod
diff --git a/ms2rescore/gui/__main__.py b/ms2rescore/gui/__main__.py
index 03ccd0e8..429e117f 100644
--- a/ms2rescore/gui/__main__.py
+++ b/ms2rescore/gui/__main__.py
@@ -10,6 +10,7 @@
 def main():
     """Entrypoint for MS²Rescore GUI."""
     multiprocessing.freeze_support()
+    # Redirect stdout when running GUI (packaged app might not have console attached)
     with contextlib.redirect_stdout(open(os.devnull, "w")):
         app()
 
diff --git a/ms2rescore/gui/app.py b/ms2rescore/gui/app.py
index c322a77e..3196659e 100644
--- a/ms2rescore/gui/app.py
+++ b/ms2rescore/gui/app.py
@@ -8,9 +8,9 @@
 import webbrowser
 from pathlib import Path
 from typing import Dict, List, Tuple
-from joblib import parallel_backend
 
 import customtkinter as ctk
+from joblib import parallel_backend
 from ms2pip.constants import MODELS as ms2pip_models
 from PIL import Image
 from psm_utils.io import FILETYPES
@@ -41,6 +41,8 @@
     pass
 
 ctk.set_default_color_theme(_THEME_FILE)
+
+# TODO Does this disable multiprocessing everywhere?
 parallel_backend("threading")
 
 
@@ -163,18 +165,11 @@ def get(self):
         main_config = self.main_config.get()
         advanced_config = self.advanced_config.get()
 
-        # TODO Move to rescoring engine config
-        percolator_config = {"init-weights": advanced_config.pop("weightsfile")}
-
         config = {"ms2rescore": main_config}
         config["ms2rescore"].update(advanced_config)
         config["ms2rescore"]["feature_generators"] = self.fgen_config.get()
         config["ms2rescore"]["rescoring_engine"] = self.rescoring_engine_config.get()
 
-        # TODO See above
-        if "percolator" in config["ms2rescore"]["rescoring_engine"]:
-            config["ms2rescore"]["rescoring_engine"]["percolator"] = percolator_config
-
         args = (config,)  # Comma required to wrap in tuple
         kwargs = {}
 
@@ -284,11 +279,13 @@ def __init__(self, *args, **kwargs):
     def get(self) -> Dict:
         """Get the configured values as a dictionary."""
         try:
+            # there cannot be spaces in the file path
+            # TODO: Fix this in widgets.LabeledFileSelect
             psm_files = self.psm_file.get().split(" ")
         except AttributeError:
             raise MS2RescoreConfigurationError("No PSM file provided. Please select a file.")
         return {
-            "psm_file": psm_files,  # there cannot be spaces in the file path
+            "psm_file": psm_files,
             "psm_file_type": self.psm_file_type.get(),
         }
 
@@ -321,11 +318,6 @@ def __init__(self, *args, **kwargs):
         self.spectrum_id_pattern = widgets.LabeledEntry(self, label="Spectrum ID regex pattern")
         self.spectrum_id_pattern.grid(row=5, column=0, pady=(0, 10), sticky="nsew")
 
-        self.weightsfile = widgets.LabeledFileSelect(
-            self, label="Pretrained Percolator weights", file_option="openfile"
-        )
-        self.weightsfile.grid(row=6, column=0, columnspan=2, sticky="nsew")
-
         self.file_prefix = widgets.LabeledFileSelect(
             self, label="Filename for output files", file_option="savefile"
         )
@@ -344,7 +336,6 @@ def get(self) -> Dict:
             "id_decoy_pattern": self.id_decoy_pattern.get(),
             "psm_id_pattern": self.psm_id_pattern.get(),
             "spectrum_id_pattern": self.spectrum_id_pattern.get(),
-            "weightsfile": self.weightsfile.get(),
             "output_path": self.file_prefix.get(),
             "config_file": self.config_file.get(),
             "write_report": self.generate_report.get(),
@@ -466,7 +457,7 @@ def __init__(self, *args, **kwargs):
 
         self.num_epochs = widgets.LabeledFloatSpinbox(
             self,
-            label="Number of epochs",
+            label="Number of transfer learning epochs",
             step_size=5,
             initial_value=20,
         )  # way to remove float in spinbox label?
@@ -569,25 +560,29 @@ def __init__(self, *args, **kwargs):
         self.configure(fg_color="transparent")
         self.grid_columnconfigure(0, weight=1)
 
-        self.title = widgets.Heading(self, text="Mokapot cofiguration")
+        self.title = widgets.Heading(self, text="Mokapot coffeeguration")
         self.title.grid(row=0, column=0, columnspan=2, pady=(0, 5), sticky="ew")
 
-        self.write_weights = widgets.LabeledSwitch(self, label="Write weightsfile", default=True)
+        self.write_weights = widgets.LabeledSwitch(
+            self, label="Write model weights to file", default=True
+        )
         self.write_weights.grid(row=1, column=0, pady=(0, 10), sticky="nsew")
 
-        self.write_txt = widgets.LabeledSwitch(self, label="Write txt output file", default=True)
+        self.write_txt = widgets.LabeledSwitch(self, label="Write TXT output files", default=True)
         self.write_txt.grid(row=2, column=0, pady=(0, 10), sticky="nsew")
 
-        self.write_flashlfq = widgets.LabeledSwitch(self, label="Write flashlfq", default=False)
+        self.write_flashlfq = widgets.LabeledSwitch(
+            self, label="Write file for FlashLFQ", default=False
+        )
         self.write_flashlfq.grid(row=3, column=0, pady=(0, 10), sticky="nsew")
 
         self.protein_kwargs = widgets.TableInput(
             self,
-            label="mokapot protein kwargs",
+            label="`mokapot.read_fasta` options (see Mokapot documentation)",
             columns=2,
-            header_labels=["keyword", "value"],
+            header_labels=["Parameter", "Value"],
         )
-        self.protein_kwargs.grid(row=4, column=0, sticky="new")  # leave this in?
+        self.protein_kwargs.grid(row=4, column=0, sticky="nsew")
 
     def get(self) -> Dict:
         """Return the configuration as a dictionary."""
@@ -617,12 +612,17 @@ def __init__(self, *args, **kwargs):
         self.configure(fg_color="transparent")
         self.grid_columnconfigure(0, weight=1)
 
-        self.title = widgets.Heading(self, text="Percolator cofiguration")
+        self.title = widgets.Heading(self, text="Percolator configuration")
         self.title.grid(row=0, column=0, columnspan=2, pady=(0, 5), sticky="ew")
 
+        self.weights_file = widgets.LabeledFileSelect(
+            self, label="Pretrained Percolator model weights", file_option="openfile"
+        )
+        self.weights_file.grid(row=1, column=0, columnspan=2, sticky="nsew")
+
     def get(self) -> Dict:
         """Return the configuration as a dictionary."""
-        config = {}
+        config = {"init-weights": self.weights_file.get()}
         return config
 
 
@@ -641,8 +641,8 @@ def app():
         function=function,
     )
     root.protocol("WM_DELETE_WINDOW", sys.exit)
-    root.geometry(f"{1250}x{700}")
-    root.minsize(1000, 700)
+    dpi = root.winfo_fpixels("1i")
+    root.geometry(f"{int(15*dpi)}x{int(10*dpi)}")
     root.title("MS²Rescore")
     root.wm_iconbitmap(os.path.join(str(_IMG_DIR), "program_icon.ico"))
 
diff --git a/ms2rescore/gui/function2ctk.py b/ms2rescore/gui/function2ctk.py
index 9ffe38bd..60bad120 100644
--- a/ms2rescore/gui/function2ctk.py
+++ b/ms2rescore/gui/function2ctk.py
@@ -10,7 +10,6 @@
 
 import customtkinter as ctk
 
-
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
@@ -51,13 +50,9 @@ def __init__(
 
         self.function = function
 
-        # # App config
-        self.geometry(f"{1250}x{700}")
-        self.minsize(1000, 700)
-
         # 2x3 grid, only logging column expands with window
-        self.grid_columnconfigure(0, weight=0, minsize=500)  # Left: Sidebar
-        self.grid_columnconfigure(1, weight=0, minsize=1000)  # Middle: Configuration
+        self.grid_columnconfigure(0, weight=0)  # Left: Sidebar
+        self.grid_columnconfigure(1, weight=2)  # Middle: Configuration
         self.grid_columnconfigure(2, weight=1)  # Right: Logging
         self.grid_rowconfigure(0, weight=1)
 
diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py
index 8ee47a7e..8cf1b19a 100644
--- a/ms2rescore/parse_psms.py
+++ b/ms2rescore/parse_psms.py
@@ -1,16 +1,17 @@
 import logging
 import re
 from typing import Dict, Union
+from itertools import chain
 
 import psm_utils.io
 from psm_utils import PSMList
 
-from ms2rescore.exceptions import MS2RescoreConfigurationError, MS2RescoreError
+from ms2rescore.exceptions import MS2RescoreConfigurationError
 
 logger = logging.getLogger(__name__)
 
 
-def parse_psms(config: Dict, psm_list: Union[PSMList, None], output_file_root: str) -> PSMList:
+def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
     """
     Parse PSMs and prepare for rescoring.
 
@@ -21,8 +22,6 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None], output_file_root: s
         top-level key).
     psm_list
         PSMList object containing PSMs. If None, PSMs will be read from ``psm_file``.
-    output_file_root
-        Path to output file root (without file extension). #TODO doesn't get used?
 
     """
     # Read PSMs, find decoys, calculate q-values
@@ -61,12 +60,12 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None], output_file_root: s
 
 def _read_psms(config, psm_list):
     if isinstance(psm_list, PSMList):
-        logger.info("Reading PSMs...")
         return psm_list
     else:
+        logger.info("Reading PSMs from file...")
         current_file = 1
         total_files = len(config["psm_file"])
-        all_psms = []
+        psm_list_list = []
         for psm_file in config["psm_file"]:
             logger.info(
                 f"Reading PSMs from PSM file ({current_file}/{total_files}): `{psm_file}`..."
@@ -86,10 +85,10 @@ def _read_psms(config, psm_list):
                     " for more information."
                 )
 
-            all_psms = all_psms + id_file_psm_list.psm_list
+            psm_list_list.append(id_file_psm_list)
             current_file += 1
 
-        return PSMList(psm_list=all_psms)
+        return PSMList(psm_list=chain.from_iterable(p.psm_list for p in psm_list_list))
 
 
 def _find_decoys(config, psm_list):
diff --git a/ms2rescore/parse_spectra.py b/ms2rescore/parse_spectra.py
index d10a436f..130d0d5c 100644
--- a/ms2rescore/parse_spectra.py
+++ b/ms2rescore/parse_spectra.py
@@ -2,53 +2,37 @@
 
 import logging
 import re
-from typing import Union, Tuple, Dict
+from itertools import chain
+from typing import Dict, Tuple
 
-from rich.progress import track
+from psm_utils import PSMList
 from pyteomics.mgf import MGF
 from pyteomics.mzml import MzML
-from itertools import chain
-from pathlib import Path
+from rich.progress import track
 
 from ms2rescore.exceptions import MS2RescoreError
 from ms2rescore.utils import infer_spectrum_path
-from psm_utils import PSMList
 
 logger = logging.getLogger(__name__)
 
 
-class ParseMGFError(MS2RescoreError):
-    """Error parsing MGF file."""
-
-    pass
-
-
-class ParsingError(MS2RescoreError):
-    """Error parsing retention time from spectrum file."""
-
-    pass
-
-
-def get_missing_values(config, psm_list, missing_rt_values=False, missing_im_values=False):
-    """Get missing features from spectrum file."""
-    logger.info("Parsing missing values from spectrum file.")
+def get_missing_values(config, psm_list, missing_rt=False, missing_im=False):
+    """Get missing RT/IM features from spectrum file."""
+    logger.debug("Extracting missing RT/IM values from spectrum file(s).")
 
     psm_dict = psm_list.get_psm_dict()
-    logger.debug(f"Extracting missing values from spectrum files.")
     for runs in psm_dict.values():
-        for run, psms in track(runs.items(), description="Parsing missing values"):
+        for run, psms in track(runs.items(), description="Extracting RT/IM values..."):
             psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
             spectrum_file = infer_spectrum_path(config["spectrum_path"], run)
-            if isinstance(spectrum_file, str):
-                spectrum_file = Path(spectrum_file)
 
             if spectrum_file.suffix.lower() == ".mzml":
                 rt_dict, im_dict = _parse_values_from_mzml(
-                    spectrum_file, config, run, missing_rt_values, missing_im_values
+                    spectrum_file, config, run, missing_rt, missing_im
                 )
             elif spectrum_file.suffix.lower() == ".mgf":
                 rt_dict, im_dict = _parse_values_from_mgf(
-                    spectrum_file, config, run, missing_rt_values, missing_im_values
+                    spectrum_file, config, run, missing_rt, missing_im
                 )
 
             for value_dict, value in zip([rt_dict, im_dict], ["retention_time", "ion_mobility"]):
@@ -62,90 +46,92 @@ def get_missing_values(config, psm_list, missing_rt_values=False, missing_im_val
 
 
 def _parse_values_from_mgf(
-    spectrum_file, config, run, missing_rt_values, missing_im_values
+    spectrum_file, config, run, missing_rt, missing_im
 ) -> Tuple[Dict, Dict]:
-    """Parse retention time and/or ion mobility from MGF file."""
+    """
+    Parse retention time and/or ion mobility from an MGF file.
+
+    Notes
+    -----
+    - Extracting values (e.g., ion mobility) according to the Matrix documentation:
+      http://www.matrixscience.com/help/data_file_help.html
+
+    """
     rt_dict = {}
     im_dict = {}
 
+    spectrum_id_pattern = re.compile(
+        config["spectrum_id_pattern"] if config["spectrum_id_pattern"] else r"(.*)"
+    )
+
     for spectrum in MGF(str(spectrum_file)):
-        if missing_rt_values:
+        matched_id = spectrum_id_pattern.match(spectrum["id"]).group()
+        if missing_rt:
             try:
-                rt_dict[
-                    re.match(
-                        config["spectrum_id_pattern"]
-                        if config["spectrum_id_pattern"]
-                        else r"(.*)",
-                        spectrum["params"]["title"],
-                    ).group()
-                ] = float(spectrum["params"]["rtinseconds"])
+                rt_dict[matched_id] = float(spectrum["params"]["rtinseconds"])
             except KeyError:
                 raise ParsingError(
-                    f"Could not parse retention time key `rtinsecondes` from spectrum file for run {run}."
-                    "Please make sure that the retention time key is present in the spectrum file."
+                    "Could not parse retention time (`rtinseconds`) from spectrum file for "
+                    f"run {run}. Please make sure that the retention time key is present in the "
+                    "spectrum file or disable the relevant feature generator."
                 )
-        if missing_im_values:
+        if missing_im:
             try:
-                im_dict[
-                    re.match(
-                        config["spectrum_id_pattern"]
-                        if config["spectrum_id_pattern"]
-                        else r"(.*)",
-                        spectrum["params"]["title"],
-                    ).group()
-                ] = float(
-                    spectrum["params"]["ion_mobility"]
-                )  # http://www.matrixscience.com/help/data_file_help.html
+                im_dict[matched_id] = float(spectrum["params"]["ion_mobility"])
             except KeyError:
                 raise ParsingError(
-                    f"Could not parse ion mobility key `ionmobility` from spectrum file for run {run}."
-                    "Please make sure that the ion mobility key is present in the spectrum file."
+                    "Could not parse ion mobility (`ion_mobility`) from spectrum file "
+                    f"for run {run}. Please make sure that the ion mobility key is present in the "
+                    "spectrum file or disable the relevant feature generator."
                 )
 
     return rt_dict, im_dict
 
 
 def _parse_values_from_mzml(
-    spectrum_file, config, run, missing_rt_values, missing_im_values
+    spectrum_file, config, run, missing_rt, missing_im
 ) -> Tuple[Dict, Dict]:
-    """Parse retention time and/or ion mobility from MGF file."""
+    """Parse retention time and/or ion mobility from an mzML file."""
     rt_dict = {}
     im_dict = {}
 
+    spectrum_id_pattern = re.compile(
+        config["spectrum_id_pattern"] if config["spectrum_id_pattern"] else r"(.*)"
+    )
+
     for spectrum in MzML(str(spectrum_file)):
-        if missing_rt_values:
+        matched_id = spectrum_id_pattern.match(spectrum["id"]).group()
+        if missing_rt:
             try:
-                rt_dict[
-                    re.match(
-                        config["spectrum_id_pattern"]
-                        if config["spectrum_id_pattern"]
-                        else r"(.*)",
-                        spectrum["id"],
-                    ).group()
-                ] = float(
-                    spectrum["scanList"]["scan"][0][
-                        "scan start time"
-                    ]  # is rt in minutes by default?
-                )
+                rt_dict[matched_id] = float(spectrum["scanList"]["scan"][0]["scan start time"])
             except KeyError:
                 raise ParsingError(
-                    f"Could not parse retention time key `scan start time` from spectrum file for run {run}."
-                    "Please make sure that the retention time key is present in the spectrum file."
+                    "Could not parse retention time (`scan start time`) from spectrum file for "
+                    f"run {run}. Please make sure that the retention time key is present in the "
+                    "spectrum file or disable the relevant feature generator."
                 )
-        if missing_im_values:
+        if missing_im:
             try:
-                im_dict[
-                    re.match(
-                        config["spectrum_id_pattern"]
-                        if config["spectrum_id_pattern"]
-                        else r"(.*)",
-                        spectrum["id"],
-                    ).group()
-                ] = float(spectrum["scanList"]["scan"][0]["reverse ion mobility"])
+                im_dict[matched_id] = float(
+                    spectrum["scanList"]["scan"][0]["reverse ion mobility"]
+                )
             except KeyError:
                 raise ParsingError(
-                    f"Could not parse ion mobility key `reverse ion mobility` from spectrum file for run {run}."
-                    "Please make sure that the ion mobility key is present in the spectrum file."
+                    "Could not parse ion mobility (`reverse ion mobility`) from spectrum file "
+                    f"for run {run}. Please make sure that the ion mobility key is present in the "
+                    "spectrum file or disable the relevant feature generator."
                 )
 
     return rt_dict, im_dict
+
+
+class ParseMGFError(MS2RescoreError):
+    """Error parsing MGF file."""
+
+    pass
+
+
+class ParsingError(MS2RescoreError):
+    """Error parsing retention time from spectrum file."""
+
+    pass
diff --git a/ms2rescore/rescoring_engines/mokapot.py b/ms2rescore/rescoring_engines/mokapot.py
index 58cea069..f3927f47 100644
--- a/ms2rescore/rescoring_engines/mokapot.py
+++ b/ms2rescore/rescoring_engines/mokapot.py
@@ -84,11 +84,11 @@ def rescore(
 
     # Add proteins
     if fasta_file:
-        logger.debug(f"Mokapot read fasta keyword arguments : {protein_kwargs}")
+        logger.debug(f"Adding protein info from {fasta_file} with options: `{protein_kwargs}`")
         lin_psm_data.add_proteins(fasta_file, **protein_kwargs)
 
     # Rescore
-    logger.debug(f"Mokapot brew keyword arguments : {kwargs}")
+    logger.debug(f"Mokapot brew options: `{kwargs}`")
     confidence_results, models = brew(lin_psm_data, **kwargs)
 
     # Reshape confidence estimates to match PSMList
@@ -124,6 +124,7 @@ def rescore(
     if write_txt:
         confidence_results.to_txt(file_root=output_file_root, decoys=True)
     if write_flashlfq:
+        # TODO: How do we validate that the RTs are in minutes?
         confidence_results.psms["retention_time"] = confidence_results.psms["retention_time"] * 60
         confidence_results.to_flashlfq(output_file_root + ".mokapot.flashlfq.txt")
 
diff --git a/ms2rescore/utils.py b/ms2rescore/utils.py
index 26c88a7e..70417b56 100644
--- a/ms2rescore/utils.py
+++ b/ms2rescore/utils.py
@@ -75,4 +75,4 @@ def infer_spectrum_path(
                 "files."
             )
 
-    return resolved_path
+    return Path(resolved_path)