PR review

compomics · Oct 12, 2023 · e16a3b1 · e16a3b1
1 parent 28b4847
commit e16a3b1
Show file tree

Hide file tree

Showing 11 changed files with 177 additions and 199 deletions.
diff --git a/README.md b/README.md
@@ -30,8 +30,10 @@ MS²Rescore can read peptide identifications in any format supported by [psm_uti
 files:
 
 - [MS Amanda](http://ms.imp.ac.at/?goto=msamanda) `.csv`
+- [Sage](https://github.com/lazear/sage) `.sage.tsv`
 - [PeptideShaker](https://compomics.github.io/projects/peptide-shaker.html) `.mzid`
 - [MSGFPlus](https://omics.pnl.gov/software/ms-gf) `.mzid`
+- [Mascot](https://www.matrixscience.com/) `.mzid`
 - [MaxQuant](https://www.maxquant.org/) `msms.txt`
 - [X!Tandem](https://www.thegpm.org/tandem/) `.xml`
 - [PEAKS](https://www.bioinfor.com/peaksdb/) `.mzid`
@@ -45,13 +47,13 @@ MS²Rescore is available as a [desktop application][desktop], a [command line to
 
 > **MS2Rescore: Data-driven rescoring dramatically boosts immunopeptide identification rates.**
 > Arthur Declercq, Robbin Bouwmeester, Aurélie Hirschler, Christine Carapito, Sven Degroeve, Lennart Martens, and Ralf Gabriels.
-> _Molecular & Cellular Proteomics_ (2021) [doi:10.1016/j.mcpro.2022.100266](https://doi.org/10.1016/j.mcpro.2022.100266) > <span class="__dimensions_badge_embed__" data-doi="10.1016/j.mcpro.2022.100266" data-hide-zero-citations="true" data-style="small_rectangle"></span>
+> _Molecular & Cellular Proteomics_ (2021) [doi:10.1016/j.mcpro.2022.100266](https://doi.org/10.1016/j.mcpro.2022.100266) <span class="__dimensions_badge_embed__" data-doi="10.1016/j.mcpro.2022.100266" data-hide-zero-citations="true" data-style="small_rectangle"></span>
 
 **Original publication describing the concept of rescoring with predicted spectra:**
 
 > **Accurate peptide fragmentation predictions allow data driven approaches to replace and improve upon proteomics search engine scoring functions.**
 > Ana S C Silva, Robbin Bouwmeester, Lennart Martens, and Sven Degroeve.
-> _Bioinformatics_ (2019) [doi:10.1093/bioinformatics/btz383](https://doi.org/10.1093/bioinformatics/btz383) > <span class="__dimensions_badge_embed__" data-doi="10.1093/bioinformatics/btz383" data-hide-zero-citations="true" data-style="small_rectangle"></span>
+> _Bioinformatics_ (2019) [doi:10.1093/bioinformatics/btz383](https://doi.org/10.1093/bioinformatics/btz383) <span class="__dimensions_badge_embed__" data-doi="10.1093/bioinformatics/btz383" data-hide-zero-citations="true" data-style="small_rectangle"></span>
 
 To replicate the experiments described in this article, check out the
 [publication branch][publication-branch] of the repository.

diff --git a/ms2rescore/config_parser.py b/ms2rescore/config_parser.py
@@ -69,7 +69,7 @@ def _validate_filenames(config: Dict) -> Dict:
         config["ms2rescore"]["output_path"], config["ms2rescore"]["psm_file"][0]
     )
 
-    # Parse config_file as posix path #TODO: Is this necessary?
+    # Parse config_file as posix path to avoid combination of forward and backward slashes
     if config["ms2rescore"]["config_file"]:
         config["ms2rescore"]["config_file"] = Path(config["ms2rescore"]["config_file"]).as_posix()
 

diff --git a/ms2rescore/core.py b/ms2rescore/core.py
@@ -8,9 +8,9 @@
 
 from ms2rescore.feature_generators import FEATURE_GENERATORS
 from ms2rescore.parse_psms import parse_psms
+from ms2rescore.parse_spectra import get_missing_values
 from ms2rescore.report import generate
 from ms2rescore.rescoring_engines import mokapot, percolator
-from ms2rescore.parse_spectra import get_missing_values
 
 logger = logging.getLogger(__name__)
 
@@ -27,7 +27,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
         PSMList object containing PSMs. If None, PSMs will be read from configuration ``psm_file``.
 
     """
-    config = configuration["ms2rescore"]  # TODO: Remove top-level key?
+    config = configuration["ms2rescore"]
     output_file_root = config["output_path"]
 
     # Write full configuration including defaults to file
@@ -54,23 +54,13 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
         f"PSMs already contain the following rescoring features: {psm_list_feature_names}"
     )
 
-    if ("deeplc" in config["feature_generators"] and None in psm_list["retention_time"]) or (
-        "ionmob" in config["feature_generators"] and None in psm_list["ion_mobility"]
-    ):
-        logger.warning(
-            "One or more PSMs are missing retention time and/or ion mobility values. These will be "
-            "parsed from the spectrum file."
-        )
-        get_missing_values(
-            config,
-            psm_list,
-            missing_rt_values=(
-                "deeplc" in config["feature_generators"] and None in psm_list["retention_time"]
-            ),
-            missing_im_values=(
-                "ionmob" in config["feature_generators"] and None in psm_list["ion_mobility"]
-            ),
-        )
+    # TODO: avoid hard coding feature generators in some way
+    rt_required = "deeplc" in config["feature_generators"] and None in psm_list["retention_time"]
+    im_required = "ionmob" in config["feature_generators"] and None in psm_list["ion_mobility"]
+    if rt_required or im_required:
+        logger.info("Parsing missing retention time and/or ion mobility values from spectra...")
+        get_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required)
+
     # Add rescoring features
     for fgen_name, fgen_config in config["feature_generators"].items():
         # TODO: Handle this somewhere else, more generally?

diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py
@@ -28,9 +28,7 @@
 from psm_utils import PSMList
 from psm_utils.io import peptide_record
 
-from ms2rescore.exceptions import MS2RescoreError
 from ms2rescore.feature_generators.base import FeatureGeneratorBase
-from ms2rescore.utils import infer_spectrum_path
 
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
 logger = logging.getLogger(__name__)
@@ -146,59 +144,65 @@ def add_features(self, psm_list: PSMList) -> None:
                     f"Running DeepLC for PSMs from run ({current_run}/{total_runs}): `{run}`..."
                 )
 
-                psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
-
-                psm_list_calibration = self._get_calibration_psms(psm_list_run)
-
-                logger.debug("Calibrating DeepLC")
-                self.deeplc_predictor = self.DeepLC(
-                    n_jobs=self.processes,
-                    verbose=self._verbose,
-                    path_model=self.selected_model or self.user_model,
-                    **self.deeplc_kwargs,
-                )
-                self.deeplc_predictor.calibrate_preds(
-                    seq_df=self._psm_list_to_deeplc_peprec(psm_list_calibration)
-                )
-                # Still calibrate for each run, but do not try out all model options.
-                # Just use model that was selected based on first run
-                if not self.selected_model:
-                    self.selected_model = list(self.deeplc_predictor.model.keys())
-                    self.deeplc_kwargs["deeplc_retrain"] = False
-                    logger.debug(
-                        f"Selected DeepLC model {self.selected_model} based on "
-                        "calibration of first run. Using this model (after new "
-                        "calibrations) for the remaining runs."
+                # Disable wild logging to stdout by Tensorflow, unless in debug mode
+                with contextlib.redirect_stdout(
+                    open(os.devnull, "w")
+                ) if not self._verbose else contextlib.nullcontext():
+                    # Make new PSM list for this run (chain PSMs per spectrum to flat list)
+                    psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
+
+                    logger.debug("Calibrating DeepLC...")
+                    psm_list_calibration = self._get_calibration_psms(psm_list_run)
+                    self.deeplc_predictor = self.DeepLC(
+                        n_jobs=self.processes,
+                        verbose=self._verbose,
+                        path_model=self.selected_model or self.user_model,
+                        **self.deeplc_kwargs,
                     )
-
-                predictions = np.array(
-                    self.deeplc_predictor.make_preds(
-                        seq_df=self._psm_list_to_deeplc_peprec(psm_list_run)
-                    )
-                )
-                observations = psm_list_run["retention_time"]
-                rt_diffs_run = np.abs(predictions - observations)
-
-                for i, psm in enumerate(psm_list_run):
-                    psm["rescoring_features"].update(
-                        {
-                            "observed_retention_time": observations[i],
-                            "predicted_retention_time": predictions[i],
-                            "rt_diff": rt_diffs_run[i],
-                        }
+                    self.deeplc_predictor.calibrate_preds(
+                        seq_df=self._psm_list_to_deeplc_peprec(psm_list_calibration)
                     )
-                    peptide = psm.peptidoform.proforma.split("\\")[0]  # remove charge
-                    if peptide_rt_diff_dict[peptide]["rt_diff_best"] > rt_diffs_run[i]:
-                        peptide_rt_diff_dict[peptide] = {
-                            "observed_retention_time_best": observations[i],
-                            "predicted_retention_time_best": predictions[i],
-                            "rt_diff_best": rt_diffs_run[i],
-                        }
-                for psm in psm_list_run:
-                    psm["rescoring_features"].update(
-                        peptide_rt_diff_dict[psm.peptidoform.proforma.split("\\")[0]]
+                    # Still calibrate for each run, but do not try out all model options.
+                    # Just use model that was selected based on first run
+                    if not self.selected_model:
+                        self.selected_model = list(self.deeplc_predictor.model.keys())
+                        self.deeplc_kwargs["deeplc_retrain"] = False
+                        logger.debug(
+                            f"Selected DeepLC model {self.selected_model} based on "
+                            "calibration of first run. Using this model (after new "
+                            "calibrations) for the remaining runs."
+                        )
+
+                    logger.debug("Predicting retention times...")
+                    predictions = np.array(
+                        self.deeplc_predictor.make_preds(
+                            seq_df=self._psm_list_to_deeplc_peprec(psm_list_run)
+                        )
                     )
-            current_run += 1
+                    observations = psm_list_run["retention_time"]
+                    rt_diffs_run = np.abs(predictions - observations)
+
+                    logger.debug("Adding features to PSMs...")
+                    for i, psm in enumerate(psm_list_run):
+                        psm["rescoring_features"].update(
+                            {
+                                "observed_retention_time": observations[i],
+                                "predicted_retention_time": predictions[i],
+                                "rt_diff": rt_diffs_run[i],
+                            }
+                        )
+                        peptide = psm.peptidoform.proforma.split("\\")[0]  # remove charge
+                        if peptide_rt_diff_dict[peptide]["rt_diff_best"] > rt_diffs_run[i]:
+                            peptide_rt_diff_dict[peptide] = {
+                                "observed_retention_time_best": observations[i],
+                                "predicted_retention_time_best": predictions[i],
+                                "rt_diff_best": rt_diffs_run[i],
+                            }
+                    for psm in psm_list_run:
+                        psm["rescoring_features"].update(
+                            peptide_rt_diff_dict[psm.peptidoform.proforma.split("\\")[0]]
+                        )
+                current_run += 1
 
     # TODO: Remove when DeepLC supports PSMList directly
     @staticmethod

diff --git a/ms2rescore/gui/__main__.py b/ms2rescore/gui/__main__.py
@@ -10,6 +10,7 @@
 def main():
     """Entrypoint for MS²Rescore GUI."""
     multiprocessing.freeze_support()
+    # Redirect stdout when running GUI (packaged app might not have console attached)
     with contextlib.redirect_stdout(open(os.devnull, "w")):
         app()
 

diff --git a/ms2rescore/gui/app.py b/ms2rescore/gui/app.py
@@ -8,9 +8,9 @@
 import webbrowser
 from pathlib import Path
 from typing import Dict, List, Tuple
-from joblib import parallel_backend
 
 import customtkinter as ctk
+from joblib import parallel_backend
 from ms2pip.constants import MODELS as ms2pip_models
 from PIL import Image
 from psm_utils.io import FILETYPES
@@ -41,6 +41,8 @@
     pass
 
 ctk.set_default_color_theme(_THEME_FILE)
+
+# TODO Does this disable multiprocessing everywhere?
 parallel_backend("threading")
 
 
@@ -163,18 +165,11 @@ def get(self):
         main_config = self.main_config.get()
         advanced_config = self.advanced_config.get()
 
-        # TODO Move to rescoring engine config
-        percolator_config = {"init-weights": advanced_config.pop("weightsfile")}
-
         config = {"ms2rescore": main_config}
         config["ms2rescore"].update(advanced_config)
         config["ms2rescore"]["feature_generators"] = self.fgen_config.get()
         config["ms2rescore"]["rescoring_engine"] = self.rescoring_engine_config.get()
 
-        # TODO See above
-        if "percolator" in config["ms2rescore"]["rescoring_engine"]:
-            config["ms2rescore"]["rescoring_engine"]["percolator"] = percolator_config
-
         args = (config,)  # Comma required to wrap in tuple
         kwargs = {}
 
@@ -284,11 +279,13 @@ def __init__(self, *args, **kwargs):
     def get(self) -> Dict:
         """Get the configured values as a dictionary."""
         try:
+            # there cannot be spaces in the file path
+            # TODO: Fix this in widgets.LabeledFileSelect
             psm_files = self.psm_file.get().split(" ")
         except AttributeError:
             raise MS2RescoreConfigurationError("No PSM file provided. Please select a file.")
         return {
-            "psm_file": psm_files,  # there cannot be spaces in the file path
+            "psm_file": psm_files,
             "psm_file_type": self.psm_file_type.get(),
         }
 
@@ -321,11 +318,6 @@ def __init__(self, *args, **kwargs):
         self.spectrum_id_pattern = widgets.LabeledEntry(self, label="Spectrum ID regex pattern")
         self.spectrum_id_pattern.grid(row=5, column=0, pady=(0, 10), sticky="nsew")
 
-        self.weightsfile = widgets.LabeledFileSelect(
-            self, label="Pretrained Percolator weights", file_option="openfile"
-        )
-        self.weightsfile.grid(row=6, column=0, columnspan=2, sticky="nsew")
-
         self.file_prefix = widgets.LabeledFileSelect(
             self, label="Filename for output files", file_option="savefile"
         )
@@ -344,7 +336,6 @@ def get(self) -> Dict:
             "id_decoy_pattern": self.id_decoy_pattern.get(),
             "psm_id_pattern": self.psm_id_pattern.get(),
             "spectrum_id_pattern": self.spectrum_id_pattern.get(),
-            "weightsfile": self.weightsfile.get(),
             "output_path": self.file_prefix.get(),
             "config_file": self.config_file.get(),
             "write_report": self.generate_report.get(),
@@ -466,7 +457,7 @@ def __init__(self, *args, **kwargs):
 
         self.num_epochs = widgets.LabeledFloatSpinbox(
             self,
-            label="Number of epochs",
+            label="Number of transfer learning epochs",
             step_size=5,
             initial_value=20,
         )  # way to remove float in spinbox label?
@@ -569,25 +560,29 @@ def __init__(self, *args, **kwargs):
         self.configure(fg_color="transparent")
         self.grid_columnconfigure(0, weight=1)
 
-        self.title = widgets.Heading(self, text="Mokapot cofiguration")
+        self.title = widgets.Heading(self, text="Mokapot coffeeguration")
         self.title.grid(row=0, column=0, columnspan=2, pady=(0, 5), sticky="ew")
 
-        self.write_weights = widgets.LabeledSwitch(self, label="Write weightsfile", default=True)
+        self.write_weights = widgets.LabeledSwitch(
+            self, label="Write model weights to file", default=True
+        )
         self.write_weights.grid(row=1, column=0, pady=(0, 10), sticky="nsew")
 
-        self.write_txt = widgets.LabeledSwitch(self, label="Write txt output file", default=True)
+        self.write_txt = widgets.LabeledSwitch(self, label="Write TXT output files", default=True)
         self.write_txt.grid(row=2, column=0, pady=(0, 10), sticky="nsew")
 
-        self.write_flashlfq = widgets.LabeledSwitch(self, label="Write flashlfq", default=False)
+        self.write_flashlfq = widgets.LabeledSwitch(
+            self, label="Write file for FlashLFQ", default=False
+        )
         self.write_flashlfq.grid(row=3, column=0, pady=(0, 10), sticky="nsew")
 
         self.protein_kwargs = widgets.TableInput(
             self,
-            label="mokapot protein kwargs",
+            label="`mokapot.read_fasta` options (see Mokapot documentation)",
             columns=2,
-            header_labels=["keyword", "value"],
+            header_labels=["Parameter", "Value"],
         )
-        self.protein_kwargs.grid(row=4, column=0, sticky="new")  # leave this in?
+        self.protein_kwargs.grid(row=4, column=0, sticky="nsew")
 
     def get(self) -> Dict:
         """Return the configuration as a dictionary."""
@@ -617,12 +612,17 @@ def __init__(self, *args, **kwargs):
         self.configure(fg_color="transparent")
         self.grid_columnconfigure(0, weight=1)
 
-        self.title = widgets.Heading(self, text="Percolator cofiguration")
+        self.title = widgets.Heading(self, text="Percolator configuration")
         self.title.grid(row=0, column=0, columnspan=2, pady=(0, 5), sticky="ew")
 
+        self.weights_file = widgets.LabeledFileSelect(
+            self, label="Pretrained Percolator model weights", file_option="openfile"
+        )
+        self.weights_file.grid(row=1, column=0, columnspan=2, sticky="nsew")
+
     def get(self) -> Dict:
         """Return the configuration as a dictionary."""
-        config = {}
+        config = {"init-weights": self.weights_file.get()}
         return config
 
 
@@ -641,8 +641,8 @@ def app():
         function=function,
     )
     root.protocol("WM_DELETE_WINDOW", sys.exit)
-    root.geometry(f"{1250}x{700}")
-    root.minsize(1000, 700)
+    dpi = root.winfo_fpixels("1i")
+    root.geometry(f"{int(15*dpi)}x{int(10*dpi)}")
     root.title("MS²Rescore")
     root.wm_iconbitmap(os.path.join(str(_IMG_DIR), "program_icon.ico"))
 

diff --git a/ms2rescore/gui/function2ctk.py b/ms2rescore/gui/function2ctk.py
@@ -10,7 +10,6 @@
 
 import customtkinter as ctk
 
-
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
@@ -51,13 +50,9 @@ def __init__(
 
         self.function = function
 
-        # # App config
-        self.geometry(f"{1250}x{700}")
-        self.minsize(1000, 700)
-
         # 2x3 grid, only logging column expands with window
-        self.grid_columnconfigure(0, weight=0, minsize=500)  # Left: Sidebar
-        self.grid_columnconfigure(1, weight=0, minsize=1000)  # Middle: Configuration
+        self.grid_columnconfigure(0, weight=0)  # Left: Sidebar
+        self.grid_columnconfigure(1, weight=2)  # Middle: Configuration
         self.grid_columnconfigure(2, weight=1)  # Right: Logging
         self.grid_rowconfigure(0, weight=1)