Skip to content

Commit

Permalink
PR review
Browse files Browse the repository at this point in the history
  • Loading branch information
RalfG committed Oct 12, 2023
1 parent 28b4847 commit e16a3b1
Show file tree
Hide file tree
Showing 11 changed files with 177 additions and 199 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,10 @@ MS²Rescore can read peptide identifications in any format supported by [psm_uti
files:

- [MS Amanda](http://ms.imp.ac.at/?goto=msamanda) `.csv`
- [Sage](https://github.com/lazear/sage) `.sage.tsv`
- [PeptideShaker](https://compomics.github.io/projects/peptide-shaker.html) `.mzid`
- [MSGFPlus](https://omics.pnl.gov/software/ms-gf) `.mzid`
- [Mascot](https://www.matrixscience.com/) `.mzid`
- [MaxQuant](https://www.maxquant.org/) `msms.txt`
- [X!Tandem](https://www.thegpm.org/tandem/) `.xml`
- [PEAKS](https://www.bioinfor.com/peaksdb/) `.mzid`
Expand All @@ -45,13 +47,13 @@ MS²Rescore is available as a [desktop application][desktop], a [command line to

> **MS2Rescore: Data-driven rescoring dramatically boosts immunopeptide identification rates.**
> Arthur Declercq, Robbin Bouwmeester, Aurélie Hirschler, Christine Carapito, Sven Degroeve, Lennart Martens, and Ralf Gabriels.
> _Molecular & Cellular Proteomics_ (2021) [doi:10.1016/j.mcpro.2022.100266](https://doi.org/10.1016/j.mcpro.2022.100266) > <span class="__dimensions_badge_embed__" data-doi="10.1016/j.mcpro.2022.100266" data-hide-zero-citations="true" data-style="small_rectangle"></span>
> _Molecular & Cellular Proteomics_ (2021) [doi:10.1016/j.mcpro.2022.100266](https://doi.org/10.1016/j.mcpro.2022.100266) <span class="__dimensions_badge_embed__" data-doi="10.1016/j.mcpro.2022.100266" data-hide-zero-citations="true" data-style="small_rectangle"></span>
**Original publication describing the concept of rescoring with predicted spectra:**

> **Accurate peptide fragmentation predictions allow data driven approaches to replace and improve upon proteomics search engine scoring functions.**
> Ana S C Silva, Robbin Bouwmeester, Lennart Martens, and Sven Degroeve.
> _Bioinformatics_ (2019) [doi:10.1093/bioinformatics/btz383](https://doi.org/10.1093/bioinformatics/btz383) > <span class="__dimensions_badge_embed__" data-doi="10.1093/bioinformatics/btz383" data-hide-zero-citations="true" data-style="small_rectangle"></span>
> _Bioinformatics_ (2019) [doi:10.1093/bioinformatics/btz383](https://doi.org/10.1093/bioinformatics/btz383) <span class="__dimensions_badge_embed__" data-doi="10.1093/bioinformatics/btz383" data-hide-zero-citations="true" data-style="small_rectangle"></span>
To replicate the experiments described in this article, check out the
[publication branch][publication-branch] of the repository.
Expand Down
2 changes: 1 addition & 1 deletion ms2rescore/config_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def _validate_filenames(config: Dict) -> Dict:
config["ms2rescore"]["output_path"], config["ms2rescore"]["psm_file"][0]
)

# Parse config_file as posix path #TODO: Is this necessary?
# Parse config_file as posix path to avoid combination of forward and backward slashes
if config["ms2rescore"]["config_file"]:
config["ms2rescore"]["config_file"] = Path(config["ms2rescore"]["config_file"]).as_posix()

Expand Down
28 changes: 9 additions & 19 deletions ms2rescore/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@

from ms2rescore.feature_generators import FEATURE_GENERATORS
from ms2rescore.parse_psms import parse_psms
from ms2rescore.parse_spectra import get_missing_values
from ms2rescore.report import generate
from ms2rescore.rescoring_engines import mokapot, percolator
from ms2rescore.parse_spectra import get_missing_values

logger = logging.getLogger(__name__)

Expand All @@ -27,7 +27,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
PSMList object containing PSMs. If None, PSMs will be read from configuration ``psm_file``.
"""
config = configuration["ms2rescore"] # TODO: Remove top-level key?
config = configuration["ms2rescore"]
output_file_root = config["output_path"]

# Write full configuration including defaults to file
Expand All @@ -54,23 +54,13 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
f"PSMs already contain the following rescoring features: {psm_list_feature_names}"
)

if ("deeplc" in config["feature_generators"] and None in psm_list["retention_time"]) or (
"ionmob" in config["feature_generators"] and None in psm_list["ion_mobility"]
):
logger.warning(
"One or more PSMs are missing retention time and/or ion mobility values. These will be "
"parsed from the spectrum file."
)
get_missing_values(
config,
psm_list,
missing_rt_values=(
"deeplc" in config["feature_generators"] and None in psm_list["retention_time"]
),
missing_im_values=(
"ionmob" in config["feature_generators"] and None in psm_list["ion_mobility"]
),
)
# TODO: avoid hard coding feature generators in some way
rt_required = "deeplc" in config["feature_generators"] and None in psm_list["retention_time"]
im_required = "ionmob" in config["feature_generators"] and None in psm_list["ion_mobility"]
if rt_required or im_required:
logger.info("Parsing missing retention time and/or ion mobility values from spectra...")
get_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required)

# Add rescoring features
for fgen_name, fgen_config in config["feature_generators"].items():
# TODO: Handle this somewhere else, more generally?
Expand Down
108 changes: 56 additions & 52 deletions ms2rescore/feature_generators/deeplc.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,7 @@
from psm_utils import PSMList
from psm_utils.io import peptide_record

from ms2rescore.exceptions import MS2RescoreError
from ms2rescore.feature_generators.base import FeatureGeneratorBase
from ms2rescore.utils import infer_spectrum_path

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -146,59 +144,65 @@ def add_features(self, psm_list: PSMList) -> None:
f"Running DeepLC for PSMs from run ({current_run}/{total_runs}): `{run}`..."
)

psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))

psm_list_calibration = self._get_calibration_psms(psm_list_run)

logger.debug("Calibrating DeepLC")
self.deeplc_predictor = self.DeepLC(
n_jobs=self.processes,
verbose=self._verbose,
path_model=self.selected_model or self.user_model,
**self.deeplc_kwargs,
)
self.deeplc_predictor.calibrate_preds(
seq_df=self._psm_list_to_deeplc_peprec(psm_list_calibration)
)
# Still calibrate for each run, but do not try out all model options.
# Just use model that was selected based on first run
if not self.selected_model:
self.selected_model = list(self.deeplc_predictor.model.keys())
self.deeplc_kwargs["deeplc_retrain"] = False
logger.debug(
f"Selected DeepLC model {self.selected_model} based on "
"calibration of first run. Using this model (after new "
"calibrations) for the remaining runs."
# Disable wild logging to stdout by Tensorflow, unless in debug mode
with contextlib.redirect_stdout(
open(os.devnull, "w")
) if not self._verbose else contextlib.nullcontext():
# Make new PSM list for this run (chain PSMs per spectrum to flat list)
psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))

logger.debug("Calibrating DeepLC...")
psm_list_calibration = self._get_calibration_psms(psm_list_run)
self.deeplc_predictor = self.DeepLC(
n_jobs=self.processes,
verbose=self._verbose,
path_model=self.selected_model or self.user_model,
**self.deeplc_kwargs,
)

predictions = np.array(
self.deeplc_predictor.make_preds(
seq_df=self._psm_list_to_deeplc_peprec(psm_list_run)
)
)
observations = psm_list_run["retention_time"]
rt_diffs_run = np.abs(predictions - observations)

for i, psm in enumerate(psm_list_run):
psm["rescoring_features"].update(
{
"observed_retention_time": observations[i],
"predicted_retention_time": predictions[i],
"rt_diff": rt_diffs_run[i],
}
self.deeplc_predictor.calibrate_preds(
seq_df=self._psm_list_to_deeplc_peprec(psm_list_calibration)
)
peptide = psm.peptidoform.proforma.split("\\")[0] # remove charge
if peptide_rt_diff_dict[peptide]["rt_diff_best"] > rt_diffs_run[i]:
peptide_rt_diff_dict[peptide] = {
"observed_retention_time_best": observations[i],
"predicted_retention_time_best": predictions[i],
"rt_diff_best": rt_diffs_run[i],
}
for psm in psm_list_run:
psm["rescoring_features"].update(
peptide_rt_diff_dict[psm.peptidoform.proforma.split("\\")[0]]
# Still calibrate for each run, but do not try out all model options.
# Just use model that was selected based on first run
if not self.selected_model:
self.selected_model = list(self.deeplc_predictor.model.keys())
self.deeplc_kwargs["deeplc_retrain"] = False
logger.debug(
f"Selected DeepLC model {self.selected_model} based on "
"calibration of first run. Using this model (after new "
"calibrations) for the remaining runs."
)

logger.debug("Predicting retention times...")
predictions = np.array(
self.deeplc_predictor.make_preds(
seq_df=self._psm_list_to_deeplc_peprec(psm_list_run)
)
)
current_run += 1
observations = psm_list_run["retention_time"]
rt_diffs_run = np.abs(predictions - observations)

logger.debug("Adding features to PSMs...")
for i, psm in enumerate(psm_list_run):
psm["rescoring_features"].update(
{
"observed_retention_time": observations[i],
"predicted_retention_time": predictions[i],
"rt_diff": rt_diffs_run[i],
}
)
peptide = psm.peptidoform.proforma.split("\\")[0] # remove charge
if peptide_rt_diff_dict[peptide]["rt_diff_best"] > rt_diffs_run[i]:
peptide_rt_diff_dict[peptide] = {
"observed_retention_time_best": observations[i],
"predicted_retention_time_best": predictions[i],
"rt_diff_best": rt_diffs_run[i],
}
for psm in psm_list_run:
psm["rescoring_features"].update(
peptide_rt_diff_dict[psm.peptidoform.proforma.split("\\")[0]]
)
current_run += 1

# TODO: Remove when DeepLC supports PSMList directly
@staticmethod
Expand Down
1 change: 1 addition & 0 deletions ms2rescore/gui/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
def main():
"""Entrypoint for MS²Rescore GUI."""
multiprocessing.freeze_support()
# Redirect stdout when running GUI (packaged app might not have console attached)
with contextlib.redirect_stdout(open(os.devnull, "w")):
app()

Expand Down
54 changes: 27 additions & 27 deletions ms2rescore/gui/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
import webbrowser
from pathlib import Path
from typing import Dict, List, Tuple
from joblib import parallel_backend

import customtkinter as ctk
from joblib import parallel_backend
from ms2pip.constants import MODELS as ms2pip_models
from PIL import Image
from psm_utils.io import FILETYPES
Expand Down Expand Up @@ -41,6 +41,8 @@
pass

ctk.set_default_color_theme(_THEME_FILE)

# TODO Does this disable multiprocessing everywhere?
parallel_backend("threading")


Expand Down Expand Up @@ -163,18 +165,11 @@ def get(self):
main_config = self.main_config.get()
advanced_config = self.advanced_config.get()

# TODO Move to rescoring engine config
percolator_config = {"init-weights": advanced_config.pop("weightsfile")}

config = {"ms2rescore": main_config}
config["ms2rescore"].update(advanced_config)
config["ms2rescore"]["feature_generators"] = self.fgen_config.get()
config["ms2rescore"]["rescoring_engine"] = self.rescoring_engine_config.get()

# TODO See above
if "percolator" in config["ms2rescore"]["rescoring_engine"]:
config["ms2rescore"]["rescoring_engine"]["percolator"] = percolator_config

args = (config,) # Comma required to wrap in tuple
kwargs = {}

Expand Down Expand Up @@ -284,11 +279,13 @@ def __init__(self, *args, **kwargs):
def get(self) -> Dict:
"""Get the configured values as a dictionary."""
try:
# there cannot be spaces in the file path
# TODO: Fix this in widgets.LabeledFileSelect
psm_files = self.psm_file.get().split(" ")
except AttributeError:
raise MS2RescoreConfigurationError("No PSM file provided. Please select a file.")
return {
"psm_file": psm_files, # there cannot be spaces in the file path
"psm_file": psm_files,
"psm_file_type": self.psm_file_type.get(),
}

Expand Down Expand Up @@ -321,11 +318,6 @@ def __init__(self, *args, **kwargs):
self.spectrum_id_pattern = widgets.LabeledEntry(self, label="Spectrum ID regex pattern")
self.spectrum_id_pattern.grid(row=5, column=0, pady=(0, 10), sticky="nsew")

self.weightsfile = widgets.LabeledFileSelect(
self, label="Pretrained Percolator weights", file_option="openfile"
)
self.weightsfile.grid(row=6, column=0, columnspan=2, sticky="nsew")

self.file_prefix = widgets.LabeledFileSelect(
self, label="Filename for output files", file_option="savefile"
)
Expand All @@ -344,7 +336,6 @@ def get(self) -> Dict:
"id_decoy_pattern": self.id_decoy_pattern.get(),
"psm_id_pattern": self.psm_id_pattern.get(),
"spectrum_id_pattern": self.spectrum_id_pattern.get(),
"weightsfile": self.weightsfile.get(),
"output_path": self.file_prefix.get(),
"config_file": self.config_file.get(),
"write_report": self.generate_report.get(),
Expand Down Expand Up @@ -466,7 +457,7 @@ def __init__(self, *args, **kwargs):

self.num_epochs = widgets.LabeledFloatSpinbox(
self,
label="Number of epochs",
label="Number of transfer learning epochs",
step_size=5,
initial_value=20,
) # way to remove float in spinbox label?
Expand Down Expand Up @@ -569,25 +560,29 @@ def __init__(self, *args, **kwargs):
self.configure(fg_color="transparent")
self.grid_columnconfigure(0, weight=1)

self.title = widgets.Heading(self, text="Mokapot cofiguration")
self.title = widgets.Heading(self, text="Mokapot coffeeguration")
self.title.grid(row=0, column=0, columnspan=2, pady=(0, 5), sticky="ew")

self.write_weights = widgets.LabeledSwitch(self, label="Write weightsfile", default=True)
self.write_weights = widgets.LabeledSwitch(
self, label="Write model weights to file", default=True
)
self.write_weights.grid(row=1, column=0, pady=(0, 10), sticky="nsew")

self.write_txt = widgets.LabeledSwitch(self, label="Write txt output file", default=True)
self.write_txt = widgets.LabeledSwitch(self, label="Write TXT output files", default=True)
self.write_txt.grid(row=2, column=0, pady=(0, 10), sticky="nsew")

self.write_flashlfq = widgets.LabeledSwitch(self, label="Write flashlfq", default=False)
self.write_flashlfq = widgets.LabeledSwitch(
self, label="Write file for FlashLFQ", default=False
)
self.write_flashlfq.grid(row=3, column=0, pady=(0, 10), sticky="nsew")

self.protein_kwargs = widgets.TableInput(
self,
label="mokapot protein kwargs",
label="`mokapot.read_fasta` options (see Mokapot documentation)",
columns=2,
header_labels=["keyword", "value"],
header_labels=["Parameter", "Value"],
)
self.protein_kwargs.grid(row=4, column=0, sticky="new") # leave this in?
self.protein_kwargs.grid(row=4, column=0, sticky="nsew")

def get(self) -> Dict:
"""Return the configuration as a dictionary."""
Expand Down Expand Up @@ -617,12 +612,17 @@ def __init__(self, *args, **kwargs):
self.configure(fg_color="transparent")
self.grid_columnconfigure(0, weight=1)

self.title = widgets.Heading(self, text="Percolator cofiguration")
self.title = widgets.Heading(self, text="Percolator configuration")
self.title.grid(row=0, column=0, columnspan=2, pady=(0, 5), sticky="ew")

self.weights_file = widgets.LabeledFileSelect(
self, label="Pretrained Percolator model weights", file_option="openfile"
)
self.weights_file.grid(row=1, column=0, columnspan=2, sticky="nsew")

def get(self) -> Dict:
"""Return the configuration as a dictionary."""
config = {}
config = {"init-weights": self.weights_file.get()}
return config


Expand All @@ -641,8 +641,8 @@ def app():
function=function,
)
root.protocol("WM_DELETE_WINDOW", sys.exit)
root.geometry(f"{1250}x{700}")
root.minsize(1000, 700)
dpi = root.winfo_fpixels("1i")
root.geometry(f"{int(15*dpi)}x{int(10*dpi)}")
root.title("MS²Rescore")
root.wm_iconbitmap(os.path.join(str(_IMG_DIR), "program_icon.ico"))

Expand Down
9 changes: 2 additions & 7 deletions ms2rescore/gui/function2ctk.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

import customtkinter as ctk


logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

Expand Down Expand Up @@ -51,13 +50,9 @@ def __init__(

self.function = function

# # App config
self.geometry(f"{1250}x{700}")
self.minsize(1000, 700)

# 2x3 grid, only logging column expands with window
self.grid_columnconfigure(0, weight=0, minsize=500) # Left: Sidebar
self.grid_columnconfigure(1, weight=0, minsize=1000) # Middle: Configuration
self.grid_columnconfigure(0, weight=0) # Left: Sidebar
self.grid_columnconfigure(1, weight=2) # Middle: Configuration
self.grid_columnconfigure(2, weight=1) # Right: Logging
self.grid_rowconfigure(0, weight=1)

Expand Down
Loading

0 comments on commit e16a3b1

Please sign in to comment.