diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index ae565344..4820d14e 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -7,7 +7,7 @@ on:
pull_request:
jobs:
- test:
+ test-python-package:
runs-on: ubuntu-latest
strategy:
matrix:
@@ -42,3 +42,30 @@ jobs:
- name: Test installation
run: |
ms2rescore --help
+
+ test-windows-installer:
+ # Only run on push to main (e.g., after PR merge)
+ if: ${{ github.ref == 'refs/heads/main' }}
+ runs-on: windows-latest
+ steps:
+ - uses: actions/checkout@v2
+
+ - uses: actions/setup-python@v2
+ with:
+ python-version: "3.11"
+
+ - name: Install package and dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install . pyinstaller
+
+ - name: Install Inno Setup
+ uses: crazy-max/ghaction-chocolatey@v1
+ with:
+ args: install innosetup -y --allow-unofficial --force
+
+ - name: Run pyinstaller
+ run: pyinstaller ./ms2rescore.spec --clean --noconfirm
+
+ - name: Test built exe
+ run: dist/ms2rescore/ms2rescore.exe
diff --git a/README.md b/README.md
index 2adfdb17..991bb436 100644
--- a/README.md
+++ b/README.md
@@ -30,8 +30,10 @@ MS²Rescore can read peptide identifications in any format supported by [psm_uti
files:
- [MS Amanda](http://ms.imp.ac.at/?goto=msamanda) `.csv`
+- [Sage](https://github.com/lazear/sage) `.sage.tsv`
- [PeptideShaker](https://compomics.github.io/projects/peptide-shaker.html) `.mzid`
- [MSGFPlus](https://omics.pnl.gov/software/ms-gf) `.mzid`
+- [Mascot](https://www.matrixscience.com/) `.mzid`
- [MaxQuant](https://www.maxquant.org/) `msms.txt`
- [X!Tandem](https://www.thegpm.org/tandem/) `.xml`
- [PEAKS](https://www.bioinfor.com/peaksdb/) `.mzid`
@@ -45,13 +47,13 @@ MS²Rescore is available as a [desktop application][desktop], a [command line to
> **MS2Rescore: Data-driven rescoring dramatically boosts immunopeptide identification rates.**
> Arthur Declercq, Robbin Bouwmeester, Aurélie Hirschler, Christine Carapito, Sven Degroeve, Lennart Martens, and Ralf Gabriels.
-> _Molecular & Cellular Proteomics_ (2021) [doi:10.1016/j.mcpro.2022.100266](https://doi.org/10.1016/j.mcpro.2022.100266) >
+> _Molecular & Cellular Proteomics_ (2021) [doi:10.1016/j.mcpro.2022.100266](https://doi.org/10.1016/j.mcpro.2022.100266)
**Original publication describing the concept of rescoring with predicted spectra:**
> **Accurate peptide fragmentation predictions allow data driven approaches to replace and improve upon proteomics search engine scoring functions.**
> Ana S C Silva, Robbin Bouwmeester, Lennart Martens, and Sven Degroeve.
-> _Bioinformatics_ (2019) [doi:10.1093/bioinformatics/btz383](https://doi.org/10.1093/bioinformatics/btz383) >
+> _Bioinformatics_ (2019) [doi:10.1093/bioinformatics/btz383](https://doi.org/10.1093/bioinformatics/btz383)
To replicate the experiments described in this article, check out the
[publication branch][publication-branch] of the repository.
diff --git a/docs/source/config_schema.md b/docs/source/config_schema.md
index b7d12c32..2b523aa5 100644
--- a/docs/source/config_schema.md
+++ b/docs/source/config_schema.md
@@ -22,6 +22,8 @@
- **One of**
- *string*
- *null*
+ - *array*
+ - **Items** *(string)*
- **`psm_file_type`** *(string)*: PSM file type. By default inferred from file extension. Default: `"infer"`.
- **`psm_reader_kwargs`** *(object)*: Keyword arguments passed to the PSM reader. Default: `{}`.
- **`spectrum_path`**: Path to spectrum file or directory with spectrum files.
diff --git a/ms2rescore/__main__.py b/ms2rescore/__main__.py
index 5665de1e..4cac9122 100644
--- a/ms2rescore/__main__.py
+++ b/ms2rescore/__main__.py
@@ -62,6 +62,7 @@ def _argument_parser() -> argparse.ArgumentParser:
metavar="FILE",
action="store",
type=str,
+ nargs="*",
dest="psm_file",
help="path to PSM file (PIN, mzIdentML, MaxQuant msms, X!Tandem XML...)",
)
diff --git a/ms2rescore/config_parser.py b/ms2rescore/config_parser.py
index bc86782e..ced80c28 100644
--- a/ms2rescore/config_parser.py
+++ b/ms2rescore/config_parser.py
@@ -44,11 +44,18 @@ def _validate_filenames(config: Dict) -> Dict:
if not config["ms2rescore"]["psm_file"]:
raise MS2RescoreConfigurationError("PSM file should be provided.")
- # psm_file should exist
- id_file = Path(config["ms2rescore"]["psm_file"])
- if not id_file.is_file():
- raise FileNotFoundError(id_file)
- config["ms2rescore"]["psm_file"] = id_file.as_posix()
+ # if psm_file is a string turn into a list else leave as is
+ if isinstance(config["ms2rescore"]["psm_file"], str):
+ config["ms2rescore"]["psm_file"] = [config["ms2rescore"]["psm_file"]]
+
+ # all provided psm_file(s) should exist
+ psm_files = []
+ for psm_file in config["ms2rescore"]["psm_file"]:
+ id_file = Path(psm_file)
+ if not id_file.is_file():
+ raise FileNotFoundError(id_file)
+ psm_files.append(id_file.as_posix())
+ config["ms2rescore"]["psm_file"] = psm_files
# spectrum_path should either be None, or existing path to file or dir
if config["ms2rescore"]["spectrum_path"]:
@@ -59,10 +66,10 @@ def _validate_filenames(config: Dict) -> Dict:
# Parse output_path
config["ms2rescore"]["output_path"] = _parse_output_path(
- config["ms2rescore"]["output_path"], config["ms2rescore"]["psm_file"]
+ config["ms2rescore"]["output_path"], config["ms2rescore"]["psm_file"][0]
)
- # Parse config_file as posix path
+ # Parse config_file as posix path to avoid combination of forward and backward slashes
if config["ms2rescore"]["config_file"]:
config["ms2rescore"]["config_file"] = Path(config["ms2rescore"]["config_file"]).as_posix()
diff --git a/ms2rescore/core.py b/ms2rescore/core.py
index 0f1bd194..fffb1902 100644
--- a/ms2rescore/core.py
+++ b/ms2rescore/core.py
@@ -8,6 +8,7 @@
from ms2rescore.feature_generators import FEATURE_GENERATORS
from ms2rescore.parse_psms import parse_psms
+from ms2rescore.parse_spectra import get_missing_values
from ms2rescore.report import generate
from ms2rescore.rescoring_engines import mokapot, percolator
@@ -26,7 +27,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
PSMList object containing PSMs. If None, PSMs will be read from configuration ``psm_file``.
"""
- config = configuration["ms2rescore"] # TODO: Remove top-level key?
+ config = configuration["ms2rescore"]
output_file_root = config["output_path"]
# Write full configuration including defaults to file
@@ -36,7 +37,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
logger.debug("Using %i of %i available CPUs.", int(config["processes"]), int(cpu_count()))
# Parse PSMs
- psm_list = parse_psms(config, psm_list, output_file_root)
+ psm_list = parse_psms(config, psm_list)
# Log #PSMs identified before rescoring
id_psms_before = _log_id_psms_before(psm_list)
@@ -53,6 +54,13 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
f"PSMs already contain the following rescoring features: {psm_list_feature_names}"
)
+ # TODO: avoid hard coding feature generators in some way
+ rt_required = "deeplc" in config["feature_generators"] and None in psm_list["retention_time"]
+ im_required = "ionmob" in config["feature_generators"] and None in psm_list["ion_mobility"]
+ if rt_required or im_required:
+ logger.info("Parsing missing retention time and/or ion mobility values from spectra...")
+ get_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required)
+
# Add rescoring features
for fgen_name, fgen_config in config["feature_generators"].items():
# TODO: Handle this somewhere else, more generally?
@@ -118,9 +126,15 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
elif "mokapot" in config["rescoring_engine"]:
if "fasta_file" not in config["rescoring_engine"]["mokapot"]:
config["rescoring_engine"]["mokapot"]["fasta_file"] = config["fasta_file"]
+ if "protein_kwargs" in config["rescoring_engine"]["mokapot"]:
+ protein_kwargs = config["rescoring_engine"]["mokapot"].pop("protein_kwargs")
+ else:
+ protein_kwargs = dict()
+
mokapot.rescore(
psm_list,
output_file_root=output_file_root,
+ protein_kwargs=protein_kwargs,
**config["rescoring_engine"]["mokapot"],
)
else:
diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py
index dcf84ea6..50b577ff 100644
--- a/ms2rescore/feature_generators/deeplc.py
+++ b/ms2rescore/feature_generators/deeplc.py
@@ -28,10 +28,7 @@
from psm_utils import PSMList
from psm_utils.io import peptide_record
-from ms2rescore.exceptions import MS2RescoreError
from ms2rescore.feature_generators.base import FeatureGeneratorBase
-from ms2rescore.parse_mgf import parse_mgf_title_rt
-from ms2rescore.utils import infer_spectrum_path
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
logger = logging.getLogger(__name__)
@@ -146,35 +143,20 @@ def add_features(self, psm_list: PSMList) -> None:
logger.info(
f"Running DeepLC for PSMs from run ({current_run}/{total_runs}): `{run}`..."
)
- # Prepare PSM file
+
+ # Disable wild logging to stdout by Tensorflow, unless in debug mode
with contextlib.redirect_stdout(
open(os.devnull, "w")
) if not self._verbose else contextlib.nullcontext():
+ # Make new PSM list for this run (chain PSMs per spectrum to flat list)
psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
- if not all(psm_list["retention_time"]):
- # Prepare spectrum filenames
- spectrum_filename = infer_spectrum_path(self.spectrum_path, run)
- retention_time_dict = parse_mgf_title_rt(
- spectrum_filename
- ) # TODO Add mzML support
- try:
- psm_list_run["retention_time"] = [
- retention_time_dict[psm_id]
- for psm_id in psm_list_run["spectrum_id"]
- ]
- except KeyError:
- raise MS2RescoreError(
- "Could not map all spectrum ids to retention times"
- )
-
+ logger.debug("Calibrating DeepLC...")
psm_list_calibration = self._get_calibration_psms(psm_list_run)
-
- logger.debug("Calibrating DeepLC")
self.deeplc_predictor = self.DeepLC(
n_jobs=self.processes,
verbose=self._verbose,
- path_model=self.user_model or self.selected_model,
+ path_model=self.selected_model or self.user_model,
**self.deeplc_kwargs,
)
self.deeplc_predictor.calibrate_preds(
@@ -182,14 +164,16 @@ def add_features(self, psm_list: PSMList) -> None:
)
# Still calibrate for each run, but do not try out all model options.
# Just use model that was selected based on first run
- if not self.user_model and not self.selected_model:
+ if not self.selected_model:
self.selected_model = list(self.deeplc_predictor.model.keys())
+ self.deeplc_kwargs["deeplc_retrain"] = False
logger.debug(
f"Selected DeepLC model {self.selected_model} based on "
"calibration of first run. Using this model (after new "
"calibrations) for the remaining runs."
)
+ logger.debug("Predicting retention times...")
predictions = np.array(
self.deeplc_predictor.make_preds(
seq_df=self._psm_list_to_deeplc_peprec(psm_list_run)
@@ -198,6 +182,7 @@ def add_features(self, psm_list: PSMList) -> None:
observations = psm_list_run["retention_time"]
rt_diffs_run = np.abs(predictions - observations)
+ logger.debug("Adding features to PSMs...")
for i, psm in enumerate(psm_list_run):
psm["rescoring_features"].update(
{
diff --git a/ms2rescore/feature_generators/ionmob.py b/ms2rescore/feature_generators/ionmob.py
index 12f75f82..e55e6cc6 100644
--- a/ms2rescore/feature_generators/ionmob.py
+++ b/ms2rescore/feature_generators/ionmob.py
@@ -129,64 +129,61 @@ def add_features(self, psm_list: PSMList) -> None:
logger.info(
f"Running Ionmob for PSMs from run ({current_run}/{total_runs}): `{run}`..."
)
- with contextlib.redirect_stdout(
- open(os.devnull, "w")
- ) if not self._verbose else contextlib.nullcontext():
- psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
- psm_list_run_df = psm_list_run.to_dataframe()
-
- # prepare data frames for CCS prediction
- psm_list_run_df["charge"] = [
- peptidoform.precursor_charge
- for peptidoform in psm_list_run_df["peptidoform"]
- ]
- psm_list_run_df = psm_list_run_df[
- psm_list_run_df["charge"] < 5
- ] # predictions do not go higher for ionmob
-
- psm_list_run_df["sequence-tokenized"] = psm_list_run_df.apply(
- lambda x: self.tokenize_peptidoform(x["peptidoform"]), axis=1
- )
- psm_list_run_df = psm_list_run_df[
- psm_list_run_df.apply(
- lambda x: self._is_valid_tokenized_sequence(x["sequence-tokenized"]),
- axis=1,
- )
- ]
-
- psm_list_run_df["mz"] = psm_list_run_df.apply(
- lambda x: calculate_mz(x["sequence-tokenized"], x["charge"]), axis=1
- ) # use precursor m/z from PSMs?
-
- psm_list_run_df["ccs_observed"] = psm_list_run_df.apply(
- lambda x: reduced_mobility_to_ccs(x["ion_mobility"], x["mz"], x["charge"]),
+
+ psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
+ psm_list_run_df = psm_list_run.to_dataframe()
+
+ # prepare data frames for CCS prediction
+ psm_list_run_df["charge"] = [
+ peptidoform.precursor_charge for peptidoform in psm_list_run_df["peptidoform"]
+ ]
+ psm_list_run_df = psm_list_run_df[
+ psm_list_run_df["charge"] < 5
+ ] # predictions do not go higher for ionmob
+
+ psm_list_run_df["sequence-tokenized"] = psm_list_run_df.apply(
+ lambda x: self.tokenize_peptidoform(x["peptidoform"]), axis=1
+ )
+ psm_list_run_df = psm_list_run_df[
+ psm_list_run_df.apply(
+ lambda x: self._is_valid_tokenized_sequence(x["sequence-tokenized"]),
axis=1,
)
- # calibrate CCS values
- shift_factor = self.calculate_ccs_shift(psm_list_run_df)
- psm_list_run_df["ccs_observed"] = psm_list_run_df.apply(
- lambda x: x["ccs_observed"] + shift_factor, axis=1
- )
- # predict CCS values
- tf_ds = to_tf_dataset_inference(
- psm_list_run_df["mz"],
- psm_list_run_df["charge"],
- psm_list_run_df["sequence-tokenized"],
- self.tokenizer,
- )
+ ]
+
+ psm_list_run_df["mz"] = psm_list_run_df.apply(
+ lambda x: calculate_mz(x["sequence-tokenized"], x["charge"]), axis=1
+ ) # use precursor m/z from PSMs?
+
+ psm_list_run_df["ccs_observed"] = psm_list_run_df.apply(
+ lambda x: reduced_mobility_to_ccs(x["ion_mobility"], x["mz"], x["charge"]),
+ axis=1,
+ )
+ # calibrate CCS values
+ shift_factor = self.calculate_ccs_shift(psm_list_run_df)
+ psm_list_run_df["ccs_observed"] = psm_list_run_df.apply(
+ lambda x: x["ccs_observed"] + shift_factor, axis=1
+ )
+ # predict CCS values
+ tf_ds = to_tf_dataset_inference(
+ psm_list_run_df["mz"],
+ psm_list_run_df["charge"],
+ psm_list_run_df["sequence-tokenized"],
+ self.tokenizer,
+ )
- psm_list_run_df["ccs_predicted"], _ = self.ionmob_model.predict(tf_ds)
+ psm_list_run_df["ccs_predicted"], _ = self.ionmob_model.predict(tf_ds)
- # calculate CCS features
- ccs_features = self._calculate_features(psm_list_run_df)
+ # calculate CCS features
+ ccs_features = self._calculate_features(psm_list_run_df)
- # add CCS features to PSMs
- for psm in psm_list_run:
- try:
- psm["rescoring_features"].update(ccs_features[psm.spectrum_id])
- except KeyError:
- psm["rescoring_features"].update({})
- current_run += 1
+ # add CCS features to PSMs
+ for psm in psm_list_run:
+ try:
+ psm["rescoring_features"].update(ccs_features[psm.spectrum_id])
+ except KeyError:
+ psm["rescoring_features"].update({})
+ current_run += 1
def _calculate_features(self, feature_df: pd.DataFrame) -> Dict[str, Dict[str, float]]:
"""Get CCS features for PSMs."""
diff --git a/ms2rescore/gui/__main__.py b/ms2rescore/gui/__main__.py
index b5b4a9eb..429e117f 100644
--- a/ms2rescore/gui/__main__.py
+++ b/ms2rescore/gui/__main__.py
@@ -1,6 +1,8 @@
"""Entrypoint for MS²Rescore GUI."""
import multiprocessing
+import os
+import contextlib
from ms2rescore.gui.app import app
@@ -8,7 +10,9 @@
def main():
"""Entrypoint for MS²Rescore GUI."""
multiprocessing.freeze_support()
- app()
+ # Redirect stdout when running GUI (packaged app might not have console attached)
+ with contextlib.redirect_stdout(open(os.devnull, "w")):
+ app()
if __name__ == "__main__":
diff --git a/ms2rescore/gui/app.py b/ms2rescore/gui/app.py
index 03e8dcfc..caafd537 100644
--- a/ms2rescore/gui/app.py
+++ b/ms2rescore/gui/app.py
@@ -10,6 +10,7 @@
from typing import Dict, List, Tuple
import customtkinter as ctk
+from joblib import parallel_backend
from ms2pip.constants import MODELS as ms2pip_models
from PIL import Image
from psm_utils.io import FILETYPES
@@ -41,6 +42,9 @@
ctk.set_default_color_theme(_THEME_FILE)
+# TODO Does this disable multiprocessing everywhere?
+parallel_backend("threading")
+
class SideBar(ctk.CTkFrame):
def __init__(self, *args, **kwargs):
@@ -161,18 +165,11 @@ def get(self):
main_config = self.main_config.get()
advanced_config = self.advanced_config.get()
- # TODO Move to rescoring engine config
- percolator_config = {"init-weights": advanced_config.pop("weightsfile")}
-
config = {"ms2rescore": main_config}
config["ms2rescore"].update(advanced_config)
config["ms2rescore"]["feature_generators"] = self.fgen_config.get()
config["ms2rescore"]["rescoring_engine"] = self.rescoring_engine_config.get()
- # TODO See above
- if "percolator" in config["ms2rescore"]["rescoring_engine"]:
- config["ms2rescore"]["rescoring_engine"]["percolator"] = percolator_config
-
args = (config,) # Comma required to wrap in tuple
kwargs = {}
@@ -245,7 +242,7 @@ def _parse_modification_mapping(table_output):
for mod in table_output:
if mod[0] and mod[1]:
modification_map[mod[0].strip()] = mod[1].strip()
- return modification_map
+ return modification_map or None
@staticmethod
def _parse_fixed_modifications(table_output):
@@ -255,7 +252,7 @@ def _parse_fixed_modifications(table_output):
if mod[0] and mod[1]:
amino_acids = [aa.upper() for aa in mod[1].strip().split(",")]
fixed_modifications[mod[0]] = amino_acids
- return fixed_modifications
+ return fixed_modifications or None
class PSMFileConfigFrame(ctk.CTkFrame):
@@ -267,7 +264,7 @@ def __init__(self, *args, **kwargs):
self.grid_columnconfigure(0, weight=1)
self.psm_file = widgets.LabeledFileSelect(
- self, label="Select identification file", file_option="openfile"
+ self, label="Select identification file", file_option="openfiles"
)
self.psm_file.grid(row=0, column=0, pady=0, padx=(0, 5), sticky="nsew")
@@ -281,8 +278,14 @@ def __init__(self, *args, **kwargs):
def get(self) -> Dict:
"""Get the configured values as a dictionary."""
+ try:
+ # there cannot be spaces in the file path
+ # TODO: Fix this in widgets.LabeledFileSelect
+ psm_files = self.psm_file.get().split(" ")
+ except AttributeError:
+ raise MS2RescoreConfigurationError("No PSM file provided. Please select a file.")
return {
- "psm_file": self.psm_file.get(),
+ "psm_file": psm_files,
"psm_file_type": self.psm_file_type.get(),
}
@@ -315,11 +318,6 @@ def __init__(self, *args, **kwargs):
self.spectrum_id_pattern = widgets.LabeledEntry(self, label="Spectrum ID regex pattern")
self.spectrum_id_pattern.grid(row=5, column=0, pady=(0, 10), sticky="nsew")
- self.weightsfile = widgets.LabeledFileSelect(
- self, label="Pretrained Percolator weights", file_option="openfile"
- )
- self.weightsfile.grid(row=6, column=0, columnspan=2, sticky="nsew")
-
self.file_prefix = widgets.LabeledFileSelect(
self, label="Filename for output files", file_option="savefile"
)
@@ -338,7 +336,6 @@ def get(self) -> Dict:
"id_decoy_pattern": self.id_decoy_pattern.get(),
"psm_id_pattern": self.psm_id_pattern.get(),
"spectrum_id_pattern": self.spectrum_id_pattern.get(),
- "weightsfile": self.weightsfile.get(),
"output_path": self.file_prefix.get(),
"config_file": self.config_file.get(),
"write_report": self.generate_report.get(),
@@ -458,12 +455,20 @@ def __init__(self, *args, **kwargs):
self.transfer_learning = widgets.LabeledSwitch(self, label="Use transfer learning")
self.transfer_learning.grid(row=2, column=0, pady=(0, 10), sticky="nsew")
+ self.num_epochs = widgets.LabeledFloatSpinbox(
+ self,
+ label="Number of transfer learning epochs",
+ step_size=5,
+ initial_value=20,
+ ) # way to remove float in spinbox label?
+ self.num_epochs.grid(row=3, column=0, pady=(0, 10), sticky="nsew")
+
self.calibration_set_size = widgets.LabeledEntry(
self,
label="Set calibration set size (fraction or number of PSMs)",
placeholder_text="0.15",
)
- self.calibration_set_size.grid(row=3, column=0, pady=(0, 10), sticky="nsew")
+ self.calibration_set_size.grid(row=4, column=0, pady=(0, 10), sticky="nsew")
def get(self) -> Dict:
"""Return the configuration as a dictionary."""
@@ -482,6 +487,7 @@ def get(self) -> Dict:
enabled = self.enabled.get()
config = {
"deeplc_retrain": self.transfer_learning.get(),
+ "n_epochs": int(self.num_epochs.get()),
"calibration_set_size": calibration_set_size,
}
return enabled, config
@@ -498,7 +504,7 @@ def __init__(self, *args, **kwargs):
self.title = widgets.Heading(self, text="Ionmob")
self.title.grid(row=0, column=0, columnspan=2, pady=(0, 5), sticky="ew")
- self.enabled = widgets.LabeledSwitch(self, label="Enable Ionmob", default=True)
+ self.enabled = widgets.LabeledSwitch(self, label="Enable Ionmob", default=False)
self.enabled.grid(row=1, column=0, pady=(0, 10), sticky="nsew")
self.model = widgets.LabeledEntry(
@@ -554,27 +560,49 @@ def __init__(self, *args, **kwargs):
self.configure(fg_color="transparent")
self.grid_columnconfigure(0, weight=1)
- self.title = widgets.Heading(self, text="Mokapot cofiguration")
+ self.title = widgets.Heading(self, text="Mokapot coffeeguration")
self.title.grid(row=0, column=0, columnspan=2, pady=(0, 5), sticky="ew")
- self.write_weights = widgets.LabeledSwitch(self, label="Write weightsfile", default=True)
+ self.write_weights = widgets.LabeledSwitch(
+ self, label="Write model weights to file", default=True
+ )
self.write_weights.grid(row=1, column=0, pady=(0, 10), sticky="nsew")
- self.write_txt = widgets.LabeledSwitch(self, label="Write txt output file", default=True)
+ self.write_txt = widgets.LabeledSwitch(self, label="Write TXT output files", default=True)
self.write_txt.grid(row=2, column=0, pady=(0, 10), sticky="nsew")
- self.write_flashlfq = widgets.LabeledSwitch(self, label="Write flashlfq", default=False)
+ self.write_flashlfq = widgets.LabeledSwitch(
+ self, label="Write file for FlashLFQ", default=False
+ )
self.write_flashlfq.grid(row=3, column=0, pady=(0, 10), sticky="nsew")
+ self.protein_kwargs = widgets.TableInput(
+ self,
+ label="`mokapot.read_fasta` options (see Mokapot documentation)",
+ columns=2,
+ header_labels=["Parameter", "Value"],
+ )
+ self.protein_kwargs.grid(row=4, column=0, sticky="nsew")
+
def get(self) -> Dict:
"""Return the configuration as a dictionary."""
config = {
"write_weights": self.write_weights.get(),
"write_txt": self.write_txt.get(),
"write_flashlfq": self.write_flashlfq.get(),
+ "protein_kwargs": self._parse_protein_kwargs(self.protein_kwargs.get()),
}
return config
+ @staticmethod
+ def _parse_protein_kwargs(table_output):
+ """Parse text input modifications mapping"""
+ protein_kwargs = {}
+ for mod in table_output:
+ if mod[0] and mod[1]:
+ protein_kwargs[mod[0].strip()] = mod[1].strip()
+ return protein_kwargs
+
class PercolatorRescoringConfiguration(ctk.CTkFrame):
def __init__(self, *args, **kwargs):
@@ -584,19 +612,24 @@ def __init__(self, *args, **kwargs):
self.configure(fg_color="transparent")
self.grid_columnconfigure(0, weight=1)
- self.title = widgets.Heading(self, text="Percolator cofiguration")
+ self.title = widgets.Heading(self, text="Percolator coffeeguration")
self.title.grid(row=0, column=0, columnspan=2, pady=(0, 5), sticky="ew")
+ self.weights_file = widgets.LabeledFileSelect(
+ self, label="Pretrained Percolator model weights", file_option="openfile"
+ )
+ self.weights_file.grid(row=1, column=0, columnspan=2, sticky="nsew")
+
def get(self) -> Dict:
"""Return the configuration as a dictionary."""
- config = {}
+ config = {"init-weights": self.weights_file.get()}
return config
def function(config):
"""Function to be executed in a separate process."""
config = config.copy()
- config = parse_configurations(config)
+ config = parse_configurations([config["ms2rescore"]["config_file"], config])
rescore(configuration=config)
@@ -608,8 +641,8 @@ def app():
function=function,
)
root.protocol("WM_DELETE_WINDOW", sys.exit)
- root.geometry(f"{1250}x{700}")
- root.minsize(1000, 700)
+ dpi = root.winfo_fpixels("1i")
+ root.geometry(f"{int(15*dpi)}x{int(10*dpi)}")
root.title("MS²Rescore")
root.wm_iconbitmap(os.path.join(str(_IMG_DIR), "program_icon.ico"))
diff --git a/ms2rescore/gui/function2ctk.py b/ms2rescore/gui/function2ctk.py
index 9ffe38bd..60bad120 100644
--- a/ms2rescore/gui/function2ctk.py
+++ b/ms2rescore/gui/function2ctk.py
@@ -10,7 +10,6 @@
import customtkinter as ctk
-
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
@@ -51,13 +50,9 @@ def __init__(
self.function = function
- # # App config
- self.geometry(f"{1250}x{700}")
- self.minsize(1000, 700)
-
# 2x3 grid, only logging column expands with window
- self.grid_columnconfigure(0, weight=0, minsize=500) # Left: Sidebar
- self.grid_columnconfigure(1, weight=0, minsize=1000) # Middle: Configuration
+ self.grid_columnconfigure(0, weight=0) # Left: Sidebar
+ self.grid_columnconfigure(1, weight=2) # Middle: Configuration
self.grid_columnconfigure(2, weight=1) # Right: Logging
self.grid_rowconfigure(0, weight=1)
diff --git a/ms2rescore/gui/widgets.py b/ms2rescore/gui/widgets.py
index 06bcce9a..ca3d03f1 100644
--- a/ms2rescore/gui/widgets.py
+++ b/ms2rescore/gui/widgets.py
@@ -268,6 +268,9 @@ def __init__(self, *args, label="Select file", file_option="openfile", **kwargs)
elif file_option == "openfile":
self._button_1 = ctk.CTkButton(self, text="Browse files", command=self._pick_file)
+ elif file_option == "openfiles":
+ self._button_1 = ctk.CTkButton(self, text="Browse files", command=self._pick_files)
+
elif file_option == "file/dir":
self._button_1 = ctk.CTkButton(self, text="Browse files", command=self._pick_file)
self._button_2 = ctk.CTkButton(self, text="Browse directories", command=self._pick_dir)
@@ -296,6 +299,10 @@ def _pick_file(self):
self._selected_filename = tk.filedialog.askopenfilename()
self._update_entry()
+ def _pick_files(self):
+ self._selected_filename = tk.filedialog.askopenfilenames()
+ self._update_entry()
+
def _pick_dir(self):
self._selected_filename = tk.filedialog.askdirectory()
self._update_entry()
diff --git a/ms2rescore/package_data/config_schema.json b/ms2rescore/package_data/config_schema.json
index a215c2a3..a97c9a59 100644
--- a/ms2rescore/package_data/config_schema.json
+++ b/ms2rescore/package_data/config_schema.json
@@ -65,7 +65,7 @@
},
"psm_file": {
"description": "Path to file with peptide-spectrum matches.",
- "oneOf": [{ "type": "string" }, { "type": "null" }]
+ "oneOf": [{ "type": "string" }, { "type": "null" }, { "type": "array", "items": { "type": "string" } }]
},
"psm_file_type": {
"description": "PSM file type. By default inferred from file extension.",
diff --git a/ms2rescore/parse_mgf.py b/ms2rescore/parse_mgf.py
deleted file mode 100644
index abc422e0..00000000
--- a/ms2rescore/parse_mgf.py
+++ /dev/null
@@ -1,50 +0,0 @@
-"""Parse MGF files."""
-
-import logging
-import mmap
-import os.path
-from typing import Union, Tuple, Dict
-
-from rich.progress import track
-from pyteomics.mgf import MGF
-
-from ms2rescore.exceptions import MS2RescoreError
-
-logger = logging.getLogger(__name__)
-
-
-class ParseMGFError(MS2RescoreError):
- """Error parsing MGF file."""
-
- pass
-
-
-def parse_mgf_title_rt(path_to_mgf: Union[str, os.PathLike]) -> Dict[str, float]:
- """Parse MGF file to extract title and retention time fields, by spectrum index."""
- logger.debug("Parsing MGF file to extract retention times.")
- mgf_reader = MGF(path_to_mgf, read_charges=False, read_ions=False)
- retention_times = {}
- for spectrum in mgf_reader:
- try:
- title = spectrum["params"]["title"]
- except KeyError:
- raise ParseMGFError("MGF file missing title field.")
- try:
- rt = float(spectrum["params"]["rtinseconds"])
- except KeyError:
- rt = None
- retention_times[title] = rt
-
- if any(list(retention_times.values())):
- return retention_times
- else:
- raise ParseMGFError("MGF file missing rtinseconds field.")
-
-
-def get_num_lines(file_path):
- fp = open(file_path, "r+")
- buf = mmap.mmap(fp.fileno(), 0)
- lines = 0
- while buf.readline():
- lines += 1
- return lines
diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py
index 3eb3d19f..e116c32f 100644
--- a/ms2rescore/parse_psms.py
+++ b/ms2rescore/parse_psms.py
@@ -1,16 +1,17 @@
import logging
import re
+from itertools import chain
from typing import Dict, Union
import psm_utils.io
from psm_utils import PSMList
-from ms2rescore.exceptions import MS2RescoreConfigurationError, MS2RescoreError
+from ms2rescore.exceptions import MS2RescoreConfigurationError
logger = logging.getLogger(__name__)
-def parse_psms(config: Dict, psm_list: Union[PSMList, None], output_file_root: str) -> PSMList:
+def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
"""
Parse PSMs and prepare for rescoring.
@@ -21,8 +22,6 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None], output_file_root: s
top-level key).
psm_list
PSMList object containing PSMs. If None, PSMs will be read from ``psm_file``.
- output_file_root
- Path to output file root (without file extension).
"""
# Read PSMs, find decoys, calculate q-values
@@ -60,24 +59,36 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None], output_file_root: s
def _read_psms(config, psm_list):
- logger.info("Reading PSMs...")
if isinstance(psm_list, PSMList):
return psm_list
else:
- try:
- return psm_utils.io.read_file(
- config["psm_file"],
- filetype=config["psm_file_type"],
- show_progressbar=True,
- **config["psm_reader_kwargs"],
- )
- except psm_utils.io.PSMUtilsIOException:
- raise MS2RescoreConfigurationError(
- "Error occurred while reading PSMs. Please check the `psm_file` and "
- "`psm_file_type` settings. See "
- "https://ms2rescore.readthedocs.io/en/latest/userguide/input-files/"
- " for more information."
+ logger.info("Reading PSMs from file...")
+ current_file = 1
+ total_files = len(config["psm_file"])
+ psm_list_list = []
+ for psm_file in config["psm_file"]:
+ logger.info(
+ f"Reading PSMs from PSM file ({current_file}/{total_files}): `{psm_file}`..."
)
+ try:
+ id_file_psm_list = psm_utils.io.read_file(
+ psm_file,
+ filetype=config["psm_file_type"],
+ show_progressbar=True,
+ **config["psm_reader_kwargs"],
+ )
+ except psm_utils.io.PSMUtilsIOException:
+ raise MS2RescoreConfigurationError(
+ "Error occurred while reading PSMs. Please check the `psm_file` and "
+ "`psm_file_type` settings. See "
+ "https://ms2rescore.readthedocs.io/en/latest/userguide/input-files/"
+ " for more information."
+ )
+
+ psm_list_list.append(id_file_psm_list)
+ current_file += 1
+
+ return PSMList(psm_list=list(chain.from_iterable(p.psm_list for p in psm_list_list)))
def _find_decoys(config, psm_list):
@@ -113,7 +124,7 @@ def _match_psm_ids(old_id, regex_pattern):
try:
return match[1]
except (TypeError, IndexError):
- raise MS2RescoreError(
+ raise MS2RescoreConfigurationError(
"`psm_id_pattern` could not be matched to all PSM spectrum IDs."
" Ensure that the regex contains a capturing group?"
)
diff --git a/ms2rescore/parse_spectra.py b/ms2rescore/parse_spectra.py
new file mode 100644
index 00000000..9ed199b9
--- /dev/null
+++ b/ms2rescore/parse_spectra.py
@@ -0,0 +1,137 @@
+"""Parse MGF files."""
+
+import logging
+import re
+from itertools import chain
+from typing import Dict, Tuple
+
+from psm_utils import PSMList
+from pyteomics.mgf import MGF
+from pyteomics.mzml import MzML
+from rich.progress import track
+
+from ms2rescore.exceptions import MS2RescoreError
+from ms2rescore.utils import infer_spectrum_path
+
+logger = logging.getLogger(__name__)
+
+
+def get_missing_values(config, psm_list, missing_rt=False, missing_im=False):
+ """Get missing RT/IM features from spectrum file."""
+ logger.debug("Extracting missing RT/IM values from spectrum file(s).")
+
+ psm_dict = psm_list.get_psm_dict()
+ for runs in psm_dict.values():
+ for run, psms in track(runs.items(), description="Extracting RT/IM values..."):
+ psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
+ spectrum_file = infer_spectrum_path(config["spectrum_path"], run)
+
+ if spectrum_file.suffix.lower() == ".mzml":
+ rt_dict, im_dict = _parse_values_from_mzml(
+ spectrum_file, config, run, missing_rt, missing_im
+ )
+ elif spectrum_file.suffix.lower() == ".mgf":
+ rt_dict, im_dict = _parse_values_from_mgf(
+ spectrum_file, config, run, missing_rt, missing_im
+ )
+
+ for value_dict, value in zip([rt_dict, im_dict], ["retention_time", "ion_mobility"]):
+ if value_dict:
+ try:
+ psm_list_run[value] = [value_dict[psm.spectrum_id] for psm in psm_list_run]
+ except KeyError:
+ raise ParsingError(
+ f"Could not parse {value} values from spectrum file for run {run}."
+ )
+
+
+def _parse_values_from_mgf(
+ spectrum_file, config, run, missing_rt, missing_im
+) -> Tuple[Dict, Dict]:
+ """
+ Parse retention time and/or ion mobility from an MGF file.
+
+ Notes
+ -----
+ - Extracting values (e.g., ion mobility) according to the Matrix documentation:
+ http://www.matrixscience.com/help/data_file_help.html
+
+ """
+ rt_dict = {}
+ im_dict = {}
+
+ spectrum_id_pattern = re.compile(
+ config["spectrum_id_pattern"] if config["spectrum_id_pattern"] else r"(.*)"
+ )
+
+ for spectrum in MGF(str(spectrum_file)):
+ matched_id = spectrum_id_pattern.match(spectrum["params"]["title"]).group()
+ if missing_rt:
+ try:
+ rt_dict[matched_id] = float(spectrum["params"]["rtinseconds"])
+ except KeyError:
+ raise ParsingError(
+ "Could not parse retention time (`rtinseconds`) from spectrum file for "
+ f"run {run}. Please make sure that the retention time key is present in the "
+ "spectrum file or disable the relevant feature generator."
+ )
+ if missing_im:
+ try:
+ im_dict[matched_id] = float(spectrum["params"]["ion_mobility"])
+ except KeyError:
+ raise ParsingError(
+ "Could not parse ion mobility (`ion_mobility`) from spectrum file "
+ f"for run {run}. Please make sure that the ion mobility key is present in the "
+ "spectrum file or disable the relevant feature generator."
+ )
+
+ return rt_dict, im_dict
+
+
+def _parse_values_from_mzml(
+ spectrum_file, config, run, missing_rt, missing_im
+) -> Tuple[Dict, Dict]:
+ """Parse retention time and/or ion mobility from an mzML file."""
+ rt_dict = {}
+ im_dict = {}
+
+ spectrum_id_pattern = re.compile(
+ config["spectrum_id_pattern"] if config["spectrum_id_pattern"] else r"(.*)"
+ )
+
+ for spectrum in MzML(str(spectrum_file)):
+ matched_id = spectrum_id_pattern.match(spectrum["id"]).group()
+ if missing_rt:
+ try:
+ rt_dict[matched_id] = float(spectrum["scanList"]["scan"][0]["scan start time"])
+ except KeyError:
+ raise ParsingError(
+ "Could not parse retention time (`scan start time`) from spectrum file for "
+ f"run {run}. Please make sure that the retention time key is present in the "
+ "spectrum file or disable the relevant feature generator."
+ )
+ if missing_im:
+ try:
+ im_dict[matched_id] = float(
+ spectrum["scanList"]["scan"][0]["reverse ion mobility"]
+ )
+ except KeyError:
+ raise ParsingError(
+ "Could not parse ion mobility (`reverse ion mobility`) from spectrum file "
+ f"for run {run}. Please make sure that the ion mobility key is present in the "
+ "spectrum file or disable the relevant feature generator."
+ )
+
+ return rt_dict, im_dict
+
+
+class ParseMGFError(MS2RescoreError):
+ """Error parsing MGF file."""
+
+ pass
+
+
+class ParsingError(MS2RescoreError):
+ """Error parsing retention time from spectrum file."""
+
+ pass
diff --git a/ms2rescore/report/generate.py b/ms2rescore/report/generate.py
index 64ab0e4b..090db873 100644
--- a/ms2rescore/report/generate.py
+++ b/ms2rescore/report/generate.py
@@ -96,7 +96,9 @@ def generate_report(
"metadata": {
"generated_on": datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
"ms2rescore_version": ms2rescore.__version__, # TODO: Write during run?
- "psm_filename": Path(config["ms2rescore"]["psm_file"]).name,
+ "psm_filename": "\n".join(
+ [Path(id_file).name for id_file in config["ms2rescore"]["psm_file"]]
+ ),
},
"main_tabs": [
{
diff --git a/ms2rescore/rescoring_engines/mokapot.py b/ms2rescore/rescoring_engines/mokapot.py
index 65a3d038..f3927f47 100644
--- a/ms2rescore/rescoring_engines/mokapot.py
+++ b/ms2rescore/rescoring_engines/mokapot.py
@@ -20,7 +20,7 @@
"""
import logging
-from typing import Any, List, Optional, Tuple
+from typing import Any, List, Optional, Tuple, Dict
import mokapot
import numpy as np
@@ -40,6 +40,7 @@ def rescore(
write_weights: bool = False,
write_txt: bool = False,
write_flashlfq: bool = False,
+ protein_kwargs: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> None:
"""
@@ -68,6 +69,9 @@ def rescore(
Write Mokapot results to a text file. Defaults to ``False``.
write_flashlfq
Write Mokapot results to a FlashLFQ-compatible file. Defaults to ``False``.
+ protein_kwargs
+ Keyword arguments to pass to the :py:meth:`~mokapot.dataset.LinearPsmDataset.add_proteins`
+ method.
**kwargs
Additional keyword arguments are passed to the Mokapot :py:func:`~mokapot.brew` function.
@@ -80,11 +84,11 @@ def rescore(
# Add proteins
if fasta_file:
- proteins = mokapot.read_fasta(fasta_file)
- lin_psm_data.add_proteins(proteins)
+ logger.debug(f"Adding protein info from {fasta_file} with options: `{protein_kwargs}`")
+ lin_psm_data.add_proteins(fasta_file, **protein_kwargs)
# Rescore
- logger.debug(f"Mokapot keyword arguments : {kwargs}")
+ logger.debug(f"Mokapot brew options: `{kwargs}`")
confidence_results, models = brew(lin_psm_data, **kwargs)
# Reshape confidence estimates to match PSMList
@@ -120,6 +124,8 @@ def rescore(
if write_txt:
confidence_results.to_txt(file_root=output_file_root, decoys=True)
if write_flashlfq:
+ # TODO: How do we validate that the RTs are in minutes?
+ confidence_results.psms["retention_time"] = confidence_results.psms["retention_time"] * 60
confidence_results.to_flashlfq(output_file_root + ".mokapot.flashlfq.txt")
diff --git a/ms2rescore/utils.py b/ms2rescore/utils.py
index 26c88a7e..70417b56 100644
--- a/ms2rescore/utils.py
+++ b/ms2rescore/utils.py
@@ -75,4 +75,4 @@ def infer_spectrum_path(
"files."
)
- return resolved_path
+ return Path(resolved_path)
diff --git a/ms2rescore_innosetup.iss b/ms2rescore_innosetup.iss
index 8bbd2aaa..8e536444 100644
--- a/ms2rescore_innosetup.iss
+++ b/ms2rescore_innosetup.iss
@@ -2,6 +2,7 @@
#define MyAppPublisher "CompOmics"
#define MyAppURL "https://github.com/compomics/ms2rescore"
#define MyAppExeName "ms2rescore.exe"
+#define OutputFilename "{#MyAppName}-{#MyAppVersion}-Windows64bit"
[Setup]
AppId={{2D3D12BD-3AE2-426E-8DE8-092148C12071}
@@ -17,7 +18,7 @@ LicenseFile=.\LICENSE
PrivilegesRequired=lowest
PrivilegesRequiredOverridesAllowed=dialog
OutputDir="dist"
-OutputBaseFilename="{#MyAppName}-{#MyAppVersion}-Windows64bit"
+OutputBaseFilename={#OutputFilename}
Compression=lzma
SolidCompression=yes
WizardStyle=modern