diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ae565344..4820d14e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -7,7 +7,7 @@ on: pull_request: jobs: - test: + test-python-package: runs-on: ubuntu-latest strategy: matrix: @@ -42,3 +42,30 @@ jobs: - name: Test installation run: | ms2rescore --help + + test-windows-installer: + # Only run on push to main (e.g., after PR merge) + if: ${{ github.ref == 'refs/heads/main' }} + runs-on: windows-latest + steps: + - uses: actions/checkout@v2 + + - uses: actions/setup-python@v2 + with: + python-version: "3.11" + + - name: Install package and dependencies + run: | + python -m pip install --upgrade pip + pip install . pyinstaller + + - name: Install Inno Setup + uses: crazy-max/ghaction-chocolatey@v1 + with: + args: install innosetup -y --allow-unofficial --force + + - name: Run pyinstaller + run: pyinstaller ./ms2rescore.spec --clean --noconfirm + + - name: Test built exe + run: dist/ms2rescore/ms2rescore.exe diff --git a/README.md b/README.md index 2adfdb17..991bb436 100644 --- a/README.md +++ b/README.md @@ -30,8 +30,10 @@ MS²Rescore can read peptide identifications in any format supported by [psm_uti files: - [MS Amanda](http://ms.imp.ac.at/?goto=msamanda) `.csv` +- [Sage](https://github.com/lazear/sage) `.sage.tsv` - [PeptideShaker](https://compomics.github.io/projects/peptide-shaker.html) `.mzid` - [MSGFPlus](https://omics.pnl.gov/software/ms-gf) `.mzid` +- [Mascot](https://www.matrixscience.com/) `.mzid` - [MaxQuant](https://www.maxquant.org/) `msms.txt` - [X!Tandem](https://www.thegpm.org/tandem/) `.xml` - [PEAKS](https://www.bioinfor.com/peaksdb/) `.mzid` @@ -45,13 +47,13 @@ MS²Rescore is available as a [desktop application][desktop], a [command line to > **MS2Rescore: Data-driven rescoring dramatically boosts immunopeptide identification rates.** > Arthur Declercq, Robbin Bouwmeester, Aurélie Hirschler, Christine Carapito, Sven Degroeve, Lennart Martens, and Ralf Gabriels. -> _Molecular & Cellular Proteomics_ (2021) [doi:10.1016/j.mcpro.2022.100266](https://doi.org/10.1016/j.mcpro.2022.100266) > +> _Molecular & Cellular Proteomics_ (2021) [doi:10.1016/j.mcpro.2022.100266](https://doi.org/10.1016/j.mcpro.2022.100266) **Original publication describing the concept of rescoring with predicted spectra:** > **Accurate peptide fragmentation predictions allow data driven approaches to replace and improve upon proteomics search engine scoring functions.** > Ana S C Silva, Robbin Bouwmeester, Lennart Martens, and Sven Degroeve. -> _Bioinformatics_ (2019) [doi:10.1093/bioinformatics/btz383](https://doi.org/10.1093/bioinformatics/btz383) > +> _Bioinformatics_ (2019) [doi:10.1093/bioinformatics/btz383](https://doi.org/10.1093/bioinformatics/btz383) To replicate the experiments described in this article, check out the [publication branch][publication-branch] of the repository. diff --git a/docs/source/config_schema.md b/docs/source/config_schema.md index b7d12c32..2b523aa5 100644 --- a/docs/source/config_schema.md +++ b/docs/source/config_schema.md @@ -22,6 +22,8 @@ - **One of** - *string* - *null* + - *array* + - **Items** *(string)* - **`psm_file_type`** *(string)*: PSM file type. By default inferred from file extension. Default: `"infer"`. - **`psm_reader_kwargs`** *(object)*: Keyword arguments passed to the PSM reader. Default: `{}`. - **`spectrum_path`**: Path to spectrum file or directory with spectrum files. diff --git a/ms2rescore/__main__.py b/ms2rescore/__main__.py index 5665de1e..4cac9122 100644 --- a/ms2rescore/__main__.py +++ b/ms2rescore/__main__.py @@ -62,6 +62,7 @@ def _argument_parser() -> argparse.ArgumentParser: metavar="FILE", action="store", type=str, + nargs="*", dest="psm_file", help="path to PSM file (PIN, mzIdentML, MaxQuant msms, X!Tandem XML...)", ) diff --git a/ms2rescore/config_parser.py b/ms2rescore/config_parser.py index bc86782e..ced80c28 100644 --- a/ms2rescore/config_parser.py +++ b/ms2rescore/config_parser.py @@ -44,11 +44,18 @@ def _validate_filenames(config: Dict) -> Dict: if not config["ms2rescore"]["psm_file"]: raise MS2RescoreConfigurationError("PSM file should be provided.") - # psm_file should exist - id_file = Path(config["ms2rescore"]["psm_file"]) - if not id_file.is_file(): - raise FileNotFoundError(id_file) - config["ms2rescore"]["psm_file"] = id_file.as_posix() + # if psm_file is a string turn into a list else leave as is + if isinstance(config["ms2rescore"]["psm_file"], str): + config["ms2rescore"]["psm_file"] = [config["ms2rescore"]["psm_file"]] + + # all provided psm_file(s) should exist + psm_files = [] + for psm_file in config["ms2rescore"]["psm_file"]: + id_file = Path(psm_file) + if not id_file.is_file(): + raise FileNotFoundError(id_file) + psm_files.append(id_file.as_posix()) + config["ms2rescore"]["psm_file"] = psm_files # spectrum_path should either be None, or existing path to file or dir if config["ms2rescore"]["spectrum_path"]: @@ -59,10 +66,10 @@ def _validate_filenames(config: Dict) -> Dict: # Parse output_path config["ms2rescore"]["output_path"] = _parse_output_path( - config["ms2rescore"]["output_path"], config["ms2rescore"]["psm_file"] + config["ms2rescore"]["output_path"], config["ms2rescore"]["psm_file"][0] ) - # Parse config_file as posix path + # Parse config_file as posix path to avoid combination of forward and backward slashes if config["ms2rescore"]["config_file"]: config["ms2rescore"]["config_file"] = Path(config["ms2rescore"]["config_file"]).as_posix() diff --git a/ms2rescore/core.py b/ms2rescore/core.py index 0f1bd194..fffb1902 100644 --- a/ms2rescore/core.py +++ b/ms2rescore/core.py @@ -8,6 +8,7 @@ from ms2rescore.feature_generators import FEATURE_GENERATORS from ms2rescore.parse_psms import parse_psms +from ms2rescore.parse_spectra import get_missing_values from ms2rescore.report import generate from ms2rescore.rescoring_engines import mokapot, percolator @@ -26,7 +27,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: PSMList object containing PSMs. If None, PSMs will be read from configuration ``psm_file``. """ - config = configuration["ms2rescore"] # TODO: Remove top-level key? + config = configuration["ms2rescore"] output_file_root = config["output_path"] # Write full configuration including defaults to file @@ -36,7 +37,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: logger.debug("Using %i of %i available CPUs.", int(config["processes"]), int(cpu_count())) # Parse PSMs - psm_list = parse_psms(config, psm_list, output_file_root) + psm_list = parse_psms(config, psm_list) # Log #PSMs identified before rescoring id_psms_before = _log_id_psms_before(psm_list) @@ -53,6 +54,13 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: f"PSMs already contain the following rescoring features: {psm_list_feature_names}" ) + # TODO: avoid hard coding feature generators in some way + rt_required = "deeplc" in config["feature_generators"] and None in psm_list["retention_time"] + im_required = "ionmob" in config["feature_generators"] and None in psm_list["ion_mobility"] + if rt_required or im_required: + logger.info("Parsing missing retention time and/or ion mobility values from spectra...") + get_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required) + # Add rescoring features for fgen_name, fgen_config in config["feature_generators"].items(): # TODO: Handle this somewhere else, more generally? @@ -118,9 +126,15 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: elif "mokapot" in config["rescoring_engine"]: if "fasta_file" not in config["rescoring_engine"]["mokapot"]: config["rescoring_engine"]["mokapot"]["fasta_file"] = config["fasta_file"] + if "protein_kwargs" in config["rescoring_engine"]["mokapot"]: + protein_kwargs = config["rescoring_engine"]["mokapot"].pop("protein_kwargs") + else: + protein_kwargs = dict() + mokapot.rescore( psm_list, output_file_root=output_file_root, + protein_kwargs=protein_kwargs, **config["rescoring_engine"]["mokapot"], ) else: diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py index dcf84ea6..50b577ff 100644 --- a/ms2rescore/feature_generators/deeplc.py +++ b/ms2rescore/feature_generators/deeplc.py @@ -28,10 +28,7 @@ from psm_utils import PSMList from psm_utils.io import peptide_record -from ms2rescore.exceptions import MS2RescoreError from ms2rescore.feature_generators.base import FeatureGeneratorBase -from ms2rescore.parse_mgf import parse_mgf_title_rt -from ms2rescore.utils import infer_spectrum_path os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" logger = logging.getLogger(__name__) @@ -146,35 +143,20 @@ def add_features(self, psm_list: PSMList) -> None: logger.info( f"Running DeepLC for PSMs from run ({current_run}/{total_runs}): `{run}`..." ) - # Prepare PSM file + + # Disable wild logging to stdout by Tensorflow, unless in debug mode with contextlib.redirect_stdout( open(os.devnull, "w") ) if not self._verbose else contextlib.nullcontext(): + # Make new PSM list for this run (chain PSMs per spectrum to flat list) psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) - if not all(psm_list["retention_time"]): - # Prepare spectrum filenames - spectrum_filename = infer_spectrum_path(self.spectrum_path, run) - retention_time_dict = parse_mgf_title_rt( - spectrum_filename - ) # TODO Add mzML support - try: - psm_list_run["retention_time"] = [ - retention_time_dict[psm_id] - for psm_id in psm_list_run["spectrum_id"] - ] - except KeyError: - raise MS2RescoreError( - "Could not map all spectrum ids to retention times" - ) - + logger.debug("Calibrating DeepLC...") psm_list_calibration = self._get_calibration_psms(psm_list_run) - - logger.debug("Calibrating DeepLC") self.deeplc_predictor = self.DeepLC( n_jobs=self.processes, verbose=self._verbose, - path_model=self.user_model or self.selected_model, + path_model=self.selected_model or self.user_model, **self.deeplc_kwargs, ) self.deeplc_predictor.calibrate_preds( @@ -182,14 +164,16 @@ def add_features(self, psm_list: PSMList) -> None: ) # Still calibrate for each run, but do not try out all model options. # Just use model that was selected based on first run - if not self.user_model and not self.selected_model: + if not self.selected_model: self.selected_model = list(self.deeplc_predictor.model.keys()) + self.deeplc_kwargs["deeplc_retrain"] = False logger.debug( f"Selected DeepLC model {self.selected_model} based on " "calibration of first run. Using this model (after new " "calibrations) for the remaining runs." ) + logger.debug("Predicting retention times...") predictions = np.array( self.deeplc_predictor.make_preds( seq_df=self._psm_list_to_deeplc_peprec(psm_list_run) @@ -198,6 +182,7 @@ def add_features(self, psm_list: PSMList) -> None: observations = psm_list_run["retention_time"] rt_diffs_run = np.abs(predictions - observations) + logger.debug("Adding features to PSMs...") for i, psm in enumerate(psm_list_run): psm["rescoring_features"].update( { diff --git a/ms2rescore/feature_generators/ionmob.py b/ms2rescore/feature_generators/ionmob.py index 12f75f82..e55e6cc6 100644 --- a/ms2rescore/feature_generators/ionmob.py +++ b/ms2rescore/feature_generators/ionmob.py @@ -129,64 +129,61 @@ def add_features(self, psm_list: PSMList) -> None: logger.info( f"Running Ionmob for PSMs from run ({current_run}/{total_runs}): `{run}`..." ) - with contextlib.redirect_stdout( - open(os.devnull, "w") - ) if not self._verbose else contextlib.nullcontext(): - psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) - psm_list_run_df = psm_list_run.to_dataframe() - - # prepare data frames for CCS prediction - psm_list_run_df["charge"] = [ - peptidoform.precursor_charge - for peptidoform in psm_list_run_df["peptidoform"] - ] - psm_list_run_df = psm_list_run_df[ - psm_list_run_df["charge"] < 5 - ] # predictions do not go higher for ionmob - - psm_list_run_df["sequence-tokenized"] = psm_list_run_df.apply( - lambda x: self.tokenize_peptidoform(x["peptidoform"]), axis=1 - ) - psm_list_run_df = psm_list_run_df[ - psm_list_run_df.apply( - lambda x: self._is_valid_tokenized_sequence(x["sequence-tokenized"]), - axis=1, - ) - ] - - psm_list_run_df["mz"] = psm_list_run_df.apply( - lambda x: calculate_mz(x["sequence-tokenized"], x["charge"]), axis=1 - ) # use precursor m/z from PSMs? - - psm_list_run_df["ccs_observed"] = psm_list_run_df.apply( - lambda x: reduced_mobility_to_ccs(x["ion_mobility"], x["mz"], x["charge"]), + + psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) + psm_list_run_df = psm_list_run.to_dataframe() + + # prepare data frames for CCS prediction + psm_list_run_df["charge"] = [ + peptidoform.precursor_charge for peptidoform in psm_list_run_df["peptidoform"] + ] + psm_list_run_df = psm_list_run_df[ + psm_list_run_df["charge"] < 5 + ] # predictions do not go higher for ionmob + + psm_list_run_df["sequence-tokenized"] = psm_list_run_df.apply( + lambda x: self.tokenize_peptidoform(x["peptidoform"]), axis=1 + ) + psm_list_run_df = psm_list_run_df[ + psm_list_run_df.apply( + lambda x: self._is_valid_tokenized_sequence(x["sequence-tokenized"]), axis=1, ) - # calibrate CCS values - shift_factor = self.calculate_ccs_shift(psm_list_run_df) - psm_list_run_df["ccs_observed"] = psm_list_run_df.apply( - lambda x: x["ccs_observed"] + shift_factor, axis=1 - ) - # predict CCS values - tf_ds = to_tf_dataset_inference( - psm_list_run_df["mz"], - psm_list_run_df["charge"], - psm_list_run_df["sequence-tokenized"], - self.tokenizer, - ) + ] + + psm_list_run_df["mz"] = psm_list_run_df.apply( + lambda x: calculate_mz(x["sequence-tokenized"], x["charge"]), axis=1 + ) # use precursor m/z from PSMs? + + psm_list_run_df["ccs_observed"] = psm_list_run_df.apply( + lambda x: reduced_mobility_to_ccs(x["ion_mobility"], x["mz"], x["charge"]), + axis=1, + ) + # calibrate CCS values + shift_factor = self.calculate_ccs_shift(psm_list_run_df) + psm_list_run_df["ccs_observed"] = psm_list_run_df.apply( + lambda x: x["ccs_observed"] + shift_factor, axis=1 + ) + # predict CCS values + tf_ds = to_tf_dataset_inference( + psm_list_run_df["mz"], + psm_list_run_df["charge"], + psm_list_run_df["sequence-tokenized"], + self.tokenizer, + ) - psm_list_run_df["ccs_predicted"], _ = self.ionmob_model.predict(tf_ds) + psm_list_run_df["ccs_predicted"], _ = self.ionmob_model.predict(tf_ds) - # calculate CCS features - ccs_features = self._calculate_features(psm_list_run_df) + # calculate CCS features + ccs_features = self._calculate_features(psm_list_run_df) - # add CCS features to PSMs - for psm in psm_list_run: - try: - psm["rescoring_features"].update(ccs_features[psm.spectrum_id]) - except KeyError: - psm["rescoring_features"].update({}) - current_run += 1 + # add CCS features to PSMs + for psm in psm_list_run: + try: + psm["rescoring_features"].update(ccs_features[psm.spectrum_id]) + except KeyError: + psm["rescoring_features"].update({}) + current_run += 1 def _calculate_features(self, feature_df: pd.DataFrame) -> Dict[str, Dict[str, float]]: """Get CCS features for PSMs.""" diff --git a/ms2rescore/gui/__main__.py b/ms2rescore/gui/__main__.py index b5b4a9eb..429e117f 100644 --- a/ms2rescore/gui/__main__.py +++ b/ms2rescore/gui/__main__.py @@ -1,6 +1,8 @@ """Entrypoint for MS²Rescore GUI.""" import multiprocessing +import os +import contextlib from ms2rescore.gui.app import app @@ -8,7 +10,9 @@ def main(): """Entrypoint for MS²Rescore GUI.""" multiprocessing.freeze_support() - app() + # Redirect stdout when running GUI (packaged app might not have console attached) + with contextlib.redirect_stdout(open(os.devnull, "w")): + app() if __name__ == "__main__": diff --git a/ms2rescore/gui/app.py b/ms2rescore/gui/app.py index 03e8dcfc..caafd537 100644 --- a/ms2rescore/gui/app.py +++ b/ms2rescore/gui/app.py @@ -10,6 +10,7 @@ from typing import Dict, List, Tuple import customtkinter as ctk +from joblib import parallel_backend from ms2pip.constants import MODELS as ms2pip_models from PIL import Image from psm_utils.io import FILETYPES @@ -41,6 +42,9 @@ ctk.set_default_color_theme(_THEME_FILE) +# TODO Does this disable multiprocessing everywhere? +parallel_backend("threading") + class SideBar(ctk.CTkFrame): def __init__(self, *args, **kwargs): @@ -161,18 +165,11 @@ def get(self): main_config = self.main_config.get() advanced_config = self.advanced_config.get() - # TODO Move to rescoring engine config - percolator_config = {"init-weights": advanced_config.pop("weightsfile")} - config = {"ms2rescore": main_config} config["ms2rescore"].update(advanced_config) config["ms2rescore"]["feature_generators"] = self.fgen_config.get() config["ms2rescore"]["rescoring_engine"] = self.rescoring_engine_config.get() - # TODO See above - if "percolator" in config["ms2rescore"]["rescoring_engine"]: - config["ms2rescore"]["rescoring_engine"]["percolator"] = percolator_config - args = (config,) # Comma required to wrap in tuple kwargs = {} @@ -245,7 +242,7 @@ def _parse_modification_mapping(table_output): for mod in table_output: if mod[0] and mod[1]: modification_map[mod[0].strip()] = mod[1].strip() - return modification_map + return modification_map or None @staticmethod def _parse_fixed_modifications(table_output): @@ -255,7 +252,7 @@ def _parse_fixed_modifications(table_output): if mod[0] and mod[1]: amino_acids = [aa.upper() for aa in mod[1].strip().split(",")] fixed_modifications[mod[0]] = amino_acids - return fixed_modifications + return fixed_modifications or None class PSMFileConfigFrame(ctk.CTkFrame): @@ -267,7 +264,7 @@ def __init__(self, *args, **kwargs): self.grid_columnconfigure(0, weight=1) self.psm_file = widgets.LabeledFileSelect( - self, label="Select identification file", file_option="openfile" + self, label="Select identification file", file_option="openfiles" ) self.psm_file.grid(row=0, column=0, pady=0, padx=(0, 5), sticky="nsew") @@ -281,8 +278,14 @@ def __init__(self, *args, **kwargs): def get(self) -> Dict: """Get the configured values as a dictionary.""" + try: + # there cannot be spaces in the file path + # TODO: Fix this in widgets.LabeledFileSelect + psm_files = self.psm_file.get().split(" ") + except AttributeError: + raise MS2RescoreConfigurationError("No PSM file provided. Please select a file.") return { - "psm_file": self.psm_file.get(), + "psm_file": psm_files, "psm_file_type": self.psm_file_type.get(), } @@ -315,11 +318,6 @@ def __init__(self, *args, **kwargs): self.spectrum_id_pattern = widgets.LabeledEntry(self, label="Spectrum ID regex pattern") self.spectrum_id_pattern.grid(row=5, column=0, pady=(0, 10), sticky="nsew") - self.weightsfile = widgets.LabeledFileSelect( - self, label="Pretrained Percolator weights", file_option="openfile" - ) - self.weightsfile.grid(row=6, column=0, columnspan=2, sticky="nsew") - self.file_prefix = widgets.LabeledFileSelect( self, label="Filename for output files", file_option="savefile" ) @@ -338,7 +336,6 @@ def get(self) -> Dict: "id_decoy_pattern": self.id_decoy_pattern.get(), "psm_id_pattern": self.psm_id_pattern.get(), "spectrum_id_pattern": self.spectrum_id_pattern.get(), - "weightsfile": self.weightsfile.get(), "output_path": self.file_prefix.get(), "config_file": self.config_file.get(), "write_report": self.generate_report.get(), @@ -458,12 +455,20 @@ def __init__(self, *args, **kwargs): self.transfer_learning = widgets.LabeledSwitch(self, label="Use transfer learning") self.transfer_learning.grid(row=2, column=0, pady=(0, 10), sticky="nsew") + self.num_epochs = widgets.LabeledFloatSpinbox( + self, + label="Number of transfer learning epochs", + step_size=5, + initial_value=20, + ) # way to remove float in spinbox label? + self.num_epochs.grid(row=3, column=0, pady=(0, 10), sticky="nsew") + self.calibration_set_size = widgets.LabeledEntry( self, label="Set calibration set size (fraction or number of PSMs)", placeholder_text="0.15", ) - self.calibration_set_size.grid(row=3, column=0, pady=(0, 10), sticky="nsew") + self.calibration_set_size.grid(row=4, column=0, pady=(0, 10), sticky="nsew") def get(self) -> Dict: """Return the configuration as a dictionary.""" @@ -482,6 +487,7 @@ def get(self) -> Dict: enabled = self.enabled.get() config = { "deeplc_retrain": self.transfer_learning.get(), + "n_epochs": int(self.num_epochs.get()), "calibration_set_size": calibration_set_size, } return enabled, config @@ -498,7 +504,7 @@ def __init__(self, *args, **kwargs): self.title = widgets.Heading(self, text="Ionmob") self.title.grid(row=0, column=0, columnspan=2, pady=(0, 5), sticky="ew") - self.enabled = widgets.LabeledSwitch(self, label="Enable Ionmob", default=True) + self.enabled = widgets.LabeledSwitch(self, label="Enable Ionmob", default=False) self.enabled.grid(row=1, column=0, pady=(0, 10), sticky="nsew") self.model = widgets.LabeledEntry( @@ -554,27 +560,49 @@ def __init__(self, *args, **kwargs): self.configure(fg_color="transparent") self.grid_columnconfigure(0, weight=1) - self.title = widgets.Heading(self, text="Mokapot cofiguration") + self.title = widgets.Heading(self, text="Mokapot coffeeguration") self.title.grid(row=0, column=0, columnspan=2, pady=(0, 5), sticky="ew") - self.write_weights = widgets.LabeledSwitch(self, label="Write weightsfile", default=True) + self.write_weights = widgets.LabeledSwitch( + self, label="Write model weights to file", default=True + ) self.write_weights.grid(row=1, column=0, pady=(0, 10), sticky="nsew") - self.write_txt = widgets.LabeledSwitch(self, label="Write txt output file", default=True) + self.write_txt = widgets.LabeledSwitch(self, label="Write TXT output files", default=True) self.write_txt.grid(row=2, column=0, pady=(0, 10), sticky="nsew") - self.write_flashlfq = widgets.LabeledSwitch(self, label="Write flashlfq", default=False) + self.write_flashlfq = widgets.LabeledSwitch( + self, label="Write file for FlashLFQ", default=False + ) self.write_flashlfq.grid(row=3, column=0, pady=(0, 10), sticky="nsew") + self.protein_kwargs = widgets.TableInput( + self, + label="`mokapot.read_fasta` options (see Mokapot documentation)", + columns=2, + header_labels=["Parameter", "Value"], + ) + self.protein_kwargs.grid(row=4, column=0, sticky="nsew") + def get(self) -> Dict: """Return the configuration as a dictionary.""" config = { "write_weights": self.write_weights.get(), "write_txt": self.write_txt.get(), "write_flashlfq": self.write_flashlfq.get(), + "protein_kwargs": self._parse_protein_kwargs(self.protein_kwargs.get()), } return config + @staticmethod + def _parse_protein_kwargs(table_output): + """Parse text input modifications mapping""" + protein_kwargs = {} + for mod in table_output: + if mod[0] and mod[1]: + protein_kwargs[mod[0].strip()] = mod[1].strip() + return protein_kwargs + class PercolatorRescoringConfiguration(ctk.CTkFrame): def __init__(self, *args, **kwargs): @@ -584,19 +612,24 @@ def __init__(self, *args, **kwargs): self.configure(fg_color="transparent") self.grid_columnconfigure(0, weight=1) - self.title = widgets.Heading(self, text="Percolator cofiguration") + self.title = widgets.Heading(self, text="Percolator coffeeguration") self.title.grid(row=0, column=0, columnspan=2, pady=(0, 5), sticky="ew") + self.weights_file = widgets.LabeledFileSelect( + self, label="Pretrained Percolator model weights", file_option="openfile" + ) + self.weights_file.grid(row=1, column=0, columnspan=2, sticky="nsew") + def get(self) -> Dict: """Return the configuration as a dictionary.""" - config = {} + config = {"init-weights": self.weights_file.get()} return config def function(config): """Function to be executed in a separate process.""" config = config.copy() - config = parse_configurations(config) + config = parse_configurations([config["ms2rescore"]["config_file"], config]) rescore(configuration=config) @@ -608,8 +641,8 @@ def app(): function=function, ) root.protocol("WM_DELETE_WINDOW", sys.exit) - root.geometry(f"{1250}x{700}") - root.minsize(1000, 700) + dpi = root.winfo_fpixels("1i") + root.geometry(f"{int(15*dpi)}x{int(10*dpi)}") root.title("MS²Rescore") root.wm_iconbitmap(os.path.join(str(_IMG_DIR), "program_icon.ico")) diff --git a/ms2rescore/gui/function2ctk.py b/ms2rescore/gui/function2ctk.py index 9ffe38bd..60bad120 100644 --- a/ms2rescore/gui/function2ctk.py +++ b/ms2rescore/gui/function2ctk.py @@ -10,7 +10,6 @@ import customtkinter as ctk - logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -51,13 +50,9 @@ def __init__( self.function = function - # # App config - self.geometry(f"{1250}x{700}") - self.minsize(1000, 700) - # 2x3 grid, only logging column expands with window - self.grid_columnconfigure(0, weight=0, minsize=500) # Left: Sidebar - self.grid_columnconfigure(1, weight=0, minsize=1000) # Middle: Configuration + self.grid_columnconfigure(0, weight=0) # Left: Sidebar + self.grid_columnconfigure(1, weight=2) # Middle: Configuration self.grid_columnconfigure(2, weight=1) # Right: Logging self.grid_rowconfigure(0, weight=1) diff --git a/ms2rescore/gui/widgets.py b/ms2rescore/gui/widgets.py index 06bcce9a..ca3d03f1 100644 --- a/ms2rescore/gui/widgets.py +++ b/ms2rescore/gui/widgets.py @@ -268,6 +268,9 @@ def __init__(self, *args, label="Select file", file_option="openfile", **kwargs) elif file_option == "openfile": self._button_1 = ctk.CTkButton(self, text="Browse files", command=self._pick_file) + elif file_option == "openfiles": + self._button_1 = ctk.CTkButton(self, text="Browse files", command=self._pick_files) + elif file_option == "file/dir": self._button_1 = ctk.CTkButton(self, text="Browse files", command=self._pick_file) self._button_2 = ctk.CTkButton(self, text="Browse directories", command=self._pick_dir) @@ -296,6 +299,10 @@ def _pick_file(self): self._selected_filename = tk.filedialog.askopenfilename() self._update_entry() + def _pick_files(self): + self._selected_filename = tk.filedialog.askopenfilenames() + self._update_entry() + def _pick_dir(self): self._selected_filename = tk.filedialog.askdirectory() self._update_entry() diff --git a/ms2rescore/package_data/config_schema.json b/ms2rescore/package_data/config_schema.json index a215c2a3..a97c9a59 100644 --- a/ms2rescore/package_data/config_schema.json +++ b/ms2rescore/package_data/config_schema.json @@ -65,7 +65,7 @@ }, "psm_file": { "description": "Path to file with peptide-spectrum matches.", - "oneOf": [{ "type": "string" }, { "type": "null" }] + "oneOf": [{ "type": "string" }, { "type": "null" }, { "type": "array", "items": { "type": "string" } }] }, "psm_file_type": { "description": "PSM file type. By default inferred from file extension.", diff --git a/ms2rescore/parse_mgf.py b/ms2rescore/parse_mgf.py deleted file mode 100644 index abc422e0..00000000 --- a/ms2rescore/parse_mgf.py +++ /dev/null @@ -1,50 +0,0 @@ -"""Parse MGF files.""" - -import logging -import mmap -import os.path -from typing import Union, Tuple, Dict - -from rich.progress import track -from pyteomics.mgf import MGF - -from ms2rescore.exceptions import MS2RescoreError - -logger = logging.getLogger(__name__) - - -class ParseMGFError(MS2RescoreError): - """Error parsing MGF file.""" - - pass - - -def parse_mgf_title_rt(path_to_mgf: Union[str, os.PathLike]) -> Dict[str, float]: - """Parse MGF file to extract title and retention time fields, by spectrum index.""" - logger.debug("Parsing MGF file to extract retention times.") - mgf_reader = MGF(path_to_mgf, read_charges=False, read_ions=False) - retention_times = {} - for spectrum in mgf_reader: - try: - title = spectrum["params"]["title"] - except KeyError: - raise ParseMGFError("MGF file missing title field.") - try: - rt = float(spectrum["params"]["rtinseconds"]) - except KeyError: - rt = None - retention_times[title] = rt - - if any(list(retention_times.values())): - return retention_times - else: - raise ParseMGFError("MGF file missing rtinseconds field.") - - -def get_num_lines(file_path): - fp = open(file_path, "r+") - buf = mmap.mmap(fp.fileno(), 0) - lines = 0 - while buf.readline(): - lines += 1 - return lines diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py index 3eb3d19f..e116c32f 100644 --- a/ms2rescore/parse_psms.py +++ b/ms2rescore/parse_psms.py @@ -1,16 +1,17 @@ import logging import re +from itertools import chain from typing import Dict, Union import psm_utils.io from psm_utils import PSMList -from ms2rescore.exceptions import MS2RescoreConfigurationError, MS2RescoreError +from ms2rescore.exceptions import MS2RescoreConfigurationError logger = logging.getLogger(__name__) -def parse_psms(config: Dict, psm_list: Union[PSMList, None], output_file_root: str) -> PSMList: +def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList: """ Parse PSMs and prepare for rescoring. @@ -21,8 +22,6 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None], output_file_root: s top-level key). psm_list PSMList object containing PSMs. If None, PSMs will be read from ``psm_file``. - output_file_root - Path to output file root (without file extension). """ # Read PSMs, find decoys, calculate q-values @@ -60,24 +59,36 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None], output_file_root: s def _read_psms(config, psm_list): - logger.info("Reading PSMs...") if isinstance(psm_list, PSMList): return psm_list else: - try: - return psm_utils.io.read_file( - config["psm_file"], - filetype=config["psm_file_type"], - show_progressbar=True, - **config["psm_reader_kwargs"], - ) - except psm_utils.io.PSMUtilsIOException: - raise MS2RescoreConfigurationError( - "Error occurred while reading PSMs. Please check the `psm_file` and " - "`psm_file_type` settings. See " - "https://ms2rescore.readthedocs.io/en/latest/userguide/input-files/" - " for more information." + logger.info("Reading PSMs from file...") + current_file = 1 + total_files = len(config["psm_file"]) + psm_list_list = [] + for psm_file in config["psm_file"]: + logger.info( + f"Reading PSMs from PSM file ({current_file}/{total_files}): `{psm_file}`..." ) + try: + id_file_psm_list = psm_utils.io.read_file( + psm_file, + filetype=config["psm_file_type"], + show_progressbar=True, + **config["psm_reader_kwargs"], + ) + except psm_utils.io.PSMUtilsIOException: + raise MS2RescoreConfigurationError( + "Error occurred while reading PSMs. Please check the `psm_file` and " + "`psm_file_type` settings. See " + "https://ms2rescore.readthedocs.io/en/latest/userguide/input-files/" + " for more information." + ) + + psm_list_list.append(id_file_psm_list) + current_file += 1 + + return PSMList(psm_list=list(chain.from_iterable(p.psm_list for p in psm_list_list))) def _find_decoys(config, psm_list): @@ -113,7 +124,7 @@ def _match_psm_ids(old_id, regex_pattern): try: return match[1] except (TypeError, IndexError): - raise MS2RescoreError( + raise MS2RescoreConfigurationError( "`psm_id_pattern` could not be matched to all PSM spectrum IDs." " Ensure that the regex contains a capturing group?" ) diff --git a/ms2rescore/parse_spectra.py b/ms2rescore/parse_spectra.py new file mode 100644 index 00000000..9ed199b9 --- /dev/null +++ b/ms2rescore/parse_spectra.py @@ -0,0 +1,137 @@ +"""Parse MGF files.""" + +import logging +import re +from itertools import chain +from typing import Dict, Tuple + +from psm_utils import PSMList +from pyteomics.mgf import MGF +from pyteomics.mzml import MzML +from rich.progress import track + +from ms2rescore.exceptions import MS2RescoreError +from ms2rescore.utils import infer_spectrum_path + +logger = logging.getLogger(__name__) + + +def get_missing_values(config, psm_list, missing_rt=False, missing_im=False): + """Get missing RT/IM features from spectrum file.""" + logger.debug("Extracting missing RT/IM values from spectrum file(s).") + + psm_dict = psm_list.get_psm_dict() + for runs in psm_dict.values(): + for run, psms in track(runs.items(), description="Extracting RT/IM values..."): + psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) + spectrum_file = infer_spectrum_path(config["spectrum_path"], run) + + if spectrum_file.suffix.lower() == ".mzml": + rt_dict, im_dict = _parse_values_from_mzml( + spectrum_file, config, run, missing_rt, missing_im + ) + elif spectrum_file.suffix.lower() == ".mgf": + rt_dict, im_dict = _parse_values_from_mgf( + spectrum_file, config, run, missing_rt, missing_im + ) + + for value_dict, value in zip([rt_dict, im_dict], ["retention_time", "ion_mobility"]): + if value_dict: + try: + psm_list_run[value] = [value_dict[psm.spectrum_id] for psm in psm_list_run] + except KeyError: + raise ParsingError( + f"Could not parse {value} values from spectrum file for run {run}." + ) + + +def _parse_values_from_mgf( + spectrum_file, config, run, missing_rt, missing_im +) -> Tuple[Dict, Dict]: + """ + Parse retention time and/or ion mobility from an MGF file. + + Notes + ----- + - Extracting values (e.g., ion mobility) according to the Matrix documentation: + http://www.matrixscience.com/help/data_file_help.html + + """ + rt_dict = {} + im_dict = {} + + spectrum_id_pattern = re.compile( + config["spectrum_id_pattern"] if config["spectrum_id_pattern"] else r"(.*)" + ) + + for spectrum in MGF(str(spectrum_file)): + matched_id = spectrum_id_pattern.match(spectrum["params"]["title"]).group() + if missing_rt: + try: + rt_dict[matched_id] = float(spectrum["params"]["rtinseconds"]) + except KeyError: + raise ParsingError( + "Could not parse retention time (`rtinseconds`) from spectrum file for " + f"run {run}. Please make sure that the retention time key is present in the " + "spectrum file or disable the relevant feature generator." + ) + if missing_im: + try: + im_dict[matched_id] = float(spectrum["params"]["ion_mobility"]) + except KeyError: + raise ParsingError( + "Could not parse ion mobility (`ion_mobility`) from spectrum file " + f"for run {run}. Please make sure that the ion mobility key is present in the " + "spectrum file or disable the relevant feature generator." + ) + + return rt_dict, im_dict + + +def _parse_values_from_mzml( + spectrum_file, config, run, missing_rt, missing_im +) -> Tuple[Dict, Dict]: + """Parse retention time and/or ion mobility from an mzML file.""" + rt_dict = {} + im_dict = {} + + spectrum_id_pattern = re.compile( + config["spectrum_id_pattern"] if config["spectrum_id_pattern"] else r"(.*)" + ) + + for spectrum in MzML(str(spectrum_file)): + matched_id = spectrum_id_pattern.match(spectrum["id"]).group() + if missing_rt: + try: + rt_dict[matched_id] = float(spectrum["scanList"]["scan"][0]["scan start time"]) + except KeyError: + raise ParsingError( + "Could not parse retention time (`scan start time`) from spectrum file for " + f"run {run}. Please make sure that the retention time key is present in the " + "spectrum file or disable the relevant feature generator." + ) + if missing_im: + try: + im_dict[matched_id] = float( + spectrum["scanList"]["scan"][0]["reverse ion mobility"] + ) + except KeyError: + raise ParsingError( + "Could not parse ion mobility (`reverse ion mobility`) from spectrum file " + f"for run {run}. Please make sure that the ion mobility key is present in the " + "spectrum file or disable the relevant feature generator." + ) + + return rt_dict, im_dict + + +class ParseMGFError(MS2RescoreError): + """Error parsing MGF file.""" + + pass + + +class ParsingError(MS2RescoreError): + """Error parsing retention time from spectrum file.""" + + pass diff --git a/ms2rescore/report/generate.py b/ms2rescore/report/generate.py index 64ab0e4b..090db873 100644 --- a/ms2rescore/report/generate.py +++ b/ms2rescore/report/generate.py @@ -96,7 +96,9 @@ def generate_report( "metadata": { "generated_on": datetime.now().strftime("%d/%m/%Y %H:%M:%S"), "ms2rescore_version": ms2rescore.__version__, # TODO: Write during run? - "psm_filename": Path(config["ms2rescore"]["psm_file"]).name, + "psm_filename": "\n".join( + [Path(id_file).name for id_file in config["ms2rescore"]["psm_file"]] + ), }, "main_tabs": [ { diff --git a/ms2rescore/rescoring_engines/mokapot.py b/ms2rescore/rescoring_engines/mokapot.py index 65a3d038..f3927f47 100644 --- a/ms2rescore/rescoring_engines/mokapot.py +++ b/ms2rescore/rescoring_engines/mokapot.py @@ -20,7 +20,7 @@ """ import logging -from typing import Any, List, Optional, Tuple +from typing import Any, List, Optional, Tuple, Dict import mokapot import numpy as np @@ -40,6 +40,7 @@ def rescore( write_weights: bool = False, write_txt: bool = False, write_flashlfq: bool = False, + protein_kwargs: Optional[Dict[str, Any]] = None, **kwargs: Any, ) -> None: """ @@ -68,6 +69,9 @@ def rescore( Write Mokapot results to a text file. Defaults to ``False``. write_flashlfq Write Mokapot results to a FlashLFQ-compatible file. Defaults to ``False``. + protein_kwargs + Keyword arguments to pass to the :py:meth:`~mokapot.dataset.LinearPsmDataset.add_proteins` + method. **kwargs Additional keyword arguments are passed to the Mokapot :py:func:`~mokapot.brew` function. @@ -80,11 +84,11 @@ def rescore( # Add proteins if fasta_file: - proteins = mokapot.read_fasta(fasta_file) - lin_psm_data.add_proteins(proteins) + logger.debug(f"Adding protein info from {fasta_file} with options: `{protein_kwargs}`") + lin_psm_data.add_proteins(fasta_file, **protein_kwargs) # Rescore - logger.debug(f"Mokapot keyword arguments : {kwargs}") + logger.debug(f"Mokapot brew options: `{kwargs}`") confidence_results, models = brew(lin_psm_data, **kwargs) # Reshape confidence estimates to match PSMList @@ -120,6 +124,8 @@ def rescore( if write_txt: confidence_results.to_txt(file_root=output_file_root, decoys=True) if write_flashlfq: + # TODO: How do we validate that the RTs are in minutes? + confidence_results.psms["retention_time"] = confidence_results.psms["retention_time"] * 60 confidence_results.to_flashlfq(output_file_root + ".mokapot.flashlfq.txt") diff --git a/ms2rescore/utils.py b/ms2rescore/utils.py index 26c88a7e..70417b56 100644 --- a/ms2rescore/utils.py +++ b/ms2rescore/utils.py @@ -75,4 +75,4 @@ def infer_spectrum_path( "files." ) - return resolved_path + return Path(resolved_path) diff --git a/ms2rescore_innosetup.iss b/ms2rescore_innosetup.iss index 8bbd2aaa..8e536444 100644 --- a/ms2rescore_innosetup.iss +++ b/ms2rescore_innosetup.iss @@ -2,6 +2,7 @@ #define MyAppPublisher "CompOmics" #define MyAppURL "https://github.com/compomics/ms2rescore" #define MyAppExeName "ms2rescore.exe" +#define OutputFilename "{#MyAppName}-{#MyAppVersion}-Windows64bit" [Setup] AppId={{2D3D12BD-3AE2-426E-8DE8-092148C12071} @@ -17,7 +18,7 @@ LicenseFile=.\LICENSE PrivilegesRequired=lowest PrivilegesRequiredOverridesAllowed=dialog OutputDir="dist" -OutputBaseFilename="{#MyAppName}-{#MyAppVersion}-Windows64bit" +OutputBaseFilename={#OutputFilename} Compression=lzma SolidCompression=yes WizardStyle=modern