Merge pull request #97 from compomics/multiple-id-input-files

Multiple id input files
compomics · Oct 12, 2023 · 40e7ad3 · 40e7ad3
2 parents 4832a91 + dee9349
commit 40e7ad3
Show file tree

Hide file tree

Showing 20 changed files with 383 additions and 202 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -7,7 +7,7 @@ on:
   pull_request:
 
 jobs:
-  test:
+  test-python-package:
     runs-on: ubuntu-latest
     strategy:
       matrix:
@@ -42,3 +42,30 @@ jobs:
       - name: Test installation
         run: |
           ms2rescore --help
+
+  test-windows-installer:
+    # Only run on push to main (e.g., after PR merge)
+    if: ${{ github.ref == 'refs/heads/main' }}
+    runs-on: windows-latest
+    steps:
+      - uses: actions/checkout@v2
+
+      - uses: actions/setup-python@v2
+        with:
+          python-version: "3.11"
+
+      - name: Install package and dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install . pyinstaller
+
+      - name: Install Inno Setup
+        uses: crazy-max/ghaction-chocolatey@v1
+        with:
+          args: install innosetup -y --allow-unofficial --force
+
+      - name: Run pyinstaller
+        run: pyinstaller ./ms2rescore.spec --clean --noconfirm
+
+      - name: Test built exe
+        run: dist/ms2rescore/ms2rescore.exe
diff --git a/README.md b/README.md
@@ -30,8 +30,10 @@ MS²Rescore can read peptide identifications in any format supported by [psm_uti
 files:
 
 - [MS Amanda](http://ms.imp.ac.at/?goto=msamanda) `.csv`
+- [Sage](https://github.com/lazear/sage) `.sage.tsv`
 - [PeptideShaker](https://compomics.github.io/projects/peptide-shaker.html) `.mzid`
 - [MSGFPlus](https://omics.pnl.gov/software/ms-gf) `.mzid`
+- [Mascot](https://www.matrixscience.com/) `.mzid`
 - [MaxQuant](https://www.maxquant.org/) `msms.txt`
 - [X!Tandem](https://www.thegpm.org/tandem/) `.xml`
 - [PEAKS](https://www.bioinfor.com/peaksdb/) `.mzid`
@@ -45,13 +47,13 @@ MS²Rescore is available as a [desktop application][desktop], a [command line to
 
 > **MS2Rescore: Data-driven rescoring dramatically boosts immunopeptide identification rates.**
 > Arthur Declercq, Robbin Bouwmeester, Aurélie Hirschler, Christine Carapito, Sven Degroeve, Lennart Martens, and Ralf Gabriels.
-> _Molecular & Cellular Proteomics_ (2021) [doi:10.1016/j.mcpro.2022.100266](https://doi.org/10.1016/j.mcpro.2022.100266) > <span class="__dimensions_badge_embed__" data-doi="10.1016/j.mcpro.2022.100266" data-hide-zero-citations="true" data-style="small_rectangle"></span>
+> _Molecular & Cellular Proteomics_ (2021) [doi:10.1016/j.mcpro.2022.100266](https://doi.org/10.1016/j.mcpro.2022.100266) <span class="__dimensions_badge_embed__" data-doi="10.1016/j.mcpro.2022.100266" data-hide-zero-citations="true" data-style="small_rectangle"></span>
 
 **Original publication describing the concept of rescoring with predicted spectra:**
 
 > **Accurate peptide fragmentation predictions allow data driven approaches to replace and improve upon proteomics search engine scoring functions.**
 > Ana S C Silva, Robbin Bouwmeester, Lennart Martens, and Sven Degroeve.
-> _Bioinformatics_ (2019) [doi:10.1093/bioinformatics/btz383](https://doi.org/10.1093/bioinformatics/btz383) > <span class="__dimensions_badge_embed__" data-doi="10.1093/bioinformatics/btz383" data-hide-zero-citations="true" data-style="small_rectangle"></span>
+> _Bioinformatics_ (2019) [doi:10.1093/bioinformatics/btz383](https://doi.org/10.1093/bioinformatics/btz383) <span class="__dimensions_badge_embed__" data-doi="10.1093/bioinformatics/btz383" data-hide-zero-citations="true" data-style="small_rectangle"></span>
 
 To replicate the experiments described in this article, check out the
 [publication branch][publication-branch] of the repository.

diff --git a/docs/source/config_schema.md b/docs/source/config_schema.md
@@ -22,6 +22,8 @@
     - **One of**
       - *string*
       - *null*
+      - *array*
+        - **Items** *(string)*
   - **`psm_file_type`** *(string)*: PSM file type. By default inferred from file extension. Default: `"infer"`.
   - **`psm_reader_kwargs`** *(object)*: Keyword arguments passed to the PSM reader. Default: `{}`.
   - **`spectrum_path`**: Path to spectrum file or directory with spectrum files.

diff --git a/ms2rescore/__main__.py b/ms2rescore/__main__.py
@@ -62,6 +62,7 @@ def _argument_parser() -> argparse.ArgumentParser:
         metavar="FILE",
         action="store",
         type=str,
+        nargs="*",
         dest="psm_file",
         help="path to PSM file (PIN, mzIdentML, MaxQuant msms, X!Tandem XML...)",
     )

diff --git a/ms2rescore/config_parser.py b/ms2rescore/config_parser.py
@@ -44,11 +44,18 @@ def _validate_filenames(config: Dict) -> Dict:
     if not config["ms2rescore"]["psm_file"]:
         raise MS2RescoreConfigurationError("PSM file should be provided.")
 
-    # psm_file should exist
-    id_file = Path(config["ms2rescore"]["psm_file"])
-    if not id_file.is_file():
-        raise FileNotFoundError(id_file)
-    config["ms2rescore"]["psm_file"] = id_file.as_posix()
+    # if psm_file is a string turn into a list else leave as is
+    if isinstance(config["ms2rescore"]["psm_file"], str):
+        config["ms2rescore"]["psm_file"] = [config["ms2rescore"]["psm_file"]]
+
+    # all provided psm_file(s) should exist
+    psm_files = []
+    for psm_file in config["ms2rescore"]["psm_file"]:
+        id_file = Path(psm_file)
+        if not id_file.is_file():
+            raise FileNotFoundError(id_file)
+        psm_files.append(id_file.as_posix())
+    config["ms2rescore"]["psm_file"] = psm_files
 
     # spectrum_path should either be None, or existing path to file or dir
     if config["ms2rescore"]["spectrum_path"]:
@@ -59,10 +66,10 @@ def _validate_filenames(config: Dict) -> Dict:
 
     # Parse output_path
     config["ms2rescore"]["output_path"] = _parse_output_path(
-        config["ms2rescore"]["output_path"], config["ms2rescore"]["psm_file"]
+        config["ms2rescore"]["output_path"], config["ms2rescore"]["psm_file"][0]
     )
 
-    # Parse config_file as posix path
+    # Parse config_file as posix path to avoid combination of forward and backward slashes
     if config["ms2rescore"]["config_file"]:
         config["ms2rescore"]["config_file"] = Path(config["ms2rescore"]["config_file"]).as_posix()
 

diff --git a/ms2rescore/core.py b/ms2rescore/core.py
@@ -8,6 +8,7 @@
 
 from ms2rescore.feature_generators import FEATURE_GENERATORS
 from ms2rescore.parse_psms import parse_psms
+from ms2rescore.parse_spectra import get_missing_values
 from ms2rescore.report import generate
 from ms2rescore.rescoring_engines import mokapot, percolator
 
@@ -26,7 +27,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
         PSMList object containing PSMs. If None, PSMs will be read from configuration ``psm_file``.
 
     """
-    config = configuration["ms2rescore"]  # TODO: Remove top-level key?
+    config = configuration["ms2rescore"]
     output_file_root = config["output_path"]
 
     # Write full configuration including defaults to file
@@ -36,7 +37,7 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
     logger.debug("Using %i of %i available CPUs.", int(config["processes"]), int(cpu_count()))
 
     # Parse PSMs
-    psm_list = parse_psms(config, psm_list, output_file_root)
+    psm_list = parse_psms(config, psm_list)
 
     # Log #PSMs identified before rescoring
     id_psms_before = _log_id_psms_before(psm_list)
@@ -53,6 +54,13 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
         f"PSMs already contain the following rescoring features: {psm_list_feature_names}"
     )
 
+    # TODO: avoid hard coding feature generators in some way
+    rt_required = "deeplc" in config["feature_generators"] and None in psm_list["retention_time"]
+    im_required = "ionmob" in config["feature_generators"] and None in psm_list["ion_mobility"]
+    if rt_required or im_required:
+        logger.info("Parsing missing retention time and/or ion mobility values from spectra...")
+        get_missing_values(config, psm_list, missing_rt=rt_required, missing_im=im_required)
+
     # Add rescoring features
     for fgen_name, fgen_config in config["feature_generators"].items():
         # TODO: Handle this somewhere else, more generally?
@@ -118,9 +126,15 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
     elif "mokapot" in config["rescoring_engine"]:
         if "fasta_file" not in config["rescoring_engine"]["mokapot"]:
             config["rescoring_engine"]["mokapot"]["fasta_file"] = config["fasta_file"]
+        if "protein_kwargs" in config["rescoring_engine"]["mokapot"]:
+            protein_kwargs = config["rescoring_engine"]["mokapot"].pop("protein_kwargs")
+        else:
+            protein_kwargs = dict()
+
         mokapot.rescore(
             psm_list,
             output_file_root=output_file_root,
+            protein_kwargs=protein_kwargs,
             **config["rescoring_engine"]["mokapot"],
         )
     else:

diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py
@@ -28,10 +28,7 @@
 from psm_utils import PSMList
 from psm_utils.io import peptide_record
 
-from ms2rescore.exceptions import MS2RescoreError
 from ms2rescore.feature_generators.base import FeatureGeneratorBase
-from ms2rescore.parse_mgf import parse_mgf_title_rt
-from ms2rescore.utils import infer_spectrum_path
 
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
 logger = logging.getLogger(__name__)
@@ -146,50 +143,37 @@ def add_features(self, psm_list: PSMList) -> None:
                 logger.info(
                     f"Running DeepLC for PSMs from run ({current_run}/{total_runs}): `{run}`..."
                 )
-                # Prepare PSM file
+
+                # Disable wild logging to stdout by Tensorflow, unless in debug mode
                 with contextlib.redirect_stdout(
                     open(os.devnull, "w")
                 ) if not self._verbose else contextlib.nullcontext():
+                    # Make new PSM list for this run (chain PSMs per spectrum to flat list)
                     psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
 
-                    if not all(psm_list["retention_time"]):
-                        # Prepare spectrum filenames
-                        spectrum_filename = infer_spectrum_path(self.spectrum_path, run)
-                        retention_time_dict = parse_mgf_title_rt(
-                            spectrum_filename
-                        )  # TODO Add mzML support
-                        try:
-                            psm_list_run["retention_time"] = [
-                                retention_time_dict[psm_id]
-                                for psm_id in psm_list_run["spectrum_id"]
-                            ]
-                        except KeyError:
-                            raise MS2RescoreError(
-                                "Could not map all spectrum ids to retention times"
-                            )
-
+                    logger.debug("Calibrating DeepLC...")
                     psm_list_calibration = self._get_calibration_psms(psm_list_run)
-
-                    logger.debug("Calibrating DeepLC")
                     self.deeplc_predictor = self.DeepLC(
                         n_jobs=self.processes,
                         verbose=self._verbose,
-                        path_model=self.user_model or self.selected_model,
+                        path_model=self.selected_model or self.user_model,
                         **self.deeplc_kwargs,
                     )
                     self.deeplc_predictor.calibrate_preds(
                         seq_df=self._psm_list_to_deeplc_peprec(psm_list_calibration)
                     )
                     # Still calibrate for each run, but do not try out all model options.
                     # Just use model that was selected based on first run
-                    if not self.user_model and not self.selected_model:
+                    if not self.selected_model:
                         self.selected_model = list(self.deeplc_predictor.model.keys())
+                        self.deeplc_kwargs["deeplc_retrain"] = False
                         logger.debug(
                             f"Selected DeepLC model {self.selected_model} based on "
                             "calibration of first run. Using this model (after new "
                             "calibrations) for the remaining runs."
                         )
 
+                    logger.debug("Predicting retention times...")
                     predictions = np.array(
                         self.deeplc_predictor.make_preds(
                             seq_df=self._psm_list_to_deeplc_peprec(psm_list_run)
@@ -198,6 +182,7 @@ def add_features(self, psm_list: PSMList) -> None:
                     observations = psm_list_run["retention_time"]
                     rt_diffs_run = np.abs(predictions - observations)
 
+                    logger.debug("Adding features to PSMs...")
                     for i, psm in enumerate(psm_list_run):
                         psm["rescoring_features"].update(
                             {

diff --git a/ms2rescore/feature_generators/ionmob.py b/ms2rescore/feature_generators/ionmob.py
@@ -129,64 +129,61 @@ def add_features(self, psm_list: PSMList) -> None:
                 logger.info(
                     f"Running Ionmob for PSMs from run ({current_run}/{total_runs}): `{run}`..."
                 )
-                with contextlib.redirect_stdout(
-                    open(os.devnull, "w")
-                ) if not self._verbose else contextlib.nullcontext():
-                    psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
-                    psm_list_run_df = psm_list_run.to_dataframe()
-
-                    # prepare data frames for CCS prediction
-                    psm_list_run_df["charge"] = [
-                        peptidoform.precursor_charge
-                        for peptidoform in psm_list_run_df["peptidoform"]
-                    ]
-                    psm_list_run_df = psm_list_run_df[
-                        psm_list_run_df["charge"] < 5
-                    ]  # predictions do not go higher for ionmob
-
-                    psm_list_run_df["sequence-tokenized"] = psm_list_run_df.apply(
-                        lambda x: self.tokenize_peptidoform(x["peptidoform"]), axis=1
-                    )
-                    psm_list_run_df = psm_list_run_df[
-                        psm_list_run_df.apply(
-                            lambda x: self._is_valid_tokenized_sequence(x["sequence-tokenized"]),
-                            axis=1,
-                        )
-                    ]
-
-                    psm_list_run_df["mz"] = psm_list_run_df.apply(
-                        lambda x: calculate_mz(x["sequence-tokenized"], x["charge"]), axis=1
-                    )  # use precursor m/z from PSMs?
-
-                    psm_list_run_df["ccs_observed"] = psm_list_run_df.apply(
-                        lambda x: reduced_mobility_to_ccs(x["ion_mobility"], x["mz"], x["charge"]),
+
+                psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
+                psm_list_run_df = psm_list_run.to_dataframe()
+
+                # prepare data frames for CCS prediction
+                psm_list_run_df["charge"] = [
+                    peptidoform.precursor_charge for peptidoform in psm_list_run_df["peptidoform"]
+                ]
+                psm_list_run_df = psm_list_run_df[
+                    psm_list_run_df["charge"] < 5
+                ]  # predictions do not go higher for ionmob
+
+                psm_list_run_df["sequence-tokenized"] = psm_list_run_df.apply(
+                    lambda x: self.tokenize_peptidoform(x["peptidoform"]), axis=1
+                )
+                psm_list_run_df = psm_list_run_df[
+                    psm_list_run_df.apply(
+                        lambda x: self._is_valid_tokenized_sequence(x["sequence-tokenized"]),
                         axis=1,
                     )
-                    # calibrate CCS values
-                    shift_factor = self.calculate_ccs_shift(psm_list_run_df)
-                    psm_list_run_df["ccs_observed"] = psm_list_run_df.apply(
-                        lambda x: x["ccs_observed"] + shift_factor, axis=1
-                    )
-                    # predict CCS values
-                    tf_ds = to_tf_dataset_inference(
-                        psm_list_run_df["mz"],
-                        psm_list_run_df["charge"],
-                        psm_list_run_df["sequence-tokenized"],
-                        self.tokenizer,
-                    )
+                ]
+
+                psm_list_run_df["mz"] = psm_list_run_df.apply(
+                    lambda x: calculate_mz(x["sequence-tokenized"], x["charge"]), axis=1
+                )  # use precursor m/z from PSMs?
+
+                psm_list_run_df["ccs_observed"] = psm_list_run_df.apply(
+                    lambda x: reduced_mobility_to_ccs(x["ion_mobility"], x["mz"], x["charge"]),
+                    axis=1,
+                )
+                # calibrate CCS values
+                shift_factor = self.calculate_ccs_shift(psm_list_run_df)
+                psm_list_run_df["ccs_observed"] = psm_list_run_df.apply(
+                    lambda x: x["ccs_observed"] + shift_factor, axis=1
+                )
+                # predict CCS values
+                tf_ds = to_tf_dataset_inference(
+                    psm_list_run_df["mz"],
+                    psm_list_run_df["charge"],
+                    psm_list_run_df["sequence-tokenized"],
+                    self.tokenizer,
+                )
 
-                    psm_list_run_df["ccs_predicted"], _ = self.ionmob_model.predict(tf_ds)
+                psm_list_run_df["ccs_predicted"], _ = self.ionmob_model.predict(tf_ds)
 
-                    # calculate CCS features
-                    ccs_features = self._calculate_features(psm_list_run_df)
+                # calculate CCS features
+                ccs_features = self._calculate_features(psm_list_run_df)
 
-                    # add CCS features to PSMs
-                    for psm in psm_list_run:
-                        try:
-                            psm["rescoring_features"].update(ccs_features[psm.spectrum_id])
-                        except KeyError:
-                            psm["rescoring_features"].update({})
-                    current_run += 1
+                # add CCS features to PSMs
+                for psm in psm_list_run:
+                    try:
+                        psm["rescoring_features"].update(ccs_features[psm.spectrum_id])
+                    except KeyError:
+                        psm["rescoring_features"].update({})
+                current_run += 1
 
     def _calculate_features(self, feature_df: pd.DataFrame) -> Dict[str, Dict[str, float]]:
         """Get CCS features for PSMs."""

diff --git a/ms2rescore/gui/__main__.py b/ms2rescore/gui/__main__.py
@@ -1,14 +1,18 @@
 """Entrypoint for MS²Rescore GUI."""
 
 import multiprocessing
+import os
+import contextlib
 
 from ms2rescore.gui.app import app
 
 
 def main():
     """Entrypoint for MS²Rescore GUI."""
     multiprocessing.freeze_support()
-    app()
+    # Redirect stdout when running GUI (packaged app might not have console attached)
+    with contextlib.redirect_stdout(open(os.devnull, "w")):
+        app()
 
 
 if __name__ == "__main__":