compomics · RalfG · Oct 12, 2023 · Sep 25, 2023 · Sep 25, 2023 · Sep 26, 2023
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -7,7 +7,7 @@ on:
   pull_request:
 
 jobs:
-  test:
+  test-python-package:
     runs-on: ubuntu-latest
     strategy:
       matrix:
@@ -42,3 +42,30 @@ jobs:
       - name: Test installation
         run: |
           ms2rescore --help
+
+  test-windows-installer:
+    # Only run on push to main (e.g., after PR merge)
+    if: ${{ github.ref == 'refs/heads/main' }}
+    runs-on: windows-latest
+    steps:
+      - uses: actions/checkout@v2
+
+      - uses: actions/setup-python@v2
+        with:
+          python-version: "3.11"
+
+      - name: Install package and dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install . pyinstaller
+
+      - name: Install Inno Setup
+        uses: crazy-max/ghaction-chocolatey@v1
+        with:
+          args: install innosetup -y --allow-unofficial --force
+
+      - name: Run pyinstaller
+        run: pyinstaller ./ms2rescore.spec --clean --noconfirm
+
+      - name: Test built exe
+        run: dist/ms2rescore/ms2rescore.exe
diff --git a/docs/source/config_schema.md b/docs/source/config_schema.md
@@ -22,6 +22,8 @@
     - **One of**
       - *string*
       - *null*
+      - *array*
+        - **Items** *(string)*
   - **`psm_file_type`** *(string)*: PSM file type. By default inferred from file extension. Default: `"infer"`.
   - **`psm_reader_kwargs`** *(object)*: Keyword arguments passed to the PSM reader. Default: `{}`.
   - **`spectrum_path`**: Path to spectrum file or directory with spectrum files.

diff --git a/ms2rescore/__main__.py b/ms2rescore/__main__.py
@@ -62,6 +62,7 @@ def _argument_parser() -> argparse.ArgumentParser:
         metavar="FILE",
         action="store",
         type=str,
+        nargs="*",
         dest="psm_file",
         help="path to PSM file (PIN, mzIdentML, MaxQuant msms, X!Tandem XML...)",
     )

diff --git a/ms2rescore/config_parser.py b/ms2rescore/config_parser.py
@@ -44,11 +44,18 @@ def _validate_filenames(config: Dict) -> Dict:
     if not config["ms2rescore"]["psm_file"]:
         raise MS2RescoreConfigurationError("PSM file should be provided.")
 
-    # psm_file should exist
-    id_file = Path(config["ms2rescore"]["psm_file"])
-    if not id_file.is_file():
-        raise FileNotFoundError(id_file)
-    config["ms2rescore"]["psm_file"] = id_file.as_posix()
+    # if psm_file is a string turn into a list else leave as is
+    if isinstance(config["ms2rescore"]["psm_file"], str):
+        config["ms2rescore"]["psm_file"] = [config["ms2rescore"]["psm_file"]]
+
+    # all provided psm_file(s) should exist
+    psm_files = []
+    for psm_file in config["ms2rescore"]["psm_file"]:
+        id_file = Path(psm_file)
+        if not id_file.is_file():
+            raise FileNotFoundError(id_file)
+        psm_files.append(id_file.as_posix())
+    config["ms2rescore"]["psm_file"] = psm_files
 
     # spectrum_path should either be None, or existing path to file or dir
     if config["ms2rescore"]["spectrum_path"]:
@@ -59,10 +66,10 @@ def _validate_filenames(config: Dict) -> Dict:
 
     # Parse output_path
     config["ms2rescore"]["output_path"] = _parse_output_path(
-        config["ms2rescore"]["output_path"], config["ms2rescore"]["psm_file"]
+        config["ms2rescore"]["output_path"], config["ms2rescore"]["psm_file"][0]
     )
 
-    # Parse config_file as posix path
+    # Parse config_file as posix path #TODO: Is this necessary?
     if config["ms2rescore"]["config_file"]:
         config["ms2rescore"]["config_file"] = Path(config["ms2rescore"]["config_file"]).as_posix()
 

diff --git a/ms2rescore/core.py b/ms2rescore/core.py
@@ -10,6 +10,7 @@
 from ms2rescore.parse_psms import parse_psms
 from ms2rescore.report import generate
 from ms2rescore.rescoring_engines import mokapot, percolator
+from ms2rescore.parse_spectra import get_missing_values
 
 logger = logging.getLogger(__name__)
 
@@ -53,6 +54,23 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
         f"PSMs already contain the following rescoring features: {psm_list_feature_names}"
     )
 
+    if ("deeplc" in config["feature_generators"] and None in psm_list["retention_time"]) or (
+        "ionmob" in config["feature_generators"] and None in psm_list["ion_mobility"]
+    ):
+        logger.warning(
+            "One or more PSMs are missing retention time and/or ion mobility values. These will be "
+            "parsed from the spectrum file."
+        )
+        get_missing_values(
+            config,
+            psm_list,
+            missing_rt_values=(
+                "deeplc" in config["feature_generators"] and None in psm_list["retention_time"]
+            ),
+            missing_im_values=(
+                "ionmob" in config["feature_generators"] and None in psm_list["ion_mobility"]
+            ),
+        )
     # Add rescoring features
     for fgen_name, fgen_config in config["feature_generators"].items():
         # TODO: Handle this somewhere else, more generally?
@@ -118,9 +136,15 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
     elif "mokapot" in config["rescoring_engine"]:
         if "fasta_file" not in config["rescoring_engine"]["mokapot"]:
             config["rescoring_engine"]["mokapot"]["fasta_file"] = config["fasta_file"]
+        if "protein_kwargs" in config["rescoring_engine"]["mokapot"]:
+            protein_kwargs = config["rescoring_engine"]["mokapot"].pop("protein_kwargs")
+        else:
+            protein_kwargs = dict()
+
         mokapot.rescore(
             psm_list,
             output_file_root=output_file_root,
+            protein_kwargs=protein_kwargs,
             **config["rescoring_engine"]["mokapot"],
         )
     else:

diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py
@@ -30,7 +30,6 @@
 
 from ms2rescore.exceptions import MS2RescoreError
 from ms2rescore.feature_generators.base import FeatureGeneratorBase
-from ms2rescore.parse_mgf import parse_mgf_title_rt
 from ms2rescore.utils import infer_spectrum_path
 
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
@@ -146,78 +145,60 @@ def add_features(self, psm_list: PSMList) -> None:
                 logger.info(
                     f"Running DeepLC for PSMs from run ({current_run}/{total_runs}): `{run}`..."
                 )
-                # Prepare PSM file
-                with contextlib.redirect_stdout(
-                    open(os.devnull, "w")
-                ) if not self._verbose else contextlib.nullcontext():
-                    psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
-
-                    if not all(psm_list["retention_time"]):
-                        # Prepare spectrum filenames
-                        spectrum_filename = infer_spectrum_path(self.spectrum_path, run)
-                        retention_time_dict = parse_mgf_title_rt(
-                            spectrum_filename
-                        )  # TODO Add mzML support
-                        try:
-                            psm_list_run["retention_time"] = [
-                                retention_time_dict[psm_id]
-                                for psm_id in psm_list_run["spectrum_id"]
-                            ]
-                        except KeyError:
-                            raise MS2RescoreError(
-                                "Could not map all spectrum ids to retention times"
-                            )
-
-                    psm_list_calibration = self._get_calibration_psms(psm_list_run)
-
-                    logger.debug("Calibrating DeepLC")
-                    self.deeplc_predictor = self.DeepLC(
-                        n_jobs=self.processes,
-                        verbose=self._verbose,
-                        path_model=self.user_model or self.selected_model,
-                        **self.deeplc_kwargs,
+
+                psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
+
+                psm_list_calibration = self._get_calibration_psms(psm_list_run)
+
+                logger.debug("Calibrating DeepLC")
+                self.deeplc_predictor = self.DeepLC(
+                    n_jobs=self.processes,
+                    verbose=self._verbose,
+                    path_model=self.selected_model or self.user_model,
+                    **self.deeplc_kwargs,
+                )
+                self.deeplc_predictor.calibrate_preds(
+                    seq_df=self._psm_list_to_deeplc_peprec(psm_list_calibration)
+                )
+                # Still calibrate for each run, but do not try out all model options.
+                # Just use model that was selected based on first run
+                if not self.selected_model:
+                    self.selected_model = list(self.deeplc_predictor.model.keys())
+                    self.deeplc_kwargs["deeplc_retrain"] = False
+                    logger.debug(
+                        f"Selected DeepLC model {self.selected_model} based on "
+                        "calibration of first run. Using this model (after new "
+                        "calibrations) for the remaining runs."
+                    )
+
+                predictions = np.array(
+                    self.deeplc_predictor.make_preds(
+                        seq_df=self._psm_list_to_deeplc_peprec(psm_list_run)
                     )
-                    self.deeplc_predictor.calibrate_preds(
-                        seq_df=self._psm_list_to_deeplc_peprec(psm_list_calibration)
+                )
+                observations = psm_list_run["retention_time"]
+                rt_diffs_run = np.abs(predictions - observations)
+
+                for i, psm in enumerate(psm_list_run):
+                    psm["rescoring_features"].update(
+                        {
+                            "observed_retention_time": observations[i],
+                            "predicted_retention_time": predictions[i],
+                            "rt_diff": rt_diffs_run[i],
+                        }
                     )
-                    # Still calibrate for each run, but do not try out all model options.
-                    # Just use model that was selected based on first run
-                    if not self.user_model and not self.selected_model:
-                        self.selected_model = list(self.deeplc_predictor.model.keys())
-                        logger.debug(
-                            f"Selected DeepLC model {self.selected_model} based on "
-                            "calibration of first run. Using this model (after new "
-                            "calibrations) for the remaining runs."
-                        )
-
-                    predictions = np.array(
-                        self.deeplc_predictor.make_preds(
-                            seq_df=self._psm_list_to_deeplc_peprec(psm_list_run)
-                        )
+                    peptide = psm.peptidoform.proforma.split("\\")[0]  # remove charge
+                    if peptide_rt_diff_dict[peptide]["rt_diff_best"] > rt_diffs_run[i]:
+                        peptide_rt_diff_dict[peptide] = {
+                            "observed_retention_time_best": observations[i],
+                            "predicted_retention_time_best": predictions[i],
+                            "rt_diff_best": rt_diffs_run[i],
+                        }
+                for psm in psm_list_run:
+                    psm["rescoring_features"].update(
+                        peptide_rt_diff_dict[psm.peptidoform.proforma.split("\\")[0]]
                     )
-                    observations = psm_list_run["retention_time"]
-                    rt_diffs_run = np.abs(predictions - observations)
-
-                    for i, psm in enumerate(psm_list_run):
-                        psm["rescoring_features"].update(
-                            {
-                                "observed_retention_time": observations[i],
-                                "predicted_retention_time": predictions[i],
-                                "rt_diff": rt_diffs_run[i],
-                            }
-                        )
-                        peptide = psm.peptidoform.proforma.split("\\")[0]  # remove charge
-                        if peptide_rt_diff_dict[peptide]["rt_diff_best"] > rt_diffs_run[i]:
-                            peptide_rt_diff_dict[peptide] = {
-                                "observed_retention_time_best": observations[i],
-                                "predicted_retention_time_best": predictions[i],
-                                "rt_diff_best": rt_diffs_run[i],
-                            }
-                    for psm in psm_list_run:
-                        psm["rescoring_features"].update(
-                            peptide_rt_diff_dict[psm.peptidoform.proforma.split("\\")[0]]
-                        )
-                current_run += 1
+            current_run += 1
 
     # TODO: Remove when DeepLC supports PSMList directly
     @staticmethod

diff --git a/ms2rescore/feature_generators/ionmob.py b/ms2rescore/feature_generators/ionmob.py
@@ -129,64 +129,61 @@ def add_features(self, psm_list: PSMList) -> None:
                 logger.info(
                     f"Running Ionmob for PSMs from run ({current_run}/{total_runs}): `{run}`..."
                 )
-                with contextlib.redirect_stdout(
-                    open(os.devnull, "w")
-                ) if not self._verbose else contextlib.nullcontext():
-                    psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
-                    psm_list_run_df = psm_list_run.to_dataframe()
-
-                    # prepare data frames for CCS prediction
-                    psm_list_run_df["charge"] = [
-                        peptidoform.precursor_charge
-                        for peptidoform in psm_list_run_df["peptidoform"]
-                    ]
-                    psm_list_run_df = psm_list_run_df[
-                        psm_list_run_df["charge"] < 5
-                    ]  # predictions do not go higher for ionmob
-
-                    psm_list_run_df["sequence-tokenized"] = psm_list_run_df.apply(
-                        lambda x: self.tokenize_peptidoform(x["peptidoform"]), axis=1
-                    )
-                    psm_list_run_df = psm_list_run_df[
-                        psm_list_run_df.apply(
-                            lambda x: self._is_valid_tokenized_sequence(x["sequence-tokenized"]),
-                            axis=1,
-                        )
-                    ]
-
-                    psm_list_run_df["mz"] = psm_list_run_df.apply(
-                        lambda x: calculate_mz(x["sequence-tokenized"], x["charge"]), axis=1
-                    )  # use precursor m/z from PSMs?
-
-                    psm_list_run_df["ccs_observed"] = psm_list_run_df.apply(
-                        lambda x: reduced_mobility_to_ccs(x["ion_mobility"], x["mz"], x["charge"]),
+
+                psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
+                psm_list_run_df = psm_list_run.to_dataframe()
+
+                # prepare data frames for CCS prediction
+                psm_list_run_df["charge"] = [
+                    peptidoform.precursor_charge for peptidoform in psm_list_run_df["peptidoform"]
+                ]
+                psm_list_run_df = psm_list_run_df[
+                    psm_list_run_df["charge"] < 5
+                ]  # predictions do not go higher for ionmob
+
+                psm_list_run_df["sequence-tokenized"] = psm_list_run_df.apply(
+                    lambda x: self.tokenize_peptidoform(x["peptidoform"]), axis=1
+                )
+                psm_list_run_df = psm_list_run_df[
+                    psm_list_run_df.apply(
+                        lambda x: self._is_valid_tokenized_sequence(x["sequence-tokenized"]),
                         axis=1,
                     )
-                    # calibrate CCS values
-                    shift_factor = self.calculate_ccs_shift(psm_list_run_df)
-                    psm_list_run_df["ccs_observed"] = psm_list_run_df.apply(
-                        lambda x: x["ccs_observed"] + shift_factor, axis=1
-                    )
-                    # predict CCS values
-                    tf_ds = to_tf_dataset_inference(
-                        psm_list_run_df["mz"],
-                        psm_list_run_df["charge"],
-                        psm_list_run_df["sequence-tokenized"],
-                        self.tokenizer,
-                    )
+                ]
+
+                psm_list_run_df["mz"] = psm_list_run_df.apply(
+                    lambda x: calculate_mz(x["sequence-tokenized"], x["charge"]), axis=1
+                )  # use precursor m/z from PSMs?
+
+                psm_list_run_df["ccs_observed"] = psm_list_run_df.apply(
+                    lambda x: reduced_mobility_to_ccs(x["ion_mobility"], x["mz"], x["charge"]),
+                    axis=1,
+                )
+                # calibrate CCS values
+                shift_factor = self.calculate_ccs_shift(psm_list_run_df)
+                psm_list_run_df["ccs_observed"] = psm_list_run_df.apply(
+                    lambda x: x["ccs_observed"] + shift_factor, axis=1
+                )
+                # predict CCS values
+                tf_ds = to_tf_dataset_inference(
+                    psm_list_run_df["mz"],
+                    psm_list_run_df["charge"],
+                    psm_list_run_df["sequence-tokenized"],
+                    self.tokenizer,
+                )
 
-                    psm_list_run_df["ccs_predicted"], _ = self.ionmob_model.predict(tf_ds)
+                psm_list_run_df["ccs_predicted"], _ = self.ionmob_model.predict(tf_ds)
 
-                    # calculate CCS features
-                    ccs_features = self._calculate_features(psm_list_run_df)
+                # calculate CCS features
+                ccs_features = self._calculate_features(psm_list_run_df)
 
-                    # add CCS features to PSMs
-                    for psm in psm_list_run:
-                        try:
-                            psm["rescoring_features"].update(ccs_features[psm.spectrum_id])
-                        except KeyError:
-                            psm["rescoring_features"].update({})
-                    current_run += 1
+                # add CCS features to PSMs
+                for psm in psm_list_run:
+                    try:
+                        psm["rescoring_features"].update(ccs_features[psm.spectrum_id])
+                    except KeyError:
+                        psm["rescoring_features"].update({})
+                current_run += 1
 
     def _calculate_features(self, feature_df: pd.DataFrame) -> Dict[str, Dict[str, float]]:
         """Get CCS features for PSMs."""