diff --git a/ms2rescore/feature_generators/deeplc.py b/ms2rescore/feature_generators/deeplc.py index 19ea6700..cb6f6a40 100644 --- a/ms2rescore/feature_generators/deeplc.py +++ b/ms2rescore/feature_generators/deeplc.py @@ -145,63 +145,60 @@ def add_features(self, psm_list: PSMList) -> None: logger.info( f"Running DeepLC for PSMs from run ({current_run}/{total_runs}): `{run}`..." ) - # Prepare PSM file - with contextlib.redirect_stdout( - open(os.devnull, "w") - ) if not self._verbose else contextlib.nullcontext(): - psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) - - psm_list_calibration = self._get_calibration_psms(psm_list_run) - - logger.debug("Calibrating DeepLC") - self.deeplc_predictor = self.DeepLC( - n_jobs=self.processes, - verbose=self._verbose, - path_model=self.selected_model or self.user_model, - **self.deeplc_kwargs, + + psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) + + psm_list_calibration = self._get_calibration_psms(psm_list_run) + + logger.debug("Calibrating DeepLC") + self.deeplc_predictor = self.DeepLC( + n_jobs=self.processes, + verbose=self._verbose, + path_model=self.selected_model or self.user_model, + **self.deeplc_kwargs, + ) + self.deeplc_predictor.calibrate_preds( + seq_df=self._psm_list_to_deeplc_peprec(psm_list_calibration) + ) + # Still calibrate for each run, but do not try out all model options. + # Just use model that was selected based on first run + if not self.selected_model: + self.selected_model = list(self.deeplc_predictor.model.keys()) + self.deeplc_kwargs["deeplc_retrain"] = False + logger.debug( + f"Selected DeepLC model {self.selected_model} based on " + "calibration of first run. Using this model (after new " + "calibrations) for the remaining runs." + ) + + predictions = np.array( + self.deeplc_predictor.make_preds( + seq_df=self._psm_list_to_deeplc_peprec(psm_list_run) ) - self.deeplc_predictor.calibrate_preds( - seq_df=self._psm_list_to_deeplc_peprec(psm_list_calibration) + ) + observations = psm_list_run["retention_time"] + rt_diffs_run = np.abs(predictions - observations) + + for i, psm in enumerate(psm_list_run): + psm["rescoring_features"].update( + { + "observed_retention_time": observations[i], + "predicted_retention_time": predictions[i], + "rt_diff": rt_diffs_run[i], + } ) - # Still calibrate for each run, but do not try out all model options. - # Just use model that was selected based on first run - if not self.selected_model: - self.selected_model = list(self.deeplc_predictor.model.keys()) - self.deeplc_kwargs["deeplc_retrain"] = False - logger.debug( - f"Selected DeepLC model {self.selected_model} based on " - "calibration of first run. Using this model (after new " - "calibrations) for the remaining runs." - ) - - predictions = np.array( - self.deeplc_predictor.make_preds( - seq_df=self._psm_list_to_deeplc_peprec(psm_list_run) - ) + peptide = psm.peptidoform.proforma.split("\\")[0] # remove charge + if peptide_rt_diff_dict[peptide]["rt_diff_best"] > rt_diffs_run[i]: + peptide_rt_diff_dict[peptide] = { + "observed_retention_time_best": observations[i], + "predicted_retention_time_best": predictions[i], + "rt_diff_best": rt_diffs_run[i], + } + for psm in psm_list_run: + psm["rescoring_features"].update( + peptide_rt_diff_dict[psm.peptidoform.proforma.split("\\")[0]] ) - observations = psm_list_run["retention_time"] - rt_diffs_run = np.abs(predictions - observations) - - for i, psm in enumerate(psm_list_run): - psm["rescoring_features"].update( - { - "observed_retention_time": observations[i], - "predicted_retention_time": predictions[i], - "rt_diff": rt_diffs_run[i], - } - ) - peptide = psm.peptidoform.proforma.split("\\")[0] # remove charge - if peptide_rt_diff_dict[peptide]["rt_diff_best"] > rt_diffs_run[i]: - peptide_rt_diff_dict[peptide] = { - "observed_retention_time_best": observations[i], - "predicted_retention_time_best": predictions[i], - "rt_diff_best": rt_diffs_run[i], - } - for psm in psm_list_run: - psm["rescoring_features"].update( - peptide_rt_diff_dict[psm.peptidoform.proforma.split("\\")[0]] - ) - current_run += 1 + current_run += 1 # TODO: Remove when DeepLC supports PSMList directly @staticmethod diff --git a/ms2rescore/feature_generators/ionmob.py b/ms2rescore/feature_generators/ionmob.py index 12f75f82..e55e6cc6 100644 --- a/ms2rescore/feature_generators/ionmob.py +++ b/ms2rescore/feature_generators/ionmob.py @@ -129,64 +129,61 @@ def add_features(self, psm_list: PSMList) -> None: logger.info( f"Running Ionmob for PSMs from run ({current_run}/{total_runs}): `{run}`..." ) - with contextlib.redirect_stdout( - open(os.devnull, "w") - ) if not self._verbose else contextlib.nullcontext(): - psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) - psm_list_run_df = psm_list_run.to_dataframe() - - # prepare data frames for CCS prediction - psm_list_run_df["charge"] = [ - peptidoform.precursor_charge - for peptidoform in psm_list_run_df["peptidoform"] - ] - psm_list_run_df = psm_list_run_df[ - psm_list_run_df["charge"] < 5 - ] # predictions do not go higher for ionmob - - psm_list_run_df["sequence-tokenized"] = psm_list_run_df.apply( - lambda x: self.tokenize_peptidoform(x["peptidoform"]), axis=1 - ) - psm_list_run_df = psm_list_run_df[ - psm_list_run_df.apply( - lambda x: self._is_valid_tokenized_sequence(x["sequence-tokenized"]), - axis=1, - ) - ] - - psm_list_run_df["mz"] = psm_list_run_df.apply( - lambda x: calculate_mz(x["sequence-tokenized"], x["charge"]), axis=1 - ) # use precursor m/z from PSMs? - - psm_list_run_df["ccs_observed"] = psm_list_run_df.apply( - lambda x: reduced_mobility_to_ccs(x["ion_mobility"], x["mz"], x["charge"]), + + psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values()))) + psm_list_run_df = psm_list_run.to_dataframe() + + # prepare data frames for CCS prediction + psm_list_run_df["charge"] = [ + peptidoform.precursor_charge for peptidoform in psm_list_run_df["peptidoform"] + ] + psm_list_run_df = psm_list_run_df[ + psm_list_run_df["charge"] < 5 + ] # predictions do not go higher for ionmob + + psm_list_run_df["sequence-tokenized"] = psm_list_run_df.apply( + lambda x: self.tokenize_peptidoform(x["peptidoform"]), axis=1 + ) + psm_list_run_df = psm_list_run_df[ + psm_list_run_df.apply( + lambda x: self._is_valid_tokenized_sequence(x["sequence-tokenized"]), axis=1, ) - # calibrate CCS values - shift_factor = self.calculate_ccs_shift(psm_list_run_df) - psm_list_run_df["ccs_observed"] = psm_list_run_df.apply( - lambda x: x["ccs_observed"] + shift_factor, axis=1 - ) - # predict CCS values - tf_ds = to_tf_dataset_inference( - psm_list_run_df["mz"], - psm_list_run_df["charge"], - psm_list_run_df["sequence-tokenized"], - self.tokenizer, - ) + ] + + psm_list_run_df["mz"] = psm_list_run_df.apply( + lambda x: calculate_mz(x["sequence-tokenized"], x["charge"]), axis=1 + ) # use precursor m/z from PSMs? + + psm_list_run_df["ccs_observed"] = psm_list_run_df.apply( + lambda x: reduced_mobility_to_ccs(x["ion_mobility"], x["mz"], x["charge"]), + axis=1, + ) + # calibrate CCS values + shift_factor = self.calculate_ccs_shift(psm_list_run_df) + psm_list_run_df["ccs_observed"] = psm_list_run_df.apply( + lambda x: x["ccs_observed"] + shift_factor, axis=1 + ) + # predict CCS values + tf_ds = to_tf_dataset_inference( + psm_list_run_df["mz"], + psm_list_run_df["charge"], + psm_list_run_df["sequence-tokenized"], + self.tokenizer, + ) - psm_list_run_df["ccs_predicted"], _ = self.ionmob_model.predict(tf_ds) + psm_list_run_df["ccs_predicted"], _ = self.ionmob_model.predict(tf_ds) - # calculate CCS features - ccs_features = self._calculate_features(psm_list_run_df) + # calculate CCS features + ccs_features = self._calculate_features(psm_list_run_df) - # add CCS features to PSMs - for psm in psm_list_run: - try: - psm["rescoring_features"].update(ccs_features[psm.spectrum_id]) - except KeyError: - psm["rescoring_features"].update({}) - current_run += 1 + # add CCS features to PSMs + for psm in psm_list_run: + try: + psm["rescoring_features"].update(ccs_features[psm.spectrum_id]) + except KeyError: + psm["rescoring_features"].update({}) + current_run += 1 def _calculate_features(self, feature_df: pd.DataFrame) -> Dict[str, Dict[str, float]]: """Get CCS features for PSMs.""" diff --git a/ms2rescore/gui/__main__.py b/ms2rescore/gui/__main__.py index b5b4a9eb..03ccd0e8 100644 --- a/ms2rescore/gui/__main__.py +++ b/ms2rescore/gui/__main__.py @@ -1,6 +1,8 @@ """Entrypoint for MSĀ²Rescore GUI.""" import multiprocessing +import os +import contextlib from ms2rescore.gui.app import app @@ -8,7 +10,8 @@ def main(): """Entrypoint for MSĀ²Rescore GUI.""" multiprocessing.freeze_support() - app() + with contextlib.redirect_stdout(open(os.devnull, "w")): + app() if __name__ == "__main__":