Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multiple id input files #97

Merged
merged 28 commits into from
Oct 12, 2023
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
b1cee20
changes for multiple id input
ArthurDeclercq Sep 25, 2023
6ba96f2
Merge branch 'main' of https://github.com/compomics/ms2rescore into m…
ArthurDeclercq Sep 25, 2023
00dde87
change report
ArthurDeclercq Sep 26, 2023
f4a4692
add exe build to test workflow
ArthurDeclercq Sep 28, 2023
1c54e54
test fixes
ArthurDeclercq Sep 28, 2023
51c54eb
moved installer to different github workflow
ArthurDeclercq Sep 28, 2023
2c81485
fixes installer
ArthurDeclercq Sep 28, 2023
ed097a1
add name to github action
ArthurDeclercq Sep 28, 2023
2489656
changes innosetup
ArthurDeclercq Sep 28, 2023
8d6bd51
mokapot fix set rt in seconds
ArthurDeclercq Sep 29, 2023
7216076
parse values from spec file
ArthurDeclercq Sep 29, 2023
4b97e2e
changes for rt parsing from spectra
ArthurDeclercq Sep 29, 2023
9f2c6ae
change to parse rt & im from mzml/mgf
ArthurDeclercq Oct 2, 2023
b7e2f69
removed test setup code
ArthurDeclercq Oct 2, 2023
b8d41bc
added protein kwargs
ArthurDeclercq Oct 2, 2023
8069034
change name of ion mobility param mgf reader
ArthurDeclercq Oct 2, 2023
df35f18
typing fix for python 3.8
ArthurDeclercq Oct 2, 2023
4b6f35d
multiple file support gui
ArthurDeclercq Oct 3, 2023
fad0a73
add protein kwargs to mokapot tab
ArthurDeclercq Oct 3, 2023
32712cb
add n_epocs to gui
ArthurDeclercq Oct 3, 2023
c87d454
remove print statements (again)
ArthurDeclercq Oct 3, 2023
1751fd6
question ?
ArthurDeclercq Oct 3, 2023
a47fe1a
deeplc retrain fix
ArthurDeclercq Oct 9, 2023
5db71e2
parallel backend threading
ArthurDeclercq Oct 11, 2023
1249cdf
changes stdout rewriting to devnull
ArthurDeclercq Oct 11, 2023
28b4847
Move pyinstaller test to test.yml
RalfG Oct 11, 2023
e16a3b1
PR review
RalfG Oct 12, 2023
dee9349
Fix usage of config file in GUI (was not parsed at all); fix bugs fro…
RalfG Oct 12, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 28 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ on:
pull_request:

jobs:
test:
test-python-package:
runs-on: ubuntu-latest
strategy:
matrix:
Expand Down Expand Up @@ -42,3 +42,30 @@ jobs:
- name: Test installation
run: |
ms2rescore --help

test-windows-installer:
# Only run on push to main (e.g., after PR merge)
if: ${{ github.ref == 'refs/heads/main' }}
runs-on: windows-latest
steps:
- uses: actions/checkout@v2

- uses: actions/setup-python@v2
with:
python-version: "3.11"

- name: Install package and dependencies
run: |
python -m pip install --upgrade pip
pip install . pyinstaller

- name: Install Inno Setup
uses: crazy-max/ghaction-chocolatey@v1
with:
args: install innosetup -y --allow-unofficial --force

- name: Run pyinstaller
run: pyinstaller ./ms2rescore.spec --clean --noconfirm

- name: Test built exe
run: dist/ms2rescore/ms2rescore.exe
2 changes: 2 additions & 0 deletions docs/source/config_schema.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
- **One of**
- *string*
- *null*
- *array*
- **Items** *(string)*
- **`psm_file_type`** *(string)*: PSM file type. By default inferred from file extension. Default: `"infer"`.
- **`psm_reader_kwargs`** *(object)*: Keyword arguments passed to the PSM reader. Default: `{}`.
- **`spectrum_path`**: Path to spectrum file or directory with spectrum files.
Expand Down
1 change: 1 addition & 0 deletions ms2rescore/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def _argument_parser() -> argparse.ArgumentParser:
metavar="FILE",
action="store",
type=str,
nargs="*",
dest="psm_file",
help="path to PSM file (PIN, mzIdentML, MaxQuant msms, X!Tandem XML...)",
)
Expand Down
21 changes: 14 additions & 7 deletions ms2rescore/config_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,18 @@ def _validate_filenames(config: Dict) -> Dict:
if not config["ms2rescore"]["psm_file"]:
raise MS2RescoreConfigurationError("PSM file should be provided.")

# psm_file should exist
id_file = Path(config["ms2rescore"]["psm_file"])
if not id_file.is_file():
raise FileNotFoundError(id_file)
config["ms2rescore"]["psm_file"] = id_file.as_posix()
# if psm_file is a string turn into a list else leave as is
if isinstance(config["ms2rescore"]["psm_file"], str):
config["ms2rescore"]["psm_file"] = [config["ms2rescore"]["psm_file"]]

# all provided psm_file(s) should exist
psm_files = []
for psm_file in config["ms2rescore"]["psm_file"]:
id_file = Path(psm_file)
if not id_file.is_file():
raise FileNotFoundError(id_file)
psm_files.append(id_file.as_posix())
config["ms2rescore"]["psm_file"] = psm_files

# spectrum_path should either be None, or existing path to file or dir
if config["ms2rescore"]["spectrum_path"]:
Expand All @@ -59,10 +66,10 @@ def _validate_filenames(config: Dict) -> Dict:

# Parse output_path
config["ms2rescore"]["output_path"] = _parse_output_path(
config["ms2rescore"]["output_path"], config["ms2rescore"]["psm_file"]
config["ms2rescore"]["output_path"], config["ms2rescore"]["psm_file"][0]
)

# Parse config_file as posix path
# Parse config_file as posix path #TODO: Is this necessary?
if config["ms2rescore"]["config_file"]:
config["ms2rescore"]["config_file"] = Path(config["ms2rescore"]["config_file"]).as_posix()

Expand Down
24 changes: 24 additions & 0 deletions ms2rescore/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from ms2rescore.parse_psms import parse_psms
from ms2rescore.report import generate
from ms2rescore.rescoring_engines import mokapot, percolator
from ms2rescore.parse_spectra import get_missing_values

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -53,6 +54,23 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
f"PSMs already contain the following rescoring features: {psm_list_feature_names}"
)

if ("deeplc" in config["feature_generators"] and None in psm_list["retention_time"]) or (
"ionmob" in config["feature_generators"] and None in psm_list["ion_mobility"]
):
logger.warning(
"One or more PSMs are missing retention time and/or ion mobility values. These will be "
"parsed from the spectrum file."
)
get_missing_values(
config,
psm_list,
missing_rt_values=(
"deeplc" in config["feature_generators"] and None in psm_list["retention_time"]
),
missing_im_values=(
"ionmob" in config["feature_generators"] and None in psm_list["ion_mobility"]
),
)
# Add rescoring features
for fgen_name, fgen_config in config["feature_generators"].items():
# TODO: Handle this somewhere else, more generally?
Expand Down Expand Up @@ -118,9 +136,15 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
elif "mokapot" in config["rescoring_engine"]:
if "fasta_file" not in config["rescoring_engine"]["mokapot"]:
config["rescoring_engine"]["mokapot"]["fasta_file"] = config["fasta_file"]
if "protein_kwargs" in config["rescoring_engine"]["mokapot"]:
protein_kwargs = config["rescoring_engine"]["mokapot"].pop("protein_kwargs")
else:
protein_kwargs = dict()

mokapot.rescore(
psm_list,
output_file_root=output_file_root,
protein_kwargs=protein_kwargs,
**config["rescoring_engine"]["mokapot"],
)
else:
Expand Down
121 changes: 51 additions & 70 deletions ms2rescore/feature_generators/deeplc.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@

from ms2rescore.exceptions import MS2RescoreError
from ms2rescore.feature_generators.base import FeatureGeneratorBase
from ms2rescore.parse_mgf import parse_mgf_title_rt
from ms2rescore.utils import infer_spectrum_path

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
Expand Down Expand Up @@ -146,78 +145,60 @@ def add_features(self, psm_list: PSMList) -> None:
logger.info(
f"Running DeepLC for PSMs from run ({current_run}/{total_runs}): `{run}`..."
)
# Prepare PSM file
with contextlib.redirect_stdout(
open(os.devnull, "w")
) if not self._verbose else contextlib.nullcontext():
psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))

if not all(psm_list["retention_time"]):
# Prepare spectrum filenames
spectrum_filename = infer_spectrum_path(self.spectrum_path, run)
retention_time_dict = parse_mgf_title_rt(
spectrum_filename
) # TODO Add mzML support
try:
psm_list_run["retention_time"] = [
retention_time_dict[psm_id]
for psm_id in psm_list_run["spectrum_id"]
]
except KeyError:
raise MS2RescoreError(
"Could not map all spectrum ids to retention times"
)

psm_list_calibration = self._get_calibration_psms(psm_list_run)

logger.debug("Calibrating DeepLC")
self.deeplc_predictor = self.DeepLC(
n_jobs=self.processes,
verbose=self._verbose,
path_model=self.user_model or self.selected_model,
**self.deeplc_kwargs,

psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))

psm_list_calibration = self._get_calibration_psms(psm_list_run)

logger.debug("Calibrating DeepLC")
self.deeplc_predictor = self.DeepLC(
n_jobs=self.processes,
verbose=self._verbose,
path_model=self.selected_model or self.user_model,
**self.deeplc_kwargs,
)
self.deeplc_predictor.calibrate_preds(
seq_df=self._psm_list_to_deeplc_peprec(psm_list_calibration)
)
# Still calibrate for each run, but do not try out all model options.
# Just use model that was selected based on first run
if not self.selected_model:
self.selected_model = list(self.deeplc_predictor.model.keys())
self.deeplc_kwargs["deeplc_retrain"] = False
logger.debug(
f"Selected DeepLC model {self.selected_model} based on "
"calibration of first run. Using this model (after new "
"calibrations) for the remaining runs."
)

predictions = np.array(
self.deeplc_predictor.make_preds(
seq_df=self._psm_list_to_deeplc_peprec(psm_list_run)
)
self.deeplc_predictor.calibrate_preds(
seq_df=self._psm_list_to_deeplc_peprec(psm_list_calibration)
)
observations = psm_list_run["retention_time"]
rt_diffs_run = np.abs(predictions - observations)

for i, psm in enumerate(psm_list_run):
psm["rescoring_features"].update(
{
"observed_retention_time": observations[i],
"predicted_retention_time": predictions[i],
"rt_diff": rt_diffs_run[i],
}
)
# Still calibrate for each run, but do not try out all model options.
# Just use model that was selected based on first run
if not self.user_model and not self.selected_model:
self.selected_model = list(self.deeplc_predictor.model.keys())
logger.debug(
f"Selected DeepLC model {self.selected_model} based on "
"calibration of first run. Using this model (after new "
"calibrations) for the remaining runs."
)

predictions = np.array(
self.deeplc_predictor.make_preds(
seq_df=self._psm_list_to_deeplc_peprec(psm_list_run)
)
peptide = psm.peptidoform.proforma.split("\\")[0] # remove charge
if peptide_rt_diff_dict[peptide]["rt_diff_best"] > rt_diffs_run[i]:
peptide_rt_diff_dict[peptide] = {
"observed_retention_time_best": observations[i],
"predicted_retention_time_best": predictions[i],
"rt_diff_best": rt_diffs_run[i],
}
for psm in psm_list_run:
psm["rescoring_features"].update(
peptide_rt_diff_dict[psm.peptidoform.proforma.split("\\")[0]]
)
observations = psm_list_run["retention_time"]
rt_diffs_run = np.abs(predictions - observations)

for i, psm in enumerate(psm_list_run):
psm["rescoring_features"].update(
{
"observed_retention_time": observations[i],
"predicted_retention_time": predictions[i],
"rt_diff": rt_diffs_run[i],
}
)
peptide = psm.peptidoform.proforma.split("\\")[0] # remove charge
if peptide_rt_diff_dict[peptide]["rt_diff_best"] > rt_diffs_run[i]:
peptide_rt_diff_dict[peptide] = {
"observed_retention_time_best": observations[i],
"predicted_retention_time_best": predictions[i],
"rt_diff_best": rt_diffs_run[i],
}
for psm in psm_list_run:
psm["rescoring_features"].update(
peptide_rt_diff_dict[psm.peptidoform.proforma.split("\\")[0]]
)
current_run += 1
current_run += 1

# TODO: Remove when DeepLC supports PSMList directly
@staticmethod
Expand Down
103 changes: 50 additions & 53 deletions ms2rescore/feature_generators/ionmob.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,64 +129,61 @@ def add_features(self, psm_list: PSMList) -> None:
logger.info(
f"Running Ionmob for PSMs from run ({current_run}/{total_runs}): `{run}`..."
)
with contextlib.redirect_stdout(
open(os.devnull, "w")
) if not self._verbose else contextlib.nullcontext():
psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
psm_list_run_df = psm_list_run.to_dataframe()

# prepare data frames for CCS prediction
psm_list_run_df["charge"] = [
peptidoform.precursor_charge
for peptidoform in psm_list_run_df["peptidoform"]
]
psm_list_run_df = psm_list_run_df[
psm_list_run_df["charge"] < 5
] # predictions do not go higher for ionmob

psm_list_run_df["sequence-tokenized"] = psm_list_run_df.apply(
lambda x: self.tokenize_peptidoform(x["peptidoform"]), axis=1
)
psm_list_run_df = psm_list_run_df[
psm_list_run_df.apply(
lambda x: self._is_valid_tokenized_sequence(x["sequence-tokenized"]),
axis=1,
)
]

psm_list_run_df["mz"] = psm_list_run_df.apply(
lambda x: calculate_mz(x["sequence-tokenized"], x["charge"]), axis=1
) # use precursor m/z from PSMs?

psm_list_run_df["ccs_observed"] = psm_list_run_df.apply(
lambda x: reduced_mobility_to_ccs(x["ion_mobility"], x["mz"], x["charge"]),

psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
psm_list_run_df = psm_list_run.to_dataframe()

# prepare data frames for CCS prediction
psm_list_run_df["charge"] = [
peptidoform.precursor_charge for peptidoform in psm_list_run_df["peptidoform"]
]
psm_list_run_df = psm_list_run_df[
psm_list_run_df["charge"] < 5
] # predictions do not go higher for ionmob

psm_list_run_df["sequence-tokenized"] = psm_list_run_df.apply(
lambda x: self.tokenize_peptidoform(x["peptidoform"]), axis=1
)
psm_list_run_df = psm_list_run_df[
psm_list_run_df.apply(
lambda x: self._is_valid_tokenized_sequence(x["sequence-tokenized"]),
axis=1,
)
# calibrate CCS values
shift_factor = self.calculate_ccs_shift(psm_list_run_df)
psm_list_run_df["ccs_observed"] = psm_list_run_df.apply(
lambda x: x["ccs_observed"] + shift_factor, axis=1
)
# predict CCS values
tf_ds = to_tf_dataset_inference(
psm_list_run_df["mz"],
psm_list_run_df["charge"],
psm_list_run_df["sequence-tokenized"],
self.tokenizer,
)
]

psm_list_run_df["mz"] = psm_list_run_df.apply(
lambda x: calculate_mz(x["sequence-tokenized"], x["charge"]), axis=1
) # use precursor m/z from PSMs?

psm_list_run_df["ccs_observed"] = psm_list_run_df.apply(
lambda x: reduced_mobility_to_ccs(x["ion_mobility"], x["mz"], x["charge"]),
axis=1,
)
# calibrate CCS values
shift_factor = self.calculate_ccs_shift(psm_list_run_df)
psm_list_run_df["ccs_observed"] = psm_list_run_df.apply(
lambda x: x["ccs_observed"] + shift_factor, axis=1
)
# predict CCS values
tf_ds = to_tf_dataset_inference(
psm_list_run_df["mz"],
psm_list_run_df["charge"],
psm_list_run_df["sequence-tokenized"],
self.tokenizer,
)

psm_list_run_df["ccs_predicted"], _ = self.ionmob_model.predict(tf_ds)
psm_list_run_df["ccs_predicted"], _ = self.ionmob_model.predict(tf_ds)

# calculate CCS features
ccs_features = self._calculate_features(psm_list_run_df)
# calculate CCS features
ccs_features = self._calculate_features(psm_list_run_df)

# add CCS features to PSMs
for psm in psm_list_run:
try:
psm["rescoring_features"].update(ccs_features[psm.spectrum_id])
except KeyError:
psm["rescoring_features"].update({})
current_run += 1
# add CCS features to PSMs
for psm in psm_list_run:
try:
psm["rescoring_features"].update(ccs_features[psm.spectrum_id])
except KeyError:
psm["rescoring_features"].update({})
current_run += 1

def _calculate_features(self, feature_df: pd.DataFrame) -> Dict[str, Dict[str, float]]:
"""Get CCS features for PSMs."""
Expand Down
Loading