Skip to content

Commit

Permalink
Merge pull request #30 from compomics/feature/beta5
Browse files Browse the repository at this point in the history
v2.0.0-beta.5
  • Loading branch information
RalfG authored Mar 4, 2021
2 parents c27db1d + 4bfd2d4 commit 006834a
Show file tree
Hide file tree
Showing 11 changed files with 131 additions and 46 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest, macos-latest]
python-version: [3.6, 3.7, 3.8]
python-version: [3.7, 3.8]
steps:
- uses: actions/checkout@v2

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ To replicate the experiments described in this article, check out the
[![install pip](https://flat.badgen.net/badge/install%20with/pip/green)](https://pypi.org/project/ms2rescore/)

MS²ReScore requires:
- Python 3.6 or higher on Linux, macOS, or
- Python 3.7 or higher on Linux, macOS, or
[Windows Subsystem for Linux](https://docs.microsoft.com/en-us/windows/wsl)
- If the option `run_percolator` is set to `True`,
[Percolator](https://github.com/percolator/percolator/) needs to be callable with the
Expand Down
2 changes: 1 addition & 1 deletion ms2rescore/_version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""Single source of ms2rescore version number."""

__version__ = "2.0.0-beta.4"
__version__ = "2.0.0-beta.5"
32 changes: 21 additions & 11 deletions ms2rescore/id_file_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import pandas as pd
from pyteomics import tandem

from ms2rescore._exceptions import MS2ReScoreError
from ms2rescore.maxquant import MSMSAccessor
from ms2rescore.parse_mgf import parse_mgf
from ms2rescore.peptide_record import PeptideRecord
Expand All @@ -20,6 +21,12 @@
logger = logging.getLogger(__name__)


class IDFileParserError(MS2ReScoreError):
"""Error parsing ID file."""

pass


def parse_mgf_title_rt(
path_to_mgf: Union[str, os.PathLike]
) -> Tuple[Dict[int, str], Dict[int, float]]:
Expand Down Expand Up @@ -123,7 +130,7 @@ def _validate_mgf_path(
path_to_mgf_file = passed_path

else:
raise ValueError(
raise IDFileParserError(
"Configured `mgf_path` must be None or a path to an existing file or "
"directory."
)
Expand Down Expand Up @@ -176,12 +183,12 @@ def peprec_from_pin(self) -> PeptideRecord:
titles, retention_times = parse_mgf_title_rt(self.path_to_mgf_file)
peprec.df["observed_retention_time"] = peprec.df["spec_id"].map(retention_times)
peprec.df["spec_id"] = peprec.df["spec_id"].map(titles)
assert (
~peprec.df["observed_retention_time"].isna().any()
), "Could not map all MGF retention times to spectrum indices."
assert (
~peprec.df["spec_id"].isna().any()
), "Could not map all MGF titles to spectrum indices."
if not ~peprec.df["observed_retention_time"].isna().any():
raise IDFileParserError(
"Could not map all MGF retention times to spectrum indices."
)
if not ~peprec.df["spec_id"].isna().any():
raise IDFileParserError("Could not map all MGF titles to spectrum indices.")

return peprec

Expand Down Expand Up @@ -297,10 +304,12 @@ def get_peprec(self) -> PeptideRecord:
on="tandem_id"
)
# Validate merge by comparing the hyperscore columns
assert (peprec_df["hyperscore_tandem"] == peprec_df["hyperscore"]).all()
if not (peprec_df["hyperscore_tandem"] == peprec_df["hyperscore"]).all():
raise IDFileParserError(
"Could not merge tandem xml and generated pin files."
)
peprec_df.drop(
columns=["tandem_id", "hyperscore_tandem"],
axis="columns",
inplace=True
)

Expand Down Expand Up @@ -365,8 +374,9 @@ def parse_mgf_files(self, peprec):
peprec.df,
self.passed_mgf_path,
outname=path_to_new_mgf,
filename_col='Raw file', spec_title_col='spec_id',
title_parsing_method='TRFP_MQ',
filename_col='Raw file',
spec_title_col='spec_id',
title_parsing_method='run.scan.scan',
)
self._path_to_new_mgf = path_to_new_mgf

Expand Down
27 changes: 23 additions & 4 deletions ms2rescore/maxquant.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,24 @@ def __init__(self, pandas_obj) -> None:
"""Pandas extension for MaxQuant msms.txt files."""
self._obj = pandas_obj
self._set_mass_error_unit()
self.invalid_amino_acids = r"[BJOUXZ]"

@classmethod
def _evaluate_columns(cls, column: str) -> bool:
"""Case insensitive column evaluation for Pandas.read_csv usecols argument."""
return column.lower() in [col.lower() for col in cls.default_columns]

@classmethod
def _fix_column_case(cls, columns: List[str]) -> Dict[str, str]:
"""
Create mapping for column names with the correct case.
Using `_evaluate_columns`, we can load required columns in a case-insensitive
manner. As a result, the column name case must be fixed for downstream usage.
"""
case_mapping = {col.lower(): col for col in cls.default_columns}
rename_mapping = {col: case_mapping[col.lower()] for col in columns}
return rename_mapping

@classmethod
def from_file(
Expand All @@ -66,16 +84,17 @@ def from_file(
filter_rank1_psms : bool, optional
filter for rank 1 PSMs
validate_amino_acids : bool, optional
remove PSMs where the sequence includes an invalid amino acid
(B, J, O, U, X, Z); required for MS2PIP compatibility
remove PSMs where the sequence includes an invalid amino acid; required for
MS2PIP compatibility
Returns
-------
msms : ms2rescore.maxquant.MSMS
MSMS object (pandas.DataFrame with additional methods)
"""

msms_df = pd.read_csv(path_to_msms, sep="\t", usecols=cls.default_columns)
msms_df = pd.read_csv(path_to_msms, sep="\t", usecols=cls._evaluate_columns)
msms_df.rename(columns=cls._fix_column_case(msms_df.columns), inplace=True)
if filter_rank1_psms:
msms_df = msms_df.msms.filter_rank1_psms()
if validate_amino_acids:
Expand Down Expand Up @@ -114,7 +133,7 @@ def filter_rank1_psms(self) -> pd.DataFrame:
def remove_invalid_amino_acids(self) -> pd.DataFrame:
"""Remove invalid amino acids from MSMS."""
invalid_indices = self._obj[self._obj["Sequence"].str.contains(
r"[BJOUXZ]", regex=True
self.invalid_amino_acids, regex=True
)].index
self._obj = self._obj.drop(index=invalid_indices).reset_index(drop=True)

Expand Down
41 changes: 27 additions & 14 deletions ms2rescore/parse_mgf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,21 @@
import logging
import mmap
import os.path
import re

from tqdm import tqdm

from ms2rescore._exceptions import MS2ReScoreError

logger = logging.getLogger(__name__)


class ParseMGFError(MS2ReScoreError):
"""Error parsing MGF file."""

pass


def get_num_lines(file_path):
fp = open(file_path, "r+")
buf = mmap.mmap(fp.fileno(), 0)
Expand All @@ -18,7 +27,7 @@ def get_num_lines(file_path):
return lines


def title_parser(line, method='full'):
def title_parser(line, method='full', run=None):
"""
Take an MGF TITLE line and return the spectrum title.
Expand All @@ -32,8 +41,7 @@ def title_parser(line, method='full'):
- 'first_space': take everything between 'TITLE=' and first space.
- 'first_space_no_charge': take everything between 'TITLE=' and first space,
but leave out everything after last dot. (required for MaxQuant pipeline).
- 'TRFP_MQ': For MGF parsed with ThermoRawFileParser and spec_ids from
MaxQuant msms.txt.
- 'run.scan.scan': Extract scan number and merge with run name (for MaxQuant IDs).
"""

if method == 'full':
Expand All @@ -42,11 +50,17 @@ def title_parser(line, method='full'):
title = line[6:].split(' ')[0].strip()
elif method == 'first_space_no_charge':
title = '.'.join(line[6:].split(' ')[0].split('.')[:-1]).strip()
elif method == 'TRFP_MQ':
line = line.strip().split('mzspec=')[1].split(' ')
filename = line[0].replace('.raw:', '')
scan = line[3].replace('scan=', '')
title = '.'.join([filename, scan, scan])
elif method == 'run.scan.scan':
if not run:
raise TypeError("If `method` is `run.scan.scan`, `run` cannot be None.")
scan_m = re.match(r"TITLE=.*scan=([0-9]+).*$", line)
if scan_m:
scan = scan_m.group(1)
else:
raise ParseMGFError(
f"Could not extract scan number from TITLE field: `{line.strip()}`"
)
title = '.'.join([run, scan, scan])
else:
raise ValueError("method '{}' is not a valid title parsing method".format(
method
Expand All @@ -65,7 +79,7 @@ def parse_mgf(df_in, mgf_folder, outname='scan_mgf_result.mgf',
if df_in[spec_title_col].duplicated().any():
logger.warning("Duplicate spec_id's found in PeptideRecord.")

if df_in[filename_col].iloc[0][-4:] in ['.mgf', '.MGF']:
if df_in[filename_col].iloc[0][-4:].lower() == '.mgf':
file_suffix = ''
else:
file_suffix = '.mgf'
Expand All @@ -76,19 +90,19 @@ def parse_mgf(df_in, mgf_folder, outname='scan_mgf_result.mgf',
with open(outname, 'w') as out:
count = 0
for run in runs:
found = False
current_mgf_file = os.path.join(mgf_folder, run + file_suffix)
spec_set = set(df_in[(df_in[filename_col] == run)][spec_title_col].values)

# Temporary fix: replace charges in MGF with ID'ed charges
# Until MS2PIP uses ID'ed charge instead of MGF charge
id_charges = df_in[(df_in[filename_col] == run)].set_index('spec_id')['charge'].to_dict()

found = False
with open(current_mgf_file, 'r') as f:
iterator = tqdm(f, total=get_num_lines(current_mgf_file)) if show_progress_bar else f
for line in iterator:
if 'TITLE=' in line:
title = title_parser(line, method=title_parsing_method)
title = title_parser(line, method=title_parsing_method, run=run)
if title in spec_set:
found = True
line = "TITLE=" + title + "\n"
Expand All @@ -115,6 +129,5 @@ def parse_mgf(df_in, mgf_folder, outname='scan_mgf_result.mgf',
logger.debug(
"%i/%i spectra found and written to new MGF file.", count, num_expected
)
assert (
count == num_expected
), "Not all PSMs could be found in the provided MGF files"
if not count == num_expected:
raise ParseMGFError("Not all PSMs could be found in the provided MGF files.")
17 changes: 16 additions & 1 deletion ms2rescore/peptideshaker.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,18 +34,33 @@ def __init__(self, pandas_obj: pd.DataFrame) -> None:
def _validate(self):
"""Validate Pandas DataFrame as Extended PSM Report."""
# TODO: Implement validation of PSM report DataFrame
pass
self.drop_invalid_amino_acids()

def drop_invalid_amino_acids(self, invalid_amino_acids=r"[BJOUXZ]"):
"""Drop all PSMs (rows) with peptides containing invalid amino acids."""
to_drop = self._obj[
self._obj['Sequence'].str.contains(invalid_amino_acids, regex=True)
].index
if len(to_drop) > 0:
logger.warning(
"Dropping %i PSMs from report due to invalid amino acids (%s)",
len(to_drop),
invalid_amino_acids
)
self._obj = self._obj.drop(index=to_drop)

@staticmethod
def from_tsv(path: Union[str, os.PathLike]) -> pd.DataFrame:
"""Read Extended PSM Report from TSV file."""
ext_psm_report = pd.read_csv(path, sep="\t", index_col=0)
ext_psm_report.ext_psm_report._validate()
return ext_psm_report

@staticmethod
def from_xls(path: Union[str, os.PathLike]) -> pd.DataFrame:
"""Read Extended PSM Report from XLS file."""
ext_psm_report = pd.read_excel(path, sheet_name=0, index_col=0)
pd.ext_psm_report._validate(ext_psm_report)
return ext_psm_report

@staticmethod
Expand Down
Loading

0 comments on commit 006834a

Please sign in to comment.