diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 37b4e984..942277be 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -8,7 +8,7 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest] - python-version: [3.6, 3.7, 3.8] + python-version: [3.7, 3.8] steps: - uses: actions/checkout@v2 diff --git a/README.md b/README.md index 0dbe7b92..3fb9452f 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ To replicate the experiments described in this article, check out the [![install pip](https://flat.badgen.net/badge/install%20with/pip/green)](https://pypi.org/project/ms2rescore/) MSĀ²ReScore requires: -- Python 3.6 or higher on Linux, macOS, or +- Python 3.7 or higher on Linux, macOS, or [Windows Subsystem for Linux](https://docs.microsoft.com/en-us/windows/wsl) - If the option `run_percolator` is set to `True`, [Percolator](https://github.com/percolator/percolator/) needs to be callable with the diff --git a/ms2rescore/_version.py b/ms2rescore/_version.py index debd8a3f..a27b5998 100644 --- a/ms2rescore/_version.py +++ b/ms2rescore/_version.py @@ -1,3 +1,3 @@ """Single source of ms2rescore version number.""" -__version__ = "2.0.0-beta.4" +__version__ = "2.0.0-beta.5" diff --git a/ms2rescore/id_file_parser.py b/ms2rescore/id_file_parser.py index 45fe92c2..5cc7d6ca 100644 --- a/ms2rescore/id_file_parser.py +++ b/ms2rescore/id_file_parser.py @@ -10,6 +10,7 @@ import pandas as pd from pyteomics import tandem +from ms2rescore._exceptions import MS2ReScoreError from ms2rescore.maxquant import MSMSAccessor from ms2rescore.parse_mgf import parse_mgf from ms2rescore.peptide_record import PeptideRecord @@ -20,6 +21,12 @@ logger = logging.getLogger(__name__) +class IDFileParserError(MS2ReScoreError): + """Error parsing ID file.""" + + pass + + def parse_mgf_title_rt( path_to_mgf: Union[str, os.PathLike] ) -> Tuple[Dict[int, str], Dict[int, float]]: @@ -123,7 +130,7 @@ def _validate_mgf_path( path_to_mgf_file = passed_path else: - raise ValueError( + raise IDFileParserError( "Configured `mgf_path` must be None or a path to an existing file or " "directory." ) @@ -176,12 +183,12 @@ def peprec_from_pin(self) -> PeptideRecord: titles, retention_times = parse_mgf_title_rt(self.path_to_mgf_file) peprec.df["observed_retention_time"] = peprec.df["spec_id"].map(retention_times) peprec.df["spec_id"] = peprec.df["spec_id"].map(titles) - assert ( - ~peprec.df["observed_retention_time"].isna().any() - ), "Could not map all MGF retention times to spectrum indices." - assert ( - ~peprec.df["spec_id"].isna().any() - ), "Could not map all MGF titles to spectrum indices." + if not ~peprec.df["observed_retention_time"].isna().any(): + raise IDFileParserError( + "Could not map all MGF retention times to spectrum indices." + ) + if not ~peprec.df["spec_id"].isna().any(): + raise IDFileParserError("Could not map all MGF titles to spectrum indices.") return peprec @@ -297,10 +304,12 @@ def get_peprec(self) -> PeptideRecord: on="tandem_id" ) # Validate merge by comparing the hyperscore columns - assert (peprec_df["hyperscore_tandem"] == peprec_df["hyperscore"]).all() + if not (peprec_df["hyperscore_tandem"] == peprec_df["hyperscore"]).all(): + raise IDFileParserError( + "Could not merge tandem xml and generated pin files." + ) peprec_df.drop( columns=["tandem_id", "hyperscore_tandem"], - axis="columns", inplace=True ) @@ -365,8 +374,9 @@ def parse_mgf_files(self, peprec): peprec.df, self.passed_mgf_path, outname=path_to_new_mgf, - filename_col='Raw file', spec_title_col='spec_id', - title_parsing_method='TRFP_MQ', + filename_col='Raw file', + spec_title_col='spec_id', + title_parsing_method='run.scan.scan', ) self._path_to_new_mgf = path_to_new_mgf diff --git a/ms2rescore/maxquant.py b/ms2rescore/maxquant.py index 0e2646e3..c77761d4 100644 --- a/ms2rescore/maxquant.py +++ b/ms2rescore/maxquant.py @@ -48,6 +48,24 @@ def __init__(self, pandas_obj) -> None: """Pandas extension for MaxQuant msms.txt files.""" self._obj = pandas_obj self._set_mass_error_unit() + self.invalid_amino_acids = r"[BJOUXZ]" + + @classmethod + def _evaluate_columns(cls, column: str) -> bool: + """Case insensitive column evaluation for Pandas.read_csv usecols argument.""" + return column.lower() in [col.lower() for col in cls.default_columns] + + @classmethod + def _fix_column_case(cls, columns: List[str]) -> Dict[str, str]: + """ + Create mapping for column names with the correct case. + + Using `_evaluate_columns`, we can load required columns in a case-insensitive + manner. As a result, the column name case must be fixed for downstream usage. + """ + case_mapping = {col.lower(): col for col in cls.default_columns} + rename_mapping = {col: case_mapping[col.lower()] for col in columns} + return rename_mapping @classmethod def from_file( @@ -66,8 +84,8 @@ def from_file( filter_rank1_psms : bool, optional filter for rank 1 PSMs validate_amino_acids : bool, optional - remove PSMs where the sequence includes an invalid amino acid - (B, J, O, U, X, Z); required for MS2PIP compatibility + remove PSMs where the sequence includes an invalid amino acid; required for + MS2PIP compatibility Returns ------- @@ -75,7 +93,8 @@ def from_file( MSMS object (pandas.DataFrame with additional methods) """ - msms_df = pd.read_csv(path_to_msms, sep="\t", usecols=cls.default_columns) + msms_df = pd.read_csv(path_to_msms, sep="\t", usecols=cls._evaluate_columns) + msms_df.rename(columns=cls._fix_column_case(msms_df.columns), inplace=True) if filter_rank1_psms: msms_df = msms_df.msms.filter_rank1_psms() if validate_amino_acids: @@ -114,7 +133,7 @@ def filter_rank1_psms(self) -> pd.DataFrame: def remove_invalid_amino_acids(self) -> pd.DataFrame: """Remove invalid amino acids from MSMS.""" invalid_indices = self._obj[self._obj["Sequence"].str.contains( - r"[BJOUXZ]", regex=True + self.invalid_amino_acids, regex=True )].index self._obj = self._obj.drop(index=invalid_indices).reset_index(drop=True) diff --git a/ms2rescore/parse_mgf.py b/ms2rescore/parse_mgf.py index b5bfa0f1..57110831 100644 --- a/ms2rescore/parse_mgf.py +++ b/ms2rescore/parse_mgf.py @@ -3,12 +3,21 @@ import logging import mmap import os.path +import re from tqdm import tqdm +from ms2rescore._exceptions import MS2ReScoreError + logger = logging.getLogger(__name__) +class ParseMGFError(MS2ReScoreError): + """Error parsing MGF file.""" + + pass + + def get_num_lines(file_path): fp = open(file_path, "r+") buf = mmap.mmap(fp.fileno(), 0) @@ -18,7 +27,7 @@ def get_num_lines(file_path): return lines -def title_parser(line, method='full'): +def title_parser(line, method='full', run=None): """ Take an MGF TITLE line and return the spectrum title. @@ -32,8 +41,7 @@ def title_parser(line, method='full'): - 'first_space': take everything between 'TITLE=' and first space. - 'first_space_no_charge': take everything between 'TITLE=' and first space, but leave out everything after last dot. (required for MaxQuant pipeline). - - 'TRFP_MQ': For MGF parsed with ThermoRawFileParser and spec_ids from - MaxQuant msms.txt. + - 'run.scan.scan': Extract scan number and merge with run name (for MaxQuant IDs). """ if method == 'full': @@ -42,11 +50,17 @@ def title_parser(line, method='full'): title = line[6:].split(' ')[0].strip() elif method == 'first_space_no_charge': title = '.'.join(line[6:].split(' ')[0].split('.')[:-1]).strip() - elif method == 'TRFP_MQ': - line = line.strip().split('mzspec=')[1].split(' ') - filename = line[0].replace('.raw:', '') - scan = line[3].replace('scan=', '') - title = '.'.join([filename, scan, scan]) + elif method == 'run.scan.scan': + if not run: + raise TypeError("If `method` is `run.scan.scan`, `run` cannot be None.") + scan_m = re.match(r"TITLE=.*scan=([0-9]+).*$", line) + if scan_m: + scan = scan_m.group(1) + else: + raise ParseMGFError( + f"Could not extract scan number from TITLE field: `{line.strip()}`" + ) + title = '.'.join([run, scan, scan]) else: raise ValueError("method '{}' is not a valid title parsing method".format( method @@ -65,7 +79,7 @@ def parse_mgf(df_in, mgf_folder, outname='scan_mgf_result.mgf', if df_in[spec_title_col].duplicated().any(): logger.warning("Duplicate spec_id's found in PeptideRecord.") - if df_in[filename_col].iloc[0][-4:] in ['.mgf', '.MGF']: + if df_in[filename_col].iloc[0][-4:].lower() == '.mgf': file_suffix = '' else: file_suffix = '.mgf' @@ -76,7 +90,6 @@ def parse_mgf(df_in, mgf_folder, outname='scan_mgf_result.mgf', with open(outname, 'w') as out: count = 0 for run in runs: - found = False current_mgf_file = os.path.join(mgf_folder, run + file_suffix) spec_set = set(df_in[(df_in[filename_col] == run)][spec_title_col].values) @@ -84,11 +97,12 @@ def parse_mgf(df_in, mgf_folder, outname='scan_mgf_result.mgf', # Until MS2PIP uses ID'ed charge instead of MGF charge id_charges = df_in[(df_in[filename_col] == run)].set_index('spec_id')['charge'].to_dict() + found = False with open(current_mgf_file, 'r') as f: iterator = tqdm(f, total=get_num_lines(current_mgf_file)) if show_progress_bar else f for line in iterator: if 'TITLE=' in line: - title = title_parser(line, method=title_parsing_method) + title = title_parser(line, method=title_parsing_method, run=run) if title in spec_set: found = True line = "TITLE=" + title + "\n" @@ -115,6 +129,5 @@ def parse_mgf(df_in, mgf_folder, outname='scan_mgf_result.mgf', logger.debug( "%i/%i spectra found and written to new MGF file.", count, num_expected ) - assert ( - count == num_expected - ), "Not all PSMs could be found in the provided MGF files" + if not count == num_expected: + raise ParseMGFError("Not all PSMs could be found in the provided MGF files.") diff --git a/ms2rescore/peptideshaker.py b/ms2rescore/peptideshaker.py index 5e81ab1f..1ad83fa8 100644 --- a/ms2rescore/peptideshaker.py +++ b/ms2rescore/peptideshaker.py @@ -34,18 +34,33 @@ def __init__(self, pandas_obj: pd.DataFrame) -> None: def _validate(self): """Validate Pandas DataFrame as Extended PSM Report.""" # TODO: Implement validation of PSM report DataFrame - pass + self.drop_invalid_amino_acids() + + def drop_invalid_amino_acids(self, invalid_amino_acids=r"[BJOUXZ]"): + """Drop all PSMs (rows) with peptides containing invalid amino acids.""" + to_drop = self._obj[ + self._obj['Sequence'].str.contains(invalid_amino_acids, regex=True) + ].index + if len(to_drop) > 0: + logger.warning( + "Dropping %i PSMs from report due to invalid amino acids (%s)", + len(to_drop), + invalid_amino_acids + ) + self._obj = self._obj.drop(index=to_drop) @staticmethod def from_tsv(path: Union[str, os.PathLike]) -> pd.DataFrame: """Read Extended PSM Report from TSV file.""" ext_psm_report = pd.read_csv(path, sep="\t", index_col=0) + ext_psm_report.ext_psm_report._validate() return ext_psm_report @staticmethod def from_xls(path: Union[str, os.PathLike]) -> pd.DataFrame: """Read Extended PSM Report from XLS file.""" ext_psm_report = pd.read_excel(path, sheet_name=0, index_col=0) + pd.ext_psm_report._validate(ext_psm_report) return ext_psm_report @staticmethod diff --git a/ms2rescore/percolator.py b/ms2rescore/percolator.py index 6c6231fe..f9341cdb 100644 --- a/ms2rescore/percolator.py +++ b/ms2rescore/percolator.py @@ -11,19 +11,24 @@ import pandas as pd +from ms2rescore._exceptions import MS2ReScoreError from ms2rescore.peptide_record import PeptideRecord logger = logging.getLogger(__name__) -class UnknownModificationLabelStyleError(Exception): +class PercolatorInError(MS2ReScoreError): + """Error while processing Percolator In file.""" + + +class UnknownModificationLabelStyleError(PercolatorInError): """Could not infer modification label style.""" pass -class UnknownModificationError(Exception): +class UnknownModificationError(PercolatorInError): """Modification not found in `modification_mapping`.""" pass @@ -38,6 +43,7 @@ def __init__( modification_mapping: Optional[ Dict[Tuple[Union[str, None], Union[float, str]], str] ] = None, + invalid_amino_acids: Optional[str] = r"[BJOUXZ]" ): """ Percolator In (PIN). @@ -54,10 +60,14 @@ def __init__( (e.g. `{("Q", -17.02655): "Gln->pyro-Glu"}). If the keys are floats, they are rounded up to three decimals to avoid rounding issues while matching. If None, the original modification labels from the PIN file will be used. + invalid_amino_acids: str, optional + regex pattern of invalid amino acids. PSMs containing these amino acids will + be dropped. (default: `r"[BJOUXZ]"`) """ # Attributes self.modification_pattern = r"\[([^\[^\]]*)\]" + self.invalid_amino_acids = invalid_amino_acids # Parameters self.path = path @@ -241,9 +251,8 @@ def _get_sequence_column(self) -> pd.Series: def _get_charge_column(self) -> pd.Series: """Get charge column from one-hot encoded `ChargeX` columns.""" charge_cols = [col for col in self.df.columns if col.startswith("Charge")] - assert (self.df[charge_cols] == 1).any(axis=1).all(), ( - "Not all PSMs have" " an assigned charge state." - ) + if not (self.df[charge_cols] == 1).any(axis=1).all(): + raise PercolatorInError("Not all PSMs have an assigned charge state.") return ( self.df[charge_cols] .rename( @@ -257,10 +266,10 @@ def _get_spectrum_index_column(self, pattern: Optional[str] = None) -> pd.Series if not pattern: pattern = r".+_([0-9]+)_[0-9]+_[0-9]+" id_col = self.df["SpecId"].str.extract(pattern, expand=False).astype(int) - assert ( - ~id_col.duplicated().any() - ), "Issue in matching spectrum IDs, duplicates found." - assert ~id_col.isna().any(), "Issue in matching spectrum IDs, NaN found." + if id_col.duplicated().any(): + raise PercolatorInError("Issue in matching spectrum IDs, duplicates found.") + if id_col.isna().any(): + raise PercolatorInError("Issue in matching spectrum IDs, NaN found.") return id_col def add_peprec_modifications_column(self): @@ -301,6 +310,23 @@ def get_spectrum_filename( else: raise ValueError("Multiple spectrum filenames found in single PIN file.") + def drop_invalid_amino_acids(self): + """Drop all PSMs (rows) with peptides containing invalid amino acids.""" + if "sequence" in self.df.columns: + sequences = self.df['sequence'] + else: + sequences = self._get_sequence_column() + to_drop = sequences[ + sequences.str.contains(self.invalid_amino_acids, regex=True) + ].index + if len(to_drop) > 0: + logger.warning( + "Dropping %i PSMs from PIN due to invalid amino acids (%s)", + len(to_drop), + self.invalid_amino_acids + ) + self.df = self.df.drop(index=to_drop) + @staticmethod def fix_tabs( path: str, id_column: str = "SpecId", prot_sep: Optional[str] = "|||" @@ -376,6 +402,8 @@ def read(self, path: Optional[str] = None): if not self.path: raise ValueError("No path for PIN file defined.") self.df = pd.read_csv(self.fix_tabs(self.path), sep="\t") + if self.invalid_amino_acids: + self.drop_invalid_amino_acids() def write(self, path: Optional[str] = None): """Write PIN to file.""" diff --git a/ms2rescore/rescore_core.py b/ms2rescore/rescore_core.py index b1d4f293..07c64ac4 100644 --- a/ms2rescore/rescore_core.py +++ b/ms2rescore/rescore_core.py @@ -15,6 +15,7 @@ from sklearn.metrics import mean_squared_error as mse from tqdm import tqdm +from ms2rescore._exceptions import MS2ReScoreError logger = logging.getLogger(__name__) @@ -549,9 +550,8 @@ def write_pin_files( ) # Else, just use concat else: - assert len(complete_df) == len(feature_set), ( - "Feature sets do not match." - ) + if not len(complete_df) == len(feature_set): + raise MS2ReScoreError("Feature sets do not match.") complete_df = pd.concat( [ complete_df.reset_index(drop=True), diff --git a/ms2rescore/runner.py b/ms2rescore/runner.py deleted file mode 100644 index e69de29b..00000000 diff --git a/setup.py b/setup.py index 0159d0fe..f294a3d7 100644 --- a/setup.py +++ b/setup.py @@ -56,7 +56,7 @@ def get_readme(): packages=["ms2rescore"], include_package_data=True, entry_points={"console_scripts": ["ms2rescore=ms2rescore.__main__:main"]}, - python_requires=">=3.6", + python_requires=">=3.7", install_requires=[ "importlib-resources;python_version<'3.7'", "numpy>=1.16.0,<2",