Merge pull request #30 from compomics/feature/beta5

v2.0.0-beta.5
compomics · Mar 4, 2021 · 006834a · 006834a
2 parents c27db1d + 4bfd2d4
commit 006834a
Show file tree

Hide file tree

Showing 11 changed files with 131 additions and 46 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -8,7 +8,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest, macos-latest]
-        python-version: [3.6, 3.7, 3.8]
+        python-version: [3.7, 3.8]
     steps:
     - uses: actions/checkout@v2
 

diff --git a/README.md b/README.md
@@ -70,7 +70,7 @@ To replicate the experiments described in this article, check out the
 [![install pip](https://flat.badgen.net/badge/install%20with/pip/green)](https://pypi.org/project/ms2rescore/)
 
 MS²ReScore requires:
-- Python 3.6 or higher on Linux, macOS, or
+- Python 3.7 or higher on Linux, macOS, or
 [Windows Subsystem for Linux](https://docs.microsoft.com/en-us/windows/wsl)
 - If the option `run_percolator` is set to `True`,
 [Percolator](https://github.com/percolator/percolator/) needs to be callable with the

diff --git a/ms2rescore/_version.py b/ms2rescore/_version.py
@@ -1,3 +1,3 @@
 """Single source of ms2rescore version number."""
 
-__version__ = "2.0.0-beta.4"
+__version__ = "2.0.0-beta.5"
diff --git a/ms2rescore/id_file_parser.py b/ms2rescore/id_file_parser.py
@@ -10,6 +10,7 @@
 import pandas as pd
 from pyteomics import tandem
 
+from ms2rescore._exceptions import MS2ReScoreError
 from ms2rescore.maxquant import MSMSAccessor
 from ms2rescore.parse_mgf import parse_mgf
 from ms2rescore.peptide_record import PeptideRecord
@@ -20,6 +21,12 @@
 logger = logging.getLogger(__name__)
 
 
+class IDFileParserError(MS2ReScoreError):
+    """Error parsing ID file."""
+
+    pass
+
+
 def parse_mgf_title_rt(
     path_to_mgf: Union[str, os.PathLike]
 ) -> Tuple[Dict[int, str], Dict[int, float]]:
@@ -123,7 +130,7 @@ def _validate_mgf_path(
             path_to_mgf_file = passed_path
 
         else:
-            raise ValueError(
+            raise IDFileParserError(
                 "Configured `mgf_path` must be None or a path to an existing file or "
                 "directory."
             )
@@ -176,12 +183,12 @@ def peprec_from_pin(self) -> PeptideRecord:
         titles, retention_times = parse_mgf_title_rt(self.path_to_mgf_file)
         peprec.df["observed_retention_time"] = peprec.df["spec_id"].map(retention_times)
         peprec.df["spec_id"] = peprec.df["spec_id"].map(titles)
-        assert (
-            ~peprec.df["observed_retention_time"].isna().any()
-        ), "Could not map all MGF retention times to spectrum indices."
-        assert (
-            ~peprec.df["spec_id"].isna().any()
-        ), "Could not map all MGF titles to spectrum indices."
+        if not ~peprec.df["observed_retention_time"].isna().any():
+            raise IDFileParserError(
+                "Could not map all MGF retention times to spectrum indices."
+            )
+        if not ~peprec.df["spec_id"].isna().any():
+            raise IDFileParserError("Could not map all MGF titles to spectrum indices.")
 
         return peprec
 
@@ -297,10 +304,12 @@ def get_peprec(self) -> PeptideRecord:
             on="tandem_id"
         )
         # Validate merge by comparing the hyperscore columns
-        assert (peprec_df["hyperscore_tandem"] == peprec_df["hyperscore"]).all()
+        if not (peprec_df["hyperscore_tandem"] == peprec_df["hyperscore"]).all():
+            raise IDFileParserError(
+                "Could not merge tandem xml and generated pin files."
+            )
         peprec_df.drop(
             columns=["tandem_id", "hyperscore_tandem"],
-            axis="columns",
             inplace=True
         )
 
@@ -365,8 +374,9 @@ def parse_mgf_files(self, peprec):
             peprec.df,
             self.passed_mgf_path,
             outname=path_to_new_mgf,
-            filename_col='Raw file', spec_title_col='spec_id',
-            title_parsing_method='TRFP_MQ',
+            filename_col='Raw file',
+            spec_title_col='spec_id',
+            title_parsing_method='run.scan.scan',
         )
         self._path_to_new_mgf = path_to_new_mgf
 

diff --git a/ms2rescore/maxquant.py b/ms2rescore/maxquant.py
@@ -48,6 +48,24 @@ def __init__(self, pandas_obj) -> None:
         """Pandas extension for MaxQuant msms.txt files."""
         self._obj = pandas_obj
         self._set_mass_error_unit()
+        self.invalid_amino_acids = r"[BJOUXZ]"
+
+    @classmethod
+    def _evaluate_columns(cls, column: str) -> bool:
+        """Case insensitive column evaluation for Pandas.read_csv usecols argument."""
+        return column.lower() in [col.lower() for col in cls.default_columns]
+
+    @classmethod
+    def _fix_column_case(cls, columns: List[str]) -> Dict[str, str]:
+        """
+        Create mapping for column names with the correct case.
+
+        Using `_evaluate_columns`, we can load required columns in a case-insensitive
+        manner. As a result, the column name case must be fixed for downstream usage.
+        """
+        case_mapping = {col.lower(): col for col in cls.default_columns}
+        rename_mapping = {col: case_mapping[col.lower()] for col in columns}
+        return rename_mapping
 
     @classmethod
     def from_file(
@@ -66,16 +84,17 @@ def from_file(
         filter_rank1_psms : bool, optional
             filter for rank 1 PSMs
         validate_amino_acids : bool, optional
-            remove PSMs where the sequence includes an invalid amino acid
-            (B, J, O, U, X, Z); required for MS2PIP compatibility
+            remove PSMs where the sequence includes an invalid amino acid; required for
+            MS2PIP compatibility
 
         Returns
         -------
         msms : ms2rescore.maxquant.MSMS
             MSMS object (pandas.DataFrame with additional methods)
         """
 
-        msms_df = pd.read_csv(path_to_msms, sep="\t", usecols=cls.default_columns)
+        msms_df = pd.read_csv(path_to_msms, sep="\t", usecols=cls._evaluate_columns)
+        msms_df.rename(columns=cls._fix_column_case(msms_df.columns), inplace=True)
         if filter_rank1_psms:
             msms_df = msms_df.msms.filter_rank1_psms()
         if validate_amino_acids:
@@ -114,7 +133,7 @@ def filter_rank1_psms(self) -> pd.DataFrame:
     def remove_invalid_amino_acids(self) -> pd.DataFrame:
         """Remove invalid amino acids from MSMS."""
         invalid_indices = self._obj[self._obj["Sequence"].str.contains(
-            r"[BJOUXZ]", regex=True
+            self.invalid_amino_acids, regex=True
         )].index
         self._obj = self._obj.drop(index=invalid_indices).reset_index(drop=True)
 

diff --git a/ms2rescore/parse_mgf.py b/ms2rescore/parse_mgf.py
@@ -3,12 +3,21 @@
 import logging
 import mmap
 import os.path
+import re
 
 from tqdm import tqdm
 
+from ms2rescore._exceptions import MS2ReScoreError
+
 logger = logging.getLogger(__name__)
 
 
+class ParseMGFError(MS2ReScoreError):
+    """Error parsing MGF file."""
+
+    pass
+
+
 def get_num_lines(file_path):
     fp = open(file_path, "r+")
     buf = mmap.mmap(fp.fileno(), 0)
@@ -18,7 +27,7 @@ def get_num_lines(file_path):
     return lines
 
 
-def title_parser(line, method='full'):
+def title_parser(line, method='full', run=None):
     """
     Take an MGF TITLE line and return the spectrum title.
 
@@ -32,8 +41,7 @@ def title_parser(line, method='full'):
     - 'first_space': take everything between 'TITLE=' and first space.
     - 'first_space_no_charge': take everything between 'TITLE=' and first space,
       but leave out everything after last dot. (required for MaxQuant pipeline).
-    - 'TRFP_MQ': For MGF parsed with ThermoRawFileParser and spec_ids from
-      MaxQuant msms.txt.
+    - 'run.scan.scan': Extract scan number and merge with run name (for MaxQuant IDs).
     """
 
     if method == 'full':
@@ -42,11 +50,17 @@ def title_parser(line, method='full'):
         title = line[6:].split(' ')[0].strip()
     elif method == 'first_space_no_charge':
         title = '.'.join(line[6:].split(' ')[0].split('.')[:-1]).strip()
-    elif method == 'TRFP_MQ':
-        line = line.strip().split('mzspec=')[1].split(' ')
-        filename = line[0].replace('.raw:', '')
-        scan = line[3].replace('scan=', '')
-        title = '.'.join([filename, scan, scan])
+    elif method == 'run.scan.scan':
+        if not run:
+            raise TypeError("If `method` is `run.scan.scan`, `run` cannot be None.")
+        scan_m = re.match(r"TITLE=.*scan=([0-9]+).*$", line)
+        if scan_m:
+            scan = scan_m.group(1)
+        else:
+            raise ParseMGFError(
+                f"Could not extract scan number from TITLE field: `{line.strip()}`"
+            )
+        title = '.'.join([run, scan, scan])
     else:
         raise ValueError("method '{}' is not a valid title parsing method".format(
             method
@@ -65,7 +79,7 @@ def parse_mgf(df_in, mgf_folder, outname='scan_mgf_result.mgf',
     if df_in[spec_title_col].duplicated().any():
         logger.warning("Duplicate spec_id's found in PeptideRecord.")
 
-    if df_in[filename_col].iloc[0][-4:] in ['.mgf', '.MGF']:
+    if df_in[filename_col].iloc[0][-4:].lower() == '.mgf':
         file_suffix = ''
     else:
         file_suffix = '.mgf'
@@ -76,19 +90,19 @@ def parse_mgf(df_in, mgf_folder, outname='scan_mgf_result.mgf',
     with open(outname, 'w') as out:
         count = 0
         for run in runs:
-            found = False
             current_mgf_file = os.path.join(mgf_folder, run + file_suffix)
             spec_set = set(df_in[(df_in[filename_col] == run)][spec_title_col].values)
 
             # Temporary fix: replace charges in MGF with ID'ed charges
             # Until MS2PIP uses ID'ed charge instead of MGF charge
             id_charges = df_in[(df_in[filename_col] == run)].set_index('spec_id')['charge'].to_dict()
 
+            found = False
             with open(current_mgf_file, 'r') as f:
                 iterator = tqdm(f, total=get_num_lines(current_mgf_file)) if show_progress_bar else f
                 for line in iterator:
                     if 'TITLE=' in line:
-                        title = title_parser(line, method=title_parsing_method)
+                        title = title_parser(line, method=title_parsing_method, run=run)
                         if title in spec_set:
                             found = True
                             line = "TITLE=" + title + "\n"
@@ -115,6 +129,5 @@ def parse_mgf(df_in, mgf_folder, outname='scan_mgf_result.mgf',
     logger.debug(
         "%i/%i spectra found and written to new MGF file.", count, num_expected
     )
-    assert (
-        count == num_expected
-    ), "Not all PSMs could be found in the provided MGF files"
+    if not count == num_expected:
+        raise ParseMGFError("Not all PSMs could be found in the provided MGF files.")
diff --git a/ms2rescore/peptideshaker.py b/ms2rescore/peptideshaker.py
@@ -34,18 +34,33 @@ def __init__(self, pandas_obj: pd.DataFrame) -> None:
     def _validate(self):
         """Validate Pandas DataFrame as Extended PSM Report."""
         # TODO: Implement validation of PSM report DataFrame
-        pass
+        self.drop_invalid_amino_acids()
+
+    def drop_invalid_amino_acids(self, invalid_amino_acids=r"[BJOUXZ]"):
+        """Drop all PSMs (rows) with peptides containing invalid amino acids."""
+        to_drop = self._obj[
+            self._obj['Sequence'].str.contains(invalid_amino_acids, regex=True)
+        ].index
+        if len(to_drop) > 0:
+            logger.warning(
+                "Dropping %i PSMs from report due to invalid amino acids (%s)",
+                len(to_drop),
+                invalid_amino_acids
+            )
+            self._obj = self._obj.drop(index=to_drop)
 
     @staticmethod
     def from_tsv(path: Union[str, os.PathLike]) -> pd.DataFrame:
         """Read Extended PSM Report from TSV file."""
         ext_psm_report = pd.read_csv(path, sep="\t", index_col=0)
+        ext_psm_report.ext_psm_report._validate()
         return ext_psm_report
 
     @staticmethod
     def from_xls(path: Union[str, os.PathLike]) -> pd.DataFrame:
         """Read Extended PSM Report from XLS file."""
         ext_psm_report = pd.read_excel(path, sheet_name=0, index_col=0)
+        pd.ext_psm_report._validate(ext_psm_report)
         return ext_psm_report
 
     @staticmethod