From a30dcefd17edfed7898eb9bdf7b0df4d1491967e Mon Sep 17 00:00:00 2001 From: Jasper Van den Hemel Date: Mon, 27 Nov 2023 14:30:15 +0100 Subject: [PATCH 01/14] refactored _generate_preds_dict and _normalize_spectra for compatibilty of new constructor --- ms2pip/spectrum_output.py | 141 +++++++++++++++++++++----------------- 1 file changed, 78 insertions(+), 63 deletions(-) diff --git a/ms2pip/spectrum_output.py b/ms2pip/spectrum_output.py index a8db9168..8351ff74 100644 --- a/ms2pip/spectrum_output.py +++ b/ms2pip/spectrum_output.py @@ -124,69 +124,84 @@ def __init__( # self.peprec_dict = peprec_tmp.to_dict(orient="index") - # def _generate_preds_dict(self): - # """ - # Create easy to access dict from peprec dataframes - # """ - # self.preds_dict = {} - # preds_list = self.all_preds[ - # ["spec_id", "charge", "ion", "ionnumber", "mz", "prediction"] - # ].values.tolist() - - # for row in preds_list: - # spec_id = row[0] - # if spec_id in self.preds_dict.keys(): - # if row[2] in self.preds_dict[spec_id]["peaks"]: - # self.preds_dict[spec_id]["peaks"][row[2]].append(tuple(row[3:])) - # else: - # self.preds_dict[spec_id]["peaks"][row[2]] = [tuple(row[3:])] - # else: - # self.preds_dict[spec_id] = { - # "charge": row[1], - # "peaks": {row[2]: [tuple(row[3:])]}, - # } - - # def _normalize_spectra(self, method="basepeak_10000"): - # """ - # Normalize spectra - # """ - # if self.is_log_space: - # self.all_preds["prediction"] = ((2 ** self.all_preds["prediction"]) - 0.001).clip( - # lower=0 - # ) - # self.is_log_space = False - - # if method == "basepeak_10000": - # if self.normalization == "basepeak_10000": - # pass - # elif self.normalization == "basepeak_1": - # self.all_preds["prediction"] *= 10000 - # else: - # self.all_preds["prediction"] = self.all_preds.groupby( - # ["spec_id"], group_keys=False - # )["prediction"].apply(lambda x: (x / x.max()) * 10000) - # self.normalization = "basepeak_10000" - - # elif method == "basepeak_1": - # if self.normalization == "basepeak_1": - # pass - # elif self.normalization == "basepeak_10000": - # self.all_preds["prediction"] /= 10000 - # else: - # self.all_preds["prediction"] = self.all_preds.groupby( - # ["spec_id"], group_keys=False - # )["prediction"].apply(lambda x: (x / x.max())) - # self.normalization = "basepeak_1" - - # elif method == "tic": - # if self.normalization != "tic": - # self.all_preds["prediction"] = self.all_preds.groupby( - # ["spec_id"], group_keys=False - # )["prediction"].apply(lambda x: x / x.sum()) - # self.normalization = "tic" - - # else: - # raise NotImplementedError + def _generate_preds_dict(self): + """ + Create easy to access dict from results data. + """ + self.preds_dict = {} + + for result in self.results: + spec_id = result.psm_index + if spec_id in self.preds_dict.keys(): + for ion, ion_numbers in result.theoretical_mz.items(): + for i, ion_number in enumerate(ion_numbers): + mz = result.theoretical_mz[ion][i] + prediction = result.predicted_intensity[ion][i] if result.predicted_intensity else None + + if ion in self.preds_dict[spec_id]["peaks"]: + self.preds_dict[spec_id]["peaks"][ion].append((ion_number, mz, prediction)) + else: + self.preds_dict[spec_id]["peaks"][ion] = [(ion_number, mz, prediction)] + else: + self.preds_dict[spec_id] = { + "peaks": {} + } + + if result.psm and result.psm.peptidoform: + self.preds_dict[spec_id]["charge"] = result.psm.peptidoform.precursor_charge + + + def _normalize_spectra(self, method="basepeak_10000"): + """ + Normalize spectra + """ + if self.is_log_space: + for result in self.results: + if result.predicted_intensity: + for ion, intensities in result.predicted_intensity.items(): + result.predicted_intensity[ion] = ((2 ** intensities) - 0.001).clip(lower=0) + self.is_log_space = False + + if method == "basepeak_10000": + if self.normalization == "basepeak_10000": + pass + elif self.normalization == "basepeak_1": + for result in self.results: + if result.predicted_intensity: + for ion, intensities in result.predicted_intensity.items(): + result.predicted_intensity[ion] *= 10000 + else: + for result in self.results: + if result.predicted_intensity: + for ion, intensities in result.predicted_intensity.items(): + result.predicted_intensity[ion] = (intensities / intensities.max()) * 10000 + self.normalization = "basepeak_10000" + + elif method == "basepeak_1": + if self.normalization == "basepeak_1": + pass + elif self.normalization == "basepeak_10000": + for result in self.results: + if result.predicted_intensity: + for ion, intensities in result.predicted_intensity.items(): + result.predicted_intensity[ion] /= 10000 + else: + for result in self.results: + if result.predicted_intensity: + for ion, intensities in result.predicted_intensity.items(): + result.predicted_intensity[ion] = (intensities / intensities.max()) + self.normalization = "basepeak_1" + + elif method == "tic": + if self.normalization != "tic": + for result in self.results: + if result.predicted_intensity: + for ion, intensities in result.predicted_intensity.items(): + result.predicted_intensity[ion] = (intensities / intensities.sum()) + self.normalization = "tic" + + else: + raise NotImplementedError def _get_msp_peak_annotation( self, From e4ff67f37dd5462e334a0c335e28c9e4b3f68dd7 Mon Sep 17 00:00:00 2001 From: Jasper Van den Hemel Date: Fri, 8 Dec 2023 10:45:04 +0100 Subject: [PATCH 02/14] refactored write_msp and added basic normalization methods --- ms2pip/spectrum_output.py | 129 +++++++++++++++++++++----------------- 1 file changed, 71 insertions(+), 58 deletions(-) diff --git a/ms2pip/spectrum_output.py b/ms2pip/spectrum_output.py index 8351ff74..243c5b3d 100644 --- a/ms2pip/spectrum_output.py +++ b/ms2pip/spectrum_output.py @@ -124,31 +124,31 @@ def __init__( # self.peprec_dict = peprec_tmp.to_dict(orient="index") - def _generate_preds_dict(self): - """ - Create easy to access dict from results data. - """ - self.preds_dict = {} + # def _generate_preds_dict(self): + # """ + # Create easy to access dict from results data. + # """ + # self.preds_dict = {} - for result in self.results: - spec_id = result.psm_index - if spec_id in self.preds_dict.keys(): - for ion, ion_numbers in result.theoretical_mz.items(): - for i, ion_number in enumerate(ion_numbers): - mz = result.theoretical_mz[ion][i] - prediction = result.predicted_intensity[ion][i] if result.predicted_intensity else None - - if ion in self.preds_dict[spec_id]["peaks"]: - self.preds_dict[spec_id]["peaks"][ion].append((ion_number, mz, prediction)) - else: - self.preds_dict[spec_id]["peaks"][ion] = [(ion_number, mz, prediction)] - else: - self.preds_dict[spec_id] = { - "peaks": {} - } + # for result in self.results: + # spec_id = result.psm_index + # if spec_id in self.preds_dict.keys(): + # for ion, ion_numbers in result.theoretical_mz.items(): + # for i, ion_number in enumerate(ion_numbers): + # mz = result.theoretical_mz[ion][i] + # prediction = result.predicted_intensity[ion][i] if result.predicted_intensity else None + + # if ion in self.preds_dict[spec_id]["peaks"]: + # self.preds_dict[spec_id]["peaks"][ion].append((ion_number, mz, prediction)) + # else: + # self.preds_dict[spec_id]["peaks"][ion] = [(ion_number, mz, prediction)] + # else: + # self.preds_dict[spec_id] = { + # "peaks": {} + # } - if result.psm and result.psm.peptidoform: - self.preds_dict[spec_id]["charge"] = result.psm.peptidoform.precursor_charge + # if result.psm and result.psm.peptidoform: + # self.preds_dict[spec_id]["charge"] = result.psm.peptidoform.precursor_charge def _normalize_spectra(self, method="basepeak_10000"): @@ -202,6 +202,15 @@ def _normalize_spectra(self, method="basepeak_10000"): else: raise NotImplementedError + + + def _tic_normalize(intensities): + """Normalize intensities to total ion current (TIC).""" + return intensities / intensities.sum() + + def _basepeak_normalize(intensities): + """Normalize intensities to most intense peak.""" + return intensities / intensities.max() def _get_msp_peak_annotation( self, @@ -320,52 +329,56 @@ def write_results(self, output_formats: List[str]) -> Dict[str, Any]: results[output_format] = writer(self) return results - @output_format("msp") - @writer( - file_suffix="_predictions.msp", - normalization_method="basepeak_10000", - requires_dicts=True, - requires_diff_modifications=False, - ) - def write_msp(self, file_object): + # @output_format("msp") + # @writer( + # file_suffix="_predictions.msp", + # normalization_method="basepeak_10000", + # ) + def write_msp(self, file_object, normalization_method="basepeak_10000"): """ Construct MSP string and write to file_object. """ - for spec_id in sorted(self.peprec_dict.keys()): - seq = self.peprec_dict[spec_id]["peptide"] - mods = self.peprec_dict[spec_id]["modifications"] - charge = self.peprec_dict[spec_id]["charge"] - prec_mass, prec_mz = self.mods.calc_precursor_mz(seq, mods, charge) - msp_modifications = self._get_msp_modifications(seq, mods) - num_peaks = sum( - [len(peaklist) for _, peaklist in self.preds_dict[spec_id]["peaks"].items()] - ) + for result in self.results: - comment_line = f" Mods={msp_modifications} Parent={prec_mz}" + sequence = result.psm.peptidoform.sequence + charge = result.psm.peptidoform.precursor_charge + mw = result.psm.peptidoform.theoretical_mass - if self.has_protein_list: - protein_list = self.peprec_dict[spec_id]["protein_list"] - protein_string = self._parse_protein_string(protein_list) - comment_line += f' Protein="{protein_string}"' + result_spectrum = result.as_spectra()[0] - if self.has_rt: - rt = self.peprec_dict[spec_id]["rt"] - comment_line += f" RetentionTime={rt}" + num_peaks = len(result_spectrum.mz) - comment_line += f' MS2PIP_ID="{spec_id}"' + if normalization_method == "basepeak_10000": + intensity_normalized = self._basepeak_normalize(result_spectrum.intensity) * 10000 + elif normalization_method == "tic": + intensity_normalized = self._tic_normalize(result_spectrum.intensity) + else: + intensity_normalized = result_spectrum.intensity + + peaks = zip(result_spectrum.mz, intensity_normalized, result_spectrum.annotations) + + # TODO: change modification info for comment line to use result + + # comment_line = f" Mods={msp_modifications} Parent={prec_mz}" + + # if self.has_protein_list: + # protein_list = self.peprec_dict[spec_id]["protein_list"] + # protein_string = self._parse_protein_string(protein_list) + # comment_line += f' Protein="{protein_string}"' + + # if self.has_rt: + # rt = self.peprec_dict[spec_id]["rt"] + # comment_line += f" RetentionTime={rt}" + + # comment_line += f' MS2PIP_ID="{spec_id}"' out = [ - f"Name: {seq}/{charge}", - f"MW: {prec_mass}", - f"Comment:{comment_line}", + f"Name: {sequence}/{charge}", + f"MW: {mw}", + # f"Comment:{comment_line}", f"Num peaks: {num_peaks}", - self._get_msp_peak_annotation( - self.preds_dict[spec_id]["peaks"], - sep="\t", - include_annotations=True, - intensity_type=int, - ), + f"{mz}\t{intensity}\t{annotation}/0.0ppm\n" for mz, intensity, annotation in peaks ] file_object.writelines([line + "\n" for line in out] + ["\n"]) From c39a69fe788bd152c038a41d10a57681709f2fd7 Mon Sep 17 00:00:00 2001 From: Jasper Van den Hemel Date: Mon, 11 Dec 2023 09:54:27 +0100 Subject: [PATCH 03/14] adjust: fix format issue in write_msp output --- ms2pip/spectrum_output.py | 1301 ++++++++++++++++--------------------- 1 file changed, 562 insertions(+), 739 deletions(-) diff --git a/ms2pip/spectrum_output.py b/ms2pip/spectrum_output.py index 243c5b3d..dbd0b4b5 100644 --- a/ms2pip/spectrum_output.py +++ b/ms2pip/spectrum_output.py @@ -15,6 +15,7 @@ from time import localtime, strftime from typing import Any, Dict, List +import numpy as np from ms2pip.result import ProcessingResult @@ -25,91 +26,6 @@ class InvalidWriteModeError(ValueError): pass -# Writer decorator -def writer(**kwargs): - def deco(write_function): - @wraps(write_function) - def wrapper(self): - return self._write_general(write_function, **kwargs) - - return wrapper - - return deco - - -def output_format(output_format): - class OutputFormat: - def __init__(self, fn): - self.fn = fn - self.output_format = output_format - - def __set_name__(self, owner, name): - owner.OUTPUT_FORMATS[self.output_format] = self.fn - setattr(owner, name, self.fn) - - return OutputFormat - - -class SpectrumOutput: - """Write MS2PIP predictions to various output formats.""" - - OUTPUT_FORMATS = {} - - def __init__( - self, - results: List["ProcessingResult"], - output_filename="ms2pip_predictions", - write_mode="wt+", - return_stringbuffer=False, - is_log_space=True, - ): - """ - Write MS2PIP predictions to various output formats. - - Parameters - ---------- - results: - List of ProcessingResult objects - output_filename: str, optional - path and name for output files, will be suffexed with `_predictions` and the - relevant file extension (default: ms2pip_predictions) - write_mode: str, optional - write mode to use: "wt+" to append to start a new file, "at" to append to an - existing file (default: "wt+") - return_stringbuffer: bool, optional - If True, files are written to a StringIO object, which the write function - returns. If False, files are written to a file on disk. - is_log_space: bool, optional - Set to true if predicted intensities in `all_preds` are in log-space. In that - case, intensities will first be transformed to "normal"-space. - - Example - ------- - >>> so = ms2pip.spectrum_tools.spectrum_output.SpectrumOutput( - results - ) - >>> so.write_msp() - >>> so.write_spectronaut() - - """ - - self.results = results - self.output_filename = output_filename - self.write_mode = write_mode - self.return_stringbuffer = return_stringbuffer - self.is_log_space = is_log_space - - self.diff_modification_mapping = {} - - self.has_rt = "rt" in self.peprec.columns - self.has_protein_list = "protein_list" in self.peprec.columns - - if self.write_mode not in ["wt+", "wt", "at", "w", "a"]: - raise InvalidWriteModeError(self.write_mode) - - if "a" in self.write_mode and self.return_stringbuffer: - raise InvalidWriteModeError(self.write_mode) - # def _generate_peprec_dict(self, rt_to_seconds=True): # """ # Create easy to access dict from all_preds and peprec dataframes @@ -149,702 +65,609 @@ def __init__( # if result.psm and result.psm.peptidoform: # self.preds_dict[spec_id]["charge"] = result.psm.peptidoform.precursor_charge - - - def _normalize_spectra(self, method="basepeak_10000"): - """ - Normalize spectra - """ - if self.is_log_space: - for result in self.results: - if result.predicted_intensity: - for ion, intensities in result.predicted_intensity.items(): - result.predicted_intensity[ion] = ((2 ** intensities) - 0.001).clip(lower=0) - self.is_log_space = False - - if method == "basepeak_10000": - if self.normalization == "basepeak_10000": - pass - elif self.normalization == "basepeak_1": - for result in self.results: - if result.predicted_intensity: - for ion, intensities in result.predicted_intensity.items(): - result.predicted_intensity[ion] *= 10000 - else: - for result in self.results: - if result.predicted_intensity: - for ion, intensities in result.predicted_intensity.items(): - result.predicted_intensity[ion] = (intensities / intensities.max()) * 10000 - self.normalization = "basepeak_10000" - - elif method == "basepeak_1": - if self.normalization == "basepeak_1": - pass - elif self.normalization == "basepeak_10000": - for result in self.results: - if result.predicted_intensity: - for ion, intensities in result.predicted_intensity.items(): - result.predicted_intensity[ion] /= 10000 - else: - for result in self.results: - if result.predicted_intensity: - for ion, intensities in result.predicted_intensity.items(): - result.predicted_intensity[ion] = (intensities / intensities.max()) - self.normalization = "basepeak_1" - - elif method == "tic": - if self.normalization != "tic": - for result in self.results: - if result.predicted_intensity: - for ion, intensities in result.predicted_intensity.items(): - result.predicted_intensity[ion] = (intensities / intensities.sum()) - self.normalization = "tic" - - else: - raise NotImplementedError - - - def _tic_normalize(intensities): - """Normalize intensities to total ion current (TIC).""" - return intensities / intensities.sum() - - def _basepeak_normalize(intensities): - """Normalize intensities to most intense peak.""" - return intensities / intensities.max() - - def _get_msp_peak_annotation( - self, - peak_dict, - sep="\t", - include_zero=False, - include_annotations=True, - intensity_type=float, - ): - """ - Get MGF/MSP-like peaklist string - """ - all_peaks = [] - for ion_type, peaks in peak_dict.items(): - for peak in peaks: - if not include_zero and peak[2] == 0: - continue - if include_annotations: - all_peaks.append( - ( - peak[1], - f'{peak[1]:.6f}{sep}{intensity_type(peak[2])}{sep}"{ion_type.lower()}{peak[0]}/0.0"', - ) - ) - else: - all_peaks.append((peak[1], f"{peak[1]:.6f}{sep}{peak[2]}")) - - all_peaks = sorted(all_peaks, key=itemgetter(0)) - peak_string = "\n".join([peak[1] for peak in all_peaks]) - - return peak_string - - def _get_msp_modifications(self, sequence, modifications): - """ - Format modifications in MSP-style, e.g. "1/0,E,Glu->pyro-Glu" - """ - - if isinstance(modifications, str): - if not modifications or modifications == "-": - msp_modifications = "0" - else: - mods = modifications.split("|") - mods = [(int(mods[i]), mods[i + 1]) for i in range(0, len(mods), 2)] - mods = [(x, y) if x == 0 else (x - 1, y) for (x, y) in mods] - mods = sorted(mods) - mods = [(str(x), sequence[x], y) for (x, y) in mods] - msp_modifications = "/".join([",".join(list(x)) for x in mods]) - msp_modifications = f"{len(mods)}/{msp_modifications}" - else: + +def _tic_normalize(intensities: np.array): + """Normalize intensities to total ion current (TIC).""" + return intensities / intensities.sum() + +def _basepeak_normalize(intensities: np.array): + """Normalize intensities to most intense peak.""" + return intensities / intensities.max() + +def _get_msp_modifications(sequence, modifications): + """ + Format modifications in MSP-style, e.g. "1/0,E,Glu->pyro-Glu" + """ + + if isinstance(modifications, str): + if not modifications or modifications == "-": msp_modifications = "0" - - return msp_modifications - - def _parse_protein_string(self, protein_list): - """ - Parse protein string from list, list string literal, or string. - """ - if isinstance(protein_list, list): - protein_string = "/".join(protein_list) - elif isinstance(protein_list, str): - try: - protein_string = "/".join(literal_eval(protein_list)) - except ValueError: - protein_string = protein_list else: - protein_string = "" - return protein_string - - def _get_last_ssl_scannr(self): - """ - Return scan number of last line in a Bibliospec SSL file. - """ - ssl_filename = "{}_predictions.ssl".format(self.output_filename) - with open(ssl_filename, "rt") as ssl: - for line in ssl: - last_line = line - last_scannr = int(last_line.split("\t")[1]) - return last_scannr - - def _generate_diff_modification_mapping(self, precision): - """ - Make modification name -> ssl modification name mapping. - """ - self.diff_modification_mapping[precision] = { - ptm.split(",")[0]: "{0:+.{1}f}".format(float(ptm.split(",")[1]), precision) - for ptm in self.params["ptm"] - } - - def _get_diff_modified_sequence(self, sequence, modifications, precision=1): - """ - Build BiblioSpec SSL modified sequence string. - """ - pep = list(sequence) - mapping = self.diff_modification_mapping[precision] - - for loc, name in zip(modifications.split("|")[::2], modifications.split("|")[1::2]): - # C-term mod - if loc == "-1": - pep[-1] = pep[-1] + "[{}]".format(mapping[name]) - # N-term mod - elif loc == "0": - pep[0] = pep[0] + "[{}]".format(mapping[name]) - # Normal mod - else: - pep[int(loc) - 1] = pep[int(loc) - 1] + "[{}]".format(mapping[name]) - return "".join(pep) - - def write_results(self, output_formats: List[str]) -> Dict[str, Any]: - """ - Write MS2PIP predictions in output formats defined by output_formats. - """ - results = {} - for output_format in output_formats: - output_format = output_format.lower() - writer = self.OUTPUT_FORMATS[output_format] - results[output_format] = writer(self) - return results + mods = modifications.split("|") + mods = [(int(mods[i]), mods[i + 1]) for i in range(0, len(mods), 2)] + mods = [(x, y) if x == 0 else (x - 1, y) for (x, y) in mods] + mods = sorted(mods) + mods = [(str(x), sequence[x], y) for (x, y) in mods] + msp_modifications = "/".join([",".join(list(x)) for x in mods]) + msp_modifications = f"{len(mods)}/{msp_modifications}" + else: + msp_modifications = "0" + + return msp_modifications + +def _parse_protein_string(protein_list): + """ + Parse protein string from list, list string literal, or string. + """ + if isinstance(protein_list, list): + protein_string = "/".join(protein_list) + elif isinstance(protein_list, str): + try: + protein_string = "/".join(literal_eval(protein_list)) + except ValueError: + protein_string = protein_list + else: + protein_string = "" + return protein_string + + # def _get_last_ssl_scannr(self): + # """ + # Return scan number of last line in a Bibliospec SSL file. + # """ + # ssl_filename = "{}_predictions.ssl".format(self.output_filename) + # with open(ssl_filename, "rt") as ssl: + # for line in ssl: + # last_line = line + # last_scannr = int(last_line.split("\t")[1]) + # return last_scannr + + # def _generate_diff_modification_mapping(self, precision): + # """ + # Make modification name -> ssl modification name mapping. + # """ + # self.diff_modification_mapping[precision] = { + # ptm.split(",")[0]: "{0:+.{1}f}".format(float(ptm.split(",")[1]), precision) + # for ptm in self.params["ptm"] + # } + + # def _get_diff_modified_sequence(self, sequence, modifications, precision=1): + # """ + # Build BiblioSpec SSL modified sequence string. + # """ + # pep = list(sequence) + # mapping = self.diff_modification_mapping[precision] + + # for loc, name in zip(modifications.split("|")[::2], modifications.split("|")[1::2]): + # # C-term mod + # if loc == "-1": + # pep[-1] = pep[-1] + "[{}]".format(mapping[name]) + # # N-term mod + # elif loc == "0": + # pep[0] = pep[0] + "[{}]".format(mapping[name]) + # # Normal mod + # else: + # pep[int(loc) - 1] = pep[int(loc) - 1] + "[{}]".format(mapping[name]) + # return "".join(pep) # @output_format("msp") # @writer( # file_suffix="_predictions.msp", # normalization_method="basepeak_10000", + # requires_dicts=True, + # requires_diff_modifications=False, # ) - def write_msp(self, file_object, normalization_method="basepeak_10000"): - """ - Construct MSP string and write to file_object. - """ - - for result in self.results: - - sequence = result.psm.peptidoform.sequence - charge = result.psm.peptidoform.precursor_charge - mw = result.psm.peptidoform.theoretical_mass - - result_spectrum = result.as_spectra()[0] - - num_peaks = len(result_spectrum.mz) - - if normalization_method == "basepeak_10000": - intensity_normalized = self._basepeak_normalize(result_spectrum.intensity) * 10000 - elif normalization_method == "tic": - intensity_normalized = self._tic_normalize(result_spectrum.intensity) - else: - intensity_normalized = result_spectrum.intensity - - peaks = zip(result_spectrum.mz, intensity_normalized, result_spectrum.annotations) - - # TODO: change modification info for comment line to use result - - # comment_line = f" Mods={msp_modifications} Parent={prec_mz}" - - # if self.has_protein_list: - # protein_list = self.peprec_dict[spec_id]["protein_list"] - # protein_string = self._parse_protein_string(protein_list) - # comment_line += f' Protein="{protein_string}"' - - # if self.has_rt: - # rt = self.peprec_dict[spec_id]["rt"] - # comment_line += f" RetentionTime={rt}" - - # comment_line += f' MS2PIP_ID="{spec_id}"' - - out = [ - f"Name: {sequence}/{charge}", - f"MW: {mw}", - # f"Comment:{comment_line}", - f"Num peaks: {num_peaks}", - f"{mz}\t{intensity}\t{annotation}/0.0ppm\n" for mz, intensity, annotation in peaks - ] - - file_object.writelines([line + "\n" for line in out] + ["\n"]) - - @output_format("mgf") - @writer( - file_suffix="_predictions.mgf", - normalization_method="basepeak_10000", - requires_dicts=True, - requires_diff_modifications=False, - ) - def write_mgf(self, file_object): - """ - Construct MGF string and write to file_object - """ - for spec_id in sorted(self.peprec_dict.keys()): - seq = self.peprec_dict[spec_id]["peptide"] - mods = self.peprec_dict[spec_id]["modifications"] - charge = self.peprec_dict[spec_id]["charge"] - _, prec_mz = self.mods.calc_precursor_mz(seq, mods, charge) - msp_modifications = self._get_msp_modifications(seq, mods) - if self.has_protein_list: - protein_list = self.peprec_dict[spec_id]["protein_list"] - protein_string = self._parse_protein_string(protein_list) - else: - protein_string = "" - - out = [ - "BEGIN IONS", - f"TITLE={spec_id} {seq}/{charge} {msp_modifications} {protein_string}", - f"PEPMASS={prec_mz}", - f"CHARGE={charge}+", - ] - - if self.has_rt: - rt = self.peprec_dict[spec_id]["rt"] - out.append(f"RTINSECONDS={rt}") - - out.append( - self._get_msp_peak_annotation( - self.preds_dict[spec_id]["peaks"], - sep=" ", - include_annotations=False, - ) - ) - out.append("END IONS\n") - file_object.writelines([line + "\n" for line in out]) - - @output_format("spectronaut") - @writer( - file_suffix="_predictions_spectronaut.csv", - normalization_method="tic", - requires_dicts=False, - requires_diff_modifications=True, - ) - def write_spectronaut(self, file_obj): - """ - Construct spectronaut DataFrame and write to file_object. - """ - if "w" in self.write_mode: - header = True - elif "a" in self.write_mode: - header = False - else: - raise InvalidWriteModeError(self.write_mode) - spectronaut_peprec = self.peprec.copy() +def write_msp(processing_results: List[ProcessingResult], file_object): + """Construct MSP string and write to file_object.""" + for result in processing_results: + sequence = result.psm.peptidoform.sequence + charge = result.psm.peptidoform.precursor_charge + mw = result.psm.peptidoform.theoretical_mass + result_spectrum = result.as_spectra()[0] + num_peaks = len(result_spectrum.mz) + intensity_normalized = _basepeak_normalize(result_spectrum.intensity) * 10000 - # ModifiedPeptide and PrecursorMz columns - spectronaut_peprec["ModifiedPeptide"] = spectronaut_peprec.apply( - lambda row: self._get_diff_modified_sequence(row["peptide"], row["modifications"]), - axis=1, - ) - spectronaut_peprec["PrecursorMz"] = spectronaut_peprec.apply( - lambda row: self.mods.calc_precursor_mz( - row["peptide"], row["modifications"], row["charge"] - )[1], - axis=1, - ) - spectronaut_peprec["ModifiedPeptide"] = "_" + spectronaut_peprec["ModifiedPeptide"] + "_" + peaks = zip(result_spectrum.mz, intensity_normalized, result_spectrum.annotations) - # Additional columns - spectronaut_peprec["FragmentLossType"] = "noloss" + # TODO: change modification info for comment line to use result + # comment_line = f" Mods={msp_modifications} Parent={prec_mz}" - # Retention time - if "rt" in spectronaut_peprec.columns: - rt_cols = ["iRT"] - spectronaut_peprec["iRT"] = spectronaut_peprec["rt"] - else: - rt_cols = [] + # if self.has_protein_list: + # protein_list = self.peprec_dict[spec_id]["protein_list"] + # protein_string = self._parse_protein_string(protein_list) + # comment_line += f' Protein="{protein_string}"' - # ProteinId - if self.has_protein_list: - spectronaut_peprec["ProteinId"] = spectronaut_peprec["protein_list"].apply( - self._parse_protein_string - ) - else: - spectronaut_peprec["ProteinId"] = spectronaut_peprec["spec_id"] + # if self.has_rt: + # rt = self.peprec_dict[spec_id]["rt"] + # comment_line += f" RetentionTime={rt}" - # Rename columns and merge with predictions - spectronaut_peprec = spectronaut_peprec.rename( - columns={"charge": "PrecursorCharge", "peptide": "StrippedPeptide"} - ) - peptide_cols = ( - [ - "ModifiedPeptide", - "StrippedPeptide", - "PrecursorCharge", - "PrecursorMz", - "ProteinId", - ] - + rt_cols - + ["FragmentLossType"] - ) - spectronaut_df = spectronaut_peprec[peptide_cols + ["spec_id"]] - spectronaut_df = self.all_preds.merge(spectronaut_df, on="spec_id") + # comment_line += f' MS2PIP_ID="{spec_id}"' - # Fragment columns - spectronaut_df["FragmentCharge"] = ( - spectronaut_df["ion"].str.contains("2").map({True: 2, False: 1}) - ) - spectronaut_df["FragmentType"] = spectronaut_df["ion"].str[0].str.lower() - - # Rename and sort columns - spectronaut_df = spectronaut_df.rename( - columns={ - "mz": "FragmentMz", - "prediction": "RelativeIntensity", - "ionnumber": "FragmentNumber", - } - ) - fragment_cols = [ - "FragmentCharge", - "FragmentMz", - "RelativeIntensity", - "FragmentType", - "FragmentNumber", + out = [ + f"Name: {sequence}/{charge}", + f"MW: {mw}", + # f"Comment:{comment_line}", + f"Num peaks: {num_peaks}", + [f"{mz}\t{intensity}\t{annotation}/0.0\n" for mz, intensity, annotation in peaks] + ] + file_object.writelines(''.join(line) if isinstance(line, list) else line + "\n" for line in out) + +# @output_format("mgf") +# @writer( +# file_suffix="_predictions.mgf", +# normalization_method="basepeak_10000", +# requires_dicts=True, +# requires_diff_modifications=False, +# ) +def write_mgf(self, file_object, normalization_method="basepeak_10000"): + """ + Construct MGF string and write to file_object + """ + + for result in self.results: + + sequence = result.psm.peptidoform.sequence + charge = result.psm.peptidoform.precursor_charge + mw = result.psm.peptidoform.theoretical_mass + + out = [ + "BEGIN IONs", + f"TITLE={result.psm_index} {sequence}/{charge}" + f"CHARGE={charge}+" ] - spectronaut_df = spectronaut_df[peptide_cols + fragment_cols] - try: - spectronaut_df.to_csv( - file_obj, index=False, header=header, sep=";", lineterminator="\n" - ) - except TypeError: # Pandas < 1.5 (Required for Python 3.7 support) - spectronaut_df.to_csv( - file_obj, index=False, header=header, sep=";", line_terminator="\n" - ) - return file_obj + for spec_id in sorted(self.peprec_dict.keys()): + seq = self.peprec_dict[spec_id]["peptide"] + mods = self.peprec_dict[spec_id]["modifications"] + charge = self.peprec_dict[spec_id]["charge"] + _, prec_mz = self.mods.calc_precursor_mz(seq, mods, charge) + msp_modifications = self._get_msp_modifications(seq, mods) + + if self.has_protein_list: + protein_list = self.peprec_dict[spec_id]["protein_list"] + protein_string = self._parse_protein_string(protein_list) + else: + protein_string = "" - def _write_bibliospec_core(self, file_obj_ssl, file_obj_ms2, start_scannr=0): - """Construct Bibliospec SSL/MS2 strings and write to file_objects.""" + out = [ + "BEGIN IONS", + f"TITLE={spec_id} {seq}/{charge} {msp_modifications} {protein_string}", + f"PEPMASS={prec_mz}", + f"CHARGE={charge}+", + ] - for i, spec_id in enumerate(sorted(self.preds_dict.keys())): - scannr = i + start_scannr - seq = self.peprec_dict[spec_id]["peptide"] - mods = self.peprec_dict[spec_id]["modifications"] - charge = self.peprec_dict[spec_id]["charge"] - prec_mass, prec_mz = self.mods.calc_precursor_mz(seq, mods, charge) - ms2_filename = os.path.basename(self.output_filename) + "_predictions.ms2" + if self.has_rt: + rt = self.peprec_dict[spec_id]["rt"] + out.append(f"RTINSECONDS={rt}") - peaks = self._get_msp_peak_annotation( + out.append( + self._get_msp_peak_annotation( self.preds_dict[spec_id]["peaks"], - sep="\t", + sep=" ", include_annotations=False, ) + ) + out.append("END IONS\n") + file_object.writelines([line for line in out] + ["\n"]) + +# @output_format("spectronaut") +# @writer( +# file_suffix="_predictions_spectronaut.csv", +# normalization_method="tic", +# requires_dicts=False, +# requires_diff_modifications=True, +# ) +# def write_spectronaut(self, file_obj): +# """ +# Construct spectronaut DataFrame and write to file_object. +# """ +# if "w" in self.write_mode: +# header = True +# elif "a" in self.write_mode: +# header = False +# else: +# raise InvalidWriteModeError(self.write_mode) + +# spectronaut_peprec = self.peprec.copy() + +# # ModifiedPeptide and PrecursorMz columns +# spectronaut_peprec["ModifiedPeptide"] = spectronaut_peprec.apply( +# lambda row: self._get_diff_modified_sequence(row["peptide"], row["modifications"]), +# axis=1, +# ) +# spectronaut_peprec["PrecursorMz"] = spectronaut_peprec.apply( +# lambda row: self.mods.calc_precursor_mz( +# row["peptide"], row["modifications"], row["charge"] +# )[1], +# axis=1, +# ) +# spectronaut_peprec["ModifiedPeptide"] = "_" + spectronaut_peprec["ModifiedPeptide"] + "_" + +# # Additional columns +# spectronaut_peprec["FragmentLossType"] = "noloss" + +# # Retention time +# if "rt" in spectronaut_peprec.columns: +# rt_cols = ["iRT"] +# spectronaut_peprec["iRT"] = spectronaut_peprec["rt"] +# else: +# rt_cols = [] + +# # ProteinId +# if self.has_protein_list: +# spectronaut_peprec["ProteinId"] = spectronaut_peprec["protein_list"].apply( +# self._parse_protein_string +# ) +# else: +# spectronaut_peprec["ProteinId"] = spectronaut_peprec["spec_id"] + +# # Rename columns and merge with predictions +# spectronaut_peprec = spectronaut_peprec.rename( +# columns={"charge": "PrecursorCharge", "peptide": "StrippedPeptide"} +# ) +# peptide_cols = ( +# [ +# "ModifiedPeptide", +# "StrippedPeptide", +# "PrecursorCharge", +# "PrecursorMz", +# "ProteinId", +# ] +# + rt_cols +# + ["FragmentLossType"] +# ) +# spectronaut_df = spectronaut_peprec[peptide_cols + ["spec_id"]] +# spectronaut_df = self.all_preds.merge(spectronaut_df, on="spec_id") + +# # Fragment columns +# spectronaut_df["FragmentCharge"] = ( +# spectronaut_df["ion"].str.contains("2").map({True: 2, False: 1}) +# ) +# spectronaut_df["FragmentType"] = spectronaut_df["ion"].str[0].str.lower() + +# # Rename and sort columns +# spectronaut_df = spectronaut_df.rename( +# columns={ +# "mz": "FragmentMz", +# "prediction": "RelativeIntensity", +# "ionnumber": "FragmentNumber", +# } +# ) +# fragment_cols = [ +# "FragmentCharge", +# "FragmentMz", +# "RelativeIntensity", +# "FragmentType", +# "FragmentNumber", +# ] +# spectronaut_df = spectronaut_df[peptide_cols + fragment_cols] +# try: +# spectronaut_df.to_csv( +# file_obj, index=False, header=header, sep=";", lineterminator="\n" +# ) +# except TypeError: # Pandas < 1.5 (Required for Python 3.7 support) +# spectronaut_df.to_csv( +# file_obj, index=False, header=header, sep=";", line_terminator="\n" +# ) + +# return file_obj + +def _write_bibliospec_core(self, file_obj_ssl, file_obj_ms2, start_scannr=0): + """Construct Bibliospec SSL/MS2 strings and write to file_objects.""" + + for i, spec_id in enumerate(sorted(self.preds_dict.keys())): + scannr = i + start_scannr + seq = self.peprec_dict[spec_id]["peptide"] + mods = self.peprec_dict[spec_id]["modifications"] + charge = self.peprec_dict[spec_id]["charge"] + prec_mass, prec_mz = self.mods.calc_precursor_mz(seq, mods, charge) + ms2_filename = os.path.basename(self.output_filename) + "_predictions.ms2" + + peaks = self._get_msp_peak_annotation( + self.preds_dict[spec_id]["peaks"], + sep="\t", + include_annotations=False, + ) - if isinstance(mods, str) and mods != "-" and mods != "": - mod_seq = self._get_diff_modified_sequence(seq, mods) - else: - mod_seq = seq + if isinstance(mods, str) and mods != "-" and mods != "": + mod_seq = self._get_diff_modified_sequence(seq, mods) + else: + mod_seq = seq - rt = self.peprec_dict[spec_id]["rt"] if self.has_rt else "" + rt = self.peprec_dict[spec_id]["rt"] if self.has_rt else "" - # TODO: implement csv instead of manual writing - file_obj_ssl.write( - "\t".join([ms2_filename, str(scannr), str(charge), mod_seq, "", "", str(rt)]) - + "\n" - ) - file_obj_ms2.write( - "\n".join( - [ - f"S\t{scannr}\t{prec_mz}", - f"Z\t{charge}\t{prec_mass}", - f"D\tseq\t{seq}", - f"D\tmodified seq\t{mod_seq}", - peaks, - ] - ) - + "\n" + # TODO: implement csv instead of manual writing + file_obj_ssl.write( + "\t".join([ms2_filename, str(scannr), str(charge), mod_seq, "", "", str(rt)]) + + "\n" + ) + file_obj_ms2.write( + "\n".join( + [ + f"S\t{scannr}\t{prec_mz}", + f"Z\t{charge}\t{prec_mass}", + f"D\tseq\t{seq}", + f"D\tmodified seq\t{mod_seq}", + peaks, + ] ) + + "\n" + ) - def _write_general( - self, - write_function, - file_suffix, - normalization_method, - requires_dicts, - requires_diff_modifications, - diff_modification_precision=1, - ): - """ - General write function to call core write functions. - - Note: Does not work for write_bibliospec and write_dlib functions. - """ - - # Normalize if necessary and make dicts - if not self.normalization == normalization_method: - self._normalize_spectra(method=normalization_method) - if requires_dicts: - self._generate_preds_dict() - elif requires_dicts and not self.preds_dict: +def _write_general( + self, + write_function, + file_suffix, + normalization_method, + requires_dicts, + requires_diff_modifications, + diff_modification_precision=1, +): + """ + General write function to call core write functions. + + Note: Does not work for write_bibliospec and write_dlib functions. + """ + + # Normalize if necessary and make dicts + if not self.normalization == normalization_method: + self._normalize_spectra(method=normalization_method) + if requires_dicts: self._generate_preds_dict() - if requires_dicts and not self.peprec_dict: - self._generate_peprec_dict() - - if ( - requires_diff_modifications - and diff_modification_precision not in self.diff_modification_mapping - ): - self._generate_diff_modification_mapping(diff_modification_precision) - - # Write to file or stringbuffer - if self.return_stringbuffer: - file_object = StringIO() - logger.info("Writing results to StringIO using %s", write_function.__name__) - else: - f_name = self.output_filename + file_suffix - file_object = open(f_name, self.write_mode) - logger.info("Writing results to %s", f_name) - - write_function(self, file_object) - - return file_object - - @output_format("bibliospec") - def write_bibliospec(self): - """Write MS2PIP predictions to BiblioSpec/Skyline SSL and MS2 spectral library files.""" - precision = 1 - if precision not in self.diff_modification_mapping: - self._generate_diff_modification_mapping(precision) + elif requires_dicts and not self.preds_dict: + self._generate_preds_dict() + if requires_dicts and not self.peprec_dict: + self._generate_peprec_dict() + + if ( + requires_diff_modifications + and diff_modification_precision not in self.diff_modification_mapping + ): + self._generate_diff_modification_mapping(diff_modification_precision) + + # Write to file or stringbuffer + if self.return_stringbuffer: + file_object = StringIO() + logger.info("Writing results to StringIO using %s", write_function.__name__) + else: + f_name = self.output_filename + file_suffix + file_object = open(f_name, self.write_mode) + logger.info("Writing results to %s", f_name) + + write_function(self, file_object) + + return file_object + +# @output_format("bibliospec") +# def write_bibliospec(self): +# """Write MS2PIP predictions to BiblioSpec/Skyline SSL and MS2 spectral library files.""" +# precision = 1 +# if precision not in self.diff_modification_mapping: +# self._generate_diff_modification_mapping(precision) + +# # Normalize if necessary and make dicts +# if not self.normalization == "basepeak_10000": +# self._normalize_spectra(method="basepeak_10000") +# self._generate_preds_dict() +# elif not self.preds_dict: +# self._generate_preds_dict() +# if not self.peprec_dict: +# self._generate_peprec_dict() + +# if self.return_stringbuffer: +# file_obj_ssl = StringIO() +# file_obj_ms2 = StringIO() +# else: +# file_obj_ssl = open("{}_predictions.ssl".format(self.output_filename), self.write_mode) +# file_obj_ms2 = open("{}_predictions.ms2".format(self.output_filename), self.write_mode) + +# # If a new file is written, write headers +# if "w" in self.write_mode: +# start_scannr = 0 +# ssl_header = [ +# "file", +# "scan", +# "charge", +# "sequence", +# "score-type", +# "score", +# "retention-time", +# "\n", +# ] +# file_obj_ssl.write("\t".join(ssl_header)) +# file_obj_ms2.write( +# "H\tCreationDate\t{}\n".format(strftime("%Y-%m-%d %H:%M:%S", localtime())) +# ) +# file_obj_ms2.write("H\tExtractor\tMS2PIP predictions\n") +# else: +# # Get last scan number of ssl file, to continue indexing from there +# # because Bibliospec speclib scan numbers can only be integers +# start_scannr = self._get_last_ssl_scannr() + 1 + +# self._write_bibliospec_core(file_obj_ssl, file_obj_ms2, start_scannr=start_scannr) + +# return file_obj_ssl, file_obj_ms2 + +# def _write_dlib_metadata(self, connection): +# from sqlalchemy import select + +# from ms2pip._utils.dlib import DLIB_VERSION, Metadata + +# with connection.begin(): +# version = connection.execute( +# select([Metadata.c.Value]).where(Metadata.c.Key == "version") +# ).scalar() +# if version is None: +# connection.execute( +# Metadata.insert().values( +# Key="version", +# Value=DLIB_VERSION, +# ) +# ) + +def _write_dlib_entries(self, connection, precision): + from ms2pip._utils.dlib import Entry + + peptide_to_proteins = set() + + with connection.begin(): + for spec_id, peprec in self.peprec_dict.items(): + seq = peprec["peptide"] + mods = peprec["modifications"] + charge = peprec["charge"] - # Normalize if necessary and make dicts - if not self.normalization == "basepeak_10000": - self._normalize_spectra(method="basepeak_10000") - self._generate_preds_dict() - elif not self.preds_dict: - self._generate_preds_dict() - if not self.peprec_dict: - self._generate_peprec_dict() + prec_mass, prec_mz = self.mods.calc_precursor_mz(seq, mods, charge) + mod_seq = self._get_diff_modified_sequence(seq, mods, precision=precision) - if self.return_stringbuffer: - file_obj_ssl = StringIO() - file_obj_ms2 = StringIO() - else: - file_obj_ssl = open("{}_predictions.ssl".format(self.output_filename), self.write_mode) - file_obj_ms2 = open("{}_predictions.ms2".format(self.output_filename), self.write_mode) - - # If a new file is written, write headers - if "w" in self.write_mode: - start_scannr = 0 - ssl_header = [ - "file", - "scan", - "charge", - "sequence", - "score-type", - "score", - "retention-time", - "\n", - ] - file_obj_ssl.write("\t".join(ssl_header)) - file_obj_ms2.write( - "H\tCreationDate\t{}\n".format(strftime("%Y-%m-%d %H:%M:%S", localtime())) + all_peaks = sorted( + itertools.chain.from_iterable(self.preds_dict[spec_id]["peaks"].values()), + key=itemgetter(1), ) - file_obj_ms2.write("H\tExtractor\tMS2PIP predictions\n") - else: - # Get last scan number of ssl file, to continue indexing from there - # because Bibliospec speclib scan numbers can only be integers - start_scannr = self._get_last_ssl_scannr() + 1 - - self._write_bibliospec_core(file_obj_ssl, file_obj_ms2, start_scannr=start_scannr) - - return file_obj_ssl, file_obj_ms2 - - def _write_dlib_metadata(self, connection): - from sqlalchemy import select - - from ms2pip._utils.dlib import DLIB_VERSION, Metadata - - with connection.begin(): - version = connection.execute( - select([Metadata.c.Value]).where(Metadata.c.Key == "version") - ).scalar() - if version is None: - connection.execute( - Metadata.insert().values( - Key="version", - Value=DLIB_VERSION, - ) + mzs = [peak[1] for peak in all_peaks] + intensities = [peak[2] for peak in all_peaks] + + connection.execute( + Entry.insert().values( + PrecursorMz=prec_mz, + PrecursorCharge=charge, + PeptideModSeq=mod_seq, + PeptideSeq=seq, + Copies=1, + RTInSeconds=peprec["rt"], + Score=0, + MassEncodedLength=len(mzs), + MassArray=mzs, + IntensityEncodedLength=len(intensities), + IntensityArray=intensities, + SourceFile=self.output_filename, ) + ) - def _write_dlib_entries(self, connection, precision): - from ms2pip._utils.dlib import Entry + if self.has_protein_list: + protein_list = peprec["protein_list"] + if isinstance(protein_list, str): + protein_list = literal_eval(protein_list) - peptide_to_proteins = set() + for protein in protein_list: + peptide_to_proteins.add((seq, protein)) - with connection.begin(): - for spec_id, peprec in self.peprec_dict.items(): - seq = peprec["peptide"] - mods = peprec["modifications"] - charge = peprec["charge"] + return peptide_to_proteins - prec_mass, prec_mz = self.mods.calc_precursor_mz(seq, mods, charge) - mod_seq = self._get_diff_modified_sequence(seq, mods, precision=precision) +def _write_dlib_peptide_to_protein(self, connection, peptide_to_proteins): + from ms2pip._utils.dlib import PeptideToProtein - all_peaks = sorted( - itertools.chain.from_iterable(self.preds_dict[spec_id]["peaks"].values()), - key=itemgetter(1), - ) - mzs = [peak[1] for peak in all_peaks] - intensities = [peak[2] for peak in all_peaks] - - connection.execute( - Entry.insert().values( - PrecursorMz=prec_mz, - PrecursorCharge=charge, - PeptideModSeq=mod_seq, - PeptideSeq=seq, - Copies=1, - RTInSeconds=peprec["rt"], - Score=0, - MassEncodedLength=len(mzs), - MassArray=mzs, - IntensityEncodedLength=len(intensities), - IntensityArray=intensities, - SourceFile=self.output_filename, - ) - ) + if not self.has_protein_list: + return - if self.has_protein_list: - protein_list = peprec["protein_list"] - if isinstance(protein_list, str): - protein_list = literal_eval(protein_list) - - for protein in protein_list: - peptide_to_proteins.add((seq, protein)) - - return peptide_to_proteins - - def _write_dlib_peptide_to_protein(self, connection, peptide_to_proteins): - from ms2pip._utils.dlib import PeptideToProtein - - if not self.has_protein_list: - return - - with connection.begin(): - sql_peptide_to_proteins = set() - proteins = {protein for _, protein in peptide_to_proteins} - for peptide_to_protein in connection.execute( - PeptideToProtein.select().where(PeptideToProtein.c.ProteinAccession.in_(proteins)) - ): - sql_peptide_to_proteins.add( - ( - peptide_to_protein.PeptideSeq, - peptide_to_protein.ProteinAccession, - ) + with connection.begin(): + sql_peptide_to_proteins = set() + proteins = {protein for _, protein in peptide_to_proteins} + for peptide_to_protein in connection.execute( + PeptideToProtein.select().where(PeptideToProtein.c.ProteinAccession.in_(proteins)) + ): + sql_peptide_to_proteins.add( + ( + peptide_to_protein.PeptideSeq, + peptide_to_protein.ProteinAccession, ) + ) - peptide_to_proteins.difference_update(sql_peptide_to_proteins) - for seq, protein in peptide_to_proteins: - connection.execute( - PeptideToProtein.insert().values( - PeptideSeq=seq, isDecoy=False, ProteinAccession=protein - ) + peptide_to_proteins.difference_update(sql_peptide_to_proteins) + for seq, protein in peptide_to_proteins: + connection.execute( + PeptideToProtein.insert().values( + PeptideSeq=seq, isDecoy=False, ProteinAccession=protein ) - - @output_format("dlib") - def write_dlib(self): - """Write MS2PIP predictions to a DLIB SQLite file.""" - from ms2pip._utils.dlib import metadata, open_sqlite - - normalization = "basepeak_10000" - precision = 5 - if not self.normalization == normalization: - self._normalize_spectra(method=normalization) - self._generate_preds_dict() - if not self.peprec_dict: - self._generate_peprec_dict() - if precision not in self.diff_modification_mapping: - self._generate_diff_modification_mapping(precision) - - filename = "{}.dlib".format(self.output_filename) - logger.info("Writing results to %s", filename) - - logger.debug( - "write mode is ignored for DLIB at the file mode, although append or not is respected" - ) - if "a" not in self.write_mode and os.path.exists(filename): - os.remove(filename) - - if self.return_stringbuffer: - raise NotImplementedError("`return_stringbuffer` not implemented for DLIB output.") - - if not self.has_rt: - raise NotImplementedError("Retention times required to write DLIB file.") - - with open_sqlite(filename) as connection: - metadata.create_all() - self._write_dlib_metadata(connection) - peptide_to_proteins = self._write_dlib_entries(connection, precision) - self._write_dlib_peptide_to_protein(connection, peptide_to_proteins) - - def get_normalized_predictions(self, normalization_method="tic"): - """Return normalized copy of predictions.""" - self._normalize_spectra(method=normalization_method) - return self.all_preds.copy() - - @output_format("csv") - def write_csv(self): - """Write MS2PIP predictions to CSV.""" - - self._normalize_spectra(method="tic") - - # Write to file or stringbuffer - if self.return_stringbuffer: - file_object = StringIO() - logger.info("Writing results to StringIO using %s", "write_csv") - else: - f_name = "{}_predictions.csv".format(self.output_filename) - file_object = open(f_name, self.write_mode) - logger.info("Writing results to %s", f_name) - - try: - self.all_preds.to_csv( - file_object, float_format="%.6g", index=False, lineterminator="\n" - ) - except TypeError: # Pandas < 1.5 (Required for Python 3.7 support) - self.all_preds.to_csv( - file_object, float_format="%.6g", index=False, line_terminator="\n" ) - return file_object - - -def write_single_spectrum_csv(spectrum, filepath): - """Write a single spectrum to a CSV file.""" - with open(filepath, "wt") as f: - writer = csv.writer(f, delimiter=",", lineterminator="\n") - writer.writerow(["mz", "intensity", "annotation"]) - for mz, intensity, annotation in zip( - spectrum.mz, - spectrum.intensity, - spectrum.annotations, - ): - writer.writerow([mz, intensity, annotation]) - - -def write_single_spectrum_png(spectrum, filepath): - """Plot a single spectrum and write to a PNG file.""" - import matplotlib.pyplot as plt - import spectrum_utils.plot as sup - ax = plt.gca() - ax.set_title("MS²PIP prediction for " + str(spectrum.peptidoform)) - sup.spectrum(spectrum.to_spectrum_utils(), ax=ax) - plt.savefig(Path(filepath).with_suffix(".png")) - plt.close() +# @output_format("dlib") +# def write_dlib(self): +# """Write MS2PIP predictions to a DLIB SQLite file.""" +# from ms2pip._utils.dlib import metadata, open_sqlite + +# normalization = "basepeak_10000" +# precision = 5 +# if not self.normalization == normalization: +# self._normalize_spectra(method=normalization) +# self._generate_preds_dict() +# if not self.peprec_dict: +# self._generate_peprec_dict() +# if precision not in self.diff_modification_mapping: +# self._generate_diff_modification_mapping(precision) + +# filename = "{}.dlib".format(self.output_filename) +# logger.info("Writing results to %s", filename) + +# logger.debug( +# "write mode is ignored for DLIB at the file mode, although append or not is respected" +# ) +# if "a" not in self.write_mode and os.path.exists(filename): +# os.remove(filename) + +# if self.return_stringbuffer: +# raise NotImplementedError("`return_stringbuffer` not implemented for DLIB output.") + +# if not self.has_rt: +# raise NotImplementedError("Retention times required to write DLIB file.") + +# with open_sqlite(filename) as connection: +# metadata.create_all() +# self._write_dlib_metadata(connection) +# peptide_to_proteins = self._write_dlib_entries(connection, precision) +# self._write_dlib_peptide_to_protein(connection, peptide_to_proteins) + +# def get_normalized_predictions(self, normalization_method="tic"): +# """Return normalized copy of predictions.""" +# self._normalize_spectra(method=normalization_method) +# return self.all_preds.copy() + +# @output_format("csv") +# def write_csv(self): +# """Write MS2PIP predictions to CSV.""" + +# self._normalize_spectra(method="tic") + +# # Write to file or stringbuffer +# if self.return_stringbuffer: +# file_object = StringIO() +# logger.info("Writing results to StringIO using %s", "write_csv") +# else: +# f_name = "{}_predictions.csv".format(self.output_filename) +# file_object = open(f_name, self.write_mode) +# logger.info("Writing results to %s", f_name) + +# try: +# self.all_preds.to_csv( +# file_object, float_format="%.6g", index=False, lineterminator="\n" +# ) +# except TypeError: # Pandas < 1.5 (Required for Python 3.7 support) +# self.all_preds.to_csv( +# file_object, float_format="%.6g", index=False, line_terminator="\n" +# ) +# return file_object + + +# def write_single_spectrum_csv(spectrum, filepath): +# """Write a single spectrum to a CSV file.""" +# with open(filepath, "wt") as f: +# writer = csv.writer(f, delimiter=",", lineterminator="\n") +# writer.writerow(["mz", "intensity", "annotation"]) +# for mz, intensity, annotation in zip( +# spectrum.mz, +# spectrum.intensity, +# spectrum.annotations, +# ): +# writer.writerow([mz, intensity, annotation]) + + +# def write_single_spectrum_png(spectrum, filepath): +# """Plot a single spectrum and write to a PNG file.""" +# import matplotlib.pyplot as plt +# import spectrum_utils.plot as sup + +# ax = plt.gca() +# ax.set_title("MS²PIP prediction for " + str(spectrum.peptidoform)) +# sup.spectrum(spectrum.to_spectrum_utils(), ax=ax) +# plt.savefig(Path(filepath).with_suffix(".png")) +# plt.close() From f335c5531a07e17fa793ac73f2d56f6252044db0 Mon Sep 17 00:00:00 2001 From: Jasper Van den Hemel Date: Mon, 11 Dec 2023 11:10:49 +0100 Subject: [PATCH 04/14] changed write_mgf function to use list of processingresults --- ms2pip/spectrum_output.py | 80 ++++++++++++++++++++++----------------- 1 file changed, 45 insertions(+), 35 deletions(-) diff --git a/ms2pip/spectrum_output.py b/ms2pip/spectrum_output.py index dbd0b4b5..e4acefa9 100644 --- a/ms2pip/spectrum_output.py +++ b/ms2pip/spectrum_output.py @@ -185,7 +185,7 @@ def write_msp(processing_results: List[ProcessingResult], file_object): # comment_line += f' MS2PIP_ID="{spec_id}"' out = [ - f"Name: {sequence}/{charge}", + f"\nName: {sequence}/{charge}", f"MW: {mw}", # f"Comment:{comment_line}", f"Num peaks: {num_peaks}", @@ -200,56 +200,66 @@ def write_msp(processing_results: List[ProcessingResult], file_object): # requires_dicts=True, # requires_diff_modifications=False, # ) -def write_mgf(self, file_object, normalization_method="basepeak_10000"): +def write_mgf(processing_results: List[ProcessingResult], file_object): """ Construct MGF string and write to file_object """ - for result in self.results: + for result in processing_results: sequence = result.psm.peptidoform.sequence charge = result.psm.peptidoform.precursor_charge mw = result.psm.peptidoform.theoretical_mass - out = [ - "BEGIN IONs", - f"TITLE={result.psm_index} {sequence}/{charge}" - f"CHARGE={charge}+" - ] - - for spec_id in sorted(self.peprec_dict.keys()): - seq = self.peprec_dict[spec_id]["peptide"] - mods = self.peprec_dict[spec_id]["modifications"] - charge = self.peprec_dict[spec_id]["charge"] - _, prec_mz = self.mods.calc_precursor_mz(seq, mods, charge) - msp_modifications = self._get_msp_modifications(seq, mods) + result_spectrum = result.as_spectra()[0] + intensity_normalized = _tic_normalize(result_spectrum.intensity) - if self.has_protein_list: - protein_list = self.peprec_dict[spec_id]["protein_list"] - protein_string = self._parse_protein_string(protein_list) - else: - protein_string = "" + peaks = zip(result_spectrum.mz, intensity_normalized) out = [ - "BEGIN IONS", - f"TITLE={spec_id} {seq}/{charge} {msp_modifications} {protein_string}", - f"PEPMASS={prec_mz}", + "BEGIN IONs", + f"TITLE={result.psm_index} {sequence}/{charge}", + f"PEPMASS={mw}", f"CHARGE={charge}+", + [f"{mz}\t{intensity}\n" for mz, intensity in peaks], + "END IONs\n" ] + file_object.writelines(''.join(line) if isinstance(line, list) else line + "\n" for line in out) - if self.has_rt: - rt = self.peprec_dict[spec_id]["rt"] - out.append(f"RTINSECONDS={rt}") - out.append( - self._get_msp_peak_annotation( - self.preds_dict[spec_id]["peaks"], - sep=" ", - include_annotations=False, - ) - ) - out.append("END IONS\n") - file_object.writelines([line for line in out] + ["\n"]) + # for spec_id in sorted(self.peprec_dict.keys()): + # seq = self.peprec_dict[spec_id]["peptide"] + # mods = self.peprec_dict[spec_id]["modifications"] + # charge = self.peprec_dict[spec_id]["charge"] + # _, prec_mz = self.mods.calc_precursor_mz(seq, mods, charge) + # msp_modifications = self._get_msp_modifications(seq, mods) + + # if self.has_protein_list: + # protein_list = self.peprec_dict[spec_id]["protein_list"] + # protein_string = self._parse_protein_string(protein_list) + # else: + # protein_string = "" + + # out = [ + # "BEGIN IONS", + # f"TITLE={spec_id} {seq}/{charge} {msp_modifications} {protein_string}", + # f"PEPMASS={prec_mz}", + # f"CHARGE={charge}+", + # ] + + # if self.has_rt: + # rt = self.peprec_dict[spec_id]["rt"] + # out.append(f"RTINSECONDS={rt}") + + # out.append( + # self._get_msp_peak_annotation( + # self.preds_dict[spec_id]["peaks"], + # sep=" ", + # include_annotations=False, + # ) + # ) + # out.append("END IONS\n") + # file_object.writelines([line for line in out] + ["\n"]) # @output_format("spectronaut") # @writer( From 7b1e5ff5c28ff5b020190eb4c9a9631069b5e91e Mon Sep 17 00:00:00 2001 From: Jasper Van den Hemel Date: Mon, 11 Dec 2023 11:42:09 +0100 Subject: [PATCH 05/14] changed normalization for write_mgf --- ms2pip/spectrum_output.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/ms2pip/spectrum_output.py b/ms2pip/spectrum_output.py index e4acefa9..def64c85 100644 --- a/ms2pip/spectrum_output.py +++ b/ms2pip/spectrum_output.py @@ -185,21 +185,15 @@ def write_msp(processing_results: List[ProcessingResult], file_object): # comment_line += f' MS2PIP_ID="{spec_id}"' out = [ - f"\nName: {sequence}/{charge}", + f"Name: {sequence}/{charge}", f"MW: {mw}", # f"Comment:{comment_line}", f"Num peaks: {num_peaks}", [f"{mz}\t{intensity}\t{annotation}/0.0\n" for mz, intensity, annotation in peaks] ] file_object.writelines(''.join(line) if isinstance(line, list) else line + "\n" for line in out) + file_object.write('\n') -# @output_format("mgf") -# @writer( -# file_suffix="_predictions.mgf", -# normalization_method="basepeak_10000", -# requires_dicts=True, -# requires_diff_modifications=False, -# ) def write_mgf(processing_results: List[ProcessingResult], file_object): """ Construct MGF string and write to file_object @@ -212,7 +206,7 @@ def write_mgf(processing_results: List[ProcessingResult], file_object): mw = result.psm.peptidoform.theoretical_mass result_spectrum = result.as_spectra()[0] - intensity_normalized = _tic_normalize(result_spectrum.intensity) + intensity_normalized = _basepeak_normalize(result_spectrum.intensity) * 10000 peaks = zip(result_spectrum.mz, intensity_normalized) From ab085525883abf07d11f78130368122528cd9afe Mon Sep 17 00:00:00 2001 From: Jasper Van den Hemel Date: Wed, 13 Dec 2023 10:30:33 +0100 Subject: [PATCH 06/14] refactored write_csv to use list of processingresult --- ms2pip/spectrum_output.py | 243 +++++++++++++++++++------------------- 1 file changed, 122 insertions(+), 121 deletions(-) diff --git a/ms2pip/spectrum_output.py b/ms2pip/spectrum_output.py index def64c85..ed37c3c7 100644 --- a/ms2pip/spectrum_output.py +++ b/ms2pip/spectrum_output.py @@ -262,100 +262,100 @@ def write_mgf(processing_results: List[ProcessingResult], file_object): # requires_dicts=False, # requires_diff_modifications=True, # ) -# def write_spectronaut(self, file_obj): -# """ -# Construct spectronaut DataFrame and write to file_object. -# """ -# if "w" in self.write_mode: -# header = True -# elif "a" in self.write_mode: -# header = False -# else: -# raise InvalidWriteModeError(self.write_mode) - -# spectronaut_peprec = self.peprec.copy() - -# # ModifiedPeptide and PrecursorMz columns -# spectronaut_peprec["ModifiedPeptide"] = spectronaut_peprec.apply( -# lambda row: self._get_diff_modified_sequence(row["peptide"], row["modifications"]), -# axis=1, -# ) -# spectronaut_peprec["PrecursorMz"] = spectronaut_peprec.apply( -# lambda row: self.mods.calc_precursor_mz( -# row["peptide"], row["modifications"], row["charge"] -# )[1], -# axis=1, -# ) -# spectronaut_peprec["ModifiedPeptide"] = "_" + spectronaut_peprec["ModifiedPeptide"] + "_" - -# # Additional columns -# spectronaut_peprec["FragmentLossType"] = "noloss" - -# # Retention time -# if "rt" in spectronaut_peprec.columns: -# rt_cols = ["iRT"] -# spectronaut_peprec["iRT"] = spectronaut_peprec["rt"] -# else: -# rt_cols = [] - -# # ProteinId -# if self.has_protein_list: -# spectronaut_peprec["ProteinId"] = spectronaut_peprec["protein_list"].apply( -# self._parse_protein_string -# ) -# else: -# spectronaut_peprec["ProteinId"] = spectronaut_peprec["spec_id"] - -# # Rename columns and merge with predictions -# spectronaut_peprec = spectronaut_peprec.rename( -# columns={"charge": "PrecursorCharge", "peptide": "StrippedPeptide"} -# ) -# peptide_cols = ( -# [ -# "ModifiedPeptide", -# "StrippedPeptide", -# "PrecursorCharge", -# "PrecursorMz", -# "ProteinId", -# ] -# + rt_cols -# + ["FragmentLossType"] -# ) -# spectronaut_df = spectronaut_peprec[peptide_cols + ["spec_id"]] -# spectronaut_df = self.all_preds.merge(spectronaut_df, on="spec_id") +def write_spectronaut(processing_results: List[ProcessingResult], file_obj, header: bool): + """ + Construct spectronaut DataFrame and write to file_object. + """ + # if "w" in self.write_mode: + # header = True + # elif "a" in self.write_mode: + # header = False + # else: + # raise InvalidWriteModeError(self.write_mode) + + spectronaut_peprec = self.peprec.copy() + + # ModifiedPeptide and PrecursorMz columns + spectronaut_peprec["ModifiedPeptide"] = spectronaut_peprec.apply( + lambda row: self._get_diff_modified_sequence(row["peptide"], row["modifications"]), + axis=1, + ) + spectronaut_peprec["PrecursorMz"] = spectronaut_peprec.apply( + lambda row: self.mods.calc_precursor_mz( + row["peptide"], row["modifications"], row["charge"] + )[1], + axis=1, + ) + spectronaut_peprec["ModifiedPeptide"] = "_" + spectronaut_peprec["ModifiedPeptide"] + "_" + + # Additional columns + spectronaut_peprec["FragmentLossType"] = "noloss" + + # Retention time + if "rt" in spectronaut_peprec.columns: + rt_cols = ["iRT"] + spectronaut_peprec["iRT"] = spectronaut_peprec["rt"] + else: + rt_cols = [] -# # Fragment columns -# spectronaut_df["FragmentCharge"] = ( -# spectronaut_df["ion"].str.contains("2").map({True: 2, False: 1}) -# ) -# spectronaut_df["FragmentType"] = spectronaut_df["ion"].str[0].str.lower() - -# # Rename and sort columns -# spectronaut_df = spectronaut_df.rename( -# columns={ -# "mz": "FragmentMz", -# "prediction": "RelativeIntensity", -# "ionnumber": "FragmentNumber", -# } -# ) -# fragment_cols = [ -# "FragmentCharge", -# "FragmentMz", -# "RelativeIntensity", -# "FragmentType", -# "FragmentNumber", -# ] -# spectronaut_df = spectronaut_df[peptide_cols + fragment_cols] -# try: -# spectronaut_df.to_csv( -# file_obj, index=False, header=header, sep=";", lineterminator="\n" -# ) -# except TypeError: # Pandas < 1.5 (Required for Python 3.7 support) -# spectronaut_df.to_csv( -# file_obj, index=False, header=header, sep=";", line_terminator="\n" -# ) + # ProteinId + if self.has_protein_list: + spectronaut_peprec["ProteinId"] = spectronaut_peprec["protein_list"].apply( + self._parse_protein_string + ) + else: + spectronaut_peprec["ProteinId"] = spectronaut_peprec["spec_id"] + + # Rename columns and merge with predictions + spectronaut_peprec = spectronaut_peprec.rename( + columns={"charge": "PrecursorCharge", "peptide": "StrippedPeptide"} + ) + peptide_cols = ( + [ + "ModifiedPeptide", + "StrippedPeptide", + "PrecursorCharge", + "PrecursorMz", + "ProteinId", + ] + + rt_cols + + ["FragmentLossType"] + ) + spectronaut_df = spectronaut_peprec[peptide_cols + ["spec_id"]] + spectronaut_df = self.all_preds.merge(spectronaut_df, on="spec_id") + + # Fragment columns + spectronaut_df["FragmentCharge"] = ( + spectronaut_df["ion"].str.contains("2").map({True: 2, False: 1}) + ) + spectronaut_df["FragmentType"] = spectronaut_df["ion"].str[0].str.lower() + + # Rename and sort columns + spectronaut_df = spectronaut_df.rename( + columns={ + "mz": "FragmentMz", + "prediction": "RelativeIntensity", + "ionnumber": "FragmentNumber", + } + ) + fragment_cols = [ + "FragmentCharge", + "FragmentMz", + "RelativeIntensity", + "FragmentType", + "FragmentNumber", + ] + spectronaut_df = spectronaut_df[peptide_cols + fragment_cols] + try: + spectronaut_df.to_csv( + file_obj, index=False, header=header, sep=";", lineterminator="\n" + ) + except TypeError: # Pandas < 1.5 (Required for Python 3.7 support) + spectronaut_df.to_csv( + file_obj, index=False, header=header, sep=";", line_terminator="\n" + ) -# return file_obj + return file_obj def _write_bibliospec_core(self, file_obj_ssl, file_obj_ms2, start_scannr=0): """Construct Bibliospec SSL/MS2 strings and write to file_objects.""" @@ -621,35 +621,36 @@ def _write_dlib_peptide_to_protein(self, connection, peptide_to_proteins): # peptide_to_proteins = self._write_dlib_entries(connection, precision) # self._write_dlib_peptide_to_protein(connection, peptide_to_proteins) -# def get_normalized_predictions(self, normalization_method="tic"): -# """Return normalized copy of predictions.""" -# self._normalize_spectra(method=normalization_method) -# return self.all_preds.copy() - -# @output_format("csv") -# def write_csv(self): -# """Write MS2PIP predictions to CSV.""" - -# self._normalize_spectra(method="tic") - -# # Write to file or stringbuffer -# if self.return_stringbuffer: -# file_object = StringIO() -# logger.info("Writing results to StringIO using %s", "write_csv") -# else: -# f_name = "{}_predictions.csv".format(self.output_filename) -# file_object = open(f_name, self.write_mode) -# logger.info("Writing results to %s", f_name) - -# try: -# self.all_preds.to_csv( -# file_object, float_format="%.6g", index=False, lineterminator="\n" -# ) -# except TypeError: # Pandas < 1.5 (Required for Python 3.7 support) -# self.all_preds.to_csv( -# file_object, float_format="%.6g", index=False, line_terminator="\n" -# ) -# return file_object +def write_csv(processing_results: List[ProcessingResult], file_obj): + """Write processing results to CSV file.""" + fieldnames = [ + "spec_id", + "charge", + "ion", + "ionnumber", + "mz", + "prediction", + "rt" + ] + writer = csv.DictWriter(file_obj, fieldnames=fieldnames, lineterminator="\n") + writer.writeheader() + for result in processing_results: + if result.theoretical_mz is not None: + for ion_type in result.theoretical_mz: + for i in range(len(result.theoretical_mz[ion_type])): + writer.writerow( + { + "spec_id": result.psm_index, + "charge": result.psm.peptidoform.precursor_charge, + "ion": ion_type, + "ionnumber": i + 1, + "mz": "{:.6g}".format(result.theoretical_mz[ion_type][i]), + "prediction": "{:.6g}".format( + result.predicted_intensity[ion_type][i] + ), + "rt": result.psm.retention_time # TODO: is always empty now + } + ) # def write_single_spectrum_csv(spectrum, filepath): From a62be645c87a3c8063e13a6d796e749617acd60b Mon Sep 17 00:00:00 2001 From: Jasper Van den Hemel Date: Wed, 13 Dec 2023 11:54:15 +0100 Subject: [PATCH 07/14] refactored write_dlib functions --- ms2pip/spectrum_output.py | 245 +++++++++++++++++++------------------- 1 file changed, 122 insertions(+), 123 deletions(-) diff --git a/ms2pip/spectrum_output.py b/ms2pip/spectrum_output.py index ed37c3c7..4079d73a 100644 --- a/ms2pip/spectrum_output.py +++ b/ms2pip/spectrum_output.py @@ -25,7 +25,6 @@ class InvalidWriteModeError(ValueError): pass - # def _generate_peprec_dict(self, rt_to_seconds=True): # """ # Create easy to access dict from all_preds and peprec dataframes @@ -164,11 +163,11 @@ def write_msp(processing_results: List[ProcessingResult], file_object): sequence = result.psm.peptidoform.sequence charge = result.psm.peptidoform.precursor_charge mw = result.psm.peptidoform.theoretical_mass - result_spectrum = result.as_spectra()[0] - num_peaks = len(result_spectrum.mz) - intensity_normalized = _basepeak_normalize(result_spectrum.intensity) * 10000 + result = result.as_spectra()[0] + num_peaks = len(result.mz) + intensity_normalized = _basepeak_normalize(result.intensity) * 10000 - peaks = zip(result_spectrum.mz, intensity_normalized, result_spectrum.annotations) + peaks = zip(result.mz, intensity_normalized, result.annotations) # TODO: change modification info for comment line to use result # comment_line = f" Mods={msp_modifications} Parent={prec_mz}" @@ -205,10 +204,10 @@ def write_mgf(processing_results: List[ProcessingResult], file_object): charge = result.psm.peptidoform.precursor_charge mw = result.psm.peptidoform.theoretical_mass - result_spectrum = result.as_spectra()[0] - intensity_normalized = _basepeak_normalize(result_spectrum.intensity) * 10000 + result = result.as_spectra()[0] + intensity_normalized = _basepeak_normalize(result.intensity) * 10000 - peaks = zip(result_spectrum.mz, intensity_normalized) + peaks = zip(result.mz, intensity_normalized) out = [ "BEGIN IONs", @@ -399,49 +398,49 @@ def _write_bibliospec_core(self, file_obj_ssl, file_obj_ms2, start_scannr=0): + "\n" ) -def _write_general( - self, - write_function, - file_suffix, - normalization_method, - requires_dicts, - requires_diff_modifications, - diff_modification_precision=1, -): - """ - General write function to call core write functions. +# def _write_general( +# self, +# write_function, +# file_suffix, +# normalization_method, +# requires_dicts, +# requires_diff_modifications, +# diff_modification_precision=1, +# ): +# """ +# General write function to call core write functions. + +# Note: Does not work for write_bibliospec and write_dlib functions. +# """ - Note: Does not work for write_bibliospec and write_dlib functions. - """ +# # Normalize if necessary and make dicts +# if not self.normalization == normalization_method: +# self._normalize_spectra(method=normalization_method) +# if requires_dicts: +# self._generate_preds_dict() +# elif requires_dicts and not self.preds_dict: +# self._generate_preds_dict() +# if requires_dicts and not self.peprec_dict: +# self._generate_peprec_dict() - # Normalize if necessary and make dicts - if not self.normalization == normalization_method: - self._normalize_spectra(method=normalization_method) - if requires_dicts: - self._generate_preds_dict() - elif requires_dicts and not self.preds_dict: - self._generate_preds_dict() - if requires_dicts and not self.peprec_dict: - self._generate_peprec_dict() - - if ( - requires_diff_modifications - and diff_modification_precision not in self.diff_modification_mapping - ): - self._generate_diff_modification_mapping(diff_modification_precision) - - # Write to file or stringbuffer - if self.return_stringbuffer: - file_object = StringIO() - logger.info("Writing results to StringIO using %s", write_function.__name__) - else: - f_name = self.output_filename + file_suffix - file_object = open(f_name, self.write_mode) - logger.info("Writing results to %s", f_name) +# if ( +# requires_diff_modifications +# and diff_modification_precision not in self.diff_modification_mapping +# ): +# self._generate_diff_modification_mapping(diff_modification_precision) - write_function(self, file_object) +# # Write to file or stringbuffer +# if self.return_stringbuffer: +# file_object = StringIO() +# logger.info("Writing results to StringIO using %s", write_function.__name__) +# else: +# f_name = self.output_filename + file_suffix +# file_object = open(f_name, self.write_mode) +# logger.info("Writing results to %s", f_name) + +# write_function(self, file_object) - return file_object +# return file_object # @output_format("bibliospec") # def write_bibliospec(self): @@ -493,76 +492,76 @@ def _write_general( # return file_obj_ssl, file_obj_ms2 -# def _write_dlib_metadata(self, connection): -# from sqlalchemy import select +def _write_dlib_metadata(self, connection): + from sqlalchemy import select -# from ms2pip._utils.dlib import DLIB_VERSION, Metadata + from ms2pip._utils.dlib import DLIB_VERSION, Metadata -# with connection.begin(): -# version = connection.execute( -# select([Metadata.c.Value]).where(Metadata.c.Key == "version") -# ).scalar() -# if version is None: -# connection.execute( -# Metadata.insert().values( -# Key="version", -# Value=DLIB_VERSION, -# ) -# ) + with connection.begin(): + version = connection.execute( + select([Metadata.c.Value]).where(Metadata.c.Key == "version") + ).scalar() + if version is None: + connection.execute( + Metadata.insert().values( + Key="version", + Value=DLIB_VERSION, + ) + ) -def _write_dlib_entries(self, connection, precision): +def _write_dlib_entries(processing_results: List[ProcessingResult], connection, precision, output_filename: str): from ms2pip._utils.dlib import Entry peptide_to_proteins = set() with connection.begin(): - for spec_id, peprec in self.peprec_dict.items(): - seq = peprec["peptide"] - mods = peprec["modifications"] - charge = peprec["charge"] + + for result in processing_results: - prec_mass, prec_mz = self.mods.calc_precursor_mz(seq, mods, charge) - mod_seq = self._get_diff_modified_sequence(seq, mods, precision=precision) + precursor_mz = result.precursor_mz + precursor_charge = result.psm.peptidoform.precursor_charge + seq = result.psm.peptidoform.sequence + rt = result.psm.retention_time #TODO: check if this is the right rt + mzs = result.theoretical_mz - all_peaks = sorted( - itertools.chain.from_iterable(self.preds_dict[spec_id]["peaks"].values()), - key=itemgetter(1), - ) - mzs = [peak[1] for peak in all_peaks] - intensities = [peak[2] for peak in all_peaks] + # TODO: get mod_seq + # mod_seq = + + result = result.as_spectra()[0] + intensity_normalized = _basepeak_normalize(result.intensity) * 10000 connection.execute( Entry.insert().values( - PrecursorMz=prec_mz, - PrecursorCharge=charge, + PrecursorMz=precursor_mz, + PrecursorCharge=precursor_charge, PeptideModSeq=mod_seq, PeptideSeq=seq, Copies=1, - RTInSeconds=peprec["rt"], + RTInSeconds=rt, Score=0, MassEncodedLength=len(mzs), MassArray=mzs, - IntensityEncodedLength=len(intensities), - IntensityArray=intensities, - SourceFile=self.output_filename, + IntensityEncodedLength=len(intensity_normalized), + IntensityArray=intensity_normalized, + SourceFile=output_filename, ) ) - if self.has_protein_list: - protein_list = peprec["protein_list"] - if isinstance(protein_list, str): - protein_list = literal_eval(protein_list) + if result.psm.protein_list is not None: + protein_list = result.psm.protein_list + if isinstance(protein_list, str): + protein_list = literal_eval(protein_list) - for protein in protein_list: - peptide_to_proteins.add((seq, protein)) + for protein in protein_list: + peptide_to_proteins.add((seq, protein)) return peptide_to_proteins - -def _write_dlib_peptide_to_protein(self, connection, peptide_to_proteins): + +def _write_dlib_peptide_to_protein(connection, peptide_to_proteins): from ms2pip._utils.dlib import PeptideToProtein - if not self.has_protein_list: - return + # if not self.has_protein_list: + # return with connection.begin(): sql_peptide_to_proteins = set() @@ -586,40 +585,40 @@ def _write_dlib_peptide_to_protein(self, connection, peptide_to_proteins): ) # @output_format("dlib") -# def write_dlib(self): -# """Write MS2PIP predictions to a DLIB SQLite file.""" -# from ms2pip._utils.dlib import metadata, open_sqlite - -# normalization = "basepeak_10000" -# precision = 5 -# if not self.normalization == normalization: -# self._normalize_spectra(method=normalization) -# self._generate_preds_dict() -# if not self.peprec_dict: -# self._generate_peprec_dict() -# if precision not in self.diff_modification_mapping: -# self._generate_diff_modification_mapping(precision) - -# filename = "{}.dlib".format(self.output_filename) -# logger.info("Writing results to %s", filename) - -# logger.debug( -# "write mode is ignored for DLIB at the file mode, although append or not is respected" -# ) -# if "a" not in self.write_mode and os.path.exists(filename): -# os.remove(filename) +def write_dlib(processing_results: List[ProcessingResult], output_filename: str): + """Write MS2PIP predictions to a DLIB SQLite file.""" + from ms2pip._utils.dlib import metadata, open_sqlite + + # normalization = "basepeak_10000" + precision = 5 + # if not self.normalization == normalization: + # self._normalize_spectra(method=normalization) + # self._generate_preds_dict() + # if not self.peprec_dict: + # self._generate_peprec_dict() + # if precision not in self.diff_modification_mapping: + # self._generate_diff_modification_mapping(precision) + + filename = "{}.dlib".format(output_filename) + logger.info("Writing results to %s", filename) + + logger.debug( + "write mode is ignored for DLIB at the file mode, although append or not is respected" + ) + # if "a" not in self.write_mode and os.path.exists(filename): + # os.remove(filename) -# if self.return_stringbuffer: -# raise NotImplementedError("`return_stringbuffer` not implemented for DLIB output.") + # if self.return_stringbuffer: + # raise NotImplementedError("`return_stringbuffer` not implemented for DLIB output.") -# if not self.has_rt: -# raise NotImplementedError("Retention times required to write DLIB file.") + # if not self.has_rt: + # raise NotImplementedError("Retention times required to write DLIB file.") -# with open_sqlite(filename) as connection: -# metadata.create_all() -# self._write_dlib_metadata(connection) -# peptide_to_proteins = self._write_dlib_entries(connection, precision) -# self._write_dlib_peptide_to_protein(connection, peptide_to_proteins) + with open_sqlite(filename) as connection: + metadata.create_all() + _write_dlib_metadata(connection) + peptide_to_proteins = _write_dlib_entries(processing_results, connection, precision, filename) + _write_dlib_peptide_to_protein(connection, peptide_to_proteins) def write_csv(processing_results: List[ProcessingResult], file_obj): """Write processing results to CSV file.""" @@ -642,13 +641,13 @@ def write_csv(processing_results: List[ProcessingResult], file_obj): { "spec_id": result.psm_index, "charge": result.psm.peptidoform.precursor_charge, - "ion": ion_type, + "ion": ion_type, # TODO: change to upper(ion_type) ? "ionnumber": i + 1, "mz": "{:.6g}".format(result.theoretical_mz[ion_type][i]), "prediction": "{:.6g}".format( result.predicted_intensity[ion_type][i] ), - "rt": result.psm.retention_time # TODO: is always empty now + "rt": result.psm.retention_time # TODO: results in empty string while testing } ) From 2228df5fd6c11786817fb477154573451bb2b07f Mon Sep 17 00:00:00 2001 From: Jasper Van den Hemel Date: Wed, 13 Dec 2023 11:58:08 +0100 Subject: [PATCH 08/14] refactored _write_dlib_entries --- ms2pip/spectrum_output.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/ms2pip/spectrum_output.py b/ms2pip/spectrum_output.py index 4079d73a..fb096308 100644 --- a/ms2pip/spectrum_output.py +++ b/ms2pip/spectrum_output.py @@ -518,7 +518,7 @@ def _write_dlib_entries(processing_results: List[ProcessingResult], connection, for result in processing_results: - precursor_mz = result.precursor_mz + precursor_mz = result.psm.precursor_mz precursor_charge = result.psm.peptidoform.precursor_charge seq = result.psm.peptidoform.sequence rt = result.psm.retention_time #TODO: check if this is the right rt @@ -527,6 +527,14 @@ def _write_dlib_entries(processing_results: List[ProcessingResult], connection, # TODO: get mod_seq # mod_seq = + if result.psm.protein_list is not None: + protein_list = result.psm.protein_list + if isinstance(protein_list, str): + protein_list = literal_eval(protein_list) + + for protein in protein_list: + peptide_to_proteins.add((seq, protein)) + result = result.as_spectra()[0] intensity_normalized = _basepeak_normalize(result.intensity) * 10000 @@ -546,15 +554,6 @@ def _write_dlib_entries(processing_results: List[ProcessingResult], connection, SourceFile=output_filename, ) ) - - if result.psm.protein_list is not None: - protein_list = result.psm.protein_list - if isinstance(protein_list, str): - protein_list = literal_eval(protein_list) - - for protein in protein_list: - peptide_to_proteins.add((seq, protein)) - return peptide_to_proteins def _write_dlib_peptide_to_protein(connection, peptide_to_proteins): From 59ad47d15a600342c18b366a5df4e8c07b315654 Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 8 May 2024 00:56:34 +0200 Subject: [PATCH 09/14] Reimplement most of spectrum_output for v4.0.0 --- ms2pip/_utils/dlib.py | 4 +- ms2pip/plot.py | 23 + ms2pip/spectrum.py | 6 +- ms2pip/spectrum_output.py | 1219 ++++++++++++++++----------------- tests/test_spectrum_output.py | 112 +-- 5 files changed, 661 insertions(+), 703 deletions(-) create mode 100644 ms2pip/plot.py diff --git a/ms2pip/_utils/dlib.py b/ms2pip/_utils/dlib.py index dfb7b0ae..f33ee35f 100644 --- a/ms2pip/_utils/dlib.py +++ b/ms2pip/_utils/dlib.py @@ -1,6 +1,8 @@ """Database configuration for EncyclopeDIA DLIB SQLite format.""" import zlib +from pathlib import Path +from typing import Union import numpy import sqlalchemy @@ -91,7 +93,7 @@ def copy(self): ) -def open_sqlite(filename): +def open_sqlite(filename: Union[str, Path]) -> sqlalchemy.engine.Connection: engine = sqlalchemy.create_engine(f"sqlite:///{filename}") metadata.bind = engine return engine.connect() diff --git a/ms2pip/plot.py b/ms2pip/plot.py new file mode 100644 index 00000000..56f7bc63 --- /dev/null +++ b/ms2pip/plot.py @@ -0,0 +1,23 @@ +from pathlib import Path +from typing import Union + +from ms2pip.spectrum import Spectrum + +try: + import matplotlib.pyplot as plt + import spectrum_utils.plot as sup + + _can_plot = True +except ImportError: + _can_plot = False + + +def spectrum_to_png(spectrum: Spectrum, filepath: Union[str, Path]): + """Plot a single spectrum and write to a PNG file.""" + if not _can_plot: + raise ImportError("Matplotlib and spectrum_utils are required to plot spectra.") + ax = plt.gca() + ax.set_title("MS²PIP prediction for " + str(spectrum.peptidoform)) + sup.spectrum(spectrum.to_spectrum_utils(), ax=ax) + plt.savefig(Path(filepath).with_suffix(".png")) + plt.close() diff --git a/ms2pip/spectrum.py b/ms2pip/spectrum.py index 081e5fce..8e49a389 100644 --- a/ms2pip/spectrum.py +++ b/ms2pip/spectrum.py @@ -72,10 +72,10 @@ def __repr__(self) -> str: @model_validator(mode="after") @classmethod def check_array_lengths(cls, data: dict): - if len(data["mz"]) != len(data["intensity"]): + if len(data.mz) != len(data.intensity): raise ValueError("Array lengths do not match.") - if data["annotations"] is not None: - if len(data["annotations"]) != len(data["intensity"]): + if data.annotations is not None: + if len(data.annotations) != len(data.intensity): raise ValueError("Array lengths do not match.") return data diff --git a/ms2pip/spectrum_output.py b/ms2pip/spectrum_output.py index fb096308..be908630 100644 --- a/ms2pip/spectrum_output.py +++ b/ms2pip/spectrum_output.py @@ -1,676 +1,675 @@ """ Write spectrum files from MS2PIP predictions. """ + from __future__ import annotations import csv import itertools import logging import os +import re +from abc import ABC, abstractmethod from ast import literal_eval +from collections import defaultdict from functools import wraps from io import StringIO from operator import itemgetter from pathlib import Path from time import localtime, strftime -from typing import Any, Dict, List +from typing import Any, Dict, Generator, List, Optional, Union import numpy as np +from psm_utils import PSM, Peptidoform +from pyteomics import proforma +from sqlalchemy import engine, select +from ms2pip._utils import dlib from ms2pip.result import ProcessingResult +from ms2pip.spectrum import Spectrum -logger = logging.getLogger(__name__) +def _peptidoform_str_without_charge(peptidoform: Peptidoform) -> str: + """Get peptidoform string without charge.""" + return re.sub(r"\/\d+$", "", str(peptidoform)) -class InvalidWriteModeError(ValueError): - pass - # def _generate_peprec_dict(self, rt_to_seconds=True): - # """ - # Create easy to access dict from all_preds and peprec dataframes - # """ - # peprec_tmp = self.peprec.copy() +def _unlogarithmize(intensities: np.array) -> np.array: + """Undo logarithmic transformation of intensities.""" + return (2**intensities) - 0.001 - # if self.has_rt and rt_to_seconds: - # peprec_tmp["rt"] = peprec_tmp["rt"] * 60 - # peprec_tmp.index = peprec_tmp["spec_id"] - # peprec_tmp.drop("spec_id", axis=1, inplace=True) +def _tic_normalize(intensities: np.array): + """Normalize intensities to total ion current (TIC).""" + return intensities / intensities.sum() - # self.peprec_dict = peprec_tmp.to_dict(orient="index") - # def _generate_preds_dict(self): - # """ - # Create easy to access dict from results data. - # """ - # self.preds_dict = {} +def _basepeak_normalize(intensities: np.array, basepeak: Optional[float] = None) -> np.array: + """Normalize intensities to most intense peak.""" + if not basepeak: + basepeak = intensities.max() + return intensities / basepeak - # for result in self.results: - # spec_id = result.psm_index - # if spec_id in self.preds_dict.keys(): - # for ion, ion_numbers in result.theoretical_mz.items(): - # for i, ion_number in enumerate(ion_numbers): - # mz = result.theoretical_mz[ion][i] - # prediction = result.predicted_intensity[ion][i] if result.predicted_intensity else None - # if ion in self.preds_dict[spec_id]["peaks"]: - # self.preds_dict[spec_id]["peaks"][ion].append((ion_number, mz, prediction)) - # else: - # self.preds_dict[spec_id]["peaks"][ion] = [(ion_number, mz, prediction)] - # else: - # self.preds_dict[spec_id] = { - # "peaks": {} - # } +class _Writer(ABC): + """Abstract base class for writing spectrum files.""" - # if result.psm and result.psm.peptidoform: - # self.preds_dict[spec_id]["charge"] = result.psm.peptidoform.precursor_charge - -def _tic_normalize(intensities: np.array): - """Normalize intensities to total ion current (TIC).""" - return intensities / intensities.sum() + def __init__(self, file: Union[str, Path, StringIO], write_mode: str = "w"): + self.ssl_file = file + self.write_mode = write_mode -def _basepeak_normalize(intensities: np.array): - """Normalize intensities to most intense peak.""" - return intensities / intensities.max() + self._open_file = None -def _get_msp_modifications(sequence, modifications): - """ - Format modifications in MSP-style, e.g. "1/0,E,Glu->pyro-Glu" - """ + def __enter__(self): + """Open file in context manager.""" + if isinstance(self.ssl_file, (str, Path)): + self.ssl_file = Path(self.ssl_file) + self._open_file = open(self.ssl_file, self.write_mode) + return self + + def __exit__(self, exc_type, exc_value, traceback): + if self._open_file: + self._open_file.close() - if isinstance(modifications, str): - if not modifications or modifications == "-": - msp_modifications = "0" + @property + def _file_object(self): + """Get file object from file path or StringIO.""" + if self._open_file: + return self._open_file else: - mods = modifications.split("|") - mods = [(int(mods[i]), mods[i + 1]) for i in range(0, len(mods), 2)] - mods = [(x, y) if x == 0 else (x - 1, y) for (x, y) in mods] - mods = sorted(mods) - mods = [(str(x), sequence[x], y) for (x, y) in mods] - msp_modifications = "/".join([",".join(list(x)) for x in mods]) - msp_modifications = f"{len(mods)}/{msp_modifications}" - else: - msp_modifications = "0" - - return msp_modifications - -def _parse_protein_string(protein_list): - """ - Parse protein string from list, list string literal, or string. - """ - if isinstance(protein_list, list): - protein_string = "/".join(protein_list) - elif isinstance(protein_list, str): - try: - protein_string = "/".join(literal_eval(protein_list)) - except ValueError: - protein_string = protein_list - else: - protein_string = "" - return protein_string - - # def _get_last_ssl_scannr(self): - # """ - # Return scan number of last line in a Bibliospec SSL file. - # """ - # ssl_filename = "{}_predictions.ssl".format(self.output_filename) - # with open(ssl_filename, "rt") as ssl: - # for line in ssl: - # last_line = line - # last_scannr = int(last_line.split("\t")[1]) - # return last_scannr - - # def _generate_diff_modification_mapping(self, precision): - # """ - # Make modification name -> ssl modification name mapping. - # """ - # self.diff_modification_mapping[precision] = { - # ptm.split(",")[0]: "{0:+.{1}f}".format(float(ptm.split(",")[1]), precision) - # for ptm in self.params["ptm"] - # } - - # def _get_diff_modified_sequence(self, sequence, modifications, precision=1): - # """ - # Build BiblioSpec SSL modified sequence string. - # """ - # pep = list(sequence) - # mapping = self.diff_modification_mapping[precision] - - # for loc, name in zip(modifications.split("|")[::2], modifications.split("|")[1::2]): - # # C-term mod - # if loc == "-1": - # pep[-1] = pep[-1] + "[{}]".format(mapping[name]) - # # N-term mod - # elif loc == "0": - # pep[0] = pep[0] + "[{}]".format(mapping[name]) - # # Normal mod - # else: - # pep[int(loc) - 1] = pep[int(loc) - 1] + "[{}]".format(mapping[name]) - # return "".join(pep) - - # @output_format("msp") - # @writer( - # file_suffix="_predictions.msp", - # normalization_method="basepeak_10000", - # requires_dicts=True, - # requires_diff_modifications=False, - # ) - - -def write_msp(processing_results: List[ProcessingResult], file_object): - """Construct MSP string and write to file_object.""" - for result in processing_results: - sequence = result.psm.peptidoform.sequence - charge = result.psm.peptidoform.precursor_charge - mw = result.psm.peptidoform.theoretical_mass - result = result.as_spectra()[0] - num_peaks = len(result.mz) - intensity_normalized = _basepeak_normalize(result.intensity) * 10000 - - peaks = zip(result.mz, intensity_normalized, result.annotations) - - # TODO: change modification info for comment line to use result - # comment_line = f" Mods={msp_modifications} Parent={prec_mz}" - - # if self.has_protein_list: - # protein_list = self.peprec_dict[spec_id]["protein_list"] - # protein_string = self._parse_protein_string(protein_list) - # comment_line += f' Protein="{protein_string}"' - - # if self.has_rt: - # rt = self.peprec_dict[spec_id]["rt"] - # comment_line += f" RetentionTime={rt}" - - # comment_line += f' MS2PIP_ID="{spec_id}"' - - out = [ - f"Name: {sequence}/{charge}", - f"MW: {mw}", - # f"Comment:{comment_line}", - f"Num peaks: {num_peaks}", - [f"{mz}\t{intensity}\t{annotation}/0.0\n" for mz, intensity, annotation in peaks] - ] - file_object.writelines(''.join(line) if isinstance(line, list) else line + "\n" for line in out) - file_object.write('\n') + if isinstance(self.ssl_file, StringIO): + return self.ssl_file + elif isinstance(self.ssl_file, (str, Path)): + raise TypeError("Use `with` statement to open file from path.") + else: + raise TypeError("Unsupported type for `file`.") + + def write(self, processing_results: List[ProcessingResult]): + """Write multiple processing results to file.""" + for result in processing_results: + self._write_result(result) -def write_mgf(processing_results: List[ProcessingResult], file_object): - """ - Construct MGF string and write to file_object - """ - - for result in processing_results: + @abstractmethod + def _write_result(self, result: ProcessingResult): + """Write single processing result to file.""" + ... + + +class TSV(_Writer): + """Write TSV files from MS2PIP processing results.""" + + field_names = [ + "psm_index", + "ion_type", + "ion_number", + "mz", + "predicted", + "observed", + "rt", + ] + + def write(self, processing_results: List[ProcessingResult]): + """Write multiple processing results to file.""" + writer = csv.DictWriter(self._file_object, fieldnames=self.field_names, delimiter="\t") + if self.write_mode == "w": + writer.writeheader() + for result in processing_results: + self._write_result(result, writer) + + def _write_result(self, result: ProcessingResult, writer: csv.DictWriter): + """Write single processing result to file.""" + # Only write results with predictions or observations + if not result.theoretical_mz: + return + + for ion_type in result.theoretical_mz: + for i in range(len(result.theoretical_mz[ion_type])): + writer.writerow(self._write_row(result, ion_type, i)) + + @staticmethod + def _write_row(result: ProcessingResult, ion_type: str, ion_index: int): + """Write single row for TSV file.""" + return { + "psm_index": result.psm_index, + "ion_type": ion_type, + "ion_number": ion_index + 1, + "mz": "{:.10g}".format(result.theoretical_mz[ion_type][ion_index]), + "predicted": "{:.10g}".format(result.predicted_intensity[ion_type][ion_index]) + if result.predicted_intensity + else None, + "observed": "{:.10g}".format(result.observed_intensity[ion_type][ion_index]) + if result.observed_intensity + else None, + "rt": result.psm.retention_time if result.psm.retention_time else None, + } - sequence = result.psm.peptidoform.sequence - charge = result.psm.peptidoform.precursor_charge - mw = result.psm.peptidoform.theoretical_mass - result = result.as_spectra()[0] - intensity_normalized = _basepeak_normalize(result.intensity) * 10000 +class MSP(_Writer): + """Write MSP files from MS2PIP processing results.""" - peaks = zip(result.mz, intensity_normalized) + def write(self, results: List[ProcessingResult]): + """Write multiple processing results to file.""" + for result in results: + self._write_result(result) - out = [ - "BEGIN IONs", - f"TITLE={result.psm_index} {sequence}/{charge}", - f"PEPMASS={mw}", - f"CHARGE={charge}+", - [f"{mz}\t{intensity}\n" for mz, intensity in peaks], - "END IONs\n" + def _write_result(self, result: ProcessingResult): + """Write single processing result to file.""" + predicted_spectrum = result.as_spectra()[0] + intensity_normalized = _basepeak_normalize(predicted_spectrum.intensity) * 1e4 + peaks = zip(predicted_spectrum.mz, intensity_normalized, predicted_spectrum.annotations) + + # Header + lines = [ + f"Name: {result.psm.peptidoform.sequence}/{result.psm.get_precursor_charge()}", + f"MW: {result.psm.peptidoform.theoretical_mass}", + self._format_comment_line(result.psm), + f"Num peaks: {len(predicted_spectrum.mz)}", ] - file_object.writelines(''.join(line) if isinstance(line, list) else line + "\n" for line in out) - - - # for spec_id in sorted(self.peprec_dict.keys()): - # seq = self.peprec_dict[spec_id]["peptide"] - # mods = self.peprec_dict[spec_id]["modifications"] - # charge = self.peprec_dict[spec_id]["charge"] - # _, prec_mz = self.mods.calc_precursor_mz(seq, mods, charge) - # msp_modifications = self._get_msp_modifications(seq, mods) - - # if self.has_protein_list: - # protein_list = self.peprec_dict[spec_id]["protein_list"] - # protein_string = self._parse_protein_string(protein_list) - # else: - # protein_string = "" - - # out = [ - # "BEGIN IONS", - # f"TITLE={spec_id} {seq}/{charge} {msp_modifications} {protein_string}", - # f"PEPMASS={prec_mz}", - # f"CHARGE={charge}+", - # ] - - # if self.has_rt: - # rt = self.peprec_dict[spec_id]["rt"] - # out.append(f"RTINSECONDS={rt}") - - # out.append( - # self._get_msp_peak_annotation( - # self.preds_dict[spec_id]["peaks"], - # sep=" ", - # include_annotations=False, - # ) - # ) - # out.append("END IONS\n") - # file_object.writelines([line for line in out] + ["\n"]) - -# @output_format("spectronaut") -# @writer( -# file_suffix="_predictions_spectronaut.csv", -# normalization_method="tic", -# requires_dicts=False, -# requires_diff_modifications=True, -# ) -def write_spectronaut(processing_results: List[ProcessingResult], file_obj, header: bool): - """ - Construct spectronaut DataFrame and write to file_object. - """ - # if "w" in self.write_mode: - # header = True - # elif "a" in self.write_mode: - # header = False - # else: - # raise InvalidWriteModeError(self.write_mode) - - spectronaut_peprec = self.peprec.copy() - - # ModifiedPeptide and PrecursorMz columns - spectronaut_peprec["ModifiedPeptide"] = spectronaut_peprec.apply( - lambda row: self._get_diff_modified_sequence(row["peptide"], row["modifications"]), - axis=1, - ) - spectronaut_peprec["PrecursorMz"] = spectronaut_peprec.apply( - lambda row: self.mods.calc_precursor_mz( - row["peptide"], row["modifications"], row["charge"] - )[1], - axis=1, - ) - spectronaut_peprec["ModifiedPeptide"] = "_" + spectronaut_peprec["ModifiedPeptide"] + "_" - - # Additional columns - spectronaut_peprec["FragmentLossType"] = "noloss" - - # Retention time - if "rt" in spectronaut_peprec.columns: - rt_cols = ["iRT"] - spectronaut_peprec["iRT"] = spectronaut_peprec["rt"] - else: - rt_cols = [] - - # ProteinId - if self.has_protein_list: - spectronaut_peprec["ProteinId"] = spectronaut_peprec["protein_list"].apply( - self._parse_protein_string + + # Peaks + lines.extend( + f"{mz:.10g}\t{intensity:.10g}\t{annotation}/0.0" for mz, intensity, annotation in peaks ) - else: - spectronaut_peprec["ProteinId"] = spectronaut_peprec["spec_id"] - - # Rename columns and merge with predictions - spectronaut_peprec = spectronaut_peprec.rename( - columns={"charge": "PrecursorCharge", "peptide": "StrippedPeptide"} - ) - peptide_cols = ( - [ - "ModifiedPeptide", - "StrippedPeptide", - "PrecursorCharge", - "PrecursorMz", - "ProteinId", + + # Write to file + self._file_object.writelines(line + "\n" for line in lines) + self._file_object.write("\n") + + @staticmethod + def _format_modifications(peptidoform: Peptidoform): + """Format modifications in MSP-style string, e.g. ``Mods=1/0,E,Glu->pyro-Glu``.""" + + def _format_single_modification( + amino_acid: str, + position: int, + modifications: Optional[List[proforma.ModificationBase]], + ) -> Union[str, None]: + """Get modification label from :py:class:`proforma.ModificationBase` list.""" + if not modifications: + return None + if len(modifications) > 1: + raise ValueError("Multiple modifications per amino acid not supported.") + modification = modifications[0] + return f"{position},{amino_acid},{modification.name}" + + sequence_mods = [ + _format_single_modification(aa, pos + 1, mods) + for pos, (aa, mods) in enumerate(peptidoform.parsed_sequence) ] - + rt_cols - + ["FragmentLossType"] - ) - spectronaut_df = spectronaut_peprec[peptide_cols + ["spec_id"]] - spectronaut_df = self.all_preds.merge(spectronaut_df, on="spec_id") - - # Fragment columns - spectronaut_df["FragmentCharge"] = ( - spectronaut_df["ion"].str.contains("2").map({True: 2, False: 1}) - ) - spectronaut_df["FragmentType"] = spectronaut_df["ion"].str[0].str.lower() - - # Rename and sort columns - spectronaut_df = spectronaut_df.rename( - columns={ - "mz": "FragmentMz", - "prediction": "RelativeIntensity", - "ionnumber": "FragmentNumber", - } - ) - fragment_cols = [ - "FragmentCharge", + n_term = _format_single_modification( + peptidoform.sequence[0], 0, peptidoform.properties["n_term"] + ) + c_term = _format_single_modification( + peptidoform.sequence[-1], -1, peptidoform.properties["c_term"] + ) + + mods = [mod for mod in [n_term] + sequence_mods + [c_term] if mod is not None] + + if not mods: + return "Mods=0" + else: + return f"Mods={len(mods)}/{'/'.join(sorted(mods))}" + + @staticmethod + def _format_parent_mass(peptidoform: Peptidoform) -> str: + """Format parent mass as string.""" + return f"Parent={peptidoform.theoretical_mz}" + + @staticmethod + def _format_protein_string(psm: PSM) -> Union[str, None]: + """Format protein list as string.""" + if psm.protein_list: + return f"Protein={','.join(psm.protein_list)}" + else: + return None + + @staticmethod + def _format_retention_time(psm: PSM) -> Union[str, None]: + """Format retention time as string.""" + if psm.retention_time: + return f"RetentionTime={psm.retention_time}" + else: + return None + + @staticmethod + def _format_identifier(psm: PSM) -> str: + """Format MS2PIP ID as string.""" + return f"SpectrumIdentifier={psm.spectrum_id}" + + @staticmethod + def _format_comment_line(psm: PSM) -> str: + """Format comment line for MSP file.""" + comments = " ".join( + filter( + None, + [ + MSP._format_modifications(psm.peptidoform), + MSP._format_parent_mass(psm.peptidoform), + MSP._format_protein_string(psm), + MSP._format_retention_time(psm), + MSP._format_identifier(psm), + ], + ) + ) + return f"Comment: {comments}" + + +class MGF(_Writer): + """Write MGF files from MS2PIP processing results.""" + + def write(self, results: List[ProcessingResult]): + """Write multiple processing results to file.""" + for result in results: + self._write_result(result) + + def _write_result(self, result: ProcessingResult): + """Write single processing result to file.""" + predicted_spectrum = result.as_spectra()[0] + intensity_normalized = _basepeak_normalize(predicted_spectrum.intensity) * 1e4 + peaks = zip(predicted_spectrum.mz, intensity_normalized) + + # Header + lines = [ + "BEGIN IONS", + f"TITLE={result.psm.peptidoform}", + f"PEPMASS={result.psm.peptidoform.theoretical_mz}", + f"CHARGE={result.psm.get_precursor_charge()}+", + f"SCANS={result.psm.spectrum_id}", + f"RTINSECONDS={result.psm.retention_time}" if result.psm.retention_time else None, + ] + + # Peaks + lines.extend(f"{mz:.10g} {intensity:.10g}" for mz, intensity in peaks) + + # Write to file + self._file_object.writelines(line + "\n" for line in lines if line) + self._file_object.write("END IONS\n") + + +class Spectronaut(_Writer): + """Write Spectronaut files from MS2PIP processing results.""" + + field_names = [ + "ModifiedPeptide", + "StrippedPeptide", + "PrecursorCharge", + "PrecursorMz", + "IonMobility", + "iRT", + "ProteinId", + "RelativeFragmentIntensity", "FragmentMz", - "RelativeIntensity", "FragmentType", "FragmentNumber", + "FragmentCharge", + "FragmentLossType", ] - spectronaut_df = spectronaut_df[peptide_cols + fragment_cols] - try: - spectronaut_df.to_csv( - file_obj, index=False, header=header, sep=";", lineterminator="\n" - ) - except TypeError: # Pandas < 1.5 (Required for Python 3.7 support) - spectronaut_df.to_csv( - file_obj, index=False, header=header, sep=";", line_terminator="\n" - ) - - return file_obj -def _write_bibliospec_core(self, file_obj_ssl, file_obj_ms2, start_scannr=0): - """Construct Bibliospec SSL/MS2 strings and write to file_objects.""" + def write(self, processing_results: List[ProcessingResult]): + """Write multiple processing results to file.""" + writer = csv.DictWriter(self._file_object, fieldnames=self.field_names, delimiter="\t") + if self.write_mode == "w": + writer.writeheader() + for result in processing_results: + self._write_result(result, writer) + + def _write_result(self, result: ProcessingResult, writer: csv.DictWriter): + """Write single processing result to file.""" + # Only write results with predictions + if result.predicted_intensity is None: + return + psm_info = self._process_psm(result.psm) + for fragment_info in self._yield_fragment_info(result): + writer.writerow({**psm_info, **fragment_info}) + + @staticmethod + def _process_psm(psm: PSM) -> Dict[str, Any]: + """Process PSM to Spectronaut format.""" + return { + "ModifiedPeptide": _peptidoform_str_without_charge(psm.peptidoform), + "StrippedPeptide": psm.peptidoform.sequence, + "PrecursorCharge": psm.get_precursor_charge(), + "PrecursorMz": f"{psm.peptidoform.theoretical_mz:.10g}", + "IonMobility": f"{psm.ion_mobility:.10g}" if psm.ion_mobility else None, + "iRT": f"{psm.retention_time:.10g}" if psm.retention_time else None, + "ProteinId": "".join(psm.protein_list) if psm.protein_list else None, + } - for i, spec_id in enumerate(sorted(self.preds_dict.keys())): - scannr = i + start_scannr - seq = self.peprec_dict[spec_id]["peptide"] - mods = self.peprec_dict[spec_id]["modifications"] - charge = self.peprec_dict[spec_id]["charge"] - prec_mass, prec_mz = self.mods.calc_precursor_mz(seq, mods, charge) - ms2_filename = os.path.basename(self.output_filename) + "_predictions.ms2" + @staticmethod + def _yield_fragment_info(result: ProcessingResult) -> Generator[Dict[str, Any], None, None]: + """Yield fragment information for a processing result.""" + # Normalize intensities + intensities = { + ion_type: _unlogarithmize(intensities) + for ion_type, intensities in result.predicted_intensity.items() + } + max_intensity = max(itertools.chain(*intensities.values())) + intensities = { + ion_type: _basepeak_normalize(intensities[ion_type], basepeak=max_intensity) + for ion_type in intensities + } + for ion_type in result.predicted_intensity: + fragment_type = ion_type[0].lower() + fragment_charge = ion_type[1:] if len(ion_type) > 1 else "1" + for ion_index, (intensity, mz) in enumerate( + zip(intensities[ion_type], result.theoretical_mz[ion_type]) + ): + yield { + "RelativeFragmentIntensity": f"{intensity:.10g}", + "FragmentMz": f"{mz:.10g}", + "FragmentType": fragment_type, + "FragmentNumber": ion_index + 1, + "FragmentCharge": fragment_charge, + "FragmentLossType": "noloss", + } + + +class Bibliospec(_Writer): + """Write Bibliospec files from MS2PIP processing results.""" + + ssl_field_names = [ + "file", + "scan", + "charge", + "sequence", + "score-type", + "score", + "retention-time", + ] - peaks = self._get_msp_peak_annotation( - self.preds_dict[spec_id]["peaks"], - sep="\t", - include_annotations=False, + def __init__( + self, + ssl_file: Union[str, Path, StringIO], + ms2_file: Union[str, Path, StringIO], + write_mode: str = "w", + ): + """ + Write Bibliospec files from MS2PIP processing results. + + Parameters + ---------- + ssl_file : Union[str, Path, StringIO] + Path to SSL file or StringIO object. + ms2_file : Union[str, Path, StringIO] + Path to MS2 file or StringIO object. + write_mode : str + Write mode for files. Default is "w". + """ + + self.ssl_file = ssl_file + self.ms2_file = ms2_file + self.write_mode = write_mode + + self._open_ssl_file = None + self._open_ms2_file = None + + def __enter__(self): + """Open file in context manager.""" + if isinstance(self.ssl_file, (str, Path)): + self.ssl_file = Path(self.ssl_file) + self._open_ssl_file = open(self.ssl_file, self.write_mode) + if isinstance(self.ms2_file, (str, Path)): + self.ms2_file = Path(self.ms2_file) + self._open_ms2_file = open(self.ms2_file, self.write_mode) + return self + + def __exit__(self, exc_type, exc_value, traceback): + if self._open_ssl_file: + self._open_ssl_file.close() + if self._open_ms2_file: + self._open_ms2_file.close() + + @property + def _ssl_file_object(self): + """Get SSL file object from file path or StringIO.""" + if self._open_ssl_file: + return self._open_ssl_file + else: + if isinstance(self.ssl_file, StringIO): + return self.ssl_file + elif isinstance(self.ssl_file, (str, Path)): + raise TypeError("Use `with` statement to open file from path.") + else: + raise TypeError("Unsupported type for `file`.") + + @property + def _ms2_file_object(self): + """Get MS2 file object from file path or StringIO.""" + if self._open_ms2_file: + return self._open_ms2_file + else: + if isinstance(self.ms2_file, StringIO): + return self.ms2_file + elif isinstance(self.ms2_file, (str, Path)): + raise TypeError("Use `with` statement to open file from path.") + else: + raise TypeError("Unsupported type for `file`.") + + def write(self, processing_results: List[ProcessingResult]): + """Write multiple processing results to file.""" + # Create CSV writer + ssl_dict_writer = csv.DictWriter( + self._ssl_file_object, fieldnames=self.ssl_field_names, delimiter="\t" ) - if isinstance(mods, str) and mods != "-" and mods != "": - mod_seq = self._get_diff_modified_sequence(seq, mods) + # Write headers + if self.write_mode == "w": + ssl_dict_writer.writeheader() + self._write_ms2_header() + start_scan_number = 0 + elif self.write_mode == "a": + start_scan_number = self._get_last_ssl_scan_number(self.ssl_file) + 1 else: - mod_seq = seq + raise ValueError(f"Unsupported write mode: {self.write_mode}") + + # Write results + for i, result in enumerate(processing_results): + scan_number = start_scan_number + i + modified_sequence = self._format_modified_sequence(result.psm.peptidoform) + self._write_result(result, modified_sequence, scan_number, ssl_dict_writer) + + def _write_ms2_header(self): + """Write header to MS2 file.""" + self._ms2_file_object.write( + f"H\tCreationDate\t{strftime('%Y-%m-%d %H:%M:%S', localtime())}\n" + ) + self._ms2_file_object.write("H\tExtractor\tMS2PIP predictions\n") + + def _write_result( + self, + result: ProcessingResult, + modified_sequence: str, + scan_number: int, + writer: csv.DictWriter, + ): + """Write single processing result to files.""" + self._write_result_to_ssl(result, modified_sequence, scan_number, writer) + self._write_result_to_ms2(result, modified_sequence, scan_number) + + def _write_result_to_ssl( + self, + result: ProcessingResult, + modified_sequence: str, + scan_number: int, + writer: csv.DictWriter, + ): + """Write single processing result to the SSL file.""" + writer.writerow( + { + "file": self.ms2_file.name if isinstance(self.ms2_file, Path) else "file.ms2", + "scan": scan_number, + "charge": result.psm.get_precursor_charge(), + "sequence": modified_sequence, + "score-type": None, + "score": None, + "retention-time": result.psm.retention_time if result.psm.retention_time else None, + } + ) - rt = self.peprec_dict[spec_id]["rt"] if self.has_rt else "" + def _write_result_to_ms2( + self, result: ProcessingResult, modified_sequence: str, scan_number: int + ): + """Write single processing result to the MS2 file.""" + predicted_spectrum = result.as_spectra()[0] + intensity_normalized = _basepeak_normalize(predicted_spectrum.intensity) * 1e4 + peaks = zip(predicted_spectrum.mz, intensity_normalized) + + # Header + lines = [ + f"S\t{scan_number}\t{result.psm.peptidoform.theoretical_mz}", + f"Z\t{result.psm.get_precursor_charge()}\t{result.psm.peptidoform.theoretical_mass}", + f"D\tseq\t{result.psm.peptidoform.sequence}", + f"D\tmodified seq\t{modified_sequence}", + ] - # TODO: implement csv instead of manual writing - file_obj_ssl.write( - "\t".join([ms2_filename, str(scannr), str(charge), mod_seq, "", "", str(rt)]) - + "\n" - ) - file_obj_ms2.write( - "\n".join( - [ - f"S\t{scannr}\t{prec_mz}", - f"Z\t{charge}\t{prec_mass}", - f"D\tseq\t{seq}", - f"D\tmodified seq\t{mod_seq}", - peaks, - ] - ) - + "\n" + # Peaks + lines.extend(f"{mz:.10g}\t{intensity:.10g}" for mz, intensity in peaks) + + # Write to file + self._ms2_file_object.writelines(line + "\n" for line in lines) + self._ms2_file_object.write("\n") + + @staticmethod + def _format_modified_sequence(peptidoform: Peptidoform) -> str: + """Format modified sequence as string for Spectronaut.""" + modification_dict = defaultdict(list) + for term, position in [("n_term", 0), ("c_term", len(peptidoform) - 1)]: + if peptidoform.properties[term]: + modification_dict[position].extend(peptidoform.properties[term]) + for position, (_, mods) in enumerate(peptidoform.parsed_sequence): + if mods: + modification_dict[position].extend(mods) + return "".join( + [ + f"{aa}{''.join([f'[{mod.mass:+.1f}]' for mod in modification_dict[position]])}" + for position, aa in enumerate(peptidoform.sequence) + ] ) -# def _write_general( -# self, -# write_function, -# file_suffix, -# normalization_method, -# requires_dicts, -# requires_diff_modifications, -# diff_modification_precision=1, -# ): -# """ -# General write function to call core write functions. - -# Note: Does not work for write_bibliospec and write_dlib functions. -# """ - -# # Normalize if necessary and make dicts -# if not self.normalization == normalization_method: -# self._normalize_spectra(method=normalization_method) -# if requires_dicts: -# self._generate_preds_dict() -# elif requires_dicts and not self.preds_dict: -# self._generate_preds_dict() -# if requires_dicts and not self.peprec_dict: -# self._generate_peprec_dict() - -# if ( -# requires_diff_modifications -# and diff_modification_precision not in self.diff_modification_mapping -# ): -# self._generate_diff_modification_mapping(diff_modification_precision) - -# # Write to file or stringbuffer -# if self.return_stringbuffer: -# file_object = StringIO() -# logger.info("Writing results to StringIO using %s", write_function.__name__) -# else: -# f_name = self.output_filename + file_suffix -# file_object = open(f_name, self.write_mode) -# logger.info("Writing results to %s", f_name) - -# write_function(self, file_object) - -# return file_object - -# @output_format("bibliospec") -# def write_bibliospec(self): -# """Write MS2PIP predictions to BiblioSpec/Skyline SSL and MS2 spectral library files.""" -# precision = 1 -# if precision not in self.diff_modification_mapping: -# self._generate_diff_modification_mapping(precision) - -# # Normalize if necessary and make dicts -# if not self.normalization == "basepeak_10000": -# self._normalize_spectra(method="basepeak_10000") -# self._generate_preds_dict() -# elif not self.preds_dict: -# self._generate_preds_dict() -# if not self.peprec_dict: -# self._generate_peprec_dict() - -# if self.return_stringbuffer: -# file_obj_ssl = StringIO() -# file_obj_ms2 = StringIO() -# else: -# file_obj_ssl = open("{}_predictions.ssl".format(self.output_filename), self.write_mode) -# file_obj_ms2 = open("{}_predictions.ms2".format(self.output_filename), self.write_mode) - -# # If a new file is written, write headers -# if "w" in self.write_mode: -# start_scannr = 0 -# ssl_header = [ -# "file", -# "scan", -# "charge", -# "sequence", -# "score-type", -# "score", -# "retention-time", -# "\n", -# ] -# file_obj_ssl.write("\t".join(ssl_header)) -# file_obj_ms2.write( -# "H\tCreationDate\t{}\n".format(strftime("%Y-%m-%d %H:%M:%S", localtime())) -# ) -# file_obj_ms2.write("H\tExtractor\tMS2PIP predictions\n") -# else: -# # Get last scan number of ssl file, to continue indexing from there -# # because Bibliospec speclib scan numbers can only be integers -# start_scannr = self._get_last_ssl_scannr() + 1 - -# self._write_bibliospec_core(file_obj_ssl, file_obj_ms2, start_scannr=start_scannr) - -# return file_obj_ssl, file_obj_ms2 - -def _write_dlib_metadata(self, connection): - from sqlalchemy import select - - from ms2pip._utils.dlib import DLIB_VERSION, Metadata - - with connection.begin(): - version = connection.execute( - select([Metadata.c.Value]).where(Metadata.c.Key == "version") - ).scalar() - if version is None: - connection.execute( - Metadata.insert().values( - Key="version", - Value=DLIB_VERSION, - ) - ) + @staticmethod + def _get_last_ssl_scan_number(ssl_file: Union[str, Path, StringIO]): + """Read scan number of last line in a Bibliospec SSL file.""" + if isinstance(ssl_file, StringIO): + ssl_file.seek(0) + for line in ssl_file: + last_line = line + elif isinstance(ssl_file, (str, Path)): + with open(ssl_file, "rt") as ssl: + for line in ssl: + last_line = line + else: + raise TypeError("Unsupported type for `ssl_file`.") + return int(last_line.split("\t")[1]) -def _write_dlib_entries(processing_results: List[ProcessingResult], connection, precision, output_filename: str): - from ms2pip._utils.dlib import Entry - peptide_to_proteins = set() +class DLIB(_Writer): + """ + Write DLIB files from MS2PIP processing results. - with connection.begin(): - - for result in processing_results: + See https://bitbucket.org/searleb/encyclopedia/wiki/EncyclopeDIA%20File%20Formats for + documentation on the DLIB format. - precursor_mz = result.psm.precursor_mz - precursor_charge = result.psm.peptidoform.precursor_charge - seq = result.psm.peptidoform.sequence - rt = result.psm.retention_time #TODO: check if this is the right rt - mzs = result.theoretical_mz - - # TODO: get mod_seq - # mod_seq = - - if result.psm.protein_list is not None: - protein_list = result.psm.protein_list - if isinstance(protein_list, str): - protein_list = literal_eval(protein_list) - - for protein in protein_list: - peptide_to_proteins.add((seq, protein)) - - result = result.as_spectra()[0] - intensity_normalized = _basepeak_normalize(result.intensity) * 10000 - - connection.execute( - Entry.insert().values( - PrecursorMz=precursor_mz, - PrecursorCharge=precursor_charge, - PeptideModSeq=mod_seq, - PeptideSeq=seq, - Copies=1, - RTInSeconds=rt, - Score=0, - MassEncodedLength=len(mzs), - MassArray=mzs, - IntensityEncodedLength=len(intensity_normalized), - IntensityArray=intensity_normalized, - SourceFile=output_filename, - ) - ) - return peptide_to_proteins - -def _write_dlib_peptide_to_protein(connection, peptide_to_proteins): - from ms2pip._utils.dlib import PeptideToProtein - - # if not self.has_protein_list: - # return - - with connection.begin(): - sql_peptide_to_proteins = set() - proteins = {protein for _, protein in peptide_to_proteins} - for peptide_to_protein in connection.execute( - PeptideToProtein.select().where(PeptideToProtein.c.ProteinAccession.in_(proteins)) - ): - sql_peptide_to_proteins.add( - ( - peptide_to_protein.PeptideSeq, - peptide_to_protein.ProteinAccession, - ) - ) + """ - peptide_to_proteins.difference_update(sql_peptide_to_proteins) - for seq, protein in peptide_to_proteins: - connection.execute( - PeptideToProtein.insert().values( - PeptideSeq=seq, isDecoy=False, ProteinAccession=protein + def write(self, processing_results: List[ProcessingResult]): + """Write MS2PIP predictions to a DLIB SQLite file.""" + with dlib.open_sqlite(self.file) as connection: + dlib.metadata.create_all() + self._write_metadata(connection) + self._write_entries(processing_results, connection, self.file) + self._write_peptide_to_protein(processing_results, connection) + + def _write_result(self, result: ProcessingResult): + """Write single processing result to file.""" + ... + + @staticmethod + def _format_modified_sequence(peptidoform: Peptidoform) -> str: + """Format modified sequence as string for DLIB.""" + # TODO: Implement + # From the EncyclopeDIA DLIB documentation: + # PeptideModSeq has strings like "QKEC[+57.0214635]SDK" to indicate PTMs. PTMs are always + # encoded as delta masses (including fixed PTMs such as carbamidomethylation). Sites can + # only have one PTM mass, so compound masses are allowed, such as + # "M[+58.00548]ELS[+79.966331]C[+57.0214635]PGSR", where +58.00548 indicates both + # acetylation and oxidation. N- and C-terminus PTMs should be annotated on the first or + # last amino acid in the peptide, respectively. Metabolic labels can be incorporated in + # the same way, for example EC[+57.0214635]SDK[+8.014199]. + raise NotImplementedError + + @staticmethod + def _write_metadata(connection: engine.Connection): + with connection.begin(): + version = connection.execute( + select([dlib.Metadata.c.Value]).where(dlib.Metadata.c.Key == "version") + ).scalar() + if version is None: + connection.execute( + dlib.Metadata.insert().values( + Key="version", + Value=dlib.DLIB_VERSION, + ) ) - ) -# @output_format("dlib") -def write_dlib(processing_results: List[ProcessingResult], output_filename: str): - """Write MS2PIP predictions to a DLIB SQLite file.""" - from ms2pip._utils.dlib import metadata, open_sqlite - - # normalization = "basepeak_10000" - precision = 5 - # if not self.normalization == normalization: - # self._normalize_spectra(method=normalization) - # self._generate_preds_dict() - # if not self.peprec_dict: - # self._generate_peprec_dict() - # if precision not in self.diff_modification_mapping: - # self._generate_diff_modification_mapping(precision) - - filename = "{}.dlib".format(output_filename) - logger.info("Writing results to %s", filename) - - logger.debug( - "write mode is ignored for DLIB at the file mode, although append or not is respected" - ) - # if "a" not in self.write_mode and os.path.exists(filename): - # os.remove(filename) - - # if self.return_stringbuffer: - # raise NotImplementedError("`return_stringbuffer` not implemented for DLIB output.") - - # if not self.has_rt: - # raise NotImplementedError("Retention times required to write DLIB file.") - - with open_sqlite(filename) as connection: - metadata.create_all() - _write_dlib_metadata(connection) - peptide_to_proteins = _write_dlib_entries(processing_results, connection, precision, filename) - _write_dlib_peptide_to_protein(connection, peptide_to_proteins) - -def write_csv(processing_results: List[ProcessingResult], file_obj): - """Write processing results to CSV file.""" - fieldnames = [ - "spec_id", - "charge", - "ion", - "ionnumber", - "mz", - "prediction", - "rt" - ] - writer = csv.DictWriter(file_obj, fieldnames=fieldnames, lineterminator="\n") - writer.writeheader() - for result in processing_results: - if result.theoretical_mz is not None: - for ion_type in result.theoretical_mz: - for i in range(len(result.theoretical_mz[ion_type])): - writer.writerow( - { - "spec_id": result.psm_index, - "charge": result.psm.peptidoform.precursor_charge, - "ion": ion_type, # TODO: change to upper(ion_type) ? - "ionnumber": i + 1, - "mz": "{:.6g}".format(result.theoretical_mz[ion_type][i]), - "prediction": "{:.6g}".format( - result.predicted_intensity[ion_type][i] - ), - "rt": result.psm.retention_time # TODO: results in empty string while testing - } + @staticmethod + def _write_entries( + processing_results: List[ProcessingResult], + connection: engine.Connection, + output_filename: str, + ): + with connection.begin(): + for result in processing_results: + if not result.psm.retention_time: + raise ValueError("Retention time required to write DLIB file.") + + spectrum = result.as_spectra()[0] + intensity_normalized = _basepeak_normalize(spectrum.intensity) * 1e4 + n_peaks = len(spectrum.mz) + + connection.execute( + dlib.Entry.insert().values( + PrecursorMz=result.psm.precursor_mz, + PrecursorCharge=result.psm.get_precursor_charge(), + PeptideModSeq=DLIB._format_modified_sequence(result.psm.peptidoform), + PeptideSeq=result.psm.peptidoform.sequence, + Copies=1, + RTInSeconds=result.psm.retention_time, + Score=0, + MassEncodedLength=n_peaks, + MassArray=spectrum.mz, + IntensityEncodedLength=n_peaks, + IntensityArray=intensity_normalized, + SourceFile=output_filename, ) + ) + + @staticmethod + def _write_peptide_to_protein(results: List[ProcessingResult], connection: engine.Connection): + from ms2pip._utils.dlib import PeptideToProtein + peptide_to_proteins = { + (result.psm.peptidoform.sequence, protein) + for result in results + for protein in result.psm.protein_list + } -# def write_single_spectrum_csv(spectrum, filepath): -# """Write a single spectrum to a CSV file.""" -# with open(filepath, "wt") as f: -# writer = csv.writer(f, delimiter=",", lineterminator="\n") -# writer.writerow(["mz", "intensity", "annotation"]) -# for mz, intensity, annotation in zip( -# spectrum.mz, -# spectrum.intensity, -# spectrum.annotations, -# ): -# writer.writerow([mz, intensity, annotation]) - - -# def write_single_spectrum_png(spectrum, filepath): -# """Plot a single spectrum and write to a PNG file.""" -# import matplotlib.pyplot as plt -# import spectrum_utils.plot as sup - -# ax = plt.gca() -# ax.set_title("MS²PIP prediction for " + str(spectrum.peptidoform)) -# sup.spectrum(spectrum.to_spectrum_utils(), ax=ax) -# plt.savefig(Path(filepath).with_suffix(".png")) -# plt.close() + with connection.begin(): + sql_peptide_to_proteins = set() + proteins = {protein for _, protein in peptide_to_proteins} + for peptide_to_protein in connection.execute( + PeptideToProtein.select().where(PeptideToProtein.c.ProteinAccession.in_(proteins)) + ): + sql_peptide_to_proteins.add( + ( + peptide_to_protein.PeptideSeq, + peptide_to_protein.ProteinAccession, + ) + ) + + peptide_to_proteins.difference_update(sql_peptide_to_proteins) + for seq, protein in peptide_to_proteins: + connection.execute( + PeptideToProtein.insert().values( + PeptideSeq=seq, isDecoy=False, ProteinAccession=protein + ) + ) diff --git a/tests/test_spectrum_output.py b/tests/test_spectrum_output.py index 0760af58..0cf41765 100644 --- a/tests/test_spectrum_output.py +++ b/tests/test_spectrum_output.py @@ -1,100 +1,34 @@ -import os -import re -import numpy as np -import pandas as pd +from psm_utils import Peptidoform -from ms2pip.ms2pip_tools.spectrum_output import SpectrumOutput +from ms2pip.spectrum_output import MSP, Bibliospec -TEST_DIR = os.path.dirname(__file__) - - -class TestSpectrumOutput: - def test_integration(self): - def compare_line(test_line, target_line): - """Assert if two lines in spectrum output are the same.""" - - # Extract float values from line and use assert_allclose, to allow for - # float imprecisions - float_pattern = re.compile(r"[0-9]*[.][0-9]+") - test_floats = float_pattern.findall(test_line) - target_floats = float_pattern.findall(target_line) - assert len(test_floats) == len(target_floats) - [ - np.testing.assert_allclose(float(te), float(ta), rtol=1e-5) - for te, ta in zip(test_floats, target_floats) - ] - assert float_pattern.sub(test_line, "") == float_pattern.sub( - target_line, "" - ) - - peprec = pd.read_pickle( - os.path.join(TEST_DIR, "test_data/spectrum_output/input_peprec.pkl") - ) - all_preds = pd.read_pickle( - os.path.join(TEST_DIR, "test_data/spectrum_output/input_preds.pkl") - ) - - params = { - "ptm": [ - "Oxidation,15.994915,opt,M", - "Carbamidomethyl,57.021464,opt,C", - "Glu->pyro-Glu,-18.010565,opt,E", - "Gln->pyro-Glu,-17.026549,opt,Q", - "Acetyl,42.010565,opt,N-term", - ], - "sptm": [], - "gptm": [], - "model": "HCD", - "frag_error": "0.02", - "out": "csv", - } - - peprec_tmp = peprec.sample(5, random_state=10).copy() - all_preds_tmp = all_preds[ - all_preds["spec_id"].isin(peprec_tmp["spec_id"]) - ].copy() - - so = SpectrumOutput( - all_preds_tmp, - peprec_tmp, - params, - output_filename="test", - return_stringbuffer=True, - ) - - target_filename_base = os.path.join( - TEST_DIR, "test_data/spectrum_output/target" - ) - - # Test general output +class TestMSP: + def test__format_modification_string(self): test_cases = [ - (so.write_mgf, "_predictions.mgf"), - (so.write_msp, "_predictions.msp"), - (so.write_spectronaut, "_predictions_spectronaut.csv"), + ("ACDE/2", "Mods=0"), + ("AC[Carbamidomethyl]DE/2", "Mods=1/2,C,Carbamidomethyl"), + ("[Glu->pyro-Glu]-EPEPTIDEK/2", "Mods=1/0,E,Glu->pyro-Glu"), + ("PEPTIDEK-[Amidated]/2", "Mods=1/-1,K,Amidated"), + ("AM[Oxidation]C[Carbamidomethyl]DE/2", "Mods=2/2,M,Oxidation/3,C,Carbamidomethyl"), ] - for test_function, file_ext in test_cases: - test = test_function() - test.seek(0) - with open(target_filename_base + file_ext) as target: - for test_line, target_line in zip(test.readlines(), target.readlines()): - compare_line(test_line, target_line) + for peptidoform_str, expected_output in test_cases: + peptidoform = Peptidoform(peptidoform_str) + assert MSP._format_modifications(peptidoform) == expected_output + - # Test bibliospec output - bibliospec_ssl, bibliospec_ms2 = so.write_bibliospec() +class TestBiblioSpec: + def test__format_modified_sequence(self): test_cases = [ - (bibliospec_ssl, "_predictions.ssl"), - (bibliospec_ms2, "_predictions.ms2"), + ("ACDE/2", "ACDE"), + ("AC[Carbamidomethyl]DE/2", "AC[+57.0]DE"), + ("[Glu->pyro-Glu]-EPEPTIDEK/2", "E[-18.0]PEPTIDEK"), + ("PEPTIDEK-[Amidated]/2", "PEPTIDEK[-1.0]"), + ("AM[Oxidation]C[Carbamidomethyl]DE/2", "AM[+16.0]C[+57.0]DE"), ] - for test, file_ext in test_cases: - test.seek(0) - with open(target_filename_base + file_ext) as target: - for test_line, target_line in zip(test.readlines(), target.readlines()): - test_line = test_line.replace( - "test_predictions.ms2", "target_predictions.ms2" - ) - if not "CreationDate" in target_line: - compare_line(test_line, target_line) + for peptidoform_str, expected_output in test_cases: + peptidoform = Peptidoform(peptidoform_str) + assert Bibliospec._format_modified_sequence(peptidoform) == expected_output From 5afa8359231b8c0ac41ae597e996f31c6edad364 Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 8 May 2024 01:02:06 +0200 Subject: [PATCH 10/14] Remove unused imports --- ms2pip/spectrum_output.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/ms2pip/spectrum_output.py b/ms2pip/spectrum_output.py index be908630..a89272c2 100644 --- a/ms2pip/spectrum_output.py +++ b/ms2pip/spectrum_output.py @@ -6,15 +6,10 @@ import csv import itertools -import logging -import os import re from abc import ABC, abstractmethod -from ast import literal_eval from collections import defaultdict -from functools import wraps from io import StringIO -from operator import itemgetter from pathlib import Path from time import localtime, strftime from typing import Any, Dict, Generator, List, Optional, Union @@ -26,7 +21,6 @@ from ms2pip._utils import dlib from ms2pip.result import ProcessingResult -from ms2pip.spectrum import Spectrum def _peptidoform_str_without_charge(peptidoform: Peptidoform) -> str: From 375ea0726a6fada8baccb9ae49a0ea264f83b1c8 Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 8 May 2024 13:56:33 +0200 Subject: [PATCH 11/14] Finish DLIB; add general write_spectra function... --- docs/source/api/ms2pip.constants.rst | 5 - docs/source/api/ms2pip.spectrum-output.rst | 5 + ms2pip/constants.py | 3 - ms2pip/core.py | 3 +- ms2pip/spectrum_output.py | 314 +++++++++++++-------- tests/test_spectrum_output.py | 17 +- 6 files changed, 226 insertions(+), 121 deletions(-) diff --git a/docs/source/api/ms2pip.constants.rst b/docs/source/api/ms2pip.constants.rst index 1781590a..f7aeb2e8 100644 --- a/docs/source/api/ms2pip.constants.rst +++ b/docs/source/api/ms2pip.constants.rst @@ -2,11 +2,6 @@ ms2pip.constants **************** -.. py:data:: ms2pip.constants.SUPPORTED_OUTPUT_FORMATS - :type: list - - Supported file formats for spectrum output - .. py:data:: ms2pip.constants.MODELS :type: dict diff --git a/docs/source/api/ms2pip.spectrum-output.rst b/docs/source/api/ms2pip.spectrum-output.rst index 5047c237..6cb1b366 100644 --- a/docs/source/api/ms2pip.spectrum-output.rst +++ b/docs/source/api/ms2pip.spectrum-output.rst @@ -4,3 +4,8 @@ ms2pip.spectrum_output .. automodule:: ms2pip.spectrum_output :members: + +.. py:data:: ms2pip.spectrum_output.SUPPORTED_FORMATS + :type: dict + + Supported file formats and respective :py:class:`_Writer` class for spectrum output. diff --git a/ms2pip/constants.py b/ms2pip/constants.py index ba475391..c72affe2 100644 --- a/ms2pip/constants.py +++ b/ms2pip/constants.py @@ -1,8 +1,5 @@ """Constants and fixed configurations for MS²PIP.""" -# Supported output formats -SUPPORTED_OUTPUT_FORMATS = ["csv", "mgf", "msp", "bibliospec", "spectronaut", "dlib"] - # Models and their properties # id is passed to get_predictions to select model # ion_types is required to write the ion types in the headers of the result files diff --git a/ms2pip/core.py b/ms2pip/core.py index e3023764..50801235 100644 --- a/ms2pip/core.py +++ b/ms2pip/core.py @@ -23,9 +23,10 @@ from ms2pip._utils.psm_input import read_psms from ms2pip._utils.retention_time import RetentionTime from ms2pip._utils.xgb_models import get_predictions_xgb, validate_requested_xgb_model -from ms2pip.constants import MODELS, SUPPORTED_OUTPUT_FORMATS +from ms2pip.constants import MODELS from ms2pip.result import ProcessingResult, calculate_correlations from ms2pip.spectrum_input import read_spectrum_file +from ms2pip.spectrum_output import SUPPORTED_FORMATS, write_spectra logger = logging.getLogger(__name__) diff --git a/ms2pip/spectrum_output.py b/ms2pip/spectrum_output.py index a89272c2..d422cad3 100644 --- a/ms2pip/spectrum_output.py +++ b/ms2pip/spectrum_output.py @@ -1,5 +1,39 @@ """ -Write spectrum files from MS2PIP predictions. +Write spectrum files from MS²PIP prediction results. + + +Examples +-------- + +The simplest way to write MS²PIP predictions to a file is to use the :py:func:`write_spectra` +function: + +>>> from ms2pip import predict_single, write_spectra +>>> results = [predict_single("ACDE/2")] +>>> write_spectra("/path/to/output/filename", results, "mgf") + +Specific writer classes can also be used directly. Writer classes should be used in a context +manager to ensure the file is properly closed after writing. The following example writes MS²PIP +predictions to a TSV file: + +>>> from ms2pip import predict_single +>>> results = [predict_single("ACDE/2")] +>>> with TSV("output.tsv") as writer: +... writer.write(results) + +Results can be written to the same file sequentially: + +>>> results_2 = [predict_single("PEPTIDEK/2")] +>>> with TSV("output.tsv", write_mode="a") as writer: +... writer.write(results) +... writer.write(results_2) + +Results can be written to an existing file using the append mode: + +>>> with TSV("output.tsv", write_mode="a") as writer: +... writer.write(results_2) + + """ from __future__ import annotations @@ -7,6 +41,7 @@ import csv import itertools import re +import warnings from abc import ABC, abstractmethod from collections import defaultdict from io import StringIO @@ -23,60 +58,77 @@ from ms2pip.result import ProcessingResult -def _peptidoform_str_without_charge(peptidoform: Peptidoform) -> str: - """Get peptidoform string without charge.""" - return re.sub(r"\/\d+$", "", str(peptidoform)) - - -def _unlogarithmize(intensities: np.array) -> np.array: - """Undo logarithmic transformation of intensities.""" - return (2**intensities) - 0.001 - - -def _tic_normalize(intensities: np.array): - """Normalize intensities to total ion current (TIC).""" - return intensities / intensities.sum() - +def write_spectra( + filename: Union[str, Path], + processing_results: List[ProcessingResult], + file_format: str, + write_mode: str = "w", +): + """ + Write MS2PIP processing results to a supported spectrum file format. + + Parameters + ---------- + filename + Output filename without file extension. + processing_results + List of :py:class:`ms2pip.result.ProcessingResult` objects. + file_format + File format to write. See :py:attr:`FILE_FORMATS` for available formats. + write_mode + Write mode for file. Default is ``w`` (write). Use ``a`` (append) to add to existing file. -def _basepeak_normalize(intensities: np.array, basepeak: Optional[float] = None) -> np.array: - """Normalize intensities to most intense peak.""" - if not basepeak: - basepeak = intensities.max() - return intensities / basepeak + """ + with SUPPORTED_FORMATS[file_format](filename, write_mode) as writer: + writer.write(processing_results) class _Writer(ABC): """Abstract base class for writing spectrum files.""" - def __init__(self, file: Union[str, Path, StringIO], write_mode: str = "w"): - self.ssl_file = file + suffix = ".txt" + + def __init__(self, filename: Union[str, Path], write_mode: str = "w"): + self.filename = Path(filename).with_suffix(self.suffix) self.write_mode = write_mode self._open_file = None def __enter__(self): """Open file in context manager.""" - if isinstance(self.ssl_file, (str, Path)): - self.ssl_file = Path(self.ssl_file) - self._open_file = open(self.ssl_file, self.write_mode) + self.open() return self def __exit__(self, exc_type, exc_value, traceback): + """Close file in context manager.""" + self.close() + + def __repr__(self): + return f"{self.__class__.__name__}({self.filename, self.write_mode})" + + def open(self): + """Open file.""" + if self._open_file: + self.close() + self._open_file = open(self.filename, self.write_mode) + + def close(self): + """Close file.""" if self._open_file: self._open_file.close() + self._open_file = None @property def _file_object(self): - """Get file object from file path or StringIO.""" + """Get open file object.""" if self._open_file: return self._open_file else: - if isinstance(self.ssl_file, StringIO): - return self.ssl_file - elif isinstance(self.ssl_file, (str, Path)): - raise TypeError("Use `with` statement to open file from path.") - else: - raise TypeError("Unsupported type for `file`.") + warnings.warn( + "Opening file outside of context manager. Manually close file after use." + ) + self.open() + return self._open_file def write(self, processing_results: List[ProcessingResult]): """Write multiple processing results to file.""" @@ -92,6 +144,7 @@ def _write_result(self, result: ProcessingResult): class TSV(_Writer): """Write TSV files from MS2PIP processing results.""" + suffix = ".tsv" field_names = [ "psm_index", "ion_type", @@ -104,7 +157,9 @@ class TSV(_Writer): def write(self, processing_results: List[ProcessingResult]): """Write multiple processing results to file.""" - writer = csv.DictWriter(self._file_object, fieldnames=self.field_names, delimiter="\t") + writer = csv.DictWriter( + self._file_object, fieldnames=self.field_names, delimiter="\t", lineterminator="\n" + ) if self.write_mode == "w": writer.writeheader() for result in processing_results: @@ -141,6 +196,8 @@ def _write_row(result: ProcessingResult, ion_type: str, ion_index: int): class MSP(_Writer): """Write MSP files from MS2PIP processing results.""" + suffix = ".msp" + def write(self, results: List[ProcessingResult]): """Write multiple processing results to file.""" for result in results: @@ -251,6 +308,8 @@ def _format_comment_line(psm: PSM) -> str: class MGF(_Writer): """Write MGF files from MS2PIP processing results.""" + suffix = ".mgf" + def write(self, results: List[ProcessingResult]): """Write multiple processing results to file.""" for result in results: @@ -283,6 +342,7 @@ def _write_result(self, result: ProcessingResult): class Spectronaut(_Writer): """Write Spectronaut files from MS2PIP processing results.""" + suffix = ".spectronaut.tsv" field_names = [ "ModifiedPeptide", "StrippedPeptide", @@ -301,7 +361,9 @@ class Spectronaut(_Writer): def write(self, processing_results: List[ProcessingResult]): """Write multiple processing results to file.""" - writer = csv.DictWriter(self._file_object, fieldnames=self.field_names, delimiter="\t") + writer = csv.DictWriter( + self._file_object, fieldnames=self.field_names, delimiter="\t", lineterminator="\n" + ) if self.write_mode == "w": writer.writeheader() for result in processing_results: @@ -359,8 +421,15 @@ def _yield_fragment_info(result: ProcessingResult) -> Generator[Dict[str, Any], class Bibliospec(_Writer): - """Write Bibliospec files from MS2PIP processing results.""" + """ + Write Bibliospec SSL and MS2 files from MS2PIP processing results. + + Bibliospec SSL and MS2 files are also compatible with Skyline. + """ + + ssl_suffix = ".ssl" + ms2_suffix = ".ms2" ssl_field_names = [ "file", "scan", @@ -371,79 +440,60 @@ class Bibliospec(_Writer): "retention-time", ] - def __init__( - self, - ssl_file: Union[str, Path, StringIO], - ms2_file: Union[str, Path, StringIO], - write_mode: str = "w", - ): - """ - Write Bibliospec files from MS2PIP processing results. - - Parameters - ---------- - ssl_file : Union[str, Path, StringIO] - Path to SSL file or StringIO object. - ms2_file : Union[str, Path, StringIO] - Path to MS2 file or StringIO object. - write_mode : str - Write mode for files. Default is "w". - """ - - self.ssl_file = ssl_file - self.ms2_file = ms2_file - self.write_mode = write_mode + def __init__(self, filename: Union[str, Path], write_mode: str = "w"): + super().__init__(filename, write_mode) + self.ssl_file = self.filename.with_suffix(self.ssl_suffix) + self.ms2_file = self.filename.with_suffix(self.ms2_suffix) self._open_ssl_file = None self._open_ms2_file = None - def __enter__(self): - """Open file in context manager.""" - if isinstance(self.ssl_file, (str, Path)): - self.ssl_file = Path(self.ssl_file) - self._open_ssl_file = open(self.ssl_file, self.write_mode) - if isinstance(self.ms2_file, (str, Path)): - self.ms2_file = Path(self.ms2_file) - self._open_ms2_file = open(self.ms2_file, self.write_mode) - return self + def open(self): + """Open files.""" + self._open_ssl_file = open(self.ssl_file, self.write_mode) + self._open_ms2_file = open(self.ms2_file, self.write_mode) - def __exit__(self, exc_type, exc_value, traceback): + def close(self): + """Close files.""" if self._open_ssl_file: self._open_ssl_file.close() + self._open_ssl_file = None if self._open_ms2_file: self._open_ms2_file.close() + self._open_ms2_file = None @property def _ssl_file_object(self): - """Get SSL file object from file path or StringIO.""" + """Get open SSL file object.""" if self._open_ssl_file: return self._open_ssl_file else: - if isinstance(self.ssl_file, StringIO): - return self.ssl_file - elif isinstance(self.ssl_file, (str, Path)): - raise TypeError("Use `with` statement to open file from path.") - else: - raise TypeError("Unsupported type for `file`.") + warnings.warn( + "Opening file outside of context manager. Manually close file after use." + ) + self.open() + return self._open_ssl_file @property def _ms2_file_object(self): - """Get MS2 file object from file path or StringIO.""" + """Get open MS2 file object.""" if self._open_ms2_file: return self._open_ms2_file else: - if isinstance(self.ms2_file, StringIO): - return self.ms2_file - elif isinstance(self.ms2_file, (str, Path)): - raise TypeError("Use `with` statement to open file from path.") - else: - raise TypeError("Unsupported type for `file`.") + warnings.warn( + "Opening file outside of context manager. Manually close file after use." + ) + self.open() + return self._open_ms2_file def write(self, processing_results: List[ProcessingResult]): """Write multiple processing results to file.""" # Create CSV writer ssl_dict_writer = csv.DictWriter( - self._ssl_file_object, fieldnames=self.ssl_field_names, delimiter="\t" + self._ssl_file_object, + fieldnames=self.ssl_field_names, + delimiter="\t", + lineterminator="\n", ) # Write headers @@ -560,39 +610,53 @@ class DLIB(_Writer): """ Write DLIB files from MS2PIP processing results. - See https://bitbucket.org/searleb/encyclopedia/wiki/EncyclopeDIA%20File%20Formats for - documentation on the DLIB format. + See `EncyclopeDIA File Formats `_ + for documentation on the DLIB format. """ + suffix = ".dlib" + + def open(self): + """Open file.""" + if self.write_mode == "w": + self._open_file = self.filename.unlink(missing_ok=True) + self._open_file = dlib.open_sqlite(self.filename) + def write(self, processing_results: List[ProcessingResult]): """Write MS2PIP predictions to a DLIB SQLite file.""" - with dlib.open_sqlite(self.file) as connection: - dlib.metadata.create_all() - self._write_metadata(connection) - self._write_entries(processing_results, connection, self.file) - self._write_peptide_to_protein(processing_results, connection) + connection = self._file_object + dlib.metadata.create_all() + self._write_metadata(connection) + self._write_entries(processing_results, connection, self.filename) + self._write_peptide_to_protein(processing_results, connection) - def _write_result(self, result: ProcessingResult): - """Write single processing result to file.""" - ... + def _write_result(self, result: ProcessingResult): ... @staticmethod def _format_modified_sequence(peptidoform: Peptidoform) -> str: """Format modified sequence as string for DLIB.""" - # TODO: Implement - # From the EncyclopeDIA DLIB documentation: - # PeptideModSeq has strings like "QKEC[+57.0214635]SDK" to indicate PTMs. PTMs are always - # encoded as delta masses (including fixed PTMs such as carbamidomethylation). Sites can - # only have one PTM mass, so compound masses are allowed, such as - # "M[+58.00548]ELS[+79.966331]C[+57.0214635]PGSR", where +58.00548 indicates both - # acetylation and oxidation. N- and C-terminus PTMs should be annotated on the first or - # last amino acid in the peptide, respectively. Metabolic labels can be incorporated in - # the same way, for example EC[+57.0214635]SDK[+8.014199]. - raise NotImplementedError + # Sum all sequential mass shifts for each position + masses = [ + sum(mod.mass for mod in mods) if mods else 0 for _, mods in peptidoform.parsed_sequence + ] + + # Add N- and C-terminal modifications + for term, position in [("n_term", 0), ("c_term", len(peptidoform) - 1)]: + if peptidoform.properties[term]: + masses[position] += sum(mod.mass for mod in peptidoform.properties[term]) + + # Format modified sequence + return "".join( + [ + f"{aa}[{mass:+.6f}]" if mass else aa + for aa, mass in zip(peptidoform.sequence, masses) + ] + ) @staticmethod def _write_metadata(connection: engine.Connection): + """Write metadata to DLIB SQLite file.""" with connection.begin(): version = connection.execute( select([dlib.Metadata.c.Value]).where(dlib.Metadata.c.Key == "version") @@ -611,6 +675,7 @@ def _write_entries( connection: engine.Connection, output_filename: str, ): + """Write spectra to DLIB SQLite file.""" with connection.begin(): for result in processing_results: if not result.psm.retention_time: @@ -622,7 +687,7 @@ def _write_entries( connection.execute( dlib.Entry.insert().values( - PrecursorMz=result.psm.precursor_mz, + PrecursorMz=result.psm.peptidoform.theoretical_mz, PrecursorCharge=result.psm.get_precursor_charge(), PeptideModSeq=DLIB._format_modified_sequence(result.psm.peptidoform), PeptideSeq=result.psm.peptidoform.sequence, @@ -630,20 +695,20 @@ def _write_entries( RTInSeconds=result.psm.retention_time, Score=0, MassEncodedLength=n_peaks, - MassArray=spectrum.mz, + MassArray=spectrum.mz.tolist(), IntensityEncodedLength=n_peaks, - IntensityArray=intensity_normalized, - SourceFile=output_filename, + IntensityArray=intensity_normalized.tolist(), + SourceFile=str(output_filename), ) ) @staticmethod def _write_peptide_to_protein(results: List[ProcessingResult], connection: engine.Connection): - from ms2pip._utils.dlib import PeptideToProtein - + """Write peptide-to-protein mappings to DLIB SQLite file.""" peptide_to_proteins = { (result.psm.peptidoform.sequence, protein) for result in results + if result.psm.protein_list for protein in result.psm.protein_list } @@ -651,7 +716,9 @@ def _write_peptide_to_protein(results: List[ProcessingResult], connection: engin sql_peptide_to_proteins = set() proteins = {protein for _, protein in peptide_to_proteins} for peptide_to_protein in connection.execute( - PeptideToProtein.select().where(PeptideToProtein.c.ProteinAccession.in_(proteins)) + dlib.PeptideToProtein.select().where( + dlib.PeptideToProtein.c.ProteinAccession.in_(proteins) + ) ): sql_peptide_to_proteins.add( ( @@ -663,7 +730,34 @@ def _write_peptide_to_protein(results: List[ProcessingResult], connection: engin peptide_to_proteins.difference_update(sql_peptide_to_proteins) for seq, protein in peptide_to_proteins: connection.execute( - PeptideToProtein.insert().values( + dlib.PeptideToProtein.insert().values( PeptideSeq=seq, isDecoy=False, ProteinAccession=protein ) ) + + +SUPPORTED_FORMATS = { + "tsv": TSV, + "msp": MSP, + "mgf": MGF, + "spectronaut": Spectronaut, + "bibliospec": Bibliospec, + "dlib": DLIB, +} + + +def _peptidoform_str_without_charge(peptidoform: Peptidoform) -> str: + """Get peptidoform string without charge.""" + return re.sub(r"\/\d+$", "", str(peptidoform)) + + +def _unlogarithmize(intensities: np.array) -> np.array: + """Undo logarithmic transformation of intensities.""" + return (2**intensities) - 0.001 + + +def _basepeak_normalize(intensities: np.array, basepeak: Optional[float] = None) -> np.array: + """Normalize intensities to most intense peak.""" + if not basepeak: + basepeak = intensities.max() + return intensities / basepeak diff --git a/tests/test_spectrum_output.py b/tests/test_spectrum_output.py index 0cf41765..9ddf8d65 100644 --- a/tests/test_spectrum_output.py +++ b/tests/test_spectrum_output.py @@ -1,7 +1,6 @@ - from psm_utils import Peptidoform -from ms2pip.spectrum_output import MSP, Bibliospec +from ms2pip.spectrum_output import MSP, Bibliospec, DLIB class TestMSP: @@ -32,3 +31,17 @@ def test__format_modified_sequence(self): for peptidoform_str, expected_output in test_cases: peptidoform = Peptidoform(peptidoform_str) assert Bibliospec._format_modified_sequence(peptidoform) == expected_output + + +class TestDLIB: + def test__format_modified_sequence(self): + test_cases = [ + ("ACDE/2", "ACDE"), + ("AC[Carbamidomethyl]DE/2", "AC[+57.021464]DE"), + ("[Glu->pyro-Glu]-EPEPTIDEK/2", "E[-18.010565]PEPTIDEK"), + ("PEPTIDEK-[Amidated]/2", "PEPTIDEK[-0.984016]"), + ("AM[Oxidation]C[Carbamidomethyl]DE/2", "AM[+15.994915]C[+57.021464]DE"), + ] + + for test_in, expected_out in test_cases: + assert DLIB._format_modified_sequence(Peptidoform(test_in)) == expected_out From cdb8ca166a1aac6c82ebe874687589c20dd34fa9 Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 8 May 2024 15:14:18 +0200 Subject: [PATCH 12/14] Clip negative predicted intensities; Allow psm_list with multiple runs/collections for prediction --- ms2pip/_utils/psm_input.py | 6 ------ ms2pip/_utils/xgb_models.py | 1 + ms2pip/core.py | 9 ++++++++- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/ms2pip/_utils/psm_input.py b/ms2pip/_utils/psm_input.py index db464c9b..7a0cacc6 100644 --- a/ms2pip/_utils/psm_input.py +++ b/ms2pip/_utils/psm_input.py @@ -7,8 +7,6 @@ import psm_utils.io.peptide_record from psm_utils import PSMList -from ms2pip import exceptions - logger = logging.getLogger(__name__) @@ -23,10 +21,6 @@ def read_psms(psms: Union[str, Path, PSMList], filetype: Union[str, None]) -> PS else: raise TypeError("Invalid type for psms. Should be str, Path, or PSMList.") - # Validate runs and collections - if not len(psm_list.collections) == 1 or not len(psm_list.runs) == 1: - raise exceptions.InvalidInputError("PSMs should be for a single run and collection.") - # Apply fixed modifications if any psm_list.apply_fixed_modifications() diff --git a/ms2pip/_utils/xgb_models.py b/ms2pip/_utils/xgb_models.py index ef292306..c1e7233a 100644 --- a/ms2pip/_utils/xgb_models.py +++ b/ms2pip/_utils/xgb_models.py @@ -53,6 +53,7 @@ def get_predictions_xgb(features, num_ions, model_params, model_dir, processes=1 for ion_type, xgb_model in xgboost_models.items(): # Get predictions from XGBoost model preds = xgb_model.predict(features) + preds = preds.clip(min=np.log2(0.001)) # Clip negative intensities xgb_model.__del__() # Reshape into arrays for each peptide diff --git a/ms2pip/core.py b/ms2pip/core.py index 50801235..cc4c096c 100644 --- a/ms2pip/core.py +++ b/ms2pip/core.py @@ -545,6 +545,10 @@ def process_spectra( If only peak annotations should be extracted from the spectrum file """ + # Validate runs and collections + if not len(psm_list.collections) == 1 or not len(psm_list.runs) == 1: + raise exceptions.InvalidInputError("PSMs should be for a single run and collection.") + args = ( spectrum_file, vector_file, @@ -673,7 +677,10 @@ def _process_peptidoform( MODELS[model]["peaks_version"], 30.0, # TODO: Remove CE feature ) - predictions = {i: np.array(p, dtype=np.float32) for i, p in zip(ion_types, predictions)} + predictions = { + i: np.array(p, dtype=np.float32).clip(min=np.log2(0.001)) # Clip negative intensities + for i, p in zip(ion_types, predictions) + } feature_vectors = None return ProcessingResult( From bc65c937a344bd9f3a63fc9da9d1dabdb2ef49e0 Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 8 May 2024 15:15:22 +0200 Subject: [PATCH 13/14] Implement spectrum output for CLI functions --- ms2pip/__main__.py | 76 +++++++++++++++++---------------------- ms2pip/result.py | 40 +++------------------ ms2pip/spectrum_output.py | 34 ++++++++++-------- 3 files changed, 56 insertions(+), 94 deletions(-) diff --git a/ms2pip/__main__.py b/ms2pip/__main__.py index 7960c5f1..818c514f 100644 --- a/ms2pip/__main__.py +++ b/ms2pip/__main__.py @@ -10,17 +10,12 @@ from werkzeug.utils import secure_filename import ms2pip.core -from ms2pip import __version__ +from ms2pip import __version__, exceptions from ms2pip._utils.cli import build_credits, build_prediction_table -from ms2pip.constants import MODELS, SUPPORTED_OUTPUT_FORMATS -from ms2pip.exceptions import ( - InvalidXGBoostModelError, - UnknownModelError, - UnknownOutputFormatError, - UnresolvableModificationError, -) -from ms2pip.result import correlations_to_csv, results_to_csv -from ms2pip.spectrum_output import write_single_spectrum_csv, write_single_spectrum_png +from ms2pip.constants import MODELS +from ms2pip.plot import spectrum_to_png +from ms2pip.result import write_correlations +from ms2pip.spectrum_output import SUPPORTED_FORMATS, write_spectra console = Console() logger = logging.getLogger(__name__) @@ -44,7 +39,8 @@ def _infer_output_name( if output_name: return Path(output_name) else: - return Path(input_filename).with_suffix("") + input__filename = Path(input_filename) + return input__filename.with_name(input__filename.stem + "_predictions").with_suffix("") @click.group() @@ -65,15 +61,17 @@ def cli(*args, **kwargs): @cli.command(help=ms2pip.core.predict_single.__doc__) @click.argument("peptidoform", required=True) @click.option("--output-name", "-o", type=str) +@click.option("--output-format", "-f", type=click.Choice(SUPPORTED_FORMATS), default="tsv") @click.option("--model", type=click.Choice(MODELS), default="HCD") @click.option("--model-dir") @click.option("--plot", "-p", is_flag=True) def predict_single(*args, **kwargs): # Parse arguments output_name = kwargs.pop("output_name") + output_format = kwargs.pop("output_format") plot = kwargs.pop("plot") if not output_name: - output_name = "ms2pip_prediction_" + secure_filename(kwargs["peptidoform"]) + ".csv" + output_name = "ms2pip_prediction_" + secure_filename(kwargs["peptidoform"]) # Predict spectrum result = ms2pip.core.predict_single(*args, **kwargs) @@ -81,33 +79,29 @@ def predict_single(*args, **kwargs): # Write output console.print(build_prediction_table(predicted_spectrum)) - write_single_spectrum_csv(predicted_spectrum, output_name) + write_spectra(output_name, [result], output_format) if plot: - write_single_spectrum_png(predicted_spectrum, output_name) + spectrum_to_png(predicted_spectrum, output_name) @cli.command(help=ms2pip.core.predict_batch.__doc__) @click.argument("psms", required=True) @click.option("--output-name", "-o", type=str) -@click.option("--output-format", "-f", type=click.Choice(SUPPORTED_OUTPUT_FORMATS)) +@click.option("--output-format", "-f", type=click.Choice(SUPPORTED_FORMATS), default="tsv") @click.option("--add-retention-time", "-r", is_flag=True) @click.option("--model", type=click.Choice(MODELS), default="HCD") @click.option("--model-dir") @click.option("--processes", "-n", type=int) def predict_batch(*args, **kwargs): # Parse arguments - output_name = kwargs.pop("output_name") - output_format = kwargs.pop("output_format") # noqa F841 TODO - output_name = _infer_output_name(kwargs["psms"], output_name) + output_format = kwargs.pop("output_format") + output_name = _infer_output_name(kwargs["psms"], kwargs.pop("output_name")) # Run predictions = ms2pip.core.predict_batch(*args, **kwargs) # Write output - output_name_csv = output_name.with_name(output_name.stem + "_predictions").with_suffix(".csv") - logger.info(f"Writing output to {output_name_csv}") - results_to_csv(predictions, output_name_csv) - # TODO: add support for other output formats + write_spectra(output_name, predictions, output_format) @cli.command(help=ms2pip.core.predict_library.__doc__) @@ -129,24 +123,22 @@ def predict_library(*args, **kwargs): @click.option("--processes", "-n", type=int) def correlate(*args, **kwargs): # Parse arguments - output_name = kwargs.pop("output_name") - output_name = _infer_output_name(kwargs["psms"], output_name) + output_name = _infer_output_name(kwargs["psms"], kwargs.pop("output_name")) # Run results = ms2pip.core.correlate(*args, **kwargs) - # Write output - output_name_int = output_name.with_name(output_name.stem + "_predictions").with_suffix(".csv") - logger.info(f"Writing intensities to {output_name_int}") - results_to_csv(results, output_name_int) - # TODO: add support for other output formats + # Write intensities + logger.info(f"Writing intensities to {output_name.with_suffix('.tsv')}") + write_spectra(output_name, results, "tsv") # Write correlations if kwargs["compute_correlations"]: - output_name_corr = output_name.with_name(output_name.stem + "_correlations") - output_name_corr = output_name_corr.with_suffix(".csv") + output_name_corr = output_name.with_name(output_name.stem + "_correlations").with_suffix( + ".tsv" + ) logger.info(f"Writing correlations to {output_name_corr}") - correlations_to_csv(results, output_name_corr) + write_correlations(results, output_name_corr) @cli.command(help=ms2pip.core.get_training_data.__doc__) @@ -188,16 +180,16 @@ def annotate_spectra(*args, **kwargs): # Run results = ms2pip.core.annotate_spectra(*args, **kwargs) - # Write output - output_name_int = output_name.with_name(output_name.stem + "_observations").with_suffix(".csv") - logger.info(f"Writing intensities to {output_name_int}") - results_to_csv(results, output_name_int) + # Write intensities + output_name_int = output_name.with_name(output_name.stem + "_observations").with_suffix() + logger.info(f"Writing intensities to {output_name_int.with_suffix('.tsv')}") + write_spectra(output_name, results, "tsv") def main(): try: cli() - except UnresolvableModificationError as e: + except exceptions.UnresolvableModificationError as e: logger.critical( "Unresolvable modification: `%s`. See " "https://ms2pip.readthedocs.io/en/stable/usage/#amino-acid-modifications " @@ -205,15 +197,13 @@ def main(): e, ) sys.exit(1) - except UnknownOutputFormatError as o: - logger.critical( - f"Unknown output format: `{o}` (supported formats: `{SUPPORTED_OUTPUT_FORMATS}`)" - ) + except exceptions.UnknownOutputFormatError as o: + logger.critical(f"Unknown output format: `{o}` (supported formats: `{SUPPORTED_FORMATS}`)") sys.exit(1) - except UnknownModelError as f: + except exceptions.UnknownModelError as f: logger.critical(f"Unknown model: `{f}` (supported models: {set(MODELS.keys())})") sys.exit(1) - except InvalidXGBoostModelError: + except exceptions.InvalidXGBoostModelError: logger.critical("Could not correctly download XGBoost model\nTry a manual download.") sys.exit(1) except Exception: diff --git a/ms2pip/result.py b/ms2pip/result.py index 9d734bf3..bb5ef014 100644 --- a/ms2pip/result.py +++ b/ms2pip/result.py @@ -1,4 +1,5 @@ """Definition and handling of MS²PIP results.""" + from __future__ import annotations import csv @@ -6,7 +7,7 @@ import numpy as np from psm_utils import PSM -from pydantic import ConfigDict, BaseModel +from pydantic import BaseModel, ConfigDict try: import spectrum_utils.plot as sup @@ -115,44 +116,11 @@ def calculate_correlations(results: List[ProcessingResult]) -> None: result.correlation = np.corrcoef(pred_int, obs_int)[0][1] -def results_to_csv(results: List["ProcessingResult"], output_file: str) -> None: - """Write processing results to CSV file.""" - with open(output_file, "wt") as f: - fieldnames = [ - "psm_index", - "ion_type", - "ion_number", - "mz", - "predicted", - "observed", - ] - writer = csv.DictWriter(f, fieldnames=fieldnames, lineterminator="\n") - writer.writeheader() - for result in results: - if result.theoretical_mz is not None: - for ion_type in result.theoretical_mz: - for i in range(len(result.theoretical_mz[ion_type])): - writer.writerow( - { - "psm_index": result.psm_index, - "ion_type": ion_type, - "ion_number": i + 1, - "mz": "{:.6g}".format(result.theoretical_mz[ion_type][i]), - "predicted": "{:.6g}".format( - result.predicted_intensity[ion_type][i] - ) if result.predicted_intensity else None, - "observed": "{:.6g}".format(result.observed_intensity[ion_type][i]) - if result.observed_intensity - else None, - } - ) - - -def correlations_to_csv(results: List["ProcessingResult"], output_file: str) -> None: +def write_correlations(results: List["ProcessingResult"], output_file: str) -> None: """Write correlations to CSV file.""" with open(output_file, "wt") as f: fieldnames = ["psm_index", "correlation"] - writer = csv.DictWriter(f, fieldnames=fieldnames, lineterminator="\n") + writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter="\t", lineterminator="\n") writer.writeheader() for result in results: writer.writerow({"psm_index": result.psm_index, "correlation": result.correlation}) diff --git a/ms2pip/spectrum_output.py b/ms2pip/spectrum_output.py index d422cad3..36c97be2 100644 --- a/ms2pip/spectrum_output.py +++ b/ms2pip/spectrum_output.py @@ -40,6 +40,7 @@ import csv import itertools +import logging import re import warnings from abc import ABC, abstractmethod @@ -57,11 +58,13 @@ from ms2pip._utils import dlib from ms2pip.result import ProcessingResult +LOGGER = logging.getLogger(__name__) + def write_spectra( filename: Union[str, Path], processing_results: List[ProcessingResult], - file_format: str, + file_format: str = "tsv", write_mode: str = "w", ): """ @@ -80,13 +83,14 @@ def write_spectra( """ with SUPPORTED_FORMATS[file_format](filename, write_mode) as writer: + LOGGER.info(f"Writing to {writer.filename}") writer.write(processing_results) class _Writer(ABC): """Abstract base class for writing spectrum files.""" - suffix = ".txt" + suffix = "" def __init__(self, filename: Union[str, Path], write_mode: str = "w"): self.filename = Path(filename).with_suffix(self.suffix) @@ -182,11 +186,11 @@ def _write_row(result: ProcessingResult, ion_type: str, ion_index: int): "psm_index": result.psm_index, "ion_type": ion_type, "ion_number": ion_index + 1, - "mz": "{:.10g}".format(result.theoretical_mz[ion_type][ion_index]), - "predicted": "{:.10g}".format(result.predicted_intensity[ion_type][ion_index]) + "mz": "{:.8f}".format(result.theoretical_mz[ion_type][ion_index]), + "predicted": "{:.8f}".format(result.predicted_intensity[ion_type][ion_index]) if result.predicted_intensity else None, - "observed": "{:.10g}".format(result.observed_intensity[ion_type][ion_index]) + "observed": "{:.8f}".format(result.observed_intensity[ion_type][ion_index]) if result.observed_intensity else None, "rt": result.psm.retention_time if result.psm.retention_time else None, @@ -219,7 +223,7 @@ def _write_result(self, result: ProcessingResult): # Peaks lines.extend( - f"{mz:.10g}\t{intensity:.10g}\t{annotation}/0.0" for mz, intensity, annotation in peaks + f"{mz:.8f}\t{intensity:.8f}\t{annotation}/0.0" for mz, intensity, annotation in peaks ) # Write to file @@ -259,7 +263,7 @@ def _format_single_modification( if not mods: return "Mods=0" else: - return f"Mods={len(mods)}/{'/'.join(sorted(mods))}" + return f"Mods={len(mods)}/{'/'.join(mods)}" @staticmethod def _format_parent_mass(peptidoform: Peptidoform) -> str: @@ -332,11 +336,11 @@ def _write_result(self, result: ProcessingResult): ] # Peaks - lines.extend(f"{mz:.10g} {intensity:.10g}" for mz, intensity in peaks) + lines.extend(f"{mz:.8f} {intensity:.8f}" for mz, intensity in peaks) # Write to file self._file_object.writelines(line + "\n" for line in lines if line) - self._file_object.write("END IONS\n") + self._file_object.write("END IONS\n\n") class Spectronaut(_Writer): @@ -385,9 +389,9 @@ def _process_psm(psm: PSM) -> Dict[str, Any]: "ModifiedPeptide": _peptidoform_str_without_charge(psm.peptidoform), "StrippedPeptide": psm.peptidoform.sequence, "PrecursorCharge": psm.get_precursor_charge(), - "PrecursorMz": f"{psm.peptidoform.theoretical_mz:.10g}", - "IonMobility": f"{psm.ion_mobility:.10g}" if psm.ion_mobility else None, - "iRT": f"{psm.retention_time:.10g}" if psm.retention_time else None, + "PrecursorMz": f"{psm.peptidoform.theoretical_mz:.8f}", + "IonMobility": f"{psm.ion_mobility:.8f}" if psm.ion_mobility else None, + "iRT": f"{psm.retention_time:.8f}" if psm.retention_time else None, "ProteinId": "".join(psm.protein_list) if psm.protein_list else None, } @@ -411,8 +415,8 @@ def _yield_fragment_info(result: ProcessingResult) -> Generator[Dict[str, Any], zip(intensities[ion_type], result.theoretical_mz[ion_type]) ): yield { - "RelativeFragmentIntensity": f"{intensity:.10g}", - "FragmentMz": f"{mz:.10g}", + "RelativeFragmentIntensity": f"{intensity:.8f}", + "FragmentMz": f"{mz:.8f}", "FragmentType": fragment_type, "FragmentNumber": ion_index + 1, "FragmentCharge": fragment_charge, @@ -567,7 +571,7 @@ def _write_result_to_ms2( ] # Peaks - lines.extend(f"{mz:.10g}\t{intensity:.10g}" for mz, intensity in peaks) + lines.extend(f"{mz:.8f}\t{intensity:.8f}" for mz, intensity in peaks) # Write to file self._ms2_file_object.writelines(line + "\n" for line in lines) From 8a91a57f3c232c56c9888a211229603910b55133 Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 8 May 2024 15:19:00 +0200 Subject: [PATCH 14/14] Fix linting errors --- ms2pip/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ms2pip/core.py b/ms2pip/core.py index cc4c096c..31feb3a9 100644 --- a/ms2pip/core.py +++ b/ms2pip/core.py @@ -26,7 +26,7 @@ from ms2pip.constants import MODELS from ms2pip.result import ProcessingResult, calculate_correlations from ms2pip.spectrum_input import read_spectrum_file -from ms2pip.spectrum_output import SUPPORTED_FORMATS, write_spectra +from ms2pip.spectrum_output import SUPPORTED_FORMATS logger = logging.getLogger(__name__) @@ -425,7 +425,7 @@ def _validate_output_formats(self, output_formats: List[str]) -> List[str]: self.output_formats = ["csv"] else: for output_format in output_formats: - if output_format not in SUPPORTED_OUTPUT_FORMATS: + if output_format not in SUPPORTED_FORMATS: raise exceptions.UnknownOutputFormatError(output_format) self.output_formats = output_formats