From bc65c937a344bd9f3a63fc9da9d1dabdb2ef49e0 Mon Sep 17 00:00:00 2001 From: RalfG Date: Wed, 8 May 2024 15:15:22 +0200 Subject: [PATCH] Implement spectrum output for CLI functions --- ms2pip/__main__.py | 76 +++++++++++++++++---------------------- ms2pip/result.py | 40 +++------------------ ms2pip/spectrum_output.py | 34 ++++++++++-------- 3 files changed, 56 insertions(+), 94 deletions(-) diff --git a/ms2pip/__main__.py b/ms2pip/__main__.py index 7960c5f..818c514 100644 --- a/ms2pip/__main__.py +++ b/ms2pip/__main__.py @@ -10,17 +10,12 @@ from werkzeug.utils import secure_filename import ms2pip.core -from ms2pip import __version__ +from ms2pip import __version__, exceptions from ms2pip._utils.cli import build_credits, build_prediction_table -from ms2pip.constants import MODELS, SUPPORTED_OUTPUT_FORMATS -from ms2pip.exceptions import ( - InvalidXGBoostModelError, - UnknownModelError, - UnknownOutputFormatError, - UnresolvableModificationError, -) -from ms2pip.result import correlations_to_csv, results_to_csv -from ms2pip.spectrum_output import write_single_spectrum_csv, write_single_spectrum_png +from ms2pip.constants import MODELS +from ms2pip.plot import spectrum_to_png +from ms2pip.result import write_correlations +from ms2pip.spectrum_output import SUPPORTED_FORMATS, write_spectra console = Console() logger = logging.getLogger(__name__) @@ -44,7 +39,8 @@ def _infer_output_name( if output_name: return Path(output_name) else: - return Path(input_filename).with_suffix("") + input__filename = Path(input_filename) + return input__filename.with_name(input__filename.stem + "_predictions").with_suffix("") @click.group() @@ -65,15 +61,17 @@ def cli(*args, **kwargs): @cli.command(help=ms2pip.core.predict_single.__doc__) @click.argument("peptidoform", required=True) @click.option("--output-name", "-o", type=str) +@click.option("--output-format", "-f", type=click.Choice(SUPPORTED_FORMATS), default="tsv") @click.option("--model", type=click.Choice(MODELS), default="HCD") @click.option("--model-dir") @click.option("--plot", "-p", is_flag=True) def predict_single(*args, **kwargs): # Parse arguments output_name = kwargs.pop("output_name") + output_format = kwargs.pop("output_format") plot = kwargs.pop("plot") if not output_name: - output_name = "ms2pip_prediction_" + secure_filename(kwargs["peptidoform"]) + ".csv" + output_name = "ms2pip_prediction_" + secure_filename(kwargs["peptidoform"]) # Predict spectrum result = ms2pip.core.predict_single(*args, **kwargs) @@ -81,33 +79,29 @@ def predict_single(*args, **kwargs): # Write output console.print(build_prediction_table(predicted_spectrum)) - write_single_spectrum_csv(predicted_spectrum, output_name) + write_spectra(output_name, [result], output_format) if plot: - write_single_spectrum_png(predicted_spectrum, output_name) + spectrum_to_png(predicted_spectrum, output_name) @cli.command(help=ms2pip.core.predict_batch.__doc__) @click.argument("psms", required=True) @click.option("--output-name", "-o", type=str) -@click.option("--output-format", "-f", type=click.Choice(SUPPORTED_OUTPUT_FORMATS)) +@click.option("--output-format", "-f", type=click.Choice(SUPPORTED_FORMATS), default="tsv") @click.option("--add-retention-time", "-r", is_flag=True) @click.option("--model", type=click.Choice(MODELS), default="HCD") @click.option("--model-dir") @click.option("--processes", "-n", type=int) def predict_batch(*args, **kwargs): # Parse arguments - output_name = kwargs.pop("output_name") - output_format = kwargs.pop("output_format") # noqa F841 TODO - output_name = _infer_output_name(kwargs["psms"], output_name) + output_format = kwargs.pop("output_format") + output_name = _infer_output_name(kwargs["psms"], kwargs.pop("output_name")) # Run predictions = ms2pip.core.predict_batch(*args, **kwargs) # Write output - output_name_csv = output_name.with_name(output_name.stem + "_predictions").with_suffix(".csv") - logger.info(f"Writing output to {output_name_csv}") - results_to_csv(predictions, output_name_csv) - # TODO: add support for other output formats + write_spectra(output_name, predictions, output_format) @cli.command(help=ms2pip.core.predict_library.__doc__) @@ -129,24 +123,22 @@ def predict_library(*args, **kwargs): @click.option("--processes", "-n", type=int) def correlate(*args, **kwargs): # Parse arguments - output_name = kwargs.pop("output_name") - output_name = _infer_output_name(kwargs["psms"], output_name) + output_name = _infer_output_name(kwargs["psms"], kwargs.pop("output_name")) # Run results = ms2pip.core.correlate(*args, **kwargs) - # Write output - output_name_int = output_name.with_name(output_name.stem + "_predictions").with_suffix(".csv") - logger.info(f"Writing intensities to {output_name_int}") - results_to_csv(results, output_name_int) - # TODO: add support for other output formats + # Write intensities + logger.info(f"Writing intensities to {output_name.with_suffix('.tsv')}") + write_spectra(output_name, results, "tsv") # Write correlations if kwargs["compute_correlations"]: - output_name_corr = output_name.with_name(output_name.stem + "_correlations") - output_name_corr = output_name_corr.with_suffix(".csv") + output_name_corr = output_name.with_name(output_name.stem + "_correlations").with_suffix( + ".tsv" + ) logger.info(f"Writing correlations to {output_name_corr}") - correlations_to_csv(results, output_name_corr) + write_correlations(results, output_name_corr) @cli.command(help=ms2pip.core.get_training_data.__doc__) @@ -188,16 +180,16 @@ def annotate_spectra(*args, **kwargs): # Run results = ms2pip.core.annotate_spectra(*args, **kwargs) - # Write output - output_name_int = output_name.with_name(output_name.stem + "_observations").with_suffix(".csv") - logger.info(f"Writing intensities to {output_name_int}") - results_to_csv(results, output_name_int) + # Write intensities + output_name_int = output_name.with_name(output_name.stem + "_observations").with_suffix() + logger.info(f"Writing intensities to {output_name_int.with_suffix('.tsv')}") + write_spectra(output_name, results, "tsv") def main(): try: cli() - except UnresolvableModificationError as e: + except exceptions.UnresolvableModificationError as e: logger.critical( "Unresolvable modification: `%s`. See " "https://ms2pip.readthedocs.io/en/stable/usage/#amino-acid-modifications " @@ -205,15 +197,13 @@ def main(): e, ) sys.exit(1) - except UnknownOutputFormatError as o: - logger.critical( - f"Unknown output format: `{o}` (supported formats: `{SUPPORTED_OUTPUT_FORMATS}`)" - ) + except exceptions.UnknownOutputFormatError as o: + logger.critical(f"Unknown output format: `{o}` (supported formats: `{SUPPORTED_FORMATS}`)") sys.exit(1) - except UnknownModelError as f: + except exceptions.UnknownModelError as f: logger.critical(f"Unknown model: `{f}` (supported models: {set(MODELS.keys())})") sys.exit(1) - except InvalidXGBoostModelError: + except exceptions.InvalidXGBoostModelError: logger.critical("Could not correctly download XGBoost model\nTry a manual download.") sys.exit(1) except Exception: diff --git a/ms2pip/result.py b/ms2pip/result.py index 9d734bf..bb5ef01 100644 --- a/ms2pip/result.py +++ b/ms2pip/result.py @@ -1,4 +1,5 @@ """Definition and handling of MSĀ²PIP results.""" + from __future__ import annotations import csv @@ -6,7 +7,7 @@ import numpy as np from psm_utils import PSM -from pydantic import ConfigDict, BaseModel +from pydantic import BaseModel, ConfigDict try: import spectrum_utils.plot as sup @@ -115,44 +116,11 @@ def calculate_correlations(results: List[ProcessingResult]) -> None: result.correlation = np.corrcoef(pred_int, obs_int)[0][1] -def results_to_csv(results: List["ProcessingResult"], output_file: str) -> None: - """Write processing results to CSV file.""" - with open(output_file, "wt") as f: - fieldnames = [ - "psm_index", - "ion_type", - "ion_number", - "mz", - "predicted", - "observed", - ] - writer = csv.DictWriter(f, fieldnames=fieldnames, lineterminator="\n") - writer.writeheader() - for result in results: - if result.theoretical_mz is not None: - for ion_type in result.theoretical_mz: - for i in range(len(result.theoretical_mz[ion_type])): - writer.writerow( - { - "psm_index": result.psm_index, - "ion_type": ion_type, - "ion_number": i + 1, - "mz": "{:.6g}".format(result.theoretical_mz[ion_type][i]), - "predicted": "{:.6g}".format( - result.predicted_intensity[ion_type][i] - ) if result.predicted_intensity else None, - "observed": "{:.6g}".format(result.observed_intensity[ion_type][i]) - if result.observed_intensity - else None, - } - ) - - -def correlations_to_csv(results: List["ProcessingResult"], output_file: str) -> None: +def write_correlations(results: List["ProcessingResult"], output_file: str) -> None: """Write correlations to CSV file.""" with open(output_file, "wt") as f: fieldnames = ["psm_index", "correlation"] - writer = csv.DictWriter(f, fieldnames=fieldnames, lineterminator="\n") + writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter="\t", lineterminator="\n") writer.writeheader() for result in results: writer.writerow({"psm_index": result.psm_index, "correlation": result.correlation}) diff --git a/ms2pip/spectrum_output.py b/ms2pip/spectrum_output.py index d422cad..36c97be 100644 --- a/ms2pip/spectrum_output.py +++ b/ms2pip/spectrum_output.py @@ -40,6 +40,7 @@ import csv import itertools +import logging import re import warnings from abc import ABC, abstractmethod @@ -57,11 +58,13 @@ from ms2pip._utils import dlib from ms2pip.result import ProcessingResult +LOGGER = logging.getLogger(__name__) + def write_spectra( filename: Union[str, Path], processing_results: List[ProcessingResult], - file_format: str, + file_format: str = "tsv", write_mode: str = "w", ): """ @@ -80,13 +83,14 @@ def write_spectra( """ with SUPPORTED_FORMATS[file_format](filename, write_mode) as writer: + LOGGER.info(f"Writing to {writer.filename}") writer.write(processing_results) class _Writer(ABC): """Abstract base class for writing spectrum files.""" - suffix = ".txt" + suffix = "" def __init__(self, filename: Union[str, Path], write_mode: str = "w"): self.filename = Path(filename).with_suffix(self.suffix) @@ -182,11 +186,11 @@ def _write_row(result: ProcessingResult, ion_type: str, ion_index: int): "psm_index": result.psm_index, "ion_type": ion_type, "ion_number": ion_index + 1, - "mz": "{:.10g}".format(result.theoretical_mz[ion_type][ion_index]), - "predicted": "{:.10g}".format(result.predicted_intensity[ion_type][ion_index]) + "mz": "{:.8f}".format(result.theoretical_mz[ion_type][ion_index]), + "predicted": "{:.8f}".format(result.predicted_intensity[ion_type][ion_index]) if result.predicted_intensity else None, - "observed": "{:.10g}".format(result.observed_intensity[ion_type][ion_index]) + "observed": "{:.8f}".format(result.observed_intensity[ion_type][ion_index]) if result.observed_intensity else None, "rt": result.psm.retention_time if result.psm.retention_time else None, @@ -219,7 +223,7 @@ def _write_result(self, result: ProcessingResult): # Peaks lines.extend( - f"{mz:.10g}\t{intensity:.10g}\t{annotation}/0.0" for mz, intensity, annotation in peaks + f"{mz:.8f}\t{intensity:.8f}\t{annotation}/0.0" for mz, intensity, annotation in peaks ) # Write to file @@ -259,7 +263,7 @@ def _format_single_modification( if not mods: return "Mods=0" else: - return f"Mods={len(mods)}/{'/'.join(sorted(mods))}" + return f"Mods={len(mods)}/{'/'.join(mods)}" @staticmethod def _format_parent_mass(peptidoform: Peptidoform) -> str: @@ -332,11 +336,11 @@ def _write_result(self, result: ProcessingResult): ] # Peaks - lines.extend(f"{mz:.10g} {intensity:.10g}" for mz, intensity in peaks) + lines.extend(f"{mz:.8f} {intensity:.8f}" for mz, intensity in peaks) # Write to file self._file_object.writelines(line + "\n" for line in lines if line) - self._file_object.write("END IONS\n") + self._file_object.write("END IONS\n\n") class Spectronaut(_Writer): @@ -385,9 +389,9 @@ def _process_psm(psm: PSM) -> Dict[str, Any]: "ModifiedPeptide": _peptidoform_str_without_charge(psm.peptidoform), "StrippedPeptide": psm.peptidoform.sequence, "PrecursorCharge": psm.get_precursor_charge(), - "PrecursorMz": f"{psm.peptidoform.theoretical_mz:.10g}", - "IonMobility": f"{psm.ion_mobility:.10g}" if psm.ion_mobility else None, - "iRT": f"{psm.retention_time:.10g}" if psm.retention_time else None, + "PrecursorMz": f"{psm.peptidoform.theoretical_mz:.8f}", + "IonMobility": f"{psm.ion_mobility:.8f}" if psm.ion_mobility else None, + "iRT": f"{psm.retention_time:.8f}" if psm.retention_time else None, "ProteinId": "".join(psm.protein_list) if psm.protein_list else None, } @@ -411,8 +415,8 @@ def _yield_fragment_info(result: ProcessingResult) -> Generator[Dict[str, Any], zip(intensities[ion_type], result.theoretical_mz[ion_type]) ): yield { - "RelativeFragmentIntensity": f"{intensity:.10g}", - "FragmentMz": f"{mz:.10g}", + "RelativeFragmentIntensity": f"{intensity:.8f}", + "FragmentMz": f"{mz:.8f}", "FragmentType": fragment_type, "FragmentNumber": ion_index + 1, "FragmentCharge": fragment_charge, @@ -567,7 +571,7 @@ def _write_result_to_ms2( ] # Peaks - lines.extend(f"{mz:.10g}\t{intensity:.10g}" for mz, intensity in peaks) + lines.extend(f"{mz:.8f}\t{intensity:.8f}" for mz, intensity in peaks) # Write to file self._ms2_file_object.writelines(line + "\n" for line in lines)