From a22de42bc93ad8438dcbb1e54da86cc22db3646d Mon Sep 17 00:00:00 2001 From: Kevin Velghe Date: Tue, 13 Aug 2024 16:42:40 +0200 Subject: [PATCH 01/16] use mz_read to read mgfs and mzmls This add support for gziped files, without the need for us to implement it. --- Cargo.toml | 1 + src/lib.rs | 2 +- src/parse_mzdata.rs | 21 +++++---------------- 3 files changed, 7 insertions(+), 17 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index d7fa121..f34b2b6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,3 +12,4 @@ crate-type = ["cdylib"] pyo3 = "0.20.0" mzdata = "0.20.0" timsrust = "0.3.0" +mzpeaks = "0.17.0" diff --git a/src/lib.rs b/src/lib.rs index 61ee4d8..b78771b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -40,7 +40,7 @@ pub fn get_ms2_spectra(spectrum_path: String) -> PyResult { - parse_mzdata::read_ms2_spectra(&spectrum_path, file_type) + parse_mzdata::read_ms2_spectra(&spectrum_path) } SpectrumFileType::BrukerRaw => parse_timsrust::read_ms2_spectra(&spectrum_path), // SpectrumFileType::ThermoRaw => parse_with_mzdata_thermo(&spectrum_path, file_type), diff --git a/src/parse_mzdata.rs b/src/parse_mzdata.rs index e11c805..5717d1a 100644 --- a/src/parse_mzdata.rs +++ b/src/parse_mzdata.rs @@ -3,6 +3,7 @@ use std::fs::File; use mzdata::io::{MGFReader, MzMLReader}; use mzdata::params::ParamValue; +use mzdata::mz_read; use crate::file_types::SpectrumFileType; use crate::ms2_spectrum::MS2Spectrum; @@ -81,24 +82,12 @@ pub fn parse_precursor_info( /// Read MS2 spectra from spectrum files with mzdata pub fn read_ms2_spectra( spectrum_path: &str, - file_type: SpectrumFileType, ) -> Result, std::io::Error> { - let file = File::open(spectrum_path)?; - match file_type { - SpectrumFileType::MascotGenericFormat => Ok(MGFReader::new(file) - .map(MS2Spectrum::from) - .collect::>()), - - SpectrumFileType::MzML => Ok(MzMLReader::new(file) - .filter(|spectrum| spectrum.description.ms_level == 2) + mz_read!(spectrum_path.as_ref(), reader => { + reader.filter(|spectrum| spectrum.description.ms_level == 2) .map(MS2Spectrum::from) - .collect::>()), - - _ => Err(std::io::Error::new( - std::io::ErrorKind::InvalidInput, - "Unsupported file type for mzdata", - )), - } + .collect::>() + }) } // pub fn parse_precursor_info_thermo( From 86da407d889c37f4b3e0aa564d7040600e70d45d Mon Sep 17 00:00:00 2001 From: Kevin Velghe Date: Tue, 13 Aug 2024 19:03:28 +0200 Subject: [PATCH 02/16] use mz_read to parse precursor info --- src/lib.rs | 2 +- src/parse_mzdata.rs | 28 ++++------------------------ 2 files changed, 5 insertions(+), 25 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index b78771b..f8bc25d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -20,7 +20,7 @@ pub fn get_precursor_info(spectrum_path: String) -> PyResult { - parse_mzdata::parse_precursor_info(&spectrum_path, file_type) + parse_mzdata::parse_precursor_info(&spectrum_path) } SpectrumFileType::BrukerRaw => parse_timsrust::parse_precursor_info(&spectrum_path), // SpectrumFileType::ThermoRaw => parse_with_mzdata_thermo(&spectrum_path, file_type), diff --git a/src/parse_mzdata.rs b/src/parse_mzdata.rs index 5717d1a..ded832d 100644 --- a/src/parse_mzdata.rs +++ b/src/parse_mzdata.rs @@ -1,11 +1,8 @@ use std::collections::HashMap; -use std::fs::File; -use mzdata::io::{MGFReader, MzMLReader}; use mzdata::params::ParamValue; use mzdata::mz_read; -use crate::file_types::SpectrumFileType; use crate::ms2_spectrum::MS2Spectrum; use crate::precursor::Precursor; @@ -51,32 +48,15 @@ impl From for MS2Spectrum { /// Parse precursor info from spectrum files with mzdata pub fn parse_precursor_info( spectrum_path: &str, - file_type: SpectrumFileType, ) -> Result, std::io::Error> { - let file = File::open(spectrum_path)?; - match file_type { - SpectrumFileType::MascotGenericFormat => Ok(MGFReader::new(file) - .filter_map(|spectrum| { - spectrum.description.precursor.as_ref()?; - Some((spectrum.description.id.clone(), Precursor::from(&spectrum))) - }) - .collect::>()), - - SpectrumFileType::MzML => Ok(MzMLReader::new(file) + mz_read!(spectrum_path.as_ref(), reader => { + reader.filter(|spectrum| spectrum.description.ms_level == 2) .filter_map(|spectrum| { - if spectrum.description.ms_level != 2 { - return None; - } spectrum.description.precursor.as_ref()?; Some((spectrum.description.id.clone(), Precursor::from(&spectrum))) }) - .collect::>()), - - _ => Err(std::io::Error::new( - std::io::ErrorKind::InvalidInput, - "Unsupported file type for mzdata", - )), - } + .collect::>() + }) } /// Read MS2 spectra from spectrum files with mzdata From 503e0c95f8bfaf344d44785139a715e9c8c66f98 Mon Sep 17 00:00:00 2001 From: Kevin Velghe Date: Tue, 13 Aug 2024 19:05:23 +0200 Subject: [PATCH 03/16] use infer_from_path to determine filetype This should add support for gzipped files, MzMLb and Thermo raw files. --- src/file_types.rs | 37 +++++++++++++++++++++++-------------- src/lib.rs | 6 ++---- 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/src/file_types.rs b/src/file_types.rs index cdcbb6e..cd51898 100644 --- a/src/file_types.rs +++ b/src/file_types.rs @@ -1,25 +1,34 @@ +use mzdata::io::MassSpectrometryFormat; + pub enum SpectrumFileType { MascotGenericFormat, MzML, + MzMLb, BrukerRaw, - // ThermoRaw, + ThermoRaw, Unknown, } pub fn match_file_type(spectrum_path: &str) -> SpectrumFileType { - let extension = spectrum_path.split('.').last().unwrap_or("").to_lowercase(); - match extension.as_str() { - "mgf" => SpectrumFileType::MascotGenericFormat, - "mzml" => SpectrumFileType::MzML, - "d" | "ms2" => SpectrumFileType::BrukerRaw, - // "raw" => SpectrumFileType::ThermoRaw, - _ => match ( - folder_contains_extension(spectrum_path, "bin"), - folder_contains_extension(spectrum_path, "parquet"), - ) { - (true, true) => SpectrumFileType::BrukerRaw, - _ => SpectrumFileType::Unknown, - }, + match mzdata::io::infer_from_path(spectrum_path).0 { + MassSpectrometryFormat::MGF => SpectrumFileType::MascotGenericFormat, + MassSpectrometryFormat::MzML => SpectrumFileType::MzML, + MassSpectrometryFormat::MzMLb => SpectrumFileType::MzMLb, + MassSpectrometryFormat::ThermoRaw => SpectrumFileType::ThermoRaw, + MassSpectrometryFormat::Unknown => { + let extension = spectrum_path.split('.').last().unwrap_or("").to_lowercase(); + match extension.as_str() { + "d" | "ms2" => SpectrumFileType::BrukerRaw, + _ => match ( + folder_contains_extension(spectrum_path, "bin"), + folder_contains_extension(spectrum_path, "parquet"), + ) { + (true, true) => SpectrumFileType::BrukerRaw, + _ => SpectrumFileType::Unknown, + }, + } + } + _ => SpectrumFileType::Unknown } } diff --git a/src/lib.rs b/src/lib.rs index f8bc25d..5b6205c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -19,11 +19,10 @@ pub fn get_precursor_info(spectrum_path: String) -> PyResult { + SpectrumFileType::MascotGenericFormat | SpectrumFileType::MzML | SpectrumFileType::MzMLb | SpectrumFileType:: ThermoRaw => { parse_mzdata::parse_precursor_info(&spectrum_path) } SpectrumFileType::BrukerRaw => parse_timsrust::parse_precursor_info(&spectrum_path), - // SpectrumFileType::ThermoRaw => parse_with_mzdata_thermo(&spectrum_path, file_type), SpectrumFileType::Unknown => return Err(PyOSError::new_err("Unsupported file type")), }; @@ -39,11 +38,10 @@ pub fn get_ms2_spectra(spectrum_path: String) -> PyResult { + SpectrumFileType::MascotGenericFormat | SpectrumFileType::MzML | SpectrumFileType::MzMLb | SpectrumFileType:: ThermoRaw => { parse_mzdata::read_ms2_spectra(&spectrum_path) } SpectrumFileType::BrukerRaw => parse_timsrust::read_ms2_spectra(&spectrum_path), - // SpectrumFileType::ThermoRaw => parse_with_mzdata_thermo(&spectrum_path, file_type), SpectrumFileType::Unknown => return Err(PyOSError::new_err("Unsupported file type")), }; From f5d34bae1f9cd40cebc021b440e9fbd7640770e7 Mon Sep 17 00:00:00 2001 From: Kevin Velghe Date: Fri, 23 Aug 2024 13:30:03 +0200 Subject: [PATCH 04/16] output more appropriate exceptions --- src/lib.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 5b6205c..1983074 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,7 +6,7 @@ mod ms2_spectrum; use std::collections::HashMap; -use pyo3::exceptions::PyOSError; +use pyo3::exceptions::{PyException, PyValueError}; use pyo3::prelude::*; use file_types::{match_file_type, SpectrumFileType}; @@ -23,12 +23,12 @@ pub fn get_precursor_info(spectrum_path: String) -> PyResult parse_timsrust::parse_precursor_info(&spectrum_path), - SpectrumFileType::Unknown => return Err(PyOSError::new_err("Unsupported file type")), + SpectrumFileType::Unknown => return Err(PyValueError::new_err("Unsupported file type")), }; match precursors { Ok(precursors) => Ok(precursors), - Err(e) => Err(PyOSError::new_err(e.to_string())), + Err(e) => Err(PyException::new_err(e.to_string())), } } @@ -42,12 +42,12 @@ pub fn get_ms2_spectra(spectrum_path: String) -> PyResult parse_timsrust::read_ms2_spectra(&spectrum_path), - SpectrumFileType::Unknown => return Err(PyOSError::new_err("Unsupported file type")), + SpectrumFileType::Unknown => return Err(PyValueError::new_err("Unsupported file type")), }; match spectra { Ok(spectra) => Ok(spectra), - Err(e) => Err(PyOSError::new_err(e.to_string())), + Err(e) => Err(PyException::new_err(e.to_string())), } } From bcca89366de9963d1d9bf026065e90b1e6af14fa Mon Sep 17 00:00:00 2001 From: Kevin Velghe Date: Fri, 23 Aug 2024 14:13:39 +0200 Subject: [PATCH 05/16] add function for file type check --- src/lib.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 1983074..0daf564 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -13,6 +13,17 @@ use file_types::{match_file_type, SpectrumFileType}; use precursor::Precursor; use ms2_spectrum::MS2Spectrum; +/// Check if spectrum path matches a supported file type. +#[pyfunction] +pub fn is_supported_file_type(spectrum_path: String) -> bool { + let file_type = match_file_type(&spectrum_path); + + match file_type { + SpectrumFileType::Unknown => false, + _ => true + } +} + /// Get mapping of spectrum identifiers to precursor information. #[pyfunction] pub fn get_precursor_info(spectrum_path: String) -> PyResult> { @@ -57,6 +68,7 @@ pub fn get_ms2_spectra(spectrum_path: String) -> PyResult PyResult<()> { m.add_class::()?; m.add_class::()?; + m.add_function(wrap_pyfunction!(is_supported_file_type, m)?)?; m.add_function(wrap_pyfunction!(get_precursor_info, m)?)?; m.add_function(wrap_pyfunction!(get_ms2_spectra, m)?)?; Ok(()) From 4a112c7aca9837a8d428f7981c0b50cc5a65e6f4 Mon Sep 17 00:00:00 2001 From: Kevin Velghe Date: Fri, 23 Aug 2024 14:26:27 +0200 Subject: [PATCH 06/16] bump version number --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index f34b2b6..d6ec8b9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "ms2rescore-rs" -version = "0.3.0" +version = "0.4.0" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html From 0625d7ba99924a2ea261272462b4620eb14cc874 Mon Sep 17 00:00:00 2001 From: Kevin Velghe Date: Fri, 23 Aug 2024 15:33:40 +0200 Subject: [PATCH 07/16] use matches macro --- src/lib.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 0daf564..0bdcc22 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -18,10 +18,7 @@ use ms2_spectrum::MS2Spectrum; pub fn is_supported_file_type(spectrum_path: String) -> bool { let file_type = match_file_type(&spectrum_path); - match file_type { - SpectrumFileType::Unknown => false, - _ => true - } + !matches!(file_type, SpectrumFileType::Unknown) } /// Get mapping of spectrum identifiers to precursor information. From d739558502ec30ec247d76e3b423a022d919167f Mon Sep 17 00:00:00 2001 From: Kevin Velghe Date: Fri, 23 Aug 2024 15:34:05 +0200 Subject: [PATCH 08/16] install stable rust toolchain --- .github/workflows/test.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b078715..d743c5c 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -16,6 +16,14 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 + + - name: Install stable toolchain + uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + - name: Run Clippy run: cargo clippy --all-targets --all-features From ac8ad053fdc764fda483011d0fc13fcc17aa7e95 Mon Sep 17 00:00:00 2001 From: Kevin Velghe Date: Fri, 23 Aug 2024 15:36:25 +0200 Subject: [PATCH 09/16] use cargo action to run clippy --- .github/workflows/test.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d743c5c..7d1ab9f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -25,7 +25,10 @@ jobs: override: true - name: Run Clippy - run: cargo clippy --all-targets --all-features + uses: actions-rs/cargo@v1 + with: + command: clippy + args: --all-targets --all-features pytest: runs-on: ${{ matrix.os }} From dd4a302fd3cdf859de0c623ad4acc1d8a98abf73 Mon Sep 17 00:00:00 2001 From: Kevin Velghe Date: Fri, 23 Aug 2024 16:52:32 +0200 Subject: [PATCH 10/16] update mzdata and mzpeaks The main issue for a fresh build (for some reason I didn't have an issue locally before removing my target directory) seems to be mzpeaks, but there is little reason not to update both. --- Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index d6ec8b9..a2fcc74 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,6 @@ crate-type = ["cdylib"] [dependencies] pyo3 = "0.20.0" -mzdata = "0.20.0" +mzdata = "0.26.0" timsrust = "0.3.0" -mzpeaks = "0.17.0" +mzpeaks = "0.19.0" From 475ec91d064a87b38d13663079e59dc40c0cc79b Mon Sep 17 00:00:00 2001 From: Kevin Velghe Date: Thu, 29 Aug 2024 19:02:39 +0200 Subject: [PATCH 11/16] drop unused function --- src/parse_mzdata.rs | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/src/parse_mzdata.rs b/src/parse_mzdata.rs index ded832d..b4247e0 100644 --- a/src/parse_mzdata.rs +++ b/src/parse_mzdata.rs @@ -70,20 +70,6 @@ pub fn read_ms2_spectra( }) } -// pub fn parse_precursor_info_thermo( -// spectrum_path: &str, -// file_type: SpectrumFileType, -// ) -> Result, std::io::Error> { -// let reader = mzdata::io::ThermoRawReader::open_path(spectrum_path)?; -// Ok(reader -// .into_iter() -// .filter(|spectrum| { -// (spectrum.description.ms_level == 2) && (spectrum.description.precursor.is_some()) -// }) -// .map(|spectrum| (spectrum.description.id, Precursor::from(spectrum))) -// .collect::>()) -// } - fn get_charge_from_spectrum(spectrum: &mzdata::spectrum::MultiLayerSpectrum) -> Option { spectrum .description From 1d17954808646bc6857e1809df99912e28d90e75 Mon Sep 17 00:00:00 2001 From: Kevin Velghe Date: Thu, 29 Aug 2024 19:02:53 +0200 Subject: [PATCH 12/16] add feature for thermo rawfile support Note that this feauture needs to be called thermorawfilereader. mzdata's feature is called thermo, but the checks, including the one in the `mz_read` macro tests for thermorawfilereader. The feature here needs to match the one tested for in the macro, otherwise rawfiles are detected as a file type, but the macro still wouldn't support it. --- Cargo.toml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index a2fcc74..d323b72 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,11 @@ edition = "2021" name = "ms2rescore_rs" crate-type = ["cdylib"] +[features] +default = ["thermorawfilereader"] + +thermorawfilereader = ["mzdata/thermo"] + [dependencies] pyo3 = "0.20.0" mzdata = "0.26.0" From 45d89cad54a99832260cf75ac0d3cdaa3fc6526d Mon Sep 17 00:00:00 2001 From: Kevin Velghe Date: Wed, 18 Sep 2024 18:10:40 +0200 Subject: [PATCH 13/16] use git mzdata --- Cargo.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index d323b72..4f4e18f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,6 +15,5 @@ thermorawfilereader = ["mzdata/thermo"] [dependencies] pyo3 = "0.20.0" -mzdata = "0.26.0" +mzdata = {git = "https://github.com/paretje/mzdata.git", branch = "feature/pub-use"} timsrust = "0.3.0" -mzpeaks = "0.19.0" From 496b34f6dfd4b309a972a02ec821794fc457d767 Mon Sep 17 00:00:00 2001 From: Kevin Velghe Date: Mon, 23 Sep 2024 14:09:05 +0200 Subject: [PATCH 14/16] use mzdata 0.30 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 4f4e18f..49042fb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,5 +15,5 @@ thermorawfilereader = ["mzdata/thermo"] [dependencies] pyo3 = "0.20.0" -mzdata = {git = "https://github.com/paretje/mzdata.git", branch = "feature/pub-use"} +mzdata = "0.30.0" timsrust = "0.3.0" From c288562456c30964b4517a52cff0bd7f6d2737b5 Mon Sep 17 00:00:00 2001 From: Kevin Velghe Date: Wed, 23 Oct 2024 19:00:42 +0200 Subject: [PATCH 15/16] update mzdata This leaves us with 1 test failure: in our test MGF, PEPMASS is preceded by CHARGE. mzdata now parses the charge, but implicitly assumes there is a PEPMASS first. When that isn't the case, there is no precursor ion to assign the charge value to, resulting in a failure of our unit test. --- Cargo.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 49042fb..7900ad4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,11 +9,11 @@ name = "ms2rescore_rs" crate-type = ["cdylib"] [features] -default = ["thermorawfilereader"] +default = ["thermo"] -thermorawfilereader = ["mzdata/thermo"] +thermo = ["mzdata/thermo"] [dependencies] pyo3 = "0.20.0" -mzdata = "0.30.0" +mzdata = "0.33.0" timsrust = "0.3.0" From 8df8eaa50a2c743a00b02d25faa28984814bd9fd Mon Sep 17 00:00:00 2001 From: Kevin Velghe Date: Wed, 23 Oct 2024 19:05:06 +0200 Subject: [PATCH 16/16] update test mgf Work around the bug in mzdata for now, but it might be interesting to, once we upgrade the version of mzdata, to add a test for this. --- tests/data/test.mgf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data/test.mgf b/tests/data/test.mgf index 8cacad7..a86fe1e 100644 --- a/tests/data/test.mgf +++ b/tests/data/test.mgf @@ -1,7 +1,7 @@ BEGIN IONS TITLE=peptide1 -CHARGE=2+ PEPMASS=475.137295 +CHARGE=2+ ION_MOBILITY=42.42 RTINSECONDS=51.2 72.04439 100