diff --git a/entab-cli/src/lib.rs b/entab-cli/src/lib.rs index f0b76d5..99562ed 100644 --- a/entab-cli/src/lib.rs +++ b/entab-cli/src/lib.rs @@ -165,7 +165,7 @@ mod tests { run( ["entab", "--metadata"], &b">test\nACGT"[..], - io::Cursor::new(&mut out) + io::Cursor::new(&mut out), )?; assert_eq!(&out[..], b"key\tvalue\n"); Ok(()) diff --git a/entab-js/src/lib.rs b/entab-js/src/lib.rs index fdc9242..657027e 100644 --- a/entab-js/src/lib.rs +++ b/entab-js/src/lib.rs @@ -76,12 +76,8 @@ impl Reader { #[wasm_bindgen] pub fn next(&mut self) -> Result { if let Some(value) = self.reader.next_record().map_err(to_js)? { - let obj: BTreeMap<&str, Value> = self - .headers - .iter() - .map(AsRef::as_ref) - .zip(value) - .collect(); + let obj: BTreeMap<&str, Value> = + self.headers.iter().map(AsRef::as_ref).zip(value).collect(); serde_wasm_bindgen::to_value(&NextRecord { value: Some(obj), done: false, diff --git a/entab/fuzz/Cargo.lock b/entab/fuzz/Cargo.lock index 2691854..4796b11 100644 --- a/entab/fuzz/Cargo.lock +++ b/entab/fuzz/Cargo.lock @@ -28,9 +28,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "bytecount" -version = "0.6.3" +version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" +checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce" [[package]] name = "bzip2" @@ -154,7 +154,7 @@ checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" [[package]] name = "entab" -version = "0.3.1" +version = "0.3.3" dependencies = [ "bytecount", "bzip2", @@ -224,9 +224,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.5.0" +version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "miniz_oxide" @@ -324,30 +324,28 @@ dependencies = [ [[package]] name = "zstd" -version = "0.12.3+zstd.1.5.2" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76eea132fb024e0e13fd9c2f5d5d595d8a967aa72382ac2f9d39fcc95afd0806" +checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9" dependencies = [ "zstd-safe", ] [[package]] name = "zstd-safe" -version = "6.0.5+zstd.1.5.4" +version = "7.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d56d9e60b4b1758206c238a10165fbcae3ca37b01744e394c463463f6529d23b" +checksum = "fa556e971e7b568dc775c136fc9de8c779b1c2fc3a63defaafadffdbd3181afa" dependencies = [ - "libc", "zstd-sys", ] [[package]] name = "zstd-sys" -version = "2.0.8+zstd.1.5.5" +version = "2.0.12+zstd.1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c" +checksum = "0a4e40c320c3cb459d9a9ff6de98cff88f4751ee9275d140e2be94a2b74e4c13" dependencies = [ "cc", - "libc", "pkg-config", ] diff --git a/entab/src/filetype.rs b/entab/src/filetype.rs index 6033eb1..d4db89b 100644 --- a/entab/src/filetype.rs +++ b/entab/src/filetype.rs @@ -32,6 +32,8 @@ pub enum FileType { // chemoinformatics /// Agilent format used for MS-MS trace data AgilentMsMsScan, // bin 0x01, 0x01 + /// Agilent format used for flame ionization data (array-based) + AgilentChemstationArray, /// Agilent format used for UV-visible array data AgilentChemstationDad, /// Agilent format used for flame ionization trace data @@ -123,9 +125,9 @@ impl FileType { b"\x02\x33\x31\x00" => return FileType::AgilentChemstationDad, b"\x02\x38\x31\x00" => return FileType::AgilentChemstationFid, b"\x03\x02\x00\x00" => return FileType::AgilentMasshunterDad, - b"\x03\x31\x33\x30" => return FileType::AgilentChemstationUv, + b"\x03\x31\x33\x30" => return FileType::AgilentChemstationMwd, b"\x03\x31\x33\x31" => return FileType::AgilentChemstationUv, - b"\x03\x31\x37\x39" => return FileType::AgilentChemstationUv, + b"\x03\x31\x37\x39" => return FileType::AgilentChemstationArray, b"\x28\xB5\x2F\xFD" => return FileType::Zstd, b"\x4F\x62\x6A\x01" => return FileType::ApacheAvro, b"\xFF\xD8\xFF\xDB" | b"\xFF\xD8\xFF\xE0" | b"\xFF\xD8\xFF\xE1" @@ -140,7 +142,14 @@ impl FileType { } } if magic.len() < 2 { - return FileType::Unknown(Some(magic.iter().take(8).map(|x| format!("{:x}", x)).collect::>().join(""))); + return FileType::Unknown(Some( + magic + .iter() + .take(8) + .map(|x| format!("{:x}", x)) + .collect::>() + .join(""), + )); } match &magic[..2] { [0x0F | 0x1F, 0x8B] => return FileType::Gzip, @@ -154,7 +163,12 @@ impl FileType { b">" => FileType::Fasta, b"@" => FileType::Fastq, _ => FileType::Unknown(Some( - magic.iter().take(8).map(|x| format!("{:x}", x)).collect::>().join("") + magic + .iter() + .take(8) + .map(|x| format!("{:x}", x)) + .collect::>() + .join(""), )), } } @@ -172,6 +186,7 @@ impl FileType { "cdf" => &[FileType::NetCdf], "cf" => &[FileType::ThermoCf], "ch" => &[ + FileType::AgilentChemstationArray, FileType::AgilentChemstationFid, FileType::AgilentChemstationMwd, ], @@ -213,6 +228,7 @@ impl FileType { /// If a file is unsupported, an error will be returned. pub fn to_parser_name<'a>(&self, hint: Option<&'a str>) -> Result<&'a str, EtError> { Ok(match (self, hint) { + (FileType::AgilentChemstationArray, None) => "chemstation_array", (FileType::AgilentChemstationDad, None) => "chemstation_dad", (FileType::AgilentChemstationFid, None) => "chemstation_fid", (FileType::AgilentChemstationMs, None) => "chemstation_ms", @@ -246,6 +262,7 @@ mod tests { #[test] fn test_parser_names() { let filetypes = [ + (FileType::AgilentChemstationArray, "chemstation_array"), (FileType::AgilentChemstationFid, "chemstation_fid"), (FileType::AgilentChemstationMs, "chemstation_ms"), (FileType::AgilentChemstationMwd, "chemstation_mwd"), @@ -273,8 +290,9 @@ mod tests { let unknown_type = FileType::from_magic(b"\x00\x00\x00\x00"); assert_eq!(unknown_type, FileType::Unknown(Some("0000".to_string()))); - assert_eq!(unknown_type.to_parser_name(None).unwrap_err().msg, "File starting with #0000# has no parser"); - - + assert_eq!( + unknown_type.to_parser_name(None).unwrap_err().msg, + "File starting with #0000# has no parser" + ); } } diff --git a/entab/src/parsers/agilent/chemstation.rs b/entab/src/parsers/agilent/chemstation.rs index f837275..0b5b656 100644 --- a/entab/src/parsers/agilent/chemstation.rs +++ b/entab/src/parsers/agilent/chemstation.rs @@ -1,12 +1,11 @@ use alloc::collections::BTreeMap; use alloc::str; -use alloc::string::{String, ToString}; +use alloc::string::String; use alloc::vec; use alloc::vec::Vec; use core::marker::Copy; -use chrono::NaiveDateTime; - +use crate::parsers::agilent::metadata::ChemstationMetadata; use crate::parsers::agilent::read_agilent_header; use crate::parsers::{extract, Endian, FromSlice}; use crate::record::{StateMetadata, Value}; @@ -15,177 +14,6 @@ use crate::{impl_reader, impl_record}; const CHEMSTATION_TIME_STEP: f64 = 0.2; -#[derive(Clone, Debug, Default)] -/// Metadata consistly found in Chemstation file formats -pub struct ChemstationMetadata { - /// Time the run started (minutes) - pub start_time: f64, - /// Time the ended started (minutes) - pub end_time: f64, - /// Name of the signal record (specifically used for e.g. MWD traces) - pub signal_name: String, - /// Absolute correction to be applied to all data points - pub offset_correction: f64, - /// Scaling correction to be applied to all data points - pub mult_correction: f64, - /// In what order this run was performed - pub sequence: u16, - /// The vial number this run was performed from - pub vial: u16, - /// The replicate number of this run - pub replicate: u16, - /// The name of the sample - pub sample: String, - /// The description of the sample - pub description: String, - /// The name of the operator - pub operator: String, - /// The date the sample was run - pub run_date: Option, - /// The instrument the sample was run on - pub instrument: String, - /// The method the instrument ran - pub method: String, -} - -impl<'r> From<&ChemstationMetadata> for BTreeMap> { - fn from(metadata: &ChemstationMetadata) -> Self { - let mut map = BTreeMap::new(); - drop(map.insert("start_time".to_string(), metadata.start_time.into())); - drop(map.insert("end_time".to_string(), metadata.end_time.into())); - drop(map.insert( - "signal_name".to_string(), - metadata.signal_name.clone().into(), - )); - drop(map.insert( - "offset_correction".to_string(), - metadata.offset_correction.into(), - )); - drop(map.insert( - "mult_correction".to_string(), - metadata.mult_correction.into(), - )); - drop(map.insert("sequence".to_string(), metadata.sequence.into())); - drop(map.insert("vial".to_string(), metadata.vial.into())); - drop(map.insert("replicate".to_string(), metadata.replicate.into())); - drop(map.insert("sample".to_string(), metadata.sample.clone().into())); - drop(map.insert( - "description".to_string(), - metadata.description.clone().into(), - )); - drop(map.insert("operator".to_string(), metadata.operator.clone().into())); - drop(map.insert("run_date".to_string(), metadata.run_date.into())); - drop(map.insert("instrument".to_string(), metadata.instrument.clone().into())); - drop(map.insert("method".to_string(), metadata.method.clone().into())); - map - } -} - -fn get_metadata(header: &[u8], has_signal: bool) -> Result { - if has_signal && header.len() < 652 { - return Err( - EtError::from("Chemstation header needs to be at least 648 bytes long").incomplete(), - ); - } else if !has_signal && header.len() < 512 { - return Err( - EtError::from("Chemstation header needs to be at least 512 bytes long").incomplete(), - ); - } - let start_time = f64::from(i32::extract(&header[282..], &Endian::Big)?) / 60000.; - let end_time = f64::from(i32::extract(&header[286..], &Endian::Big)?) / 60000.; - - let mut offset_correction = 0.; - let mut mult_correction = 1.; - let mut signal_name = ""; - if has_signal { - offset_correction = f64::extract(&header[636..], &Endian::Big)?; - mult_correction = f64::extract(&header[644..], &Endian::Big)?; - - let signal_name_len = usize::from(header[596]); - if signal_name_len > 40 { - return Err("Invalid signal name length".into()); - } - signal_name = str::from_utf8(&header[597..597 + signal_name_len])?.trim(); - } - - let sample_len = usize::from(header[24]); - if sample_len > 60 { - return Err("Invalid sample length".into()); - } - let sample = str::from_utf8(&header[25..25 + sample_len])? - .trim() - .to_string(); - let description_len = usize::from(header[86]); - if description_len > 60 { - return Err("Invalid sample length".into()); - } - let description = str::from_utf8(&header[87..87 + description_len])? - .trim() - .to_string(); - let operator_len = usize::from(header[148]); - if operator_len > 28 { - return Err("Invalid sample length".into()); - } - let operator = str::from_utf8(&header[149..149 + operator_len])? - .trim() - .to_string(); - let run_date_len = usize::from(header[178]); - if run_date_len > 60 { - return Err("Invalid sample length".into()); - } - // We need to detect the date format before we can convert into a - // NaiveDateTime; not sure the format even maps to the file type - // (it may be computer-dependent?) - let raw_run_date = str::from_utf8(&header[179..179 + run_date_len])?.trim(); - let run_date = if let Ok(d) = NaiveDateTime::parse_from_str(raw_run_date, "%d-%b-%y, %H:%M:%S") - { - // format in MWD - Some(d) - } else if let Ok(d) = NaiveDateTime::parse_from_str(raw_run_date, "%d %b %y %l:%M %P") { - // format in MS - Some(d) - } else if let Ok(d) = NaiveDateTime::parse_from_str(raw_run_date, "%d %b %y %l:%M %P %z") { - // format in MS with timezone - Some(d) - } else if let Ok(d) = NaiveDateTime::parse_from_str(raw_run_date, "%m/%d/%y %I:%M:%S %p") { - // format in FID - Some(d) - } else { - None - }; - - let instrument_len = usize::from(header[208]); - let instrument = str::from_utf8(&header[209..209 + instrument_len])? - .trim() - .to_string(); - let method_len = usize::from(header[228]); - let method = str::from_utf8(&header[229..229 + method_len])? - .trim() - .to_string(); - - // not sure how robust the following are - let sequence = u16::extract(&header[252..], &Endian::Big)?; - let vial = u16::extract(&header[254..], &Endian::Big)?; - let replicate = u16::extract(&header[256..], &Endian::Big)?; - - Ok(ChemstationMetadata { - start_time, - end_time, - signal_name: signal_name.to_string(), - offset_correction, - mult_correction, - sequence, - vial, - replicate, - sample, - description, - operator, - run_date, - instrument, - method, - }) -} - #[derive(Clone, Debug, Default)] /// Internal state for the `ChemstationFidRecord` parser pub struct ChemstationFidState { @@ -220,7 +48,7 @@ impl<'b: 's, 's> FromSlice<'b, 's> for ChemstationFidState { } fn get(&mut self, rb: &'b [u8], _state: &'s Self::State) -> Result<(), EtError> { - let metadata = get_metadata(rb, true)?; + let metadata = ChemstationMetadata::from_header(rb)?; // offset the current time back one step so it'll be right after the first time that parse self.cur_time = metadata.start_time - CHEMSTATION_TIME_STEP; self.cur_intensity = 0.; @@ -319,7 +147,7 @@ impl<'b: 's, 's> FromSlice<'b, 's> for ChemstationMsState { } fn get(&mut self, buffer: &'b [u8], _state: &'s Self::State) -> Result<(), EtError> { - let metadata = get_metadata(buffer, true)?; + let metadata = ChemstationMetadata::from_header(buffer)?; let n_scans = u32::extract(&buffer[278..], &Endian::Big)? as usize; self.n_scans_left = n_scans; @@ -437,7 +265,7 @@ impl<'b: 's, 's> FromSlice<'b, 's> for ChemstationMwdState { } fn get(&mut self, buf: &'b [u8], _state: &'s Self::State) -> Result<(), EtError> { - let metadata = get_metadata(buf, true)?; + let metadata = ChemstationMetadata::from_header(buf)?; self.n_wvs_left = 0; // offset the current time back one step so it'll be right after the first time that parse @@ -557,7 +385,7 @@ impl<'b: 's, 's> FromSlice<'b, 's> for ChemstationDadState { } fn get(&mut self, buf: &'b [u8], _state: &'s Self::State) -> Result<(), EtError> { - let metadata = get_metadata(buf, false)?; + let metadata = ChemstationMetadata::from_header(buf)?; let n_scans = u32::extract(&buf[278..], &Endian::Big)? as usize; self.n_scans_left = n_scans; diff --git a/entab/src/parsers/agilent/chemstation_new.rs b/entab/src/parsers/agilent/chemstation_new.rs index db9f231..cf5ee0e 100644 --- a/entab/src/parsers/agilent/chemstation_new.rs +++ b/entab/src/parsers/agilent/chemstation_new.rs @@ -1,110 +1,21 @@ use alloc::collections::BTreeMap; use alloc::str; -use alloc::string::{String, ToString}; +use alloc::string::String; use alloc::vec; use alloc::vec::Vec; -use core::char::{decode_utf16, REPLACEMENT_CHARACTER}; use core::marker::Copy; -use chrono::NaiveDateTime; - +use crate::parsers::agilent::metadata::ChemstationMetadata; use crate::parsers::agilent::read_agilent_header; use crate::parsers::{extract, Endian, FromSlice}; use crate::record::{StateMetadata, Value}; use crate::EtError; use crate::{impl_reader, impl_record}; -#[derive(Clone, Debug, Default)] -/// Metadata consistly found in new Chemstation file formats -pub struct ChemstationNewMetadata { - /// Scaling correction to be applied to all data points - pub mult_correction: f64, - /// The name of the sample - pub sample: String, - /// The name of the operator - pub operator: String, - /// The date the sample was run - pub run_date: Option, - /// The instrument the sample was run on - pub instrument: String, - /// The method the instrument ran - pub method: String, -} - -impl<'r> From<&ChemstationNewMetadata> for BTreeMap> { - fn from(metadata: &ChemstationNewMetadata) -> Self { - let mut map = BTreeMap::new(); - drop(map.insert( - "mult_correction".to_string(), - metadata.mult_correction.into(), - )); - drop(map.insert("sample".to_string(), metadata.sample.clone().into())); - drop(map.insert("operator".to_string(), metadata.operator.clone().into())); - drop(map.insert("run_date".to_string(), metadata.run_date.into())); - drop(map.insert("instrument".to_string(), metadata.instrument.clone().into())); - drop(map.insert("method".to_string(), metadata.method.clone().into())); - map - } -} - -fn get_utf16_pascal(data: &[u8]) -> String { - let iter = (1..=2 * usize::from(data[0])) - .step_by(2) - .map(|i| u16::from_le_bytes([data[i], data[i + 1]])); - decode_utf16(iter) - .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER)) - .collect::() -} - -fn get_new_metadata(header: &[u8]) -> Result { - if header.len() < 4000 { - return Err( - EtError::from("New chemstation header needs to be at least 4000 bytes long") - .incomplete(), - ); - } - // Also, @ 3093 - Units? - let sample = get_utf16_pascal(&header[858..]); - let operator = get_utf16_pascal(&header[1880..]); - let instrument = get_utf16_pascal(&header[2492..]); - let method = get_utf16_pascal(&header[2574..]); - let mult_correction = f64::extract(&header[3085..3093], &Endian::Big)?; - - // We need to detect the date format before we can convert into a - // NaiveDateTime; not sure the format even maps to the file type - // (it may be computer-dependent?) - let raw_run_date = get_utf16_pascal(&header[2391..]); - let run_date = if let Ok(d) = NaiveDateTime::parse_from_str(&raw_run_date, "%d-%b-%y, %H:%M:%S") - { - // format in MWD - Some(d) - } else if let Ok(d) = NaiveDateTime::parse_from_str(&raw_run_date, "%d %b %y %l:%M %P") { - // format in MS - Some(d) - } else if let Ok(d) = NaiveDateTime::parse_from_str(&raw_run_date, "%d %b %y %l:%M %P %z") { - // format in MS with timezone - Some(d) - } else if let Ok(d) = NaiveDateTime::parse_from_str(&raw_run_date, "%m/%d/%y %I:%M:%S %p") { - // format in FID - Some(d) - } else { - None - }; - - Ok(ChemstationNewMetadata { - mult_correction, - sample, - operator, - run_date, - instrument, - method, - }) -} - #[derive(Clone, Debug, Default)] /// Internal state for the `ChemstationUvRecord` parser pub struct ChemstationUvState { - metadata: ChemstationNewMetadata, + metadata: ChemstationMetadata, n_scans_left: usize, n_wvs_left: usize, cur_time: f64, @@ -139,7 +50,7 @@ impl<'b: 's, 's> FromSlice<'b, 's> for ChemstationUvState { fn get(&mut self, rb: &'b [u8], _state: &'s Self::State) -> Result<(), EtError> { let n_scans = u32::extract(&rb[278..], &Endian::Big)? as usize; - self.metadata = get_new_metadata(rb)?; + self.metadata = ChemstationMetadata::from_header(rb)?; self.n_scans_left = n_scans; self.n_wvs_left = 0; self.cur_time = 0.; @@ -179,9 +90,9 @@ impl<'b: 's, 's> FromSlice<'b, 's> for ChemstationUvRecord { let con = &mut 0; // refill case let mut n_wvs_left = state.n_wvs_left; + // if n_wvs_left == 0 { let _ = extract::<&[u8]>(rb, con, &mut 4)?; // 67, 624/224 - // let next_pos = usize::from(rb.extract::(Endian::Little)?); state.cur_time = f64::from(extract::(rb, con, &mut Endian::Little)?) / 60000.; let wv_start: u16 = extract(rb, con, &mut Endian::Little)?; let wv_end: u16 = extract(rb, con, &mut Endian::Little)?; @@ -232,6 +143,135 @@ impl_reader!( () ); +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +/// The type of the records in the array. +pub enum ChemstationArrayRecordType { + #[default] + /// All of the values are stored as f32 + Float32Array, + /// All of the values are stored as f64 + Float64Array, +} + +#[derive(Clone, Debug, Default)] +/// Internal state for the `ChemstationArrayRecord` parser +pub struct ChemstationArrayState { + metadata: ChemstationMetadata, + record_type: ChemstationArrayRecordType, + n_scans_left: usize, + cur_time: f64, + time_step: f64, +} + +impl StateMetadata for ChemstationArrayState { + fn metadata(&self) -> BTreeMap { + (&self.metadata).into() + } + + fn header(&self) -> Vec<&str> { + vec!["time", "intensity"] + } +} + +impl<'b: 's, 's> FromSlice<'b, 's> for ChemstationArrayState { + type State = (); + + fn parse( + rb: &[u8], + _eof: bool, + consumed: &mut usize, + _state: &mut Self::State, + ) -> Result { + *consumed += read_agilent_header(rb, false)?; + Ok(true) + } + + fn get(&mut self, rb: &'b [u8], _state: &'s Self::State) -> Result<(), EtError> { + self.metadata = ChemstationMetadata::from_header(rb)?; + + let record_type = if &rb[348..352] == b"G\x00C\x00" + || &rb[3090..3104] == b"M\x00u\x00s\x00t\x00a\x00n\x00g\x00" + { + ChemstationArrayRecordType::Float64Array + } else { + ChemstationArrayRecordType::Float32Array + }; + + let tstep_num = u16::extract(&rb[4122..], &Endian::Big)? as f64; + let tstep_denom = u16::extract(&rb[4124..], &Endian::Big)? as f64; + let tstep = (tstep_num / tstep_denom) / 60.; + + // The file from issue #42 has 12000 scans, but the field at 278 only says 197? + // The other file I have is correct so maybe that's corrupt, but we're using + // the time step to figure this out for now. + // let n_scans = u32::extract(&rb[278..], &Endian::Big)? as usize; + let n_scans = 1 + ((self.metadata.end_time - self.metadata.start_time) / tstep) as usize; + + self.n_scans_left = dbg!(n_scans); + self.record_type = dbg!(record_type); + self.cur_time = self.metadata.start_time; + self.time_step = dbg!(tstep); + Ok(()) + } +} + +#[derive(Clone, Copy, Debug, Default)] +/// A record from a Chemstation UV file +pub struct ChemstationArrayRecord { + /// The time recorded at + pub time: f64, + /// The intensity recorded + pub intensity: f64, +} + +impl_record!(ChemstationArrayRecord: time, intensity); + +impl<'b: 's, 's> FromSlice<'b, 's> for ChemstationArrayRecord { + type State = ChemstationArrayState; + + fn parse( + _rb: &[u8], + _eof: bool, + consumed: &mut usize, + state: &mut Self::State, + ) -> Result { + if state.n_scans_left == 0 { + return Ok(false); + } + *consumed += match state.record_type { + ChemstationArrayRecordType::Float32Array => 4, + ChemstationArrayRecordType::Float64Array => 8, + }; + state.n_scans_left -= 1; + state.cur_time += state.time_step; + Ok(true) + } + + fn get(&mut self, rb: &'b [u8], state: &'s Self::State) -> Result<(), EtError> { + let con = &mut 0; + let intensity = match state.record_type { + ChemstationArrayRecordType::Float32Array => { + extract::(rb, con, &mut Endian::Little)? as f64 + } + ChemstationArrayRecordType::Float64Array => { + extract::(rb, con, &mut Endian::Little)? + } + }; + + self.time = state.cur_time; + self.intensity = intensity * state.metadata.mult_correction; + Ok(()) + } +} + +impl_reader!( + ChemstationArrayReader, + ChemstationArrayRecord, + ChemstationArrayRecord, + ChemstationArrayState, + () +); + #[cfg(test)] mod tests { use super::*; @@ -269,4 +309,23 @@ mod tests { assert_eq!(n_mzs, 6744 * 301); Ok(()) } + + #[test] + fn test_array_chemstation_reader() -> Result<(), EtError> { + let data: &[u8] = include_bytes!("../../../tests/data/test_179_fid.ch"); + let mut reader = ChemstationArrayReader::new(data, None)?; + let _ = reader.metadata(); + assert_eq!(reader.headers(), ["time", "intensity"]); + + let ChemstationArrayRecord { time, intensity } = dbg!(reader.next()?.unwrap()); + assert!((time - 0.00166095).abs() < 0.000001); + assert_eq!(intensity, 7.7457031249999995); + + let mut n_mzs = 1; + while reader.next()?.is_some() { + n_mzs += 1; + } + assert_eq!(n_mzs, 12000); + Ok(()) + } } diff --git a/entab/src/parsers/agilent/metadata.rs b/entab/src/parsers/agilent/metadata.rs new file mode 100644 index 0000000..928c917 --- /dev/null +++ b/entab/src/parsers/agilent/metadata.rs @@ -0,0 +1,248 @@ +use alloc::collections::BTreeMap; +use alloc::str; +use alloc::string::{String, ToString}; +use core::char::{decode_utf16, REPLACEMENT_CHARACTER}; + +use chrono::NaiveDateTime; + +use crate::parsers::{Endian, FromSlice}; +use crate::record::Value; +use crate::EtError; + +#[derive(Clone, Debug, Default)] +/// Metadata consistly found in Chemstation file formats +pub struct ChemstationMetadata { + /// The time the run started collecting at in minutes + pub start_time: f64, + /// The time the run stopped collecting at in minutes + pub end_time: f64, + /// Name of the signal record (specifically used for e.g. MWD traces) + pub signal_name: String, + /// Absolute correction to be applied to all data points + pub offset_correction: f64, + /// Scaling correction to be applied to all data points + pub mult_correction: f64, + /// In what order this run was performed + pub sequence: u16, + /// The vial number this run was performed from + pub vial: u16, + /// The replicate number of this run + pub replicate: u16, + /// The name of the sample + pub sample: String, + /// The description of the sample + pub description: String, + /// The name of the operator + pub operator: String, + /// The date the sample was run + pub run_date: Option, + /// The instrument the sample was run on + pub instrument: String, + /// The method the instrument ran + pub method: String, + /// The units of the y scale. + pub y_units: String, +} + +impl ChemstationMetadata { + /// Parse the header to extract the metadata + pub fn from_header(header: &[u8]) -> Result { + if header.len() < 256 { + return Err(EtError::from( + "All Chemstation header needs to be at least 256 bytes long", + ) + .incomplete()); + } + let version = u32::extract(&header[248..], &Endian::Big)?; + + let required_length = match version { + 2 | 102 => 512, + 30 | 31 | 81 => 652, + 131 => 4000, + 130 | 179 => 4800, + _ => usize::MAX, + }; + if header.len() < required_length { + return Err(EtError::from(format!( + "Chemstation {} header needs to be at least {} bytes long", + version, required_length + )) + .incomplete()); + } + + // 258..260 - 0 or 1 + // 260..264 - 0 or large int (/60000?) + // 254..268 - 9 or 13 + // only in 179 and 130 + // 290..294 - 63429.0 - f32 / 930051 - i32 + // 294..298 - 0 / -22385 + // 298..302 - repeat of 290 + // 302..306 - repeat of 294 + + // There's another data section at 4100 that + // has duplicates of some of these values? + + let sequence = u16::extract(&header[252..], &Endian::Big)?; + let vial = u16::extract(&header[254..], &Endian::Big)?; + let replicate = u16::extract(&header[256..], &Endian::Big)?; + + let sample = match version { + 0..=102 => get_pascal(&header[24..24 + 60], "sample")?, + _ => get_utf16_pascal(&header[858..]), + }; + let description = match version { + 0..=102 => get_pascal(&header[86..86 + 60], "description")?, + _ => "".to_string(), + }; + let operator = match version { + 0..=102 => get_pascal(&header[148..148 + 28], "operator")?, + _ => get_utf16_pascal(&header[1880..]), + }; + let instrument = match version { + 0..=102 => get_pascal(&header[208..228], "instrument")?, + _ => get_utf16_pascal(&header[2492..]), + }; + let method = match version { + 0..=102 => get_pascal(&header[228..], "method")?, + _ => get_utf16_pascal(&header[2574..]), + }; + + let signal_name = match version { + 30 | 31 | 81 => get_pascal(&header[596..596 + 40], "signal_name")?, + 130 | 179 => get_utf16_pascal(&header[4213..]), + _ => "".to_string(), + }; + + let offset_correction = match version { + 30 | 31 | 81 => f64::extract(&header[636..], &Endian::Big)?, + _ => 0., + }; + let mult_correction = match version { + 30 | 31 | 81 => f64::extract(&header[644..], &Endian::Big)?, + 131 => f64::extract(&header[3085..3093], &Endian::Big)?, + 130 | 179 => f64::extract(&header[4732..4770], &Endian::Big)?, + _ => 1., + }; + let start_time = match version { + 2 | 30 | 31 | 81 | 102 | 130 | 131 => { + i32::extract(&header[282..], &Endian::Big)? as f64 / 60000. + } + 179 => f32::extract(&header[282..], &Endian::Big)? as f64 / 60000., + _ => 0., + }; + let end_time = match version { + 2 | 30 | 31 | 81 | 102 | 130 | 131 => { + i32::extract(&header[286..], &Endian::Big)? as f64 / 60000. + } + 179 => f32::extract(&header[286..], &Endian::Big)? as f64 / 60000., + _ => 0., + }; + let y_units = match version { + 81 => get_pascal(&header[244..244 + 64], "y_units")?, + 131 => get_utf16_pascal(&header[3093..]), + 130 | 179 => get_utf16_pascal(&header[4172..]), + _ => "".to_string(), + }; + + // We need to detect the date format before we can convert into a + // NaiveDateTime; not sure the format even maps to the file type + // (it may be computer-dependent?) + let raw_run_date = match version { + 0..=102 => get_pascal(&header[178..178 + 60], "run_date")?, + 130 | 131 | 179 => get_utf16_pascal(&header[2391..]), + _ => "".to_string(), + }; + let run_date = if let Ok(d) = + NaiveDateTime::parse_from_str(raw_run_date.as_ref(), "%d-%b-%y, %H:%M:%S") + { + // format in MWD + Some(d) + } else if let Ok(d) = + NaiveDateTime::parse_from_str(raw_run_date.as_ref(), "%d %b %y %l:%M %P") + { + // format in MS + Some(d) + } else if let Ok(d) = + NaiveDateTime::parse_from_str(raw_run_date.as_ref(), "%d %b %y %l:%M %P %z") + { + // format in MS with timezone + Some(d) + } else if let Ok(d) = + NaiveDateTime::parse_from_str(raw_run_date.as_ref(), "%m/%d/%y %I:%M:%S %p") + { + // format in FID + Some(d) + } else { + None + }; + + Ok(Self { + start_time, + end_time, + signal_name, + offset_correction, + mult_correction, + sequence, + vial, + replicate, + sample, + description, + operator, + run_date, + instrument, + method, + y_units, + }) + } +} + +impl<'r> From<&ChemstationMetadata> for BTreeMap> { + fn from(metadata: &ChemstationMetadata) -> Self { + let mut map = BTreeMap::new(); + drop(map.insert("start_time".to_string(), metadata.start_time.into())); + drop(map.insert("end_time".to_string(), metadata.end_time.into())); + drop(map.insert( + "signal_name".to_string(), + metadata.signal_name.clone().into(), + )); + drop(map.insert( + "offset_correction".to_string(), + metadata.offset_correction.into(), + )); + drop(map.insert( + "mult_correction".to_string(), + metadata.mult_correction.into(), + )); + drop(map.insert("sequence".to_string(), metadata.sequence.into())); + drop(map.insert("vial".to_string(), metadata.vial.into())); + drop(map.insert("replicate".to_string(), metadata.replicate.into())); + drop(map.insert("sample".to_string(), metadata.sample.clone().into())); + drop(map.insert( + "description".to_string(), + metadata.description.clone().into(), + )); + drop(map.insert("operator".to_string(), metadata.operator.clone().into())); + drop(map.insert("run_date".to_string(), metadata.run_date.into())); + drop(map.insert("instrument".to_string(), metadata.instrument.clone().into())); + drop(map.insert("method".to_string(), metadata.method.clone().into())); + drop(map.insert("y_units".to_string(), metadata.y_units.clone().into())); + map + } +} + +fn get_utf16_pascal(data: &[u8]) -> String { + let iter = (1..=2 * usize::from(data[0])) + .step_by(2) + .map(|i| u16::from_le_bytes([data[i], data[i + 1]])); + decode_utf16(iter) + .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER)) + .collect::() +} + +fn get_pascal(data: &[u8], field_name: &'static str) -> Result { + let string_len = usize::from(data[0]); + if string_len > data.len() { + return Err(EtError::from(format!("Invalid {} length", field_name)).incomplete()); + } + Ok(str::from_utf8(&data[1..1 + string_len])?.trim().to_string()) +} diff --git a/entab/src/parsers/agilent/mod.rs b/entab/src/parsers/agilent/mod.rs index 3af3637..129142c 100644 --- a/entab/src/parsers/agilent/mod.rs +++ b/entab/src/parsers/agilent/mod.rs @@ -8,6 +8,8 @@ pub mod chemstation_new; /// Readers for formats generated by the GC/LC control software Masshunter #[cfg(feature = "std")] pub mod masshunter; +/// Read the common metadata format at the top of Chemstation files +pub mod metadata; use crate::error::EtError; use crate::parsers::common::Skip; diff --git a/entab/src/parsers/flow.rs b/entab/src/parsers/flow.rs index f384386..9d53fc9 100644 --- a/entab/src/parsers/flow.rs +++ b/entab/src/parsers/flow.rs @@ -457,7 +457,11 @@ mod tests { assert_eq!(metadata["specimen_source"], "Specimen_001".into()); assert_eq!( metadata["date"], - NaiveDate::from_ymd_opt(2012, 10, 26).unwrap().and_hms_opt(18, 8, 10).unwrap().into() + NaiveDate::from_ymd_opt(2012, 10, 26) + .unwrap() + .and_hms_opt(18, 8, 10) + .unwrap() + .into() ); Ok(()) } diff --git a/entab/src/readers.rs b/entab/src/readers.rs index 9148b9f..d699f27 100644 --- a/entab/src/readers.rs +++ b/entab/src/readers.rs @@ -41,6 +41,9 @@ fn _get_reader<'n, 'p, 'r>( ) -> Result<(Box, &'n str), EtError> { let reader: Box = match parser_name { "bam" => Box::new(parsers::sam::BamReader::new(rb, None)?), + "chemstation_array" => Box::new(parsers::agilent::chemstation_new::ChemstationArrayReader::new( + rb, None, + )?), "chemstation_dad" => Box::new(parsers::agilent::chemstation::ChemstationDadReader::new( rb, None, )?), diff --git a/entab/tests/DATA_SOURCES.txt b/entab/tests/DATA_SOURCES.txt index be5049c..de74fd2 100644 --- a/entab/tests/DATA_SOURCES.txt +++ b/entab/tests/DATA_SOURCES.txt @@ -13,5 +13,6 @@ test-0000.cf, collected by Roderick, test.bam, generated from test.sam, test.fastq, downloaded from NCBI, test_fid.ch, collected by Roderick, +test_179_fid.ch, from issue #32 test.sam, generated from aligning sequence.fasta against test.fastq, small.RAW, https://github.com/galaxyproteomics/tools-galaxyp/blob/master/tools/msconvert/test-data/small.RAW, CC0 diff --git a/entab/tests/data/test_179_fid.ch b/entab/tests/data/test_179_fid.ch new file mode 100644 index 0000000..8c235de Binary files /dev/null and b/entab/tests/data/test_179_fid.ch differ