diff --git a/src/datatype.rs b/src/datatype.rs index f00544e8..01349a79 100644 --- a/src/datatype.rs +++ b/src/datatype.rs @@ -303,6 +303,44 @@ where } } +/// An enum to represent all different data types that can appear as +/// a value in a worksheet cell +#[derive(Debug, Clone, PartialEq, Default)] +pub enum DataTypeRef<'a> { + /// Signed integer + Int(i64), + /// Float + Float(f64), + /// String + String(String), + /// Shared String + SharedString(&'a str), + /// Boolean + Bool(bool), + /// Date or Time + DateTime(f64), + /// Error + Error(CellErrorType), + /// Empty cell + #[default] + Empty, +} + +impl<'a> From> for DataType { + fn from(value: DataTypeRef<'a>) -> Self { + match value { + DataTypeRef::Int(v) => DataType::Int(v), + DataTypeRef::Float(v) => DataType::Float(v), + DataTypeRef::String(v) => DataType::String(v), + DataTypeRef::SharedString(v) => DataType::String(v.into()), + DataTypeRef::Bool(v) => DataType::Bool(v), + DataTypeRef::DateTime(v) => DataType::DateTime(v), + DataTypeRef::Error(v) => DataType::Error(v), + DataTypeRef::Empty => DataType::Empty, + } + } +} + #[cfg(all(test, feature = "dates"))] mod tests { use super::*; diff --git a/src/lib.rs b/src/lib.rs index e6425973..46ab2c4d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -73,6 +73,7 @@ mod de; mod errors; pub mod vba; +use datatype::DataTypeRef; use serde::de::DeserializeOwned; use std::borrow::Cow; use std::cmp::{max, min}; @@ -217,6 +218,7 @@ where pub trait CellType: Default + Clone + PartialEq {} impl CellType for DataType {} +impl<'a> CellType for DataTypeRef<'a> {} impl CellType for String {} impl CellType for usize {} // for tests diff --git a/src/xlsx.rs b/src/xlsx.rs index 785923a6..0506e5e4 100644 --- a/src/xlsx.rs +++ b/src/xlsx.rs @@ -12,10 +12,11 @@ use quick_xml::Reader as XmlReader; use zip::read::{ZipArchive, ZipFile}; use zip::result::ZipError; +use crate::datatype::DataTypeRef; use crate::vba::VbaProject; use crate::{Cell, CellErrorType, CellType, DataType, Metadata, Range, Reader, Table}; -type XlsReader<'a> = XmlReader>>; +type XlReader<'a> = XmlReader>>; /// Maximum number of rows allowed in an xlsx file pub const MAX_ROWS: u32 = 1_048_576; @@ -629,18 +630,18 @@ impl InnerTableMetadata { } } -fn worksheet( - strings: &[String], +fn worksheet<'s, T, F>( + strings: &'s [String], formats: &[CellFormat], - mut xml: XlsReader<'_>, + mut xml: XlReader<'_>, read_data: &mut F, ) -> Result, XlsxError> where T: CellType, F: FnMut( - &[String], + &'s [String], &[CellFormat], - &mut XlsReader<'_>, + &mut XlReader<'_>, &mut Vec>, ) -> Result<(), XlsxError>, { @@ -648,42 +649,54 @@ where let mut buf = Vec::new(); 'xml: loop { buf.clear(); - match xml.read_event_into(&mut buf) { - Ok(Event::Start(ref e)) => { - match e.local_name().as_ref() { - b"dimension" => { - for a in e.attributes() { - if let Attribute { - key: QName(b"ref"), - value: rdim, - } = a.map_err(XlsxError::XmlAttr)? - { - let len = get_dimension(&rdim)?.len(); - if len < 1_000_000 { - // it is unlikely to have more than that - // there may be of empty cells - cells.reserve(len as usize); - } - continue 'xml; - } + match xml.read_event_into(&mut buf).map_err(XlsxError::Xml)? { + Event::Start(ref e) => match e.local_name().as_ref() { + b"dimension" => { + for a in e.attributes() { + if let Attribute { + key: QName(b"ref"), + value: rdim, + } = a.map_err(XlsxError::XmlAttr)? + { + let len = get_dimension(&rdim)?.len(); + cells.reserve(len as usize); + continue 'xml; } - return Err(XlsxError::UnexpectedNode("dimension")); - } - b"sheetData" => { - read_data(strings, formats, &mut xml, &mut cells)?; - break; } - _ => (), + return Err(XlsxError::UnexpectedNode("dimension")); } - } - Ok(Event::Eof) => break, - Err(e) => return Err(XlsxError::Xml(e)), + b"sheetData" => { + buf.clear(); + read_data(strings, formats, &mut xml, &mut cells)?; + break; + } + _ => (), + }, + Event::Eof => break, _ => (), } } Ok(Range::from_sparse(cells)) } +impl Xlsx { + /// Get worksheet range where shared string values are only borrowed + pub fn worksheet_range_ref<'a>( + &'a mut self, + name: &str, + ) -> Option>, XlsxError>> { + let (_, path) = self.sheets.iter().find(|&&(ref n, _)| n == name)?; + let xml = xml_reader(&mut self.zip, path); + let strings = &self.strings; + let formats = &self.formats; + xml.map(|xml| { + worksheet(strings, formats, xml?, &mut |s, f, xml, cells| { + read_sheet_data(xml, s, f, cells) + }) + }) + } +} + impl Reader for Xlsx { type Error = XlsxError; @@ -704,12 +717,13 @@ impl Reader for Xlsx { } fn vba_project(&mut self) -> Option, XlsxError>> { - self.zip.by_name("xl/vbaProject.bin").ok().map(|mut f| { - let len = f.size() as usize; + let mut f = self.zip.by_name("xl/vbaProject.bin").ok()?; + let len = f.size() as usize; + Some( VbaProject::new(&mut f, len) .map(Cow::Owned) - .map_err(XlsxError::Vba) - }) + .map_err(XlsxError::Vba), + ) } fn metadata(&self) -> &Metadata { @@ -717,25 +731,28 @@ impl Reader for Xlsx { } fn worksheet_range(&mut self, name: &str) -> Option, XlsxError>> { - let xml = match self.sheets.iter().find(|&&(ref n, _)| n == name) { - Some(&(_, ref path)) => xml_reader(&mut self.zip, path), - None => return None, - }; - let strings = &self.strings; - let formats = &self.formats; - xml.map(|xml| { - worksheet(strings, formats, xml?, &mut |s, f, xml, cells| { - read_sheet_data(xml, s, f, cells) - }) - }) + Some(self.worksheet_range_ref(name)?.map(|rge| { + let inner = rge.inner.into_iter().map(|v| v.into()).collect(); + Range { + start: rge.start, + end: rge.end, + inner, + } + })) + // let (_, path) = self.sheets.iter().find(|&&(ref n, _)| n == name)?; + // let xml = xml_reader(&mut self.zip, path); + // let strings = &self.strings; + // let formats = &self.formats; + // xml.map(|xml| { + // worksheet(strings, formats, xml?, &mut |s, f, xml, cells| { + // read_sheet_data(xml, s, f, cells) + // }) + // }) } fn worksheet_formula(&mut self, name: &str) -> Option, XlsxError>> { - let xml = match self.sheets.iter().find(|&&(ref n, _)| n == name) { - Some(&(_, ref path)) => xml_reader(&mut self.zip, path), - None => return None, - }; - + let (_, path) = self.sheets.iter().find(|&&(ref n, _)| n == name)?; + let xml = xml_reader(&mut self.zip, path); let strings = &self.strings; let formats = &self.formats; xml.map(|xml| { @@ -782,6 +799,12 @@ impl Reader for Xlsx { &mut |s, f, xml, cells| read_sheet_data(xml, s, f, cells), ) .ok()?; + let inner = range.inner.into_iter().map(|v| v.into()).collect(); + let range = Range { + start: range.start, + end: range.end, + inner, + }; Some((name, range)) }) .collect() @@ -791,7 +814,7 @@ impl Reader for Xlsx { fn xml_reader<'a, RS: Read + Seek>( zip: &'a mut ZipArchive, path: &str, -) -> Option, XlsxError>> { +) -> Option, XlsxError>> { match zip.by_name(path) { Ok(f) => { let mut r = XmlReader::from_reader(BufReader::new(f)); @@ -822,7 +845,7 @@ fn get_attribute<'a>(atts: Attributes<'a>, n: QName) -> Result, } fn read_sheet( - xml: &mut XlsReader<'_>, + xml: &mut XlReader<'_>, cells: &mut Vec>, push_cell: &mut F, ) -> Result<(), XlsxError> @@ -830,7 +853,7 @@ where T: CellType, F: FnMut( &mut Vec>, - &mut XlsReader<'_>, + &mut XlReader<'_>, &BytesStart<'_>, (u32, u32), &BytesStart<'_>, @@ -865,49 +888,46 @@ where } /// read sheetData node -fn read_sheet_data( - xml: &mut XlsReader<'_>, - strings: &[String], +fn read_sheet_data<'s>( + xml: &mut XlReader<'_>, + strings: &'s [String], formats: &[CellFormat], - cells: &mut Vec>, + cells: &mut Vec>>, ) -> Result<(), XlsxError> { /// read the contents of a cell - fn read_value<'a>( + fn read_value<'s, 'a>( v: String, - strings: &[String], + strings: &'s [String], formats: &[CellFormat], c_element: &BytesStart<'a>, - ) -> Result { - let is_date_time = match get_attribute(c_element.attributes(), QName(b"s")) { - Ok(Some(style)) => { + ) -> Result, XlsxError> { + let is_date_time = + if let Ok(Some(style)) = get_attribute(c_element.attributes(), QName(b"s")) { let id: usize = std::str::from_utf8(style).unwrap_or("0").parse()?; - match formats.get(id) { - Some(CellFormat::Date) => true, - _ => false, - } - } - _ => false, - }; + matches!(formats.get(id), Some(CellFormat::Date)) + } else { + false + }; match get_attribute(c_element.attributes(), QName(b"t"))? { Some(b"s") => { // shared string let idx: usize = v.parse()?; - Ok(DataType::String(strings[idx].clone())) + Ok(DataTypeRef::SharedString(&strings[idx])) } Some(b"b") => { // boolean - Ok(DataType::Bool(v != "0")) + Ok(DataTypeRef::Bool(v != "0")) } Some(b"e") => { // error - Ok(DataType::Error(v.parse()?)) + Ok(DataTypeRef::Error(v.parse()?)) } Some(b"d") => { // date // TODO: create a DataType::Date // currently just return as string (ISO 8601) - Ok(DataType::String(v)) + Ok(DataTypeRef::String(v)) } Some(b"str") => { // see http://officeopenxml.com/SScontentOverview.php @@ -920,36 +940,36 @@ fn read_sheet_data( // NB: the result of a formula may not be a numeric value (=A3&" "&A4). // We do try an initial parse as Float for utility, but fall back to a string // representation if that fails - v.parse().map(DataType::Float).or(Ok(DataType::String(v))) + v.parse() + .map(DataTypeRef::Float) + .or(Ok(DataTypeRef::String(v))) } Some(b"n") => { // n - number if v.is_empty() { - Ok(DataType::Empty) + Ok(DataTypeRef::Empty) } else { - v.parse() - .map(|n| { - if is_date_time { - DataType::DateTime(n) - } else { - DataType::Float(n) - } - }) - .map_err(XlsxError::ParseFloat) + let n = v.parse().map_err(XlsxError::ParseFloat)?; + if is_date_time { + Ok(DataTypeRef::DateTime(n)) + } else { + Ok(DataTypeRef::Float(n)) + } } } None => { // If type is not known, we try to parse as Float for utility, but fall back to // String if this fails. - v.parse() - .map(|n| { + Ok(v.parse().map_or_else( + |_| DataTypeRef::String(v), + |n| { if is_date_time { - DataType::DateTime(n) + DataTypeRef::DateTime(n) } else { - DataType::Float(n) + DataTypeRef::Float(n) } - }) - .or(Ok(DataType::String(v))) + }, + )) } Some(b"is") => { // this case should be handled in outer loop over cell elements, in which @@ -970,7 +990,7 @@ fn read_sheet_data( b"is" => { // inlineStr if let Some(s) = read_string(xml, e.name())? { - cells.push(Cell::new(pos, DataType::String(s))); + cells.push(Cell::new(pos, DataTypeRef::String(s))); } } b"v" => { @@ -987,7 +1007,7 @@ fn read_sheet_data( } } match read_value(v, strings, formats, c_element)? { - DataType::Empty => (), + DataTypeRef::Empty => (), v => cells.push(Cell::new(pos, v)), } } @@ -1125,10 +1145,7 @@ fn get_row_column(range: &[u8]) -> Result<(u32, u32), XlsxError> { } /// attempts to read either a simple or richtext string -fn read_string( - xml: &mut XlsReader<'_>, - QName(closing): QName, -) -> Result, XlsxError> { +fn read_string(xml: &mut XlReader<'_>, QName(closing): QName) -> Result, XlsxError> { let mut buf = Vec::new(); let mut val_buf = Vec::new(); let mut rich_buffer: Option = None;