Skip to content

Commit

Permalink
Merge pull request #453 from PrettyWood/feat/keep-first-empty-rows
Browse files Browse the repository at this point in the history
feat: add option to set header row
  • Loading branch information
tafia authored Oct 8, 2024
2 parents 06a1093 + b6f91e0 commit 8efe95d
Show file tree
Hide file tree
Showing 10 changed files with 439 additions and 51 deletions.
22 changes: 21 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,26 @@ if let Some(Ok(r)) = excel.worksheet_range("Sheet1") {
}
```

### Reader: With header row

```rs
use calamine::{Reader, Xlsx, open_workbook};

let mut excel: Xlsx<_> = open_workbook("file.xlsx").unwrap();

let sheet1 = excel
.with_header_row(Some(3))
.worksheet_range("Sheet1")
.unwrap();
```

Note that `xlsx` and `xlsb` files support lazy loading, so specifying a
header row takes effect immediately when reading a sheet range.
In contrast, for `xls` and `ods` files, all sheets are loaded at once when
opening the workbook with default settings.
As a result, setting the header row only applies afterward and does not
provide any performance benefits.

### Reader: More complex

Let's assume
Expand Down Expand Up @@ -190,7 +210,7 @@ The programs are all structured to follow the same constructs:
use calamine::{open_workbook, Reader, Xlsx};

fn main() {
// Open workbook
// Open workbook
let mut excel: Xlsx<_> =
open_workbook("NYC_311_SR_2010-2020-sample-1M.xlsx").expect("failed to find file");

Expand Down
36 changes: 27 additions & 9 deletions src/auto.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
use crate::errors::Error;
use crate::vba::VbaProject;
use crate::{
open_workbook, open_workbook_from_rs, Data, DataRef, Metadata, Ods, Range, Reader, ReaderRef,
Xls, Xlsb, Xlsx,
open_workbook, open_workbook_from_rs, Data, DataRef, HeaderRow, Metadata, Ods, Range, Reader,
ReaderRef, Xls, Xlsb, Xlsx,
};
use std::borrow::Cow;
use std::fs::File;
Expand Down Expand Up @@ -85,9 +85,27 @@ where
Err(Error::Msg("Sheets must be created from a Path"))
}

fn with_header_row(&mut self, header_row: HeaderRow) -> &mut Self {
match self {
Sheets::Xls(ref mut e) => {
e.with_header_row(header_row);
}
Sheets::Xlsx(ref mut e) => {
e.with_header_row(header_row);
}
Sheets::Xlsb(ref mut e) => {
e.with_header_row(header_row);
}
Sheets::Ods(ref mut e) => {
e.with_header_row(header_row);
}
}
self
}

/// Gets `VbaProject`
fn vba_project(&mut self) -> Option<Result<Cow<'_, VbaProject>, Self::Error>> {
match *self {
match self {
Sheets::Xls(ref mut e) => e.vba_project().map(|vba| vba.map_err(Error::Xls)),
Sheets::Xlsx(ref mut e) => e.vba_project().map(|vba| vba.map_err(Error::Xlsx)),
Sheets::Xlsb(ref mut e) => e.vba_project().map(|vba| vba.map_err(Error::Xlsb)),
Expand All @@ -97,7 +115,7 @@ where

/// Initialize
fn metadata(&self) -> &Metadata {
match *self {
match self {
Sheets::Xls(ref e) => e.metadata(),
Sheets::Xlsx(ref e) => e.metadata(),
Sheets::Xlsb(ref e) => e.metadata(),
Expand All @@ -107,7 +125,7 @@ where

/// Read worksheet data in corresponding worksheet path
fn worksheet_range(&mut self, name: &str) -> Result<Range<Data>, Self::Error> {
match *self {
match self {
Sheets::Xls(ref mut e) => e.worksheet_range(name).map_err(Error::Xls),
Sheets::Xlsx(ref mut e) => e.worksheet_range(name).map_err(Error::Xlsx),
Sheets::Xlsb(ref mut e) => e.worksheet_range(name).map_err(Error::Xlsb),
Expand All @@ -117,7 +135,7 @@ where

/// Read worksheet formula in corresponding worksheet path
fn worksheet_formula(&mut self, name: &str) -> Result<Range<String>, Self::Error> {
match *self {
match self {
Sheets::Xls(ref mut e) => e.worksheet_formula(name).map_err(Error::Xls),
Sheets::Xlsx(ref mut e) => e.worksheet_formula(name).map_err(Error::Xlsx),
Sheets::Xlsb(ref mut e) => e.worksheet_formula(name).map_err(Error::Xlsb),
Expand All @@ -126,7 +144,7 @@ where
}

fn worksheets(&mut self) -> Vec<(String, Range<Data>)> {
match *self {
match self {
Sheets::Xls(ref mut e) => e.worksheets(),
Sheets::Xlsx(ref mut e) => e.worksheets(),
Sheets::Xlsb(ref mut e) => e.worksheets(),
Expand All @@ -136,7 +154,7 @@ where

#[cfg(feature = "picture")]
fn pictures(&self) -> Option<Vec<(String, Vec<u8>)>> {
match *self {
match self {
Sheets::Xls(ref e) => e.pictures(),
Sheets::Xlsx(ref e) => e.pictures(),
Sheets::Xlsb(ref e) => e.pictures(),
Expand All @@ -153,7 +171,7 @@ where
&'a mut self,
name: &str,
) -> Result<Range<DataRef<'a>>, Self::Error> {
match *self {
match self {
Sheets::Xlsx(ref mut e) => e.worksheet_range_ref(name).map_err(Error::Xlsx),
Sheets::Xlsb(ref mut e) => e.worksheet_range_ref(name).map_err(Error::Xlsb),
Sheets::Xls(_) => unimplemented!(),
Expand Down
21 changes: 21 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,23 @@ pub struct Sheet {
pub visible: SheetVisible,
}

/// Row to use as header
/// By default, the first non-empty row is used as header
#[derive(Debug, Clone, Copy)]
#[non_exhaustive]
pub enum HeaderRow {
/// First non-empty row
FirstNonEmptyRow,
/// Index of the header row
Row(u32),
}

impl Default for HeaderRow {
fn default() -> Self {
HeaderRow::FirstNonEmptyRow
}
}

// FIXME `Reader` must only be seek `Seek` for `Xls::xls`. Because of the present API this limits
// the kinds of readers (other) data in formats can be read from.
/// A trait to share spreadsheets reader functions across different `FileType`s
Expand All @@ -228,6 +245,10 @@ where
/// Creates a new instance.
fn new(reader: RS) -> Result<Self, Self::Error>;

/// Set header row (i.e. first row to be read)
/// If `header_row` is `None`, the first non-empty row will be used as header row
fn with_header_row(&mut self, header_row: HeaderRow) -> &mut Self;

/// Gets `VbaProject`
fn vba_project(&mut self) -> Option<Result<Cow<'_, VbaProject>, Self::Error>>;

Expand Down
37 changes: 33 additions & 4 deletions src/ods.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ use zip::read::{ZipArchive, ZipFile};
use zip::result::ZipError;

use crate::vba::VbaProject;
use crate::{Data, DataType, Metadata, Range, Reader, Sheet, SheetType, SheetVisible};
use crate::{Data, DataType, HeaderRow, Metadata, Range, Reader, Sheet, SheetType, SheetVisible};
use std::marker::PhantomData;

const MIMETYPE: &[u8] = b"application/vnd.oasis.opendocument.spreadsheet";
Expand Down Expand Up @@ -62,6 +62,13 @@ pub enum OdsError {
WorksheetNotFound(String),
}

/// Ods reader options
#[derive(Debug, Default)]
#[non_exhaustive]
struct OdsOptions {
pub header_row: HeaderRow,
}

from_err!(std::io::Error, OdsError, Io);
from_err!(zip::result::ZipError, OdsError, Zip);
from_err!(quick_xml::Error, OdsError, Xml);
Expand Down Expand Up @@ -116,6 +123,8 @@ pub struct Ods<RS> {
marker: PhantomData<RS>,
#[cfg(feature = "picture")]
pictures: Option<Vec<(String, Vec<u8>)>>,
/// Reader options
options: OdsOptions,
}

impl<RS> Reader<RS> for Ods<RS>
Expand Down Expand Up @@ -161,9 +170,15 @@ where
sheets,
#[cfg(feature = "picture")]
pictures,
options: OdsOptions::default(),
})
}

fn with_header_row(&mut self, header_row: HeaderRow) -> &mut Self {
self.options.header_row = header_row;
self
}

/// Gets `VbaProject`
fn vba_project(&mut self) -> Option<Result<Cow<'_, VbaProject>, OdsError>> {
None
Expand All @@ -176,10 +191,24 @@ where

/// Read worksheet data in corresponding worksheet path
fn worksheet_range(&mut self, name: &str) -> Result<Range<Data>, OdsError> {
self.sheets
let sheet = self
.sheets
.get(name)
.ok_or_else(|| OdsError::WorksheetNotFound(name.into()))
.map(|r| r.0.to_owned())
.ok_or_else(|| OdsError::WorksheetNotFound(name.into()))?
.0
.to_owned();

match self.options.header_row {
HeaderRow::FirstNonEmptyRow => Ok(sheet),
HeaderRow::Row(header_row_idx) => {
// If `header_row` is a row index, adjust the range
if let (Some(start), Some(end)) = (sheet.start(), sheet.end()) {
Ok(sheet.range((header_row_idx, start.1), end))
} else {
Ok(sheet)
}
}
}
}

fn worksheets(&mut self) -> Vec<(String, Range<Data>)> {
Expand Down
27 changes: 24 additions & 3 deletions src/xls.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ use crate::utils::read_usize;
use crate::utils::{push_column, read_f64, read_i16, read_i32, read_u16, read_u32};
use crate::vba::VbaProject;
use crate::{
Cell, CellErrorType, Data, Dimensions, Metadata, Range, Reader, Sheet, SheetType, SheetVisible,
Cell, CellErrorType, Data, Dimensions, HeaderRow, Metadata, Range, Reader, Sheet, SheetType,
SheetVisible,
};

#[derive(Debug)]
Expand Down Expand Up @@ -136,6 +137,8 @@ pub struct XlsOptions {
///
/// [code page]: https://docs.microsoft.com/en-us/windows/win32/intl/code-page-identifiers
pub force_codepage: Option<u16>,
/// Row to use as header
pub header_row: HeaderRow,
}

struct SheetData {
Expand Down Expand Up @@ -231,6 +234,11 @@ impl<RS: Read + Seek> Reader<RS> for Xls<RS> {
Self::new_with_options(reader, XlsOptions::default())
}

fn with_header_row(&mut self, header_row: HeaderRow) -> &mut Self {
self.options.header_row = header_row;
self
}

fn vba_project(&mut self) -> Option<Result<Cow<'_, VbaProject>, XlsError>> {
self.vba.as_ref().map(|vba| Ok(Cow::Borrowed(vba)))
}
Expand All @@ -241,10 +249,23 @@ impl<RS: Read + Seek> Reader<RS> for Xls<RS> {
}

fn worksheet_range(&mut self, name: &str) -> Result<Range<Data>, XlsError> {
self.sheets
let sheet = self
.sheets
.get(name)
.map(|r| r.range.clone())
.ok_or_else(|| XlsError::WorksheetNotFound(name.into()))
.ok_or_else(|| XlsError::WorksheetNotFound(name.into()))?;

match self.options.header_row {
HeaderRow::FirstNonEmptyRow => Ok(sheet),
HeaderRow::Row(header_row_idx) => {
// If `header_row` is a row index, adjust the range
if let (Some(start), Some(end)) = (sheet.start(), sheet.end()) {
Ok(sheet.range((header_row_idx, start.1), end))
} else {
Ok(sheet)
}
}
}
}

fn worksheets(&mut self) -> Vec<(String, Range<Data>)> {
Expand Down
Loading

0 comments on commit 8efe95d

Please sign in to comment.