Skip to content

Commit

Permalink
fix(excelsheet): use_columns should work when columns are skipped (#217)
Browse files Browse the repository at this point in the history
closes #214

Signed-off-by: Luka Peschke <[email protected]>
  • Loading branch information
lukapeschke authored Mar 29, 2024
1 parent 026f273 commit 4dae4e8
Show file tree
Hide file tree
Showing 4 changed files with 164 additions and 44 deletions.
36 changes: 16 additions & 20 deletions python/tests/test_column_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,43 +240,39 @@ def test_single_sheet_with_unnamed_columns_and_pagination_and_column_names(
use_columns_idx = [0, 2, 3]
expected: dict[str, list[Any]] = {
"col0": [2.0, 3.0],
"col2": ["hello", "world"],
"col3": [-5.0, -6.0],
"col1": ["hello", "world"],
"col2": [-5.0, -6.0],
}
column_names = [f"col{i}" for i in range(5)]
column_names = [f"col{i}" for i in range(3)]
expected_columns_names = ["col0", "__UNNAMED__1", "col1", "col2", "__UNNAMED__4"]

# skipping the header row only
sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_str, skip_rows=1, column_names=column_names
)
assert [col.name for col in sheet.available_columns] == column_names

pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))
with pytest.raises(
fastexcel.InvalidParametersError,
match='use_columns can only contain integers when used with columns_names, got "col0"',
):
excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns",
use_columns=use_columns_str,
skip_rows=1,
column_names=column_names,
)

sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_idx, skip_rows=1, column_names=column_names
)
assert [col.name for col in sheet.available_columns] == column_names
assert [col.name for col in sheet.available_columns] == expected_columns_names

pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected))

# skipping the header row + first data row
expected_first_row_skipped = {k: v[1:] for k, v in expected.items()}

sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_str, skip_rows=2, column_names=column_names
)
assert [col.name for col in sheet.available_columns] == column_names

pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected_first_row_skipped))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected_first_row_skipped))

sheet = excel_reader_single_sheet_with_unnamed_columns.load_sheet(
"With unnamed columns", use_columns=use_columns_idx, skip_rows=2, column_names=column_names
)
assert [col.name for col in sheet.available_columns] == column_names
assert [col.name for col in sheet.available_columns] == expected_columns_names

pd_assert_frame_equal(sheet.to_pandas(), pd.DataFrame(expected_first_row_skipped))
pl_assert_frame_equal(sheet.to_polars(), pl.DataFrame(expected_first_row_skipped))
Expand Down
74 changes: 74 additions & 0 deletions python/tests/test_use_columns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from __future__ import annotations

import fastexcel
import pandas as pd
import polars as pl
from pandas.testing import assert_frame_equal as pd_assert_frame_equal
from polars.testing import assert_frame_equal as pl_assert_frame_equal
from utils import path_for_fixture


def test_use_columns_with_use_columns() -> None:
excel_reader = fastexcel.read_excel(path_for_fixture("fixture-single-sheet-with-types.xlsx"))

sheet = excel_reader.load_sheet(
0,
use_columns=[1, 2],
header_row=None,
skip_rows=1,
column_names=["bools_renamed", "dates_renamed"],
)

assert sheet.available_columns == [
fastexcel.ColumnInfo(
name="__UNNAMED__0",
column_name_from="generated",
index=0,
dtype="float",
dtype_from="guessed",
),
fastexcel.ColumnInfo(
name="bools_renamed",
index=1,
dtype="boolean",
dtype_from="guessed",
column_name_from="provided",
),
fastexcel.ColumnInfo(
name="dates_renamed",
index=2,
dtype="datetime",
dtype_from="guessed",
column_name_from="provided",
),
fastexcel.ColumnInfo(
name="__UNNAMED__3",
index=3,
dtype="float",
dtype_from="guessed",
column_name_from="generated",
),
]

pd_assert_frame_equal(
sheet.to_pandas(),
pd.DataFrame(
{
"bools_renamed": [True, False, True],
"dates_renamed": pd.Series([pd.Timestamp("2022-03-02 05:43:04")] * 3).astype(
"datetime64[ms]"
),
}
),
)
pl_assert_frame_equal(
sheet.to_polars(),
pl.DataFrame(
{
"bools_renamed": [True, False, True],
"dates_renamed": ["2022-03-02 05:43:04"] * 3,
}
).with_columns(
pl.col("dates_renamed").str.strptime(pl.Datetime, "%F %T").dt.cast_time_unit("ms")
),
)
2 changes: 1 addition & 1 deletion src/types/python/excelsheet/column_info.rs
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ impl ColumnInfo {
}

pub fn __repr__(&self) -> String {
format!("ColumnInfo<name=\"{name}\", index={index}, dtype=\"{dtype}\", dtype_from=\"{dtype_from}\", column_name_from=\"{column_name_from}\" >", name=self.name, index=self.index, dtype=self.dtype.to_string(), dtype_from=self.dtype_from.to_string(), column_name_from=self.column_name_from.to_string())
format!("ColumnInfo(name=\"{name}\", index={index}, dtype=\"{dtype}\", dtype_from=\"{dtype_from}\", column_name_from=\"{column_name_from}\" )", name=self.name, index=self.index, dtype=self.dtype.to_string(), dtype_from=self.dtype_from.to_string(), column_name_from=self.column_name_from.to_string())
}

pub fn __eq__(&self, other: &Self) -> bool {
Expand Down
96 changes: 73 additions & 23 deletions src/types/python/excelsheet/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -326,9 +326,6 @@ impl ExcelSheet {
selected_columns: SelectedColumns,
dtypes: Option<DTypeMap>,
) -> FastExcelResult<Self> {
// Ensuring dtypes are compatible with selected columns
// Self::validate_dtypes_and_selected_columns(&selected_columns, &dtypes)?;

let mut sheet = ExcelSheet {
name,
header,
Expand All @@ -344,7 +341,7 @@ impl ExcelSheet {
selected_columns: Vec::with_capacity(0),
};

let available_columns_info = sheet.get_available_columns_info();
let available_columns_info = sheet.get_available_columns_info(&selected_columns)?;

let mut aliased_available_columns = Vec::with_capacity(available_columns_info.len());

Expand Down Expand Up @@ -379,19 +376,22 @@ impl ExcelSheet {
Ok(sheet)
}

fn get_available_columns_info(&self) -> Vec<ColumnInfoBuilder> {
fn get_available_columns_info(
&self,
selected_columns: &SelectedColumns,
) -> FastExcelResult<Vec<ColumnInfoBuilder>> {
let width = self.data.width();
match &self.header {
Header::None => (0..width)
Header::None => Ok((0..width)
.map(|col_idx| {
ColumnInfoBuilder::new(
format!("__UNNAMED__{col_idx}"),
col_idx,
ColumnNameFrom::Generated,
)
})
.collect(),
Header::At(row_idx) => (0..width)
.collect()),
Header::At(row_idx) => Ok((0..width)
.map(|col_idx| {
self.data
.get((*row_idx, col_idx))
Expand All @@ -407,23 +407,73 @@ impl ExcelSheet {
)
})
})
.collect(),
.collect()),
Header::With(names) => {
let nameless_start_idx = names.len();
names
.iter()
.enumerate()
.map(|(col_idx, name)| {
ColumnInfoBuilder::new(name.to_owned(), col_idx, ColumnNameFrom::Provided)
})
.chain((nameless_start_idx..width).map(|col_idx| {
ColumnInfoBuilder::new(
format!("__UNNAMED__{col_idx}"),
col_idx,
ColumnNameFrom::Generated,
if let SelectedColumns::Selection(column_selection) = selected_columns {
if column_selection.len() != names.len() {
return Err(FastExcelErrorKind::InvalidParameters(
"column_names and use_columns must have the same length".to_string(),
)
}))
.collect()
.into());
}
let selected_indices = column_selection
.iter()
.map(|idx_or_name| {
match idx_or_name {
IdxOrName::Idx(idx) => Ok(*idx),
IdxOrName::Name(name) => Err(FastExcelErrorKind::InvalidParameters(
format!("use_columns can only contain integers when used with columns_names, got \"{name}\"")
)
.into()),
}
})
.collect::<FastExcelResult<Vec<_>>>()?;

Ok((0..width)
.map(|col_idx| {
let provided_name_opt = if let Some(pos_in_names) =
selected_indices.iter().position(|idx| idx == &col_idx)
{
names.get(pos_in_names).cloned()
} else {
None
};

match provided_name_opt {
Some(provided_name) => ColumnInfoBuilder::new(
provided_name,
col_idx,
ColumnNameFrom::Provided,
),
None => ColumnInfoBuilder::new(
format!("__UNNAMED__{col_idx}"),
col_idx,
ColumnNameFrom::Generated,
),
}
})
.collect())
} else {
let nameless_start_idx = names.len();
Ok(names
.iter()
.enumerate()
.map(|(col_idx, name)| {
ColumnInfoBuilder::new(
name.to_owned(),
col_idx,
ColumnNameFrom::Provided,
)
})
.chain((nameless_start_idx..width).map(|col_idx| {
ColumnInfoBuilder::new(
format!("__UNNAMED__{col_idx}"),
col_idx,
ColumnNameFrom::Generated,
)
}))
.collect())
}
}
}
}
Expand Down

0 comments on commit 4dae4e8

Please sign in to comment.