Skip to content

Commit

Permalink
add guess_max param for guessing col_type; fixes #223 (#257)
Browse files Browse the repository at this point in the history
* add new param n for guessing col_type

* Rename `n` to `guess_max`

* Account for explicit skip in xls col type processing

* Tests re: col types and the new guess_max parameter

* Use guess_max checking and tests from readr

* guess_max EVERYWHERE and consistent default to 1000

* Add NEWS bullet re: guess_max

* oops
  • Loading branch information
jennybc authored Feb 11, 2017
1 parent 5950a7c commit 12dad81
Show file tree
Hide file tree
Showing 14 changed files with 165 additions and 79 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# readxl 0.1.1.9000

* New argument `guess_max` lets user adjust the number of rows used to guess column types, similar to functions in readr. (#223, #257 @tklebel, @jennybc)

* Improved handling of empty cells for xlsx. (#248 @jennybc)

- Cells with no content are not loaded. Sheet extent is always computed from loaded cells, instead of the nominal dimensions reported in the worksheet. The result is to not consult the XML for empty cells that appear there simply because they have an associated style or format. This is detectable in Excel as seemingly empty cells with a format other than "General".
Expand Down
28 changes: 14 additions & 14 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -1,14 +1,6 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

parseXml <- function(base, internal) {
invisible(.Call('readxl_parseXml', PACKAGE = 'readxl', base, internal))
}

countRows <- function(base, sheet) {
.Call('readxl_countRows', PACKAGE = 'readxl', base, sheet)
}

xls_formats <- function(path) {
.Call('readxl_xls_formats', PACKAGE = 'readxl', path)
}
Expand All @@ -21,8 +13,8 @@ xls_col_names <- function(path, i = 0L, nskip = 0L) {
.Call('readxl_xls_col_names', PACKAGE = 'readxl', path, i, nskip)
}

xls_col_types <- function(path, na, sheet = 0L, nskip = 0L, n = 100L, has_col_names = FALSE) {
.Call('readxl_xls_col_types', PACKAGE = 'readxl', path, na, sheet, nskip, n, has_col_names)
xls_col_types <- function(path, na, sheet = 0L, nskip = 0L, guess_max = 1000L, has_col_names = FALSE) {
.Call('readxl_xls_col_types', PACKAGE = 'readxl', path, na, sheet, nskip, guess_max, has_col_names)
}

xls_cols <- function(path, i, col_names, col_types, na, nskip = 0L) {
Expand All @@ -49,16 +41,24 @@ parse_ref <- function(ref) {
.Call('readxl_parse_ref', PACKAGE = 'readxl', ref)
}

xlsx_col_types <- function(path, sheet = 0L, na = character(), nskip = 0L, n = 100L) {
.Call('readxl_xlsx_col_types', PACKAGE = 'readxl', path, sheet, na, nskip, n)
xlsx_col_types <- function(path, sheet = 0L, na = character(), nskip = 0L, guess_max = 1000L) {
.Call('readxl_xlsx_col_types', PACKAGE = 'readxl', path, sheet, na, nskip, guess_max)
}

xlsx_col_names <- function(path, sheet = 0L, nskip = 0L) {
.Call('readxl_xlsx_col_names', PACKAGE = 'readxl', path, sheet, nskip)
}

read_xlsx_ <- function(path, sheet, col_names, col_types, na, nskip = 0L) {
.Call('readxl_read_xlsx_', PACKAGE = 'readxl', path, sheet, col_names, col_types, na, nskip)
read_xlsx_ <- function(path, sheet, col_names, col_types, na, nskip = 0L, guess_max = 1000L) {
.Call('readxl_read_xlsx_', PACKAGE = 'readxl', path, sheet, col_names, col_types, na, nskip, guess_max)
}

parseXml <- function(base, internal) {
invisible(.Call('readxl_parseXml', PACKAGE = 'readxl', base, internal))
}

countRows <- function(base, sheet) {
.Call('readxl_countRows', PACKAGE = 'readxl', base, sheet)
}

zip_xml <- function(zip_path, file_path) {
Expand Down
36 changes: 28 additions & 8 deletions R/read_excel.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ NULL
#' @param na Character vector of strings to use for missing values. By default
#' readxl treats blank cells as missing data.
#' @param skip Number of rows to skip before reading any data.
#' @param guess_max Maximum number of rows to use for guessing column types.
#' @export
#' @examples
#' datasets <- readxl_example("datasets.xlsx")
Expand All @@ -27,13 +28,14 @@ NULL
#' # Skipping rows and using default column names
#' read_excel(datasets, skip = 148, col_names = FALSE)
read_excel <- function(path, sheet = 1L, col_names = TRUE, col_types = NULL,
na = "", skip = 0) {
na = "", skip = 0, guess_max = 1000) {

path <- check_file(path)
guess_max <- check_guess_max(guess_max)

switch(excel_format(path),
xls = read_xls(path, sheet, col_names, col_types, na, skip),
xlsx = read_xlsx(path, sheet, col_names, col_types, na, skip)
xls = read_xls(path, sheet, col_names, col_types, na, skip, guess_max),
xlsx = read_xlsx(path, sheet, col_names, col_types, na, skip, guess_max)
)
}

Expand All @@ -44,7 +46,7 @@ read_excel <- function(path, sheet = 1L, col_names = TRUE, col_types = NULL,
#' @rdname read_excel
#' @export
read_xls <- function(path, sheet = 1L, col_names = TRUE, col_types = NULL,
na = "", skip = 0) {
na = "", skip = 0, guess_max = 1000) {

sheet <- standardise_sheet(sheet, xls_sheets(path))

Expand All @@ -56,7 +58,9 @@ read_xls <- function(path, sheet = 1L, col_names = TRUE, col_types = NULL,
}

if (is.null(col_types)) {
col_types <- xls_col_types(path, sheet, na = na, nskip = skip, has_col_names = has_col_names)
col_types <- xls_col_types(path, sheet, na = na, nskip = skip,
has_col_names = has_col_names,
guess_max = guess_max)
}

tibble::repair_names(
Expand All @@ -72,14 +76,14 @@ read_xls <- function(path, sheet = 1L, col_names = TRUE, col_types = NULL,
#' @rdname read_excel
#' @export
read_xlsx <- function(path, sheet = 1L, col_names = TRUE, col_types = NULL,
na = "", skip = 0) {
na = "", skip = 0, guess_max = 1000) {
path <- check_file(path)
sheet <- standardise_sheet(sheet, xlsx_sheets(path))

tibble::repair_names(
tibble::as_tibble(
read_xlsx_(path, sheet, col_names = col_names, col_types = col_types,
na = na, nskip = skip),
read_xlsx_(path, sheet, col_names, col_types, na,
nskip = skip, guess_max = guess_max),
validate = FALSE
),
prefix = "X", sep = "__"
Expand Down Expand Up @@ -123,3 +127,19 @@ standardise_sheet <- function(sheet, sheet_names) {
stop("`sheet` must be either an integer or a string.", call. = FALSE)
}
}

## from readr
check_guess_max <- function(guess_max, max_limit = .Machine$integer.max %/% 100) {

if (length(guess_max) != 1 || !is.numeric(guess_max) || !is_integerish(guess_max) ||
is.na(guess_max) || guess_max < 0) {
stop("`guess_max` must be a positive integer", call. = FALSE)
}

if (guess_max > max_limit) {
warning("`guess_max` is a very large value, setting to `", max_limit,
"` to avoid exhausting memory", call. = FALSE)
guess_max <- max_limit
}
guess_max
}
4 changes: 4 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,7 @@ is_absolute_path <- function(path) {
}

isFALSE <- function(x) identical(x, FALSE)

is_integerish <- function(x) {
floor(x) == x
}
10 changes: 6 additions & 4 deletions man/read_excel.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

69 changes: 35 additions & 34 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,29 +5,6 @@

using namespace Rcpp;

// parseXml
void parseXml(std::string base, std::string internal);
RcppExport SEXP readxl_parseXml(SEXP baseSEXP, SEXP internalSEXP) {
BEGIN_RCPP
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< std::string >::type base(baseSEXP);
Rcpp::traits::input_parameter< std::string >::type internal(internalSEXP);
parseXml(base, internal);
return R_NilValue;
END_RCPP
}
// countRows
int countRows(std::string base, int sheet);
RcppExport SEXP readxl_countRows(SEXP baseSEXP, SEXP sheetSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< std::string >::type base(baseSEXP);
Rcpp::traits::input_parameter< int >::type sheet(sheetSEXP);
rcpp_result_gen = Rcpp::wrap(countRows(base, sheet));
return rcpp_result_gen;
END_RCPP
}
// xls_formats
std::map<int,std::string> xls_formats(std::string path);
RcppExport SEXP readxl_xls_formats(SEXP pathSEXP) {
Expand Down Expand Up @@ -64,18 +41,18 @@ BEGIN_RCPP
END_RCPP
}
// xls_col_types
CharacterVector xls_col_types(std::string path, std::vector<std::string> na, int sheet, int nskip, int n, bool has_col_names);
RcppExport SEXP readxl_xls_col_types(SEXP pathSEXP, SEXP naSEXP, SEXP sheetSEXP, SEXP nskipSEXP, SEXP nSEXP, SEXP has_col_namesSEXP) {
CharacterVector xls_col_types(std::string path, std::vector<std::string> na, int sheet, int nskip, int guess_max, bool has_col_names);
RcppExport SEXP readxl_xls_col_types(SEXP pathSEXP, SEXP naSEXP, SEXP sheetSEXP, SEXP nskipSEXP, SEXP guess_maxSEXP, SEXP has_col_namesSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< std::string >::type path(pathSEXP);
Rcpp::traits::input_parameter< std::vector<std::string> >::type na(naSEXP);
Rcpp::traits::input_parameter< int >::type sheet(sheetSEXP);
Rcpp::traits::input_parameter< int >::type nskip(nskipSEXP);
Rcpp::traits::input_parameter< int >::type n(nSEXP);
Rcpp::traits::input_parameter< int >::type guess_max(guess_maxSEXP);
Rcpp::traits::input_parameter< bool >::type has_col_names(has_col_namesSEXP);
rcpp_result_gen = Rcpp::wrap(xls_col_types(path, na, sheet, nskip, n, has_col_names));
rcpp_result_gen = Rcpp::wrap(xls_col_types(path, na, sheet, nskip, guess_max, has_col_names));
return rcpp_result_gen;
END_RCPP
}
Expand Down Expand Up @@ -153,17 +130,17 @@ BEGIN_RCPP
END_RCPP
}
// xlsx_col_types
CharacterVector xlsx_col_types(std::string path, int sheet, CharacterVector na, int nskip, int n);
RcppExport SEXP readxl_xlsx_col_types(SEXP pathSEXP, SEXP sheetSEXP, SEXP naSEXP, SEXP nskipSEXP, SEXP nSEXP) {
CharacterVector xlsx_col_types(std::string path, int sheet, CharacterVector na, int nskip, int guess_max);
RcppExport SEXP readxl_xlsx_col_types(SEXP pathSEXP, SEXP sheetSEXP, SEXP naSEXP, SEXP nskipSEXP, SEXP guess_maxSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< std::string >::type path(pathSEXP);
Rcpp::traits::input_parameter< int >::type sheet(sheetSEXP);
Rcpp::traits::input_parameter< CharacterVector >::type na(naSEXP);
Rcpp::traits::input_parameter< int >::type nskip(nskipSEXP);
Rcpp::traits::input_parameter< int >::type n(nSEXP);
rcpp_result_gen = Rcpp::wrap(xlsx_col_types(path, sheet, na, nskip, n));
Rcpp::traits::input_parameter< int >::type guess_max(guess_maxSEXP);
rcpp_result_gen = Rcpp::wrap(xlsx_col_types(path, sheet, na, nskip, guess_max));
return rcpp_result_gen;
END_RCPP
}
Expand All @@ -181,8 +158,8 @@ BEGIN_RCPP
END_RCPP
}
// read_xlsx_
List read_xlsx_(std::string path, int sheet, RObject col_names, RObject col_types, std::vector<std::string> na, int nskip);
RcppExport SEXP readxl_read_xlsx_(SEXP pathSEXP, SEXP sheetSEXP, SEXP col_namesSEXP, SEXP col_typesSEXP, SEXP naSEXP, SEXP nskipSEXP) {
List read_xlsx_(std::string path, int sheet, RObject col_names, RObject col_types, std::vector<std::string> na, int nskip, int guess_max);
RcppExport SEXP readxl_read_xlsx_(SEXP pathSEXP, SEXP sheetSEXP, SEXP col_namesSEXP, SEXP col_typesSEXP, SEXP naSEXP, SEXP nskipSEXP, SEXP guess_maxSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Expand All @@ -192,7 +169,31 @@ BEGIN_RCPP
Rcpp::traits::input_parameter< RObject >::type col_types(col_typesSEXP);
Rcpp::traits::input_parameter< std::vector<std::string> >::type na(naSEXP);
Rcpp::traits::input_parameter< int >::type nskip(nskipSEXP);
rcpp_result_gen = Rcpp::wrap(read_xlsx_(path, sheet, col_names, col_types, na, nskip));
Rcpp::traits::input_parameter< int >::type guess_max(guess_maxSEXP);
rcpp_result_gen = Rcpp::wrap(read_xlsx_(path, sheet, col_names, col_types, na, nskip, guess_max));
return rcpp_result_gen;
END_RCPP
}
// parseXml
void parseXml(std::string base, std::string internal);
RcppExport SEXP readxl_parseXml(SEXP baseSEXP, SEXP internalSEXP) {
BEGIN_RCPP
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< std::string >::type base(baseSEXP);
Rcpp::traits::input_parameter< std::string >::type internal(internalSEXP);
parseXml(base, internal);
return R_NilValue;
END_RCPP
}
// countRows
int countRows(std::string base, int sheet);
RcppExport SEXP readxl_countRows(SEXP baseSEXP, SEXP sheetSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< std::string >::type base(baseSEXP);
Rcpp::traits::input_parameter< int >::type sheet(sheetSEXP);
rcpp_result_gen = Rcpp::wrap(countRows(base, sheet));
return rcpp_result_gen;
END_RCPP
}
Expand Down
7 changes: 4 additions & 3 deletions src/XlsWorkSheet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@ CharacterVector xls_col_names(std::string path, int i = 0, int nskip = 0) {
}

// [[Rcpp::export]]
CharacterVector xls_col_types(std::string path, std::vector<std::string> na, int sheet = 0,
int nskip = 0, int n = 100, bool has_col_names = false) {
CharacterVector xls_col_types(std::string path, std::vector<std::string> na,
int sheet = 0, int nskip = 0,
int guess_max = 1000, bool has_col_names = false) {
XlsWorkBook wb = XlsWorkBook(path);
std::vector<CellType> types = wb.sheet(sheet).colTypes(na, nskip + has_col_names, n);
std::vector<CellType> types = wb.sheet(sheet).colTypes(na, nskip + has_col_names, guess_max);

CharacterVector out(types.size());
for (size_t i = 0; i < types.size(); ++i) {
Expand Down
5 changes: 3 additions & 2 deletions src/XlsWorkSheet.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,11 @@ class XlsWorkSheet {
return out;
}

std::vector<CellType> colTypes(const StringSet &na, int nskip = 0, int n_max = 100) {
std::vector<CellType> colTypes(const StringSet &na, int nskip = 0,
int guess_max = 1000) {
std::vector<CellType> types(ncol_);

for (int i = nskip; i < nrow_ && i < n_max; ++i) {
for (int i = nskip; i < nrow_ && i < nskip + guess_max; ++i) {
if ((i + 1) % 10000 == 0)
Rcpp::checkUserInterrupt();

Expand Down
11 changes: 6 additions & 5 deletions src/XlsxWorkSheet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@ IntegerVector parse_ref(std::string ref) {

// [[Rcpp::export]]
CharacterVector xlsx_col_types(std::string path, int sheet = 0,
CharacterVector na = CharacterVector(), int nskip = 0,
int n = 100) {
CharacterVector na = CharacterVector(),
int nskip = 0, int guess_max = 1000) {

XlsxWorkSheet ws(path, sheet, nskip);
std::vector<CellType> types = ws.colTypes(na, nskip, n);
std::vector<CellType> types = ws.colTypes(na, nskip, guess_max);

CharacterVector out(types.size());
for (size_t i = 0; i < types.size(); ++i) {
Expand All @@ -38,7 +38,8 @@ CharacterVector xlsx_col_names(std::string path, int sheet = 0, int nskip = 0) {

// [[Rcpp::export]]
List read_xlsx_(std::string path, int sheet, RObject col_names,
RObject col_types, std::vector<std::string> na, int nskip = 0) {
RObject col_types, std::vector<std::string> na,
int nskip = 0, int guess_max = 1000) {

XlsxWorkSheet ws(path, sheet, nskip);

Expand Down Expand Up @@ -67,7 +68,7 @@ List read_xlsx_(std::string path, int sheet, RObject col_names,
std::vector<CellType> colTypes;
switch(TYPEOF(col_types)) {
case NILSXP:
colTypes = ws.colTypes(na, 100, sheetHasColumnNames);
colTypes = ws.colTypes(na, guess_max, sheetHasColumnNames);
break;
case STRSXP:
colTypes = cellTypes(as<CharacterVector>(col_types));
Expand Down
5 changes: 3 additions & 2 deletions src/XlsxWorkSheet.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ class XlsxWorkSheet {
// moved out of here, so we don't read column names again inside this fxn.
// More comments near end of fxn.
std::vector<CellType> colTypes(const StringSet& na,
int n_max = 100, bool has_col_names = false) {
int guess_max = 1000,
bool has_col_names = false) {
std::vector<CellType> types;
types.resize(ncol_);

Expand All @@ -93,7 +94,7 @@ class XlsxWorkSheet {
// account for any empty rows between column headers and data start
i = it->row() - base;

while (i < n_max && it != cells_.end()) {
while (i < guess_max && it != cells_.end()) {
// find the end of current row
row_end = it;
while(row_end != cells_.end() && row_end->row() == it->row()) {
Expand Down
Binary file added tests/testthat/sheets/types.xls
Binary file not shown.
Binary file modified tests/testthat/sheets/types.xlsx
Binary file not shown.
Loading

0 comments on commit 12dad81

Please sign in to comment.