diff --git a/.Rbuildignore b/.Rbuildignore index 972db163..8a3bce13 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -13,3 +13,5 @@ ^_pkgdown\.yml$ ^docs$ ^clippy$ +^\[MS-XLS\]\.pdf$ +^excelfileformat\.pdf$ diff --git a/.gitignore b/.gitignore index 59723256..5f690587 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,5 @@ .RData ~* clippy +\[MS-XLS\].pdf +excelfileformat.pdf diff --git a/NEWS.md b/NEWS.md index ab0da5e7..c261642f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,15 @@ # readxl 0.1.1.9000 +*currently much of this applies only to xlsx, but will be extended to xls* + +* A user-specified `col_types` of length one will be replicated to have length equal to the number of columns. (#127, #114, #261 @jennybc) + +* Column type `"blank"` has been deprecated in favor of the more descriptive `"skip"`, which also supports the goal to become more consistent with readr. (#260, #193, #261 @jennybc) + +* User-supplied `col_names` are processed relative to user-supplied `col_types`, if given. Specifically, `col_names` is considered valid if it has the same length as `col_types`, before *or after* removing skipped columns. (#81, #261 @jennybc) + +* Leading or embedded empty columns are no longer dropped, regardless of whether there is a column name. (#157, #261 @jennybc) + * New argument `guess_max` lets user adjust the number of rows used to guess column types, similar to functions in readr. (#223, #257 @tklebel, @jennybc) * Improved handling of empty cells for xlsx. (#248 @jennybc) diff --git a/R/RcppExports.R b/R/RcppExports.R index 14700877..2b1c41e2 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -41,8 +41,8 @@ parse_ref <- function(ref) { .Call('readxl_parse_ref', PACKAGE = 'readxl', ref) } -xlsx_col_types <- function(path, sheet = 0L, na = character(), nskip = 0L, guess_max = 1000L) { - .Call('readxl_xlsx_col_types', PACKAGE = 'readxl', path, sheet, na, nskip, guess_max) +xlsx_col_types <- function(path, sheet = 0L, na = character(), nskip = 0L, guess_max = 1000L, sheetHasColumnNames = FALSE) { + .Call('readxl_xlsx_col_types', PACKAGE = 'readxl', path, sheet, na, nskip, guess_max, sheetHasColumnNames) } xlsx_col_names <- function(path, sheet = 0L, nskip = 0L) { @@ -53,14 +53,6 @@ read_xlsx_ <- function(path, sheet, col_names, col_types, na, nskip = 0L, guess_ .Call('readxl_read_xlsx_', PACKAGE = 'readxl', path, sheet, col_names, col_types, na, nskip, guess_max) } -parseXml <- function(base, internal) { - invisible(.Call('readxl_parseXml', PACKAGE = 'readxl', base, internal)) -} - -countRows <- function(base, sheet) { - .Call('readxl_countRows', PACKAGE = 'readxl', base, sheet) -} - zip_xml <- function(zip_path, file_path) { invisible(.Call('readxl_zip_xml', PACKAGE = 'readxl', zip_path, file_path)) } diff --git a/R/read_excel.R b/R/read_excel.R index 42a6fe3f..5b9e5c68 100644 --- a/R/read_excel.R +++ b/R/read_excel.R @@ -7,14 +7,19 @@ NULL #' @param path Path to the xls/xlsx file #' @param sheet Sheet to read. Either a string (the name of a sheet), or an #' integer (the position of the sheet). Defaults to the first sheet. -#' @param col_names `TRUE` to use the first row as column names, `FALSE` -#' to get default names, or a character vector giving a name for each column. +#' @param col_names `TRUE` to use the first row as column names, `FALSE` to get +#' default names, or a character vector giving a name for each column. If user +#' provides `col_types` as a vector, `col_names` can have one entry per +#' column, i.e. have the same length as `col_types`, or one entry per +#' unskipped column. #' @param col_types Either `NULL` to guess from the spreadsheet or a character -#' vector containing one entry per column from these options: "blank", -#' "numeric", "date" or "text". -#' @param na Character vector of strings to use for missing values. By default +#' vector containing one entry per column from these options: "skip", +#' "numeric", "date" or "text". The content of a cell in a skipped column is +#' never read and that column will not appear in the data frame output. +#' @param na Character vector of strings to use for missing values. By default, #' readxl treats blank cells as missing data. -#' @param skip Number of rows to skip before reading any data. +#' @param skip Number of rows to skip before reading any data. Leading blank +#' rows are automatically skipped. #' @param guess_max Maximum number of rows to use for guessing column types. #' @export #' @examples @@ -32,6 +37,7 @@ read_excel <- function(path, sheet = 1L, col_names = TRUE, col_types = NULL, path <- check_file(path) guess_max <- check_guess_max(guess_max) + col_types <- check_col_types(col_types) switch(excel_format(path), xls = read_xls(path, sheet, col_names, col_types, na, skip, guess_max), @@ -128,6 +134,30 @@ standardise_sheet <- function(sheet, sheet_names) { } } +check_col_types <- function(col_types) { + if (is.null(col_types)) { + return(col_types) + } + stopifnot(is.character(col_types), length(col_types) > 0, !anyNA(col_types)) + + blank <- col_types == "blank" + if (any(blank)) { + message("`col_type = \"blank\"` deprecated. Use \"skip\" instead.") + col_types[blank] <- "skip" + } + + accepted_types <- c("skip", "numeric", "date", "text") + ok <- col_types %in% accepted_types + if (any(!ok)) { + info <- paste( + paste0("'", col_types[!ok], "' [", seq_along(col_types)[!ok], "]"), + collapse = ", " + ) + stop(paste("Illegal column type:", info), call. = FALSE) + } + col_types +} + ## from readr check_guess_max <- function(guess_max, max_limit = .Machine$integer.max %/% 100) { diff --git a/README.Rmd b/README.Rmd index c1619a93..74acf6dc 100644 --- a/README.Rmd +++ b/README.Rmd @@ -94,6 +94,8 @@ If you are new to the tidyverse conventions for data import, you may want to con * Loads datetimes into POSIXct columns. Both Windows (1900) and Mac (1904) date specifications are processed correctly. -* Blank columns are automatically dropped (*but this is changing!*). Blank rows that appear before the data are automatically dropped; embedded blank rows are not. +* Blank rows that appear before the data are automatically dropped; embedded blank rows are not. User can exert more control of this with `skip`. + +* Column names and types are determined from the data in the sheet, by default, but user can also supply via `col_names` and `col_types`. * It returns a tibble, i.e. a data frame with an additional `tbl_df` class. Among other things, this provide nicer printing. diff --git a/README.md b/README.md index 02f962ed..0a6bef13 100644 --- a/README.md +++ b/README.md @@ -117,6 +117,8 @@ Features - Loads datetimes into POSIXct columns. Both Windows (1900) and Mac (1904) date specifications are processed correctly. -- Blank columns are automatically dropped (*but this is changing!*). Blank rows that appear before the data are automatically dropped; embedded blank rows are not. +- Blank rows that appear before the data are automatically dropped; embedded blank rows are not. User can exert more control of this with `skip`. + +- Column names and types are determined from the data in the sheet, by default, but user can also supply via `col_names` and `col_types`. - It returns a tibble, i.e. a data frame with an additional `tbl_df` class. Among other things, this provide nicer printing. diff --git a/docs/index.html b/docs/index.html index 0f550515..2d30db2d 100644 --- a/docs/index.html +++ b/docs/index.html @@ -142,7 +142,8 @@

diff --git a/docs/news/index.html b/docs/news/index.html index 479d7f42..925a0309 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -82,7 +82,13 @@

Change log All releases

readxl 0.1.1.9000

+

currently much of this applies only to xlsx, but will be extended to xls