Refactor xlsx col type and col name handling (#261)

* add new param n for guessing col_type * Remove all column name processing from colTypes() * Move colName and colType comparison out of readCols() * Stop dropping blank columns; fixes #157 * Move tests into more logical places * More specific error when col_names or col_types has wrong length * Rcpp churn * If no data, call it blank right here * Simplify col_type learning loop * Sketch my grand plans * Check and test col_types * Simplify readCols() loop * Recycle col_types if length 1; fixes #127 * Deprecate `col_types = "blank"`; fixes #260 * Rationalize joint processing of col_names + col_types; fixes #81 * Ignore docs for xls format * Fix/update xlsx_col_types() Even though I'm not sure what it's for. * Delete benchmarks.cpp, home of the shifty, vestigial parseXml()and countRows() * Add bullets to NEWS * README and pkgdown * Groom error messages and tests thereof * Wording in NEWS * Stop for unknown type * Move CELL_SKIP; add comment re: enum order * Recycle length-one col_type with std::fill() * Indenting * Helper function for reconciling col names * Helper function for recycling col types * Simplify col_type learning loop; avoid cell copy * Simplify and avoid cell copy in readCols() too * It's guess_max, not max_guess * Delete comment, test code style * Cleaner with i, j
tidyverse · Feb 13, 2017 · 4a34a17 · 4a34a17
1 parent 12dad81
commit 4a34a17
Show file tree

Hide file tree

Showing 25 changed files with 423 additions and 293 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -13,3 +13,5 @@
 ^_pkgdown\.yml$
 ^docs$
 ^clippy$
+^\[MS-XLS\]\.pdf$
+^excelfileformat\.pdf$
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,5 @@
 .RData
 ~*
 clippy
+\[MS-XLS\].pdf
+excelfileformat.pdf
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,15 @@
 # readxl 0.1.1.9000
 
+*currently much of this applies only to xlsx, but will be extended to xls*
+
+* A user-specified `col_types` of length one will be replicated to have length equal to the number of columns. (#127, #114, #261 @jennybc)
+
+* Column type `"blank"` has been deprecated in favor of the more descriptive `"skip"`, which also supports the goal to become more consistent with readr. (#260, #193, #261 @jennybc)
+
+* User-supplied `col_names` are processed relative to user-supplied `col_types`, if given. Specifically, `col_names` is considered valid if it has the same length as `col_types`, before *or after* removing skipped columns. (#81, #261 @jennybc)
+
+* Leading or embedded empty columns are no longer dropped, regardless of whether there is a column name. (#157, #261 @jennybc)
+
 * New argument `guess_max` lets user adjust the number of rows used to guess column types, similar to functions in readr. (#223, #257 @tklebel, @jennybc)
 
 * Improved handling of empty cells for xlsx. (#248 @jennybc)

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -41,8 +41,8 @@ parse_ref <- function(ref) {
     .Call('readxl_parse_ref', PACKAGE = 'readxl', ref)
 }
 
-xlsx_col_types <- function(path, sheet = 0L, na = character(), nskip = 0L, guess_max = 1000L) {
-    .Call('readxl_xlsx_col_types', PACKAGE = 'readxl', path, sheet, na, nskip, guess_max)
+xlsx_col_types <- function(path, sheet = 0L, na = character(), nskip = 0L, guess_max = 1000L, sheetHasColumnNames = FALSE) {
+    .Call('readxl_xlsx_col_types', PACKAGE = 'readxl', path, sheet, na, nskip, guess_max, sheetHasColumnNames)
 }
 
 xlsx_col_names <- function(path, sheet = 0L, nskip = 0L) {
@@ -53,14 +53,6 @@ read_xlsx_ <- function(path, sheet, col_names, col_types, na, nskip = 0L, guess_
     .Call('readxl_read_xlsx_', PACKAGE = 'readxl', path, sheet, col_names, col_types, na, nskip, guess_max)
 }
 
-parseXml <- function(base, internal) {
-    invisible(.Call('readxl_parseXml', PACKAGE = 'readxl', base, internal))
-}
-
-countRows <- function(base, sheet) {
-    .Call('readxl_countRows', PACKAGE = 'readxl', base, sheet)
-}
-
 zip_xml <- function(zip_path, file_path) {
     invisible(.Call('readxl_zip_xml', PACKAGE = 'readxl', zip_path, file_path))
 }

diff --git a/R/read_excel.R b/R/read_excel.R
@@ -7,14 +7,19 @@ NULL
 #' @param path Path to the xls/xlsx file
 #' @param sheet Sheet to read. Either a string (the name of a sheet), or an
 #'   integer (the position of the sheet). Defaults to the first sheet.
-#' @param col_names `TRUE` to use the first row as column names, `FALSE`
-#'   to get default names, or a character vector giving a name for each column.
+#' @param col_names `TRUE` to use the first row as column names, `FALSE` to get
+#'   default names, or a character vector giving a name for each column. If user
+#'   provides `col_types` as a vector, `col_names` can have one entry per
+#'   column, i.e. have the same length as `col_types`, or one entry per
+#'   unskipped column.
 #' @param col_types Either `NULL` to guess from the spreadsheet or a character
-#'   vector containing one entry per column from these options: "blank",
-#'   "numeric", "date" or "text".
-#' @param na Character vector of strings to use for missing values. By default
+#'   vector containing one entry per column from these options: "skip",
+#'   "numeric", "date" or "text". The content of a cell in a skipped column is
+#'   never read and that column will not appear in the data frame output.
+#' @param na Character vector of strings to use for missing values. By default,
 #'   readxl treats blank cells as missing data.
-#' @param skip Number of rows to skip before reading any data.
+#' @param skip Number of rows to skip before reading any data. Leading blank
+#'   rows are automatically skipped.
 #' @param guess_max Maximum number of rows to use for guessing column types.
 #' @export
 #' @examples
@@ -32,6 +37,7 @@ read_excel <- function(path, sheet = 1L, col_names = TRUE, col_types = NULL,
 
   path <- check_file(path)
   guess_max <- check_guess_max(guess_max)
+  col_types <- check_col_types(col_types)
 
   switch(excel_format(path),
     xls =  read_xls(path, sheet, col_names, col_types, na, skip, guess_max),
@@ -128,6 +134,30 @@ standardise_sheet <- function(sheet, sheet_names) {
   }
 }
 
+check_col_types <- function(col_types) {
+  if (is.null(col_types)) {
+    return(col_types)
+  }
+  stopifnot(is.character(col_types), length(col_types) > 0, !anyNA(col_types))
+
+  blank <- col_types == "blank"
+  if (any(blank)) {
+    message("`col_type = \"blank\"` deprecated. Use \"skip\" instead.")
+    col_types[blank] <- "skip"
+  }
+
+  accepted_types <- c("skip", "numeric", "date", "text")
+  ok <- col_types %in% accepted_types
+  if (any(!ok)) {
+    info <- paste(
+      paste0("'", col_types[!ok], "' [", seq_along(col_types)[!ok], "]"),
+      collapse = ", "
+    )
+    stop(paste("Illegal column type:", info), call. = FALSE)
+  }
+  col_types
+}
+
 ## from readr
 check_guess_max <- function(guess_max, max_limit = .Machine$integer.max %/% 100) {
 

diff --git a/README.Rmd b/README.Rmd
@@ -94,6 +94,8 @@ If you are new to the tidyverse conventions for data import, you may want to con
 * Loads datetimes into POSIXct columns. Both Windows (1900) and Mac (1904) 
   date specifications are processed correctly.
 
-* Blank columns are automatically dropped (*but this is changing!*). Blank rows that appear before the data are automatically dropped; embedded blank rows are not.
+* Blank rows that appear before the data are automatically dropped; embedded blank rows are not. User can exert more control of this with `skip`.
+
+* Column names and types are determined from the data in the sheet, by default, but user can also supply via `col_names` and `col_types`.
 
 * It returns a tibble, i.e. a data frame with an additional `tbl_df` class. Among other things, this provide nicer printing.
diff --git a/README.md b/README.md
@@ -117,6 +117,8 @@ Features
 
 -   Loads datetimes into POSIXct columns. Both Windows (1900) and Mac (1904) date specifications are processed correctly.
 
--   Blank columns are automatically dropped (*but this is changing!*). Blank rows that appear before the data are automatically dropped; embedded blank rows are not.
+-   Blank rows that appear before the data are automatically dropped; embedded blank rows are not. User can exert more control of this with `skip`.
+
+-   Column names and types are determined from the data in the sheet, by default, but user can also supply via `col_names` and `col_types`.
 
 -   It returns a tibble, i.e. a data frame with an additional `tbl_df` class. Among other things, this provide nicer printing.
diff --git a/docs/index.html b/docs/index.html
diff --git a/docs/news/index.html b/docs/news/index.html
diff --git a/docs/reference/read_excel.html b/docs/reference/read_excel.html
diff --git a/man/read_excel.Rd b/man/read_excel.Rd
diff --git a/src/CellType.h b/src/CellType.h
@@ -5,8 +5,12 @@
 #include <libxls/xls.h>
 #include "StringSet.h"
 
+// CELL_BLANK can arise only from an individual cell during type guessing
+// important that it be the first entry = default type, instead of ...
+// CELL_SKIP  which can arise only as a user-specified column type
 enum CellType {
   CELL_BLANK,
+  CELL_SKIP,
   CELL_DATE,
   CELL_NUMERIC,
   CELL_TEXT
@@ -26,11 +30,12 @@ inline std::vector<CellType> cellTypes(Rcpp::CharacterVector x) {
       types.push_back(CELL_DATE);
     } else if (type == "numeric") {
       types.push_back(CELL_NUMERIC);
+    } else if (type == "skip") {
+      types.push_back(CELL_SKIP);
     } else if (type == "text") {
       types.push_back(CELL_TEXT);
     } else {
-      Rcpp::warning("Unknown type '%s' at position %i. Using text instead.",
-        type, i + 1);
+      Rcpp::stop("Unknown type '%s' at position %i", type, i + 1);
     }
   }
 
@@ -42,6 +47,7 @@ inline std::string cellTypeDesc(CellType type) {
   case CELL_BLANK:   return "blank";
   case CELL_DATE:    return "date";
   case CELL_NUMERIC: return "numeric";
+  case CELL_SKIP:    return "skip";
   case CELL_TEXT:    return "text";
   }
   return "???";
@@ -153,6 +159,9 @@ inline Rcpp::RObject makeCol(CellType type, int n) {
   case CELL_NUMERIC:
     return Rcpp::NumericVector(n, NA_REAL);
     break;
+  case CELL_SKIP:
+    return R_NilValue;
+    break;
   case CELL_TEXT:
     return Rcpp::CharacterVector(n, NA_STRING);
     break;
@@ -161,24 +170,24 @@ inline Rcpp::RObject makeCol(CellType type, int n) {
   return R_NilValue;
 }
 
-// Drop blanks from list of columns
-inline Rcpp::List removeBlankColumns(Rcpp::List cols,
-                                     Rcpp::CharacterVector names,
-                                     std::vector<CellType> types) {
+inline Rcpp::List removeSkippedColumns(Rcpp::List cols,
+                                       Rcpp::CharacterVector names,
+                                       std::vector<CellType> types) {
   int p = cols.size();
 
   int p_out = 0;
   for (int j = 0; j < p; ++j) {
-    if (types[j] != CELL_BLANK)
+    if (types[j] != CELL_SKIP)
       p_out++;
   }
 
   Rcpp::List out(p_out);
   Rcpp::CharacterVector names_out(p_out);
   int j_out = 0;
   for (int j = 0; j < p; ++j) {
-    if (types[j] == CELL_BLANK)
+    if (types[j] == CELL_SKIP) {
       continue;
+    }
 
     out[j_out] = cols[j];
     names_out[j_out] = names[j];

diff --git a/src/ColSpec.h b/src/ColSpec.h
@@ -0,0 +1,49 @@
+#ifndef READXL_COLSPEC_
+#define READXL_COLSPEC_
+
+#include <Rcpp.h>
+#include "CellType.h"
+
+inline std::vector<CellType> recycleTypes(std::vector<CellType> types,
+                                          int ncol) {
+  if (types.size() == 1) {
+    types.resize(ncol);
+    std::fill(types.begin(), types.end(), types[0]);
+  }
+  return types;
+}
+
+inline Rcpp::CharacterVector reconcileNames(Rcpp::CharacterVector names,
+                                            const std::vector<CellType>& types,
+                                            int sheet) {
+  size_t ncol_names = names.size();
+  size_t ncol_types = types.size();
+
+  if (ncol_names == ncol_types) {
+    return names;
+  }
+
+  size_t ncol_noskip = 0;
+  for (size_t i = 0; i < types.size(); i++) {
+    if (types[i] != CELL_SKIP) {
+      ncol_noskip++;
+    }
+  }
+  if (ncol_names != ncol_noskip) {
+    Rcpp::stop("Sheet %d has %d columns (%d unskipped), but `col_names` has length %d.",
+               sheet + 1, ncol_types, ncol_noskip, ncol_names);
+  }
+
+  Rcpp::CharacterVector newNames(ncol_types, "");
+  size_t j_short = 0;
+  for (size_t j_long = 0; j_long < ncol_types; ++j_long) {
+    if (types[j_long] == CELL_SKIP) {
+      continue;
+    }
+    newNames[j_long] = names[j_short];
+    j_short++;
+  }
+  return newNames;
+}
+
+#endif