Merge pull request #340 from billdenney/clean_names-rewrite

clean_names rewrite
sfirke · Mar 12, 2020 · f0425ee · f0425ee
2 parents c01e368 + a6d9af7
commit f0425ee
Show file tree

Hide file tree

Showing 8 changed files with 576 additions and 182 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -27,6 +27,8 @@ Imports:
     magrittr,
     purrr,
     rlang,
+    stringi,
+    stringr,
     snakecase (>= 0.9.2),
     tidyselect (>= 1.0.0),
     tidyr (>= 0.7.0)
@@ -38,6 +40,7 @@ Suggests:
     knitr,
     rmarkdown,
     sf,
-    tibble
+    tibble,
+    tidygraph
 VignetteBuilder: knitr
 Encoding: UTF-8
diff --git a/NAMESPACE b/NAMESPACE
@@ -2,8 +2,10 @@
 
 S3method(chisq.test,default)
 S3method(chisq.test,tabyl)
+S3method(clean_names,data.frame)
 S3method(clean_names,default)
 S3method(clean_names,sf)
+S3method(clean_names,tbl_graph)
 S3method(crosstab,data.frame)
 S3method(crosstab,default)
 S3method(describe_class,default)
@@ -49,11 +51,16 @@ export(tabyl)
 export(top_levels)
 export(untabyl)
 export(use_first_valid_of)
+importFrom(dplyr,rename_all)
 importFrom(lubridate,ymd)
 importFrom(lubridate,ymd_hms)
 importFrom(magrittr,"%>%")
 importFrom(rlang,dots_n)
 importFrom(rlang,expr)
 importFrom(rlang,syms)
+importFrom(snakecase,to_any_case)
 importFrom(stats,na.omit)
+importFrom(stringi,stri_trans_general)
+importFrom(stringr,str_replace)
+importFrom(stringr,str_replace_all)
 importFrom(tidyselect,eval_select)
diff --git a/NEWS.md b/NEWS.md
@@ -1,7 +1,15 @@
 # janitor 1.2.1.9000 (unreleased), will be v 1.3.0
 
+## Breaking Changes
+
+* `clean_names()` and `make_clean_names()` now work significantly harder to be locale-independent, and translation to ASCII is simpler (i.e. in many cases, Unicode is removed and the Greek delta character becomes a "d"). You may also now control how substitutions occur and add your own substitutions (like "%" becoming "percent").  These changes alter the new names provided by these functions in some cases. (Fix #331, thanks to @billdenney)
+
 ## Major features
 
+* `clean_names()` and `make_clean_names()` have a more generic interface where all arguments from `make_clean_names()` are accessible from `clean_names()` (Fix #339, thanks to @ari-nz and @billdenney).
+
+* `make_clean_names()` now allows the user to specify parts of names to be replaced (Fix #316, thanks to @woodwards for reporting and @woodwards and @billdenney for implementing)
+
 * The variables considered by the function `get_dupes()` can be specified using the select helper functions from `tidyselect`.  This includes `-column_name` to omit a variable as well as the matching functions `starts_with()`, `ends_with()`, `contains()`, and `matches()`.  See `?tidyselect::select_helpers` for more (#326, thanks to **@jzadra** for suggesting and implementing).
 
 * The new function `signif_half_up()` rounds a numeric vector to the specified number of significant digits with halves rounded up (#314, thanks to **@khueyama** for suggesting and implementing).
@@ -14,6 +22,8 @@
 
 * `row_to_names()` will now work on matrix input (#320, thanks to **@billdenney** for suggesting and implementing
 
+* `clean_names()` can now be called on *tbl_graph* objects from the `tidygraph` package. (#252, thanks to @gvdr for bringing up the issue and thanks to @Tazinho for proposing solution).
+
 ## Bug fixes
 
 * `adorn_ns()` doesn't append anything to character columns when called on a data.frame resulting from a call to `adorn_percentages()`.  (#195).

diff --git a/R/clean_names.R b/R/clean_names.R
@@ -1,30 +1,27 @@
-#' @title Cleans names of a data.frame.
+#' @title Cleans names of an object (usually a data.frame).
 #'
 #' @description
 #' Resulting names are unique and consist only of the \code{_} character, numbers, and letters.
 #' Capitalization preferences can be specified using the \code{case} parameter.
 #'
-#' Accented characters are
-#' transliterated to ASCII.  For example, an "o" with a German umlaut over it becomes "o", and the Spanish character "enye" becomes "n".
+#' Accented characters are transliterated to ASCII.  For example, an "o" with a
+#' German umlaut over it becomes "o", and the Spanish character "enye" becomes
+#' "n".
 #' 
-#' This function takes and returns a data.frame, for ease of piping with  \code{`\%>\%`}.  
-#' For the underlying function that works on a character vector of names,
-#' see \code{\link[janitor]{make_clean_names}}. 
+#' This function takes and returns a data.frame, for ease of piping with
+#' \code{`\%>\%`}. For the underlying function that works on a character vector
+#' of names, see \code{\link[janitor]{make_clean_names}}.
 #'
 #' @param dat the input data.frame.
-#' @param case The desired target case (default is \code{"snake"}), indicated by these possible values:
-#' \itemize{
-#'  \item{\code{"snake"} produces snake_case}
-#'  \item{\code{"lower_camel"} or \code{"small_camel"} produces lowerCamel}
-#'  \item{\code{"upper_camel"} or \code{"big_camel"} produces UpperCamel}
-#'  \item{\code{"screaming_snake"} or \code{"all_caps"} produces ALL_CAPS}
-#'  \item{\code{"lower_upper"} produces lowerUPPER}
-#'  \item{\code{"upper_lower"} produces UPPERlower}
-#'  \item{\code{old_janitor}: legacy compatibility option to preserve behavior of \code{clean_names} prior to addition of the "case" argument(janitor versions <= 0.3.1 )}.  Provided as a quick fix for old scripts broken by the changes to \code{clean_names} in janitor v1.0.
-#'  \item{\code{"parsed"}, \code{"mixed"}, \code{"none"}: less-common cases offered by \code{snakecase::to_any_case}.  See \code{\link[snakecase]{to_any_case}} for details.}
-#'  }
-#'
+#' @inheritDotParams make_clean_names -string
 #' @return Returns the data.frame with clean names.
+#' 
+#' @details \code{clean_names()} is intended to be used on \code{data.frames}
+#'   and \code{data.frame} like objects. For this reason there are methods to
+#'   support using \code{clean_names()} on \code{sf} and \code{tbl_graph} (from
+#'   \code{tidygraph}) objects. For cleaning named lists and vectors, consider
+#'   using \code{make_clean_names()}.
+#' 
 #' @export
 #' @examples
 #' # not run:
@@ -39,48 +36,55 @@
 #' # not run:
 #' # library(readxl)
 #' # read_excel("messy_excel_file.xlsx") %>% clean_names()
-
-# create new clean_names method
-clean_names <- function(dat, case) {
+clean_names <- function(dat, ...) {
   UseMethod("clean_names")
 }
 
-
 #' @rdname clean_names
 #' @export
+clean_names.data.frame <- function(dat, ...) {
+  stats::setNames(dat, make_clean_names(names(dat), ...))
+}
 
-
-# create a default method, Will only dispatch on a dataframe 
-clean_names.default <- function(dat, case = c(
-  "snake", "lower_camel", "upper_camel", "screaming_snake",
-  "lower_upper", "upper_lower", "all_caps", "small_camel",
-  "big_camel", "old_janitor", "parsed", "mixed", "none"
-)) {
-  if(!is.data.frame(dat)){ 
-    stop( "clean_names() must be called on a data.frame.  Consider janitor::make_clean_names() for other cases of manipulating vectors of names.") 
-  }
-  stats::setNames(dat, make_clean_names(names(dat), case = case))
+#' @rdname clean_names
+#' @export
+clean_names.default <- function(dat, ...) {
+  stop(
+    "No `clean_names()` method exists for the class ", paste(class(dat), collapse=", "),
+    "\nConsider janitor::make_clean_names() for other cases of manipulating vectors of names."
+  )
 }
 
+#' @rdname clean_names
 #' @export
-# create method for sf object
-clean_names.sf <- function(dat, case = c(
-  "snake", "lower_camel", "upper_camel", "screaming_snake",
-  "lower_upper", "upper_lower", "all_caps", "small_camel",
-  "big_camel", "old_janitor", "parsed", "mixed", "none"
-)) {
+clean_names.sf <- function(dat, ...) {
   if (!requireNamespace("sf", quietly = TRUE)) { # nocov start
-    stop("Package \"sf\" needed for this function to work. Please install it.",
-         call. = FALSE)
+    stop(
+      "Package 'sf' needed for this function to work. Please install it.",
+      call. = FALSE
+    )
   } # nocov end
   # get old names
   sf_names <- names(dat) 
   # identify ending column index to clean
   n_cols <- length(dat)-1 
   # clean all but last column
-  sf_cleaned <- make_clean_names(sf_names[1:n_cols], case) 
+  sf_cleaned <- make_clean_names(sf_names[1:n_cols], ...) 
   # rename original df
   names(dat)[1:n_cols] <- sf_cleaned 
 
   return(dat)
 }
+
+#' @rdname clean_names
+#' @export
+#' @importFrom dplyr rename_all
+clean_names.tbl_graph <- function(dat, ...) {
+  if (!requireNamespace("tidygraph", quietly = TRUE)) { # nocov start
+    stop(
+      "Package 'tidygraph' needed for this function to work. Please install it.", 
+      call. = FALSE
+    )
+  } # nocov end
+  dplyr::rename_all(dat, .funs=make_clean_names, ...)
+}
diff --git a/R/make_clean_names.R b/R/make_clean_names.R
@@ -1,81 +1,152 @@
 #' @title Cleans a vector of text, typically containing the names of an object.
 #'
 #' @description
-#' Resulting strings are unique and consist only of the \code{_} character, numbers, and letters.
-#' Capitalization preferences can be specified using the \code{case} parameter.
+#' Resulting strings are unique and consist only of the \code{_} character,
+#' numbers, and letters. By default, the resulting strings will only consist of
+#' ASCII characters, but non-ASCII (e.g. Unicode) may be allowed by setting
+#' `ascii=FALSE`.  Capitalization preferences can be specified using the
+#' \code{case} parameter.
 #'
 #' For use on the names of a data.frame, e.g., in a \code{`\%>\%`} pipeline,
 #' call the convenience function \code{\link[janitor]{clean_names}}.
 #' 
-#' Accented characters are transliterated to ASCII.  For example, an "o" 
-#' with a German umlaut over it becomes "o", and the Spanish character "enye" becomes "n".
+#' When `ascii=TRUE` (the default), accented characters are transliterated to
+#' ASCII.  For example, an "o" with a German umlaut over it becomes "o", and the
+#' Spanish character "enye" becomes "n".
+#' 
+#' The order of operations is: `replace`, (optional) ASCII conversion, removing
+#' initial spaces and punctuation, apply `base::make.names()`, apply
+#' `snakecase::to_any_case()`, and add numeric suffixes to duplicates.
+#' 
+#' If `case = "old_janitor"` legacy compatibility option to preserve behavior of
+#' `clean_names()` prior to addition of the "case" argument(janitor versions <=
+#' 0.3.1).  The `"old_janitor"` option is provided as a quick fix for old
+#' scripts broken by the changes to `clean_names()` in janitor v1.0, and it
+#' should not be used for new code.
 #'
 #' @param string A character vector of names to clean.
-#'
-#' @inheritParams clean_names
+#' @param case The desired target case (default is \code{"snake"}) will be
+#'   passed to `snakecase::to_any_case()` with the exception of "old_janitor"
+#'   (see details).
+#' @param replace A named character vector where the name is replaced by the
+#'   value.
+#' @param ascii Convert the names to ASCII (\code{TRUE}, default) or not
+#'   (\code{FALSE}).
+#' @param use_make_names Should `make.names()` be applied to ensure that the
+#'   output is usable as a name without quoting?  (Avoiding `make.names()`
+#'   ensures that the output is locale-independent but quoting may be required.)
+#' @inheritParams snakecase::to_any_case
+#' @inheritDotParams snakecase::to_any_case
 #'
 #' @return Returns the "cleaned" character vector.
 #' @export
-#' @seealso \code{\link[snakecase]{to_any_case}}
+#' @seealso \code{\link[snakecase]{to_any_case}()}
 #' @examples
 #' 
 #' # cleaning the names of a vector:
 #' x <- structure(1:3, names = c("name with space", "TwoWords", "total $ (2009)"))
 #' x
 #' names(x) <- make_clean_names(names(x))
 #' x # now has cleaned names
-
+#'
 #' # if you prefer camelCase variable names:
 #' make_clean_names(names(x), "small_camel")
 #'
 #' # similar to janitor::clean_names(poorly_named_df):
 #' # not run:
 #' # make_clean_names(names(poorly_named_df))
-#' 
-make_clean_names <- function(string, case = c(
-  "snake", "lower_camel", "upper_camel", "screaming_snake",
-  "lower_upper", "upper_lower", "all_caps", "small_camel",
-  "big_camel", "old_janitor", "parsed", "mixed", "none"
-)) {
+#'
+#' @importFrom stringi stri_trans_general
+#' @importFrom stringr str_replace str_replace_all
+#' @importFrom snakecase to_any_case
+make_clean_names <- function(string,
+                             case = "snake",
+                             replace=
+                               c(
+                                 "'"="",
+                                 "\""="",
+                                 "%"="_percent_",
+                                 "#"="_number_"
+                               ),
+                             ascii=TRUE,
+                             use_make_names=TRUE,
+                             # default arguments for snake_case::to_any_case
+                             sep_in = "\\.",
+                             transliterations = "Latin-ASCII",
+                             parsing_option = 1,
+                             numerals = "asis",
+                             ...) {
 
-  # old behavior, to provide easy fix for people whose code breaks with the snakecase integration
-  case <- match.arg(case)
+  # Handling "old_janitor" case for backward compatibility
   if (case == "old_janitor") {
     return(old_make_clean_names(string))
   }
-
-  ### new behaviour with snakecase integration
-  # Takes a data.frame, returns the same data frame with cleaned names
-  old_names <- string
-  new_names <- old_names %>%
-    gsub("'", "", .) %>% # remove single quotation marks
-    gsub("\"", "", .) %>% # remove double quotation marks
-    gsub("%", ".percent_", .) %>% # starting with "." as a workaround, to make
-    # ".percent" a valid name. The "." will be replaced in the call to to_any_case
-    # via the preprocess argument anyway.
-    gsub("#", ".number_", .) %>%
-    gsub("^[[:space:][:punct:]]+", "", .) %>% # remove leading spaces & punctuation
-    make.names(.) %>%
-    # Handle dots, multiple underscores, case conversion, string transliteration
-    # Parsing option 4 removes underscores around numbers, #153
-    snakecase::to_any_case(.,
-      case = case, sep_in = "\\.",
-      transliterations = c("Latin-ASCII"), parsing_option = 1,
-      numerals = "asis"
+
+  replaced_names <-
+    stringr::str_replace_all(
+      str=string,
+      pattern=replace
+    )
+  transliterated_names <-
+    if (ascii) {
+      stringi::stri_trans_general(
+        replaced_names,
+        id="Greek-Latin;Latin-ASCII;Accents-Any;Any-ASCII"
+      )
+    } else {
+      replaced_names
+    }
+  # Remove starting spaces and punctuation
+  good_start <-
+    stringr::str_replace(
+      str=transliterated_names,
+      # Description of this regexp:
+      # \A: beginning of the string (rather than beginning of the line as ^ would indicate)
+      # \h: any horizontal whitespace character (spaces, tabs, and anything else that is a Unicode whitespace)
+      # \s: non-unicode whitespace matching (it may overlap with \h)
+      # \p{}: indicates a unicode class of characters, so these will also match punctuation, symbols, separators, and "other" characters
+      # * means all of the above zero or more times (not + so that the capturing part of the regexp works)
+      # (.*)$: captures everything else in the string for the replacement
+      pattern="\\A[\\h\\s\\p{Punctuation}\\p{Symbol}\\p{Separator}\\p{Other}]*(.*)$",
+      replacement="\\1"
+    )
+  # make.names() is dependent on the locale and therefore will return different
+  # system-dependent values (e.g. as in issue #268 with Japanese characters).
+  made_names <-
+    if (use_make_names) {
+      make.names(good_start)
+    } else {
+      good_start
+    }
+
+  cased_names <-
+    snakecase::to_any_case(
+      made_names,
+      case = case,
+      sep_in = sep_in,
+      transliterations = transliterations,
+      parsing_option = parsing_option,
+      numerals = numerals,
+      ...
     )
 
-  # Handle duplicated names - they mess up dplyr pipelines
-  # This appends the column number to repeated instances of duplicate variable names
-  dupe_count <- vapply(seq_along(new_names), function(i) {
-    sum(new_names[i] == new_names[1:i])
-  }, integer(1))
+  # Handle duplicated names - they mess up dplyr pipelines.  This appends the
+  # column number to repeated instances of duplicate variable names.
+  dupe_count <-
+    vapply(
+      seq_along(cased_names), function(i) {
+        sum(cased_names[i] == cased_names[1:i])
+      },
+      1L
+    )
 
-  new_names[dupe_count > 1] <- paste(
-    new_names[dupe_count > 1],
-    dupe_count[dupe_count > 1],
-    sep = "_"
-  )
-  new_names
+  cased_names[dupe_count > 1] <-
+    paste(
+      cased_names[dupe_count > 1],
+      dupe_count[dupe_count > 1],
+      sep = "_"
+    )
+  cased_names
 }
 
 # copy of clean_names from janitor v0.3 on CRAN, to preserve old behavior