diff --git a/DESCRIPTION b/DESCRIPTION index dcfd71f4..0e374e14 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -27,6 +27,8 @@ Imports: magrittr, purrr, rlang, + stringi, + stringr, snakecase (>= 0.9.2), tidyselect (>= 1.0.0), tidyr (>= 0.7.0) @@ -38,6 +40,7 @@ Suggests: knitr, rmarkdown, sf, - tibble + tibble, + tidygraph VignetteBuilder: knitr Encoding: UTF-8 diff --git a/NAMESPACE b/NAMESPACE index 5d9f3509..3911a74e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,8 +2,10 @@ S3method(chisq.test,default) S3method(chisq.test,tabyl) +S3method(clean_names,data.frame) S3method(clean_names,default) S3method(clean_names,sf) +S3method(clean_names,tbl_graph) S3method(crosstab,data.frame) S3method(crosstab,default) S3method(describe_class,default) @@ -49,11 +51,16 @@ export(tabyl) export(top_levels) export(untabyl) export(use_first_valid_of) +importFrom(dplyr,rename_all) importFrom(lubridate,ymd) importFrom(lubridate,ymd_hms) importFrom(magrittr,"%>%") importFrom(rlang,dots_n) importFrom(rlang,expr) importFrom(rlang,syms) +importFrom(snakecase,to_any_case) importFrom(stats,na.omit) +importFrom(stringi,stri_trans_general) +importFrom(stringr,str_replace) +importFrom(stringr,str_replace_all) importFrom(tidyselect,eval_select) diff --git a/NEWS.md b/NEWS.md index 538b240d..020d1d84 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,15 @@ # janitor 1.2.1.9000 (unreleased), will be v 1.3.0 +## Breaking Changes + +* `clean_names()` and `make_clean_names()` now work significantly harder to be locale-independent, and translation to ASCII is simpler (i.e. in many cases, Unicode is removed and the Greek delta character becomes a "d"). You may also now control how substitutions occur and add your own substitutions (like "%" becoming "percent"). These changes alter the new names provided by these functions in some cases. (Fix #331, thanks to @billdenney) + ## Major features +* `clean_names()` and `make_clean_names()` have a more generic interface where all arguments from `make_clean_names()` are accessible from `clean_names()` (Fix #339, thanks to @ari-nz and @billdenney). + +* `make_clean_names()` now allows the user to specify parts of names to be replaced (Fix #316, thanks to @woodwards for reporting and @woodwards and @billdenney for implementing) + * The variables considered by the function `get_dupes()` can be specified using the select helper functions from `tidyselect`. This includes `-column_name` to omit a variable as well as the matching functions `starts_with()`, `ends_with()`, `contains()`, and `matches()`. See `?tidyselect::select_helpers` for more (#326, thanks to **@jzadra** for suggesting and implementing). * The new function `signif_half_up()` rounds a numeric vector to the specified number of significant digits with halves rounded up (#314, thanks to **@khueyama** for suggesting and implementing). @@ -14,6 +22,8 @@ * `row_to_names()` will now work on matrix input (#320, thanks to **@billdenney** for suggesting and implementing +* `clean_names()` can now be called on *tbl_graph* objects from the `tidygraph` package. (#252, thanks to @gvdr for bringing up the issue and thanks to @Tazinho for proposing solution). + ## Bug fixes * `adorn_ns()` doesn't append anything to character columns when called on a data.frame resulting from a call to `adorn_percentages()`. (#195). diff --git a/R/clean_names.R b/R/clean_names.R index 7669ae23..51beaa45 100644 --- a/R/clean_names.R +++ b/R/clean_names.R @@ -1,30 +1,27 @@ -#' @title Cleans names of a data.frame. +#' @title Cleans names of an object (usually a data.frame). #' #' @description #' Resulting names are unique and consist only of the \code{_} character, numbers, and letters. #' Capitalization preferences can be specified using the \code{case} parameter. #' -#' Accented characters are -#' transliterated to ASCII. For example, an "o" with a German umlaut over it becomes "o", and the Spanish character "enye" becomes "n". +#' Accented characters are transliterated to ASCII. For example, an "o" with a +#' German umlaut over it becomes "o", and the Spanish character "enye" becomes +#' "n". #' -#' This function takes and returns a data.frame, for ease of piping with \code{`\%>\%`}. -#' For the underlying function that works on a character vector of names, -#' see \code{\link[janitor]{make_clean_names}}. +#' This function takes and returns a data.frame, for ease of piping with +#' \code{`\%>\%`}. For the underlying function that works on a character vector +#' of names, see \code{\link[janitor]{make_clean_names}}. #' #' @param dat the input data.frame. -#' @param case The desired target case (default is \code{"snake"}), indicated by these possible values: -#' \itemize{ -#' \item{\code{"snake"} produces snake_case} -#' \item{\code{"lower_camel"} or \code{"small_camel"} produces lowerCamel} -#' \item{\code{"upper_camel"} or \code{"big_camel"} produces UpperCamel} -#' \item{\code{"screaming_snake"} or \code{"all_caps"} produces ALL_CAPS} -#' \item{\code{"lower_upper"} produces lowerUPPER} -#' \item{\code{"upper_lower"} produces UPPERlower} -#' \item{\code{old_janitor}: legacy compatibility option to preserve behavior of \code{clean_names} prior to addition of the "case" argument(janitor versions <= 0.3.1 )}. Provided as a quick fix for old scripts broken by the changes to \code{clean_names} in janitor v1.0. -#' \item{\code{"parsed"}, \code{"mixed"}, \code{"none"}: less-common cases offered by \code{snakecase::to_any_case}. See \code{\link[snakecase]{to_any_case}} for details.} -#' } -#' +#' @inheritDotParams make_clean_names -string #' @return Returns the data.frame with clean names. +#' +#' @details \code{clean_names()} is intended to be used on \code{data.frames} +#' and \code{data.frame} like objects. For this reason there are methods to +#' support using \code{clean_names()} on \code{sf} and \code{tbl_graph} (from +#' \code{tidygraph}) objects. For cleaning named lists and vectors, consider +#' using \code{make_clean_names()}. +#' #' @export #' @examples #' # not run: @@ -39,48 +36,55 @@ #' # not run: #' # library(readxl) #' # read_excel("messy_excel_file.xlsx") %>% clean_names() - -# create new clean_names method -clean_names <- function(dat, case) { +clean_names <- function(dat, ...) { UseMethod("clean_names") } - #' @rdname clean_names #' @export +clean_names.data.frame <- function(dat, ...) { + stats::setNames(dat, make_clean_names(names(dat), ...)) +} - -# create a default method, Will only dispatch on a dataframe -clean_names.default <- function(dat, case = c( - "snake", "lower_camel", "upper_camel", "screaming_snake", - "lower_upper", "upper_lower", "all_caps", "small_camel", - "big_camel", "old_janitor", "parsed", "mixed", "none" -)) { - if(!is.data.frame(dat)){ - stop( "clean_names() must be called on a data.frame. Consider janitor::make_clean_names() for other cases of manipulating vectors of names.") - } - stats::setNames(dat, make_clean_names(names(dat), case = case)) +#' @rdname clean_names +#' @export +clean_names.default <- function(dat, ...) { + stop( + "No `clean_names()` method exists for the class ", paste(class(dat), collapse=", "), + "\nConsider janitor::make_clean_names() for other cases of manipulating vectors of names." + ) } +#' @rdname clean_names #' @export -# create method for sf object -clean_names.sf <- function(dat, case = c( - "snake", "lower_camel", "upper_camel", "screaming_snake", - "lower_upper", "upper_lower", "all_caps", "small_camel", - "big_camel", "old_janitor", "parsed", "mixed", "none" -)) { +clean_names.sf <- function(dat, ...) { if (!requireNamespace("sf", quietly = TRUE)) { # nocov start - stop("Package \"sf\" needed for this function to work. Please install it.", - call. = FALSE) + stop( + "Package 'sf' needed for this function to work. Please install it.", + call. = FALSE + ) } # nocov end # get old names sf_names <- names(dat) # identify ending column index to clean n_cols <- length(dat)-1 # clean all but last column - sf_cleaned <- make_clean_names(sf_names[1:n_cols], case) + sf_cleaned <- make_clean_names(sf_names[1:n_cols], ...) # rename original df names(dat)[1:n_cols] <- sf_cleaned return(dat) } + +#' @rdname clean_names +#' @export +#' @importFrom dplyr rename_all +clean_names.tbl_graph <- function(dat, ...) { + if (!requireNamespace("tidygraph", quietly = TRUE)) { # nocov start + stop( + "Package 'tidygraph' needed for this function to work. Please install it.", + call. = FALSE + ) + } # nocov end + dplyr::rename_all(dat, .funs=make_clean_names, ...) +} diff --git a/R/make_clean_names.R b/R/make_clean_names.R index b42ba933..c16f36b8 100644 --- a/R/make_clean_names.R +++ b/R/make_clean_names.R @@ -1,22 +1,46 @@ #' @title Cleans a vector of text, typically containing the names of an object. #' #' @description -#' Resulting strings are unique and consist only of the \code{_} character, numbers, and letters. -#' Capitalization preferences can be specified using the \code{case} parameter. +#' Resulting strings are unique and consist only of the \code{_} character, +#' numbers, and letters. By default, the resulting strings will only consist of +#' ASCII characters, but non-ASCII (e.g. Unicode) may be allowed by setting +#' `ascii=FALSE`. Capitalization preferences can be specified using the +#' \code{case} parameter. #' #' For use on the names of a data.frame, e.g., in a \code{`\%>\%`} pipeline, #' call the convenience function \code{\link[janitor]{clean_names}}. #' -#' Accented characters are transliterated to ASCII. For example, an "o" -#' with a German umlaut over it becomes "o", and the Spanish character "enye" becomes "n". +#' When `ascii=TRUE` (the default), accented characters are transliterated to +#' ASCII. For example, an "o" with a German umlaut over it becomes "o", and the +#' Spanish character "enye" becomes "n". +#' +#' The order of operations is: `replace`, (optional) ASCII conversion, removing +#' initial spaces and punctuation, apply `base::make.names()`, apply +#' `snakecase::to_any_case()`, and add numeric suffixes to duplicates. +#' +#' If `case = "old_janitor"` legacy compatibility option to preserve behavior of +#' `clean_names()` prior to addition of the "case" argument(janitor versions <= +#' 0.3.1). The `"old_janitor"` option is provided as a quick fix for old +#' scripts broken by the changes to `clean_names()` in janitor v1.0, and it +#' should not be used for new code. #' #' @param string A character vector of names to clean. -#' -#' @inheritParams clean_names +#' @param case The desired target case (default is \code{"snake"}) will be +#' passed to `snakecase::to_any_case()` with the exception of "old_janitor" +#' (see details). +#' @param replace A named character vector where the name is replaced by the +#' value. +#' @param ascii Convert the names to ASCII (\code{TRUE}, default) or not +#' (\code{FALSE}). +#' @param use_make_names Should `make.names()` be applied to ensure that the +#' output is usable as a name without quoting? (Avoiding `make.names()` +#' ensures that the output is locale-independent but quoting may be required.) +#' @inheritParams snakecase::to_any_case +#' @inheritDotParams snakecase::to_any_case #' #' @return Returns the "cleaned" character vector. #' @export -#' @seealso \code{\link[snakecase]{to_any_case}} +#' @seealso \code{\link[snakecase]{to_any_case}()} #' @examples #' #' # cleaning the names of a vector: @@ -24,58 +48,105 @@ #' x #' names(x) <- make_clean_names(names(x)) #' x # now has cleaned names - +#' #' # if you prefer camelCase variable names: #' make_clean_names(names(x), "small_camel") #' #' # similar to janitor::clean_names(poorly_named_df): #' # not run: #' # make_clean_names(names(poorly_named_df)) -#' -make_clean_names <- function(string, case = c( - "snake", "lower_camel", "upper_camel", "screaming_snake", - "lower_upper", "upper_lower", "all_caps", "small_camel", - "big_camel", "old_janitor", "parsed", "mixed", "none" -)) { +#' +#' @importFrom stringi stri_trans_general +#' @importFrom stringr str_replace str_replace_all +#' @importFrom snakecase to_any_case +make_clean_names <- function(string, + case = "snake", + replace= + c( + "'"="", + "\""="", + "%"="_percent_", + "#"="_number_" + ), + ascii=TRUE, + use_make_names=TRUE, + # default arguments for snake_case::to_any_case + sep_in = "\\.", + transliterations = "Latin-ASCII", + parsing_option = 1, + numerals = "asis", + ...) { - # old behavior, to provide easy fix for people whose code breaks with the snakecase integration - case <- match.arg(case) + # Handling "old_janitor" case for backward compatibility if (case == "old_janitor") { return(old_make_clean_names(string)) } - - ### new behaviour with snakecase integration - # Takes a data.frame, returns the same data frame with cleaned names - old_names <- string - new_names <- old_names %>% - gsub("'", "", .) %>% # remove single quotation marks - gsub("\"", "", .) %>% # remove double quotation marks - gsub("%", ".percent_", .) %>% # starting with "." as a workaround, to make - # ".percent" a valid name. The "." will be replaced in the call to to_any_case - # via the preprocess argument anyway. - gsub("#", ".number_", .) %>% - gsub("^[[:space:][:punct:]]+", "", .) %>% # remove leading spaces & punctuation - make.names(.) %>% - # Handle dots, multiple underscores, case conversion, string transliteration - # Parsing option 4 removes underscores around numbers, #153 - snakecase::to_any_case(., - case = case, sep_in = "\\.", - transliterations = c("Latin-ASCII"), parsing_option = 1, - numerals = "asis" + + replaced_names <- + stringr::str_replace_all( + str=string, + pattern=replace + ) + transliterated_names <- + if (ascii) { + stringi::stri_trans_general( + replaced_names, + id="Greek-Latin;Latin-ASCII;Accents-Any;Any-ASCII" + ) + } else { + replaced_names + } + # Remove starting spaces and punctuation + good_start <- + stringr::str_replace( + str=transliterated_names, + # Description of this regexp: + # \A: beginning of the string (rather than beginning of the line as ^ would indicate) + # \h: any horizontal whitespace character (spaces, tabs, and anything else that is a Unicode whitespace) + # \s: non-unicode whitespace matching (it may overlap with \h) + # \p{}: indicates a unicode class of characters, so these will also match punctuation, symbols, separators, and "other" characters + # * means all of the above zero or more times (not + so that the capturing part of the regexp works) + # (.*)$: captures everything else in the string for the replacement + pattern="\\A[\\h\\s\\p{Punctuation}\\p{Symbol}\\p{Separator}\\p{Other}]*(.*)$", + replacement="\\1" + ) + # make.names() is dependent on the locale and therefore will return different + # system-dependent values (e.g. as in issue #268 with Japanese characters). + made_names <- + if (use_make_names) { + make.names(good_start) + } else { + good_start + } + + cased_names <- + snakecase::to_any_case( + made_names, + case = case, + sep_in = sep_in, + transliterations = transliterations, + parsing_option = parsing_option, + numerals = numerals, + ... ) - # Handle duplicated names - they mess up dplyr pipelines - # This appends the column number to repeated instances of duplicate variable names - dupe_count <- vapply(seq_along(new_names), function(i) { - sum(new_names[i] == new_names[1:i]) - }, integer(1)) + # Handle duplicated names - they mess up dplyr pipelines. This appends the + # column number to repeated instances of duplicate variable names. + dupe_count <- + vapply( + seq_along(cased_names), function(i) { + sum(cased_names[i] == cased_names[1:i]) + }, + 1L + ) - new_names[dupe_count > 1] <- paste( - new_names[dupe_count > 1], - dupe_count[dupe_count > 1], - sep = "_" - ) - new_names + cased_names[dupe_count > 1] <- + paste( + cased_names[dupe_count > 1], + dupe_count[dupe_count > 1], + sep = "_" + ) + cased_names } # copy of clean_names from janitor v0.3 on CRAN, to preserve old behavior diff --git a/man/clean_names.Rd b/man/clean_names.Rd index 65103063..a9f8c40e 100644 --- a/man/clean_names.Rd +++ b/man/clean_names.Rd @@ -2,32 +2,63 @@ % Please edit documentation in R/clean_names.R \name{clean_names} \alias{clean_names} +\alias{clean_names.data.frame} \alias{clean_names.default} -\title{Cleans names of a data.frame.} +\alias{clean_names.sf} +\alias{clean_names.tbl_graph} +\title{Cleans names of an object (usually a data.frame).} \usage{ -clean_names(dat, case) - -\method{clean_names}{default}( - dat, - case = c("snake", "lower_camel", "upper_camel", "screaming_snake", "lower_upper", - "upper_lower", "all_caps", "small_camel", "big_camel", "old_janitor", "parsed", - "mixed", "none") -) +clean_names(dat, ...) + +\method{clean_names}{data.frame}(dat, ...) + +\method{clean_names}{default}(dat, ...) + +\method{clean_names}{sf}(dat, ...) + +\method{clean_names}{tbl_graph}(dat, ...) } \arguments{ \item{dat}{the input data.frame.} -\item{case}{The desired target case (default is \code{"snake"}), indicated by these possible values: +\item{...}{ + Arguments passed on to \code{\link[=make_clean_names]{make_clean_names}} + \describe{ + \item{\code{case}}{The desired target case (default is \code{"snake"}) will be +passed to `snakecase::to_any_case()` with the exception of "old_janitor" +(see details).} + \item{\code{replace}}{A named character vector where the name is replaced by the +value.} + \item{\code{ascii}}{Convert the names to ASCII (\code{TRUE}, default) or not +(\code{FALSE}).} + \item{\code{use_make_names}}{Should `make.names()` be applied to ensure that the +output is usable as a name without quoting? (Avoiding `make.names()` +ensures that the output is locale-independent but quoting may be required.)} + \item{\code{sep_in}}{(short for separator input) if character, is interpreted as a +regular expression (wrapped internally into \code{stringr::regex()}). +The default value is a regular expression that matches any sequence of +non-alphanumeric values. All matches will be replaced by underscores +(additionally to \code{"_"} and \code{" "}, for which this is always true, even +if \code{NULL} is supplied). These underscores are used internally to split +the strings into substrings and specify the word boundaries.} + \item{\code{transliterations}}{A character vector (if not \code{NULL}). The entries of this argument +need to be elements of \code{stringi::stri_trans_list()} (like "Latin-ASCII", which is often useful) or names of lookup tables (currently only "german" is supported). In the order of the entries the letters of the input + string will be transliterated via \code{stringi::stri_trans_general()} or replaced via the + matches of the lookup table. When named character elements are supplied as part of `transliterations`, anything that matches the names is replaced by the corresponding value. +You should use this feature with care in case of \code{case = "parsed"}, \code{case = "internal_parsing"} and +\code{case = "none"}, since for upper case letters, which have transliterations/replacements + of length 2, the second letter will be transliterated to lowercase, for example Oe, Ae, Ss, which + might not always be what is intended. In this case you can make usage of the option to supply named elements and specify the transliterations yourself.} + \item{\code{parsing_option}}{An integer that will determine the parsing_option. \itemize{ - \item{\code{"snake"} produces snake_case} - \item{\code{"lower_camel"} or \code{"small_camel"} produces lowerCamel} - \item{\code{"upper_camel"} or \code{"big_camel"} produces UpperCamel} - \item{\code{"screaming_snake"} or \code{"all_caps"} produces ALL_CAPS} - \item{\code{"lower_upper"} produces lowerUPPER} - \item{\code{"upper_lower"} produces UPPERlower} - \item{\code{old_janitor}: legacy compatibility option to preserve behavior of \code{clean_names} prior to addition of the "case" argument(janitor versions <= 0.3.1 )}. Provided as a quick fix for old scripts broken by the changes to \code{clean_names} in janitor v1.0. - \item{\code{"parsed"}, \code{"mixed"}, \code{"none"}: less-common cases offered by \code{snakecase::to_any_case}. See \code{\link[snakecase]{to_any_case}} for details.} + \item{1: \code{"RRRStudio" -> "RRR_Studio"}} + \item{2: \code{"RRRStudio" -> "RRRS_tudio"}} + \item{3: \code{"RRRStudio" -> "RRRSStudio"}. This will become for example \code{"Rrrstudio"} when we convert to lower camel case.} + \item{-1, -2, -3: These \code{parsing_options}'s will suppress the conversion after non-alphanumeric values.} + \item{0: no parsing} }} + \item{\code{numerals}}{A character specifying the alignment of numerals (\code{"middle"}, \code{left}, \code{right}, \code{asis} or \code{tight}). I.e. \code{numerals = "left"} ensures that no output separator is in front of a digit.} + }} } \value{ Returns the data.frame with clean names. @@ -36,12 +67,20 @@ Returns the data.frame with clean names. Resulting names are unique and consist only of the \code{_} character, numbers, and letters. Capitalization preferences can be specified using the \code{case} parameter. -Accented characters are -transliterated to ASCII. For example, an "o" with a German umlaut over it becomes "o", and the Spanish character "enye" becomes "n". +Accented characters are transliterated to ASCII. For example, an "o" with a +German umlaut over it becomes "o", and the Spanish character "enye" becomes +"n". -This function takes and returns a data.frame, for ease of piping with \code{`\%>\%`}. -For the underlying function that works on a character vector of names, -see \code{\link[janitor]{make_clean_names}}. +This function takes and returns a data.frame, for ease of piping with +\code{`\%>\%`}. For the underlying function that works on a character vector +of names, see \code{\link[janitor]{make_clean_names}}. +} +\details{ +\code{clean_names()} is intended to be used on \code{data.frames} + and \code{data.frame} like objects. For this reason there are methods to + support using \code{clean_names()} on \code{sf} and \code{tbl_graph} (from + \code{tidygraph}) objects. For cleaning named lists and vectors, consider + using \code{make_clean_names()}. } \examples{ # not run: diff --git a/man/make_clean_names.Rd b/man/make_clean_names.Rd index afe13297..1cf59eac 100644 --- a/man/make_clean_names.Rd +++ b/man/make_clean_names.Rd @@ -6,38 +6,106 @@ \usage{ make_clean_names( string, - case = c("snake", "lower_camel", "upper_camel", "screaming_snake", "lower_upper", - "upper_lower", "all_caps", "small_camel", "big_camel", "old_janitor", "parsed", - "mixed", "none") + case = "snake", + replace = c(`'` = "", `"` = "", `\%` = "_percent_", `#` = "_number_"), + ascii = TRUE, + use_make_names = TRUE, + sep_in = "\\\\.", + transliterations = "Latin-ASCII", + parsing_option = 1, + numerals = "asis", + ... ) } \arguments{ \item{string}{A character vector of names to clean.} -\item{case}{The desired target case (default is \code{"snake"}), indicated by these possible values: +\item{case}{The desired target case (default is \code{"snake"}) will be +passed to `snakecase::to_any_case()` with the exception of "old_janitor" +(see details).} + +\item{replace}{A named character vector where the name is replaced by the +value.} + +\item{ascii}{Convert the names to ASCII (\code{TRUE}, default) or not +(\code{FALSE}).} + +\item{use_make_names}{Should `make.names()` be applied to ensure that the +output is usable as a name without quoting? (Avoiding `make.names()` +ensures that the output is locale-independent but quoting may be required.)} + +\item{sep_in}{(short for separator input) if character, is interpreted as a +regular expression (wrapped internally into \code{stringr::regex()}). +The default value is a regular expression that matches any sequence of +non-alphanumeric values. All matches will be replaced by underscores +(additionally to \code{"_"} and \code{" "}, for which this is always true, even +if \code{NULL} is supplied). These underscores are used internally to split +the strings into substrings and specify the word boundaries.} + +\item{transliterations}{A character vector (if not \code{NULL}). The entries of this argument +need to be elements of \code{stringi::stri_trans_list()} (like "Latin-ASCII", which is often useful) or names of lookup tables (currently only "german" is supported). In the order of the entries the letters of the input + string will be transliterated via \code{stringi::stri_trans_general()} or replaced via the + matches of the lookup table. When named character elements are supplied as part of `transliterations`, anything that matches the names is replaced by the corresponding value. +You should use this feature with care in case of \code{case = "parsed"}, \code{case = "internal_parsing"} and +\code{case = "none"}, since for upper case letters, which have transliterations/replacements + of length 2, the second letter will be transliterated to lowercase, for example Oe, Ae, Ss, which + might not always be what is intended. In this case you can make usage of the option to supply named elements and specify the transliterations yourself.} + +\item{parsing_option}{An integer that will determine the parsing_option. \itemize{ - \item{\code{"snake"} produces snake_case} - \item{\code{"lower_camel"} or \code{"small_camel"} produces lowerCamel} - \item{\code{"upper_camel"} or \code{"big_camel"} produces UpperCamel} - \item{\code{"screaming_snake"} or \code{"all_caps"} produces ALL_CAPS} - \item{\code{"lower_upper"} produces lowerUPPER} - \item{\code{"upper_lower"} produces UPPERlower} - \item{\code{old_janitor}: legacy compatibility option to preserve behavior of \code{clean_names} prior to addition of the "case" argument(janitor versions <= 0.3.1 )}. Provided as a quick fix for old scripts broken by the changes to \code{clean_names} in janitor v1.0. - \item{\code{"parsed"}, \code{"mixed"}, \code{"none"}: less-common cases offered by \code{snakecase::to_any_case}. See \code{\link[snakecase]{to_any_case}} for details.} + \item{1: \code{"RRRStudio" -> "RRR_Studio"}} + \item{2: \code{"RRRStudio" -> "RRRS_tudio"}} + \item{3: \code{"RRRStudio" -> "RRRSStudio"}. This will become for example \code{"Rrrstudio"} when we convert to lower camel case.} + \item{-1, -2, -3: These \code{parsing_options}'s will suppress the conversion after non-alphanumeric values.} + \item{0: no parsing} }} + +\item{numerals}{A character specifying the alignment of numerals (\code{"middle"}, \code{left}, \code{right}, \code{asis} or \code{tight}). I.e. \code{numerals = "left"} ensures that no output separator is in front of a digit.} + +\item{...}{ + Arguments passed on to \code{\link[snakecase:to_any_case]{snakecase::to_any_case}} + \describe{ + \item{\code{abbreviations}}{character. (Case insensitive) matched abbreviations are surrounded by underscores. In this way, they can get recognized by the parser. This is useful when e.g. \code{parsing_option} 1 is needed for the use case, but some abbreviations but some substrings would require \code{parsing_option} 2. Furthermore, this argument also specifies the formatting of abbreviations in the output for the cases title, mixed, lower and upper camel. E.g. for upper camel the first letter is always in upper case, but when the abbreviation is supplied in upper case, this will also be visible in the output. + +Use this feature with care: One letter abbreviations and abbreviations next to each other are hard to read and also not easy to parse for further processing.} + \item{\code{sep_out}}{(short for separator output) String that will be used as separator. The defaults are \code{"_"} +and \code{""}, regarding the specified \code{case}. When \code{length(sep_out) > 1}, the last element of \code{sep_out} gets recycled and separators are incorporated per string according to their order.} + \item{\code{unique_sep}}{A string. If not \code{NULL}, then duplicated names will get +a suffix integer +in the order of their appearance. The suffix is separated by the supplied string + to this argument.} + \item{\code{empty_fill}}{A string. If it is supplied, then each entry that matches "" will be replaced +by the supplied string to this argument.} + \item{\code{prefix}}{prefix (string).} + \item{\code{postfix}}{postfix (string).} + }} } \value{ Returns the "cleaned" character vector. } \description{ -Resulting strings are unique and consist only of the \code{_} character, numbers, and letters. -Capitalization preferences can be specified using the \code{case} parameter. +Resulting strings are unique and consist only of the \code{_} character, +numbers, and letters. By default, the resulting strings will only consist of +ASCII characters, but non-ASCII (e.g. Unicode) may be allowed by setting +`ascii=FALSE`. Capitalization preferences can be specified using the +\code{case} parameter. For use on the names of a data.frame, e.g., in a \code{`\%>\%`} pipeline, call the convenience function \code{\link[janitor]{clean_names}}. -Accented characters are transliterated to ASCII. For example, an "o" -with a German umlaut over it becomes "o", and the Spanish character "enye" becomes "n". +When `ascii=TRUE` (the default), accented characters are transliterated to +ASCII. For example, an "o" with a German umlaut over it becomes "o", and the +Spanish character "enye" becomes "n". + +The order of operations is: `replace`, (optional) ASCII conversion, removing +initial spaces and punctuation, apply `base::make.names()`, apply +`snakecase::to_any_case()`, and add numeric suffixes to duplicates. + +If `case = "old_janitor"` legacy compatibility option to preserve behavior of +`clean_names()` prior to addition of the "case" argument(janitor versions <= +0.3.1). The `"old_janitor"` option is provided as a quick fix for old +scripts broken by the changes to `clean_names()` in janitor v1.0, and it +should not be used for new code. } \examples{ @@ -46,6 +114,7 @@ x <- structure(1:3, names = c("name with space", "TwoWords", "total $ (2009)")) x names(x) <- make_clean_names(names(x)) x # now has cleaned names + # if you prefer camelCase variable names: make_clean_names(names(x), "small_camel") @@ -55,5 +124,5 @@ make_clean_names(names(x), "small_camel") } \seealso{ -\code{\link[snakecase]{to_any_case}} +\code{\link[snakecase]{to_any_case}()} } diff --git a/tests/testthat/test-clean-names.R b/tests/testthat/test-clean-names.R index dd3bba2f..33a7b7a4 100644 --- a/tests/testthat/test-clean-names.R +++ b/tests/testthat/test-clean-names.R @@ -1,42 +1,175 @@ -# Tests for data.frame renaming function - -library(janitor) context("clean_names") -test_df <- data.frame(matrix(ncol = 20) %>% as.data.frame()) -names(test_df) <- c( - "sp ace", "repeated", "a**^@", "%", "*", "!", - "d(!)9", "REPEATED", "can\"'t", "hi_`there`", " leading spaces", - "€", "ação", "Farœ", "a b c d e f", "testCamelCase", "!leadingpunct", - "average # of days", "jan2009sales", "jan 2009 sales" -) +# Tests for make_clean_names #### -clean <- clean_names(test_df, "snake") +test_that("All scenarios for make_clean_names", { + expect_equal( + make_clean_names("sp ace"), + "sp_ace" + ) + expect_equal( + make_clean_names(c("repeated", "repeated", "REPEATED")), + paste0("repeated", c("", "_2", "_3")) + ) + expect_equal( + make_clean_names("a**^@"), + "a" + ) + expect_equal( + make_clean_names("%"), + "percent" + ) + expect_equal( + make_clean_names("%", replace=c("%"="foo")), + "foo", + info="Verify that `replace` works." + ) + expect_equal( + make_clean_names("*"), + "x" + ) + expect_equal( + make_clean_names("!"), + "x" + ) + expect_equal( + make_clean_names(c("*", "!")), + c("x", "x_2") + ) + expect_equal( + make_clean_names("d(!)9"), + "d_9" + ) + expect_equal( + make_clean_names("can\"'t"), + "cant" + ) + expect_equal( + make_clean_names("hi_`there`"), + "hi_there" + ) + expect_equal( + make_clean_names(" leading spaces"), + "leading_spaces" + ) + expect_equal( + make_clean_names("€"), + "x" + ) + expect_equal( + make_clean_names("ação", ascii=FALSE), + "acao" + ) + expect_equal( + make_clean_names("ação"), + "acao" + ) + expect_equal( + make_clean_names("Farœ"), + "faroe" + ) + expect_equal( + make_clean_names("a b c d e f"), + "a_b_c_d_e_f" + ) + expect_equal( + make_clean_names("testCamelCase"), + "test_camel_case" + ) + expect_equal( + make_clean_names("!leadingpunct"), + "leadingpunct" + ) + expect_equal( + make_clean_names("average # of days"), + "average_number_of_days" + ) + expect_equal( + make_clean_names("jan2009sales"), + "jan2009sales" + ) + expect_equal( + make_clean_names("jan 2009 sales"), + "jan_2009_sales" + ) + expect_equal( + make_clean_names("not_first_unicode_µ"), + "not_first_unicode_m" + ) + expect_equal( + make_clean_names("µ_first_unicode"), + "m_first_unicode" + ) + expect_equal( + make_clean_names("a/b"), + "a_b", + info="No custom replacement" + ) + expect_equal( + make_clean_names("a/b", replace=c("/"="_per_")), + "a_per_b", + info="Custom replacement" + ) +}) -test_that("Names are cleaned appropriately", { - expect_equal(names(clean)[1], "sp_ace") # spaces - expect_equal(names(clean)[2], "repeated") # first instance of repeat - expect_equal(names(clean)[3], "a") # multiple special chars, trailing special chars - expect_equal(names(clean)[4], "percent") # converting % to percent - expect_equal(names(clean)[5], "x") # 100% invalid name - expect_equal(names(clean)[6], "x_2") # repeat of invalid name - expect_equal(names(clean)[7], "d_9") # multiple special characters - expect_equal(names(clean)[8], "repeated_2") # uppercase, 2nd instance of repeat - expect_equal(names(clean)[9], "cant") # uppercase, 2nd instance of repeat - expect_equal(names(clean)[10], "hi_there") # double-underscores to single - expect_equal(names(clean)[11], "leading_spaces") # leading spaces - expect_equal(names(clean)[12], "x_3") # euro sign, invalid - expect_equal(names(clean)[13], "acao") # accented word, transliterated to latin, - expect_equal(names(clean)[14], "faroe") # œ character was failing to convert on Windows, should work universally for stringi 1.1.6 or higher - # https://github.com/sfirke/janitor/issues/120#issuecomment-303385418 - expect_equal(names(clean)[15], "a_b_c_d_e_f") # for testing alternating cases below with e.g., case = "upper_lower" - expect_equal(names(clean)[16], "test_camel_case") # for testing alternating cases below with e.g., case = "upper_lower" - expect_equal(names(clean)[17], "leadingpunct") # for testing alternating cases below with e.g., case = "upper_lower" - expect_equal(names(clean)[18], "average_number_of_days") # for testing alternating cases below with e.g., case = "upper_lower" - expect_equal(names(clean)[19], "jan2009sales") # no separator around number-word boundary if not existing already - expect_equal(names(clean)[20], "jan_2009_sales") # yes separator around number-word boundary if it existed +test_that("locale-specific make_clean_names tests", { + orig_locale <- Sys.getlocale(category="LC_CTYPE") + Sys.setlocale(locale="C") + expect_equal( + make_clean_names("介護_看護_女"), + "x_u_4ecb_u_8b77_u_770b_u_8b77_u_5973", + info="Unicode transliteration happens with make.names()" + ) + expect_equal( + make_clean_names("介護_看護_女", use_make_names=FALSE), + "介護_看護_女", + info="Unicode transliteration does not happen without make.names()" + ) + expect_equal( + make_clean_names("μ"), + "m", + info="lower-case mu is transliterated to an 'm'" + ) + expect_equal( + make_clean_names("µ", ascii=FALSE, use_make_names=FALSE), + "µ", + info="lower-case mu is not transliterated to an 'm' and uses the " + ) + Sys.setlocale(locale=orig_locale) }) +testing_vector <- + c( + "sp ace", + "repeated", + "a**^@", + "%", + "*", + "!", + "d(!)9", + "REPEATED", + "can\"'t", + "hi_`there`", + " leading spaces", + "€", + "ação", + "Farœ", + "a b c d e f", + "testCamelCase", + "!leadingpunct", + "average # of days", + "jan2009sales", + "jan 2009 sales", + "not_first_unicode_µ", + "µ_first_unicode" + ) + +# Tests for clean_names #### +test_df <- as.data.frame(matrix(ncol = 22)) +names(test_df) <- testing_vector +clean <- clean_names(test_df, "snake", ascii=TRUE) +clean_noascii <- clean_names(test_df, "snake", ascii=FALSE) + test_that("Returns a data.frame", { expect_is(clean, "data.frame") }) @@ -46,35 +179,46 @@ test_that("Tests for cases beyond default snake", { names(clean_names(test_df, "small_camel")), c( "spAce", "repeated", "a", "percent", "x", "x_2", "d9", "repeated_2", - "cant", "hiThere", "leadingSpaces", "x_3", "acao", "faroe", "aBCDEF", "testCamelCase", "leadingpunct", "averageNumberOfDays", "jan2009Sales", "jan2009Sales_2" + "cant", "hiThere", "leadingSpaces", "x_3", "acao", "faroe", "aBCDEF", + "testCamelCase", "leadingpunct", "averageNumberOfDays", + "jan2009Sales", "jan2009Sales_2", "notFirstUnicodeM", "mFirstUnicode" ) ) expect_equal( names(clean_names(test_df, "big_camel")), c( "SpAce", "Repeated", "A", "Percent", "X", "X_2", "D9", "Repeated_2", - "Cant", "HiThere", "LeadingSpaces", "X_3", "Acao", "Faroe", "ABCDEF", "TestCamelCase", "Leadingpunct", "AverageNumberOfDays", "Jan2009Sales", "Jan2009Sales_2" + "Cant", "HiThere", "LeadingSpaces", "X_3", "Acao", "Faroe", "ABCDEF", "TestCamelCase", + "Leadingpunct", "AverageNumberOfDays", "Jan2009Sales", "Jan2009Sales_2", + "NotFirstUnicodeM", "MFirstUnicode" ) ) expect_equal( names(clean_names(test_df, "all_caps")), c( "SP_ACE", "REPEATED", "A", "PERCENT", "X", "X_2", "D_9", "REPEATED_2", - "CANT", "HI_THERE", "LEADING_SPACES", "X_3", "ACAO", "FAROE", "A_B_C_D_E_F", "TEST_CAMEL_CASE", "LEADINGPUNCT", "AVERAGE_NUMBER_OF_DAYS", "JAN2009SALES", "JAN_2009_SALES" + "CANT", "HI_THERE", "LEADING_SPACES", "X_3", "ACAO", "FAROE", "A_B_C_D_E_F", + "TEST_CAMEL_CASE", "LEADINGPUNCT", "AVERAGE_NUMBER_OF_DAYS", "JAN2009SALES", + "JAN_2009_SALES", + "NOT_FIRST_UNICODE_M", "M_FIRST_UNICODE" ) ) expect_equal( names(clean_names(test_df, "lower_upper")), c( "spACE", "repeated", "a", "percent", "x", "x_2", "d9", "repeated_2", - "cant", "hiTHERE", "leadingSPACES", "x_3", "acao", "faroe", "aBcDeF", "testCAMELcase", "leadingpunct", "averageNUMBERofDAYS", "jan2009SALES", "jan2009SALES_2" + "cant", "hiTHERE", "leadingSPACES", "x_3", "acao", "faroe", "aBcDeF", + "testCAMELcase", "leadingpunct", "averageNUMBERofDAYS", "jan2009SALES", + "jan2009SALES_2", "notFIRSTunicodeM", "mFIRSTunicode" ) ) expect_equal( names(clean_names(test_df, "upper_lower")), c( "SPace", "REPEATED", "A", "PERCENT", "X", "X_2", "D9", "REPEATED_2", - "CANT", "HIthere", "LEADINGspaces", "X_3", "ACAO", "FAROE", "AbCdEf", "TESTcamelCASE", "LEADINGPUNCT", "AVERAGEnumberOFdays", "JAN2009sales", "JAN2009sales_2" + "CANT", "HIthere", "LEADINGspaces", "X_3", "ACAO", "FAROE", "AbCdEf", + "TESTcamelCASE", "LEADINGPUNCT", "AVERAGEnumberOFdays", "JAN2009sales", + "JAN2009sales_2", "NOTfirstUNICODEm", "MfirstUNICODE" ) ) expect_equal( @@ -83,7 +227,7 @@ test_that("Tests for cases beyond default snake", { "sp_ace", "repeated", "a", "percent", "X", "X_2", "d_9", "REPEATED", "cant", "hi_there", "leading_spaces", "X_3", "acao", "Faroe", "a_b_c_d_e_f", "testCamelCase", "leadingpunct", "average_number_of_days", - "jan2009sales", "jan_2009_sales" + "jan2009sales", "jan_2009_sales", "not_first_unicode_m", "m_first_unicode" ) ) expect_equal( @@ -91,7 +235,8 @@ test_that("Tests for cases beyond default snake", { c( "sp_ace", "repeated", "a", "percent", "x", "x_2", "d_9", "repeated_2", "cant", "hi_there", "leading_spaces", "x_3", "ação", "farœ", - "a_b_c_d_e_f", "testcamelcase", "x_leadingpunct", "average_of_days", "jan2009sales", "jan_2009_sales" + "a_b_c_d_e_f", "testcamelcase", "x_leadingpunct", "average_of_days", + "jan2009sales", "jan_2009_sales", "not_first_unicode_µ", "µ_first_unicode" ) ) # check that alias arguments yield identical outputs @@ -101,12 +246,16 @@ test_that("Tests for cases beyond default snake", { }) test_that("errors if not called on a data.frame", { - expect_error(clean_names(1:3), "clean_names() must be called on a data.frame. Consider janitor::make_clean_names() for other cases of manipulating vectors of names.", fixed = TRUE) + expect_error( + clean_names(1:3), + regexp="No `clean_names()` method exists for the class integer", + fixed=TRUE + ) }) #------------------------------------------------------------------------------# -#---------------------------- Tests for sf method -----------------------------# +#---------------------------- Tests for sf method -----------------------------#### #------------------------------------------------------------------------------# context("clean_names.sf") @@ -117,11 +266,11 @@ test_that("Names are cleaned appropriately without attaching sf", { clean <- clean_names(nc, "snake") expect_equal(names(clean)[4], "cnty_id") + expect_is(clean, "sf") }) test_that("Names are cleaned appropriately", { skip_if_not_installed("sf") - library(sf) test_df <- data.frame(matrix(ncol = 22) %>% as.data.frame()) names(test_df) <- c( @@ -134,9 +283,9 @@ test_that("Names are cleaned appropriately", { test_df["long"] <- -80 test_df["lat"] <- 40 - test_df <- st_as_sf(test_df, coords = c("long", "lat")) + test_df <- sf::st_as_sf(test_df, coords = c("long", "lat")) names(test_df)[21] <- "Geometry" - st_geometry(test_df) <- "Geometry" + sf::st_geometry(test_df) <- "Geometry" clean <- clean_names(test_df, "snake") @@ -161,14 +310,10 @@ test_that("Names are cleaned appropriately", { expect_equal(names(clean)[18], "average_number_of_days") # for testing alternating cases below with e.g., case = "upper_lower" expect_equal(names(clean)[19], "jan2009sales") # no separator around number-word boundary if not existing already expect_equal(names(clean)[20], "jan_2009_sales") # yes separator around number-word boundary if it existed - - expect_is(clean, "sf", info="Returns a sf data.frame") }) - test_that("Tests for cases beyond default snake for sf objects", { - test_df <- data.frame(matrix(ncol = 22) %>% as.data.frame()) names(test_df) <- c( @@ -181,11 +326,9 @@ test_that("Tests for cases beyond default snake for sf objects", { test_df["long"] <- -80 test_df["lat"] <- 40 - test_df <- st_as_sf(test_df, coords = c("long", "lat")) + test_df <- sf::st_as_sf(test_df, coords = c("long", "lat")) names(test_df)[21] <- "geometry" - st_geometry(test_df) <- "geometry" - - + sf::st_geometry(test_df) <- "geometry" expect_equal( names(clean_names(test_df, "small_camel")), @@ -240,3 +383,51 @@ test_that("Tests for cases beyond default snake for sf objects", { ) ) }) + +#------------------------------------------------------------------------------# +#------------------------ Tests for tbl_graph method --------------------------##### +#------------------------------------------------------------------------------# + +context("clean_names.tbl_graph") + +test_that("tbl_graph/tidygraph", { + skip_if_not_installed("tidygraph") + # create test graph to test clean_names + test_graph <- + tidygraph::play_erdos_renyi(10, 0.5) %>% + # create nodes wi + tidygraph::bind_nodes(test_df) %>% + tidygraph::mutate_all(tidyr::replace_na, 1) + + # create a graph with clean names + clean_graph <- clean_names(test_graph, case = "snake") + + # get clean names + clean <- names(tibble::as_tibble(clean_graph)) + expect_is( + clean_graph, "tbl_graph", + info="Returns a tbl_graph object" + ) + + expect_equal(clean[1], "sp_ace") # spaces + expect_equal(clean[2], "repeated") # first instance of repeat + expect_equal(clean[3], "a") # multiple special chars, trailing special chars + expect_equal(clean[4], "percent") # converting % to percent + expect_equal(clean[5], "x") # 100% invalid name + expect_equal(clean[6], "x_2") # repeat of invalid name + expect_equal(clean[7], "d_9") # multiple special characters + expect_equal(clean[8], "repeated_2") # uppercase, 2nd instance of repeat + expect_equal(clean[9], "cant") # uppercase, 2nd instance of repeat + expect_equal(clean[10], "hi_there") # double-underscores to single + expect_equal(clean[11], "leading_spaces") # leading spaces + expect_equal(clean[12], "x_3") # euro sign, invalid + expect_equal(clean[13], "acao") # accented word, transliterated to latin, + expect_equal(clean[14], "faroe") # œ character was failing to convert on Windows, should work universally for stringi 1.1.6 or higher + # https://github.com/sfirke/janitor/issues/120#issuecomment-303385418 + expect_equal(clean[15], "a_b_c_d_e_f") # for testing alternating cases below with e.g., case = "upper_lower" + expect_equal(clean[16], "test_camel_case") # for testing alternating cases below with e.g., case = "upper_lower" + expect_equal(clean[17], "leadingpunct") # for testing alternating cases below with e.g., case = "upper_lower" + expect_equal(clean[18], "average_number_of_days") # for testing alternating cases below with e.g., case = "upper_lower" + expect_equal(clean[19], "jan2009sales") # no separator around number-word boundary if not existing already + expect_equal(clean[20], "jan_2009_sales") # yes separator around number-word boundary if it existed +})