draft "data_transform()" (#418)

* draft "data_transform()" * update pkgdown * add note * data_transform -> data_modify * rename file * ad tests * styler * recycling works * message about recycling * allow strings in variable * make it work inside functuions * add test * more improvements, docs, tests * docs * docs * add comment * make it work for grouped df, too * add tests * comment * styler * Update R/data_modify.R Co-authored-by: Etienne Bacher <[email protected]> * Update R/data_modify.R Co-authored-by: Etienne Bacher <[email protected]> * Update R/data_modify.R Co-authored-by: Etienne Bacher <[email protected]> * allow string as variable * new handling for RHS strings * remove redundant code * allow sequence of strings * remove list-option * update tests * docs * recycling msg only once * allow NULL * alert about modified variables, update tests * news * add more tests * styler * remove messages * informative error for misspelled variables * remove redundant code * tell which expression had an erorr * update snapshots * add tests * simplify * satisfy lintr * remove duplicated code for grouped method * Update R/data_modify.R Co-authored-by: Etienne Bacher <[email protected]> * Update R/data_modify.R Co-authored-by: Etienne Bacher <[email protected]> * update RD * suggest msg * msg * msg * temporary disable class-attr * preserve class attr * fix, so grouped_df preserves labels * add test * revert changes to grouped_df method * statisfy lintr * msg --------- Co-authored-by: Etienne Bacher <[email protected]>
easystats · May 30, 2023 · 6436e15 · 6436e15
1 parent 8d3bef6
commit 6436e15
Show file tree

Hide file tree

Showing 10 changed files with 944 additions and 11 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.7.1.8
+Version: 0.7.1.9
 Authors@R: c(
     person("Indrajeet", "Patil", , "[email protected]", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),

diff --git a/NAMESPACE b/NAMESPACE
@@ -48,6 +48,9 @@ S3method(data_filter,data.frame)
 S3method(data_filter,grouped_df)
 S3method(data_merge,data.frame)
 S3method(data_merge,list)
+S3method(data_modify,data.frame)
+S3method(data_modify,default)
+S3method(data_modify,grouped_df)
 S3method(data_peek,data.frame)
 S3method(data_tabulate,data.frame)
 S3method(data_tabulate,default)
@@ -213,6 +216,7 @@ export(data_group)
 export(data_join)
 export(data_match)
 export(data_merge)
+export(data_modify)
 export(data_partition)
 export(data_peek)
 export(data_read)

diff --git a/NEWS.md b/NEWS.md
@@ -30,6 +30,9 @@ NEW FUNCTIONS
 
 * `data_unite()`, to merge values of multiple variables into one new variable.
 
+* `data_modify()`, to create new variables, or modify or remove existing
+  variables in a data frame.
+
 BUG FIXES
 
 * `standardize_parameters()` now works when the package namespace is in the model

diff --git a/R/data_modify.R b/R/data_modify.R
@@ -0,0 +1,255 @@
+#' Create new variables in a data frame
+#'
+#' Create new variables or modify existing variables in a data frame. Unlike `base::transform()`, `data_modify()`
+#' can be used on grouped data frames, and newly created variables can be directly
+#' used.
+#'
+#' @param data A data frame
+#' @param ... One or more expressions that define the new variable name and the
+#' values or recoding of those new variables. These expressions can be one of:
+#' - A sequence of named, literal expressions, where the left-hand side refers
+#'   to the name of the new variable, while the right-hand side represent the
+#'   values of the new variable. Example: `Sepal.Width = center(Sepal.Width)`.
+#' - A sequence of string values, representing expressions.
+#' - A variable that contains a string representation of the expression. Example:
+#'   ```r
+#'   a <- "2 * Sepal.Width"
+#'   data_modify(iris, a)
+#'   ```
+#' - A character vector of expressions. Example:
+#'   `c("SW_double = 2 * Sepal.Width", "SW_fraction = SW_double / 10")`. This
+#'   type of expression cannot be mixed with other expressions, i.e. if a
+#'   character vector is provided, you may not add further elements to `...`.
+#' - Using `NULL` as right-hand side removes a variable from the data frame.
+#'   Example: `Petal.Width = NULL`.
+#'
+#' Note that newly created variables can be used in subsequent expressions.
+#' See also 'Examples'.
+#'
+#' @note `data_modify()` can also be used inside functions. However, it is
+#' recommended to pass the recode-expression as character vector or list of
+#' characters.
+#'
+#' @examples
+#' data(efc)
+#' new_efc <- data_modify(
+#'   efc,
+#'   c12hour_c = center(c12hour),
+#'   c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE),
+#'   c12hour_z2 = standardize(c12hour)
+#' )
+#' head(new_efc)
+#'
+#' # using strings instead of literal expressions
+#' new_efc <- data_modify(
+#'   efc,
+#'   "c12hour_c = center(c12hour)",
+#'   "c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE)",
+#'   "c12hour_z2 = standardize(c12hour)"
+#' )
+#' head(new_efc)
+#'
+#' # using character strings, provided as variable
+#' stand <- "c12hour_c / sd(c12hour, na.rm = TRUE)"
+#' new_efc <- data_modify(
+#'   efc,
+#'   c12hour_c = center(c12hour),
+#'   c12hour_z = stand
+#' )
+#' head(new_efc)
+#'
+#' # providing expressions as character vector
+#' new_exp <- c(
+#'   "c12hour_c = center(c12hour)",
+#'   "c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE)"
+#' )
+#' new_efc <- data_modify(efc, new_exp)
+#' head(new_efc)
+#'
+#' # attributes - in this case, value and variable labels - are preserved
+#' str(new_efc)
+#'
+#' # overwrite existing variable, remove old variable
+#' out <- data_modify(iris, Petal.Length = 1 / Sepal.Length, Sepal.Length = NULL)
+#' head(out)
+#'
+#' # works on grouped data
+#' grouped_efc <- data_group(efc, "c172code")
+#' new_efc <- data_modify(
+#'   grouped_efc,
+#'   c12hour_c = center(c12hour),
+#'   c12hour_z = c12hour_c / sd(c12hour, na.rm = TRUE),
+#'   c12hour_z2 = standardize(c12hour)
+#' )
+#' head(new_efc)
+#'
+#' # works from inside functions
+#' foo <- function(data, z) {
+#'   head(data_modify(data, z))
+#' }
+#' foo(iris, "var_a = Sepal.Width / 10")
+#'
+#' new_exp <- c("SW_double = 2 * Sepal.Width", "SW_fraction = SW_double / 10")
+#' foo(iris, new_exp)
+#' @export
+data_modify <- function(data, ...) {
+  UseMethod("data_modify")
+}
+
+#' @export
+data_modify.default <- function(data, ...) {
+  insight::format_error("`data` must be a data frame.")
+}
+
+#' @rdname data_modify
+#' @export
+data_modify.data.frame <- function(data, ...) {
+  dots <- eval(substitute(alist(...)))
+  column_names <- colnames(data)
+
+  # we check for character vector of expressions, in which case
+  # "dots" should be unnamed
+  if (is.null(names(dots))) {
+    # if we have multiple strings, concatenate them to a character vector
+    # and put it into a list...
+    if (length(dots) > 1) {
+      if (all(vapply(dots, is.character, logical(1)))) {
+        dots <- list(unlist(dots))
+      } else {
+        insight::format_error("You cannot mix string and literal representation of expressions.")
+      }
+    }
+    # expression is given as character string, e.g.
+    # a <- "double_SepWidth = 2 * Sepal.Width"
+    # data_modify(iris, a)
+    # or as character vector, e.g.
+    # data_modify(iris, c("var_a = Sepal.Width / 10", "var_b = Sepal.Width * 10"))
+    character_symbol <- tryCatch(.dynEval(dots[[1]]), error = function(e) NULL)
+    # do we have a character vector? Then we can proceed
+    if (is.character(character_symbol)) {
+      dots <- lapply(character_symbol, function(s) {
+        # turn value from character vector into expression
+        str2lang(.dynEval(s))
+      })
+      names(dots) <- vapply(dots, function(n) insight::safe_deparse(n[[2]]), character(1))
+    }
+  }
+
+  for (i in seq_along(dots)) {
+    # iterate expressions for new variables
+    symbol <- dots[[i]]
+
+    # expression is given as character string in a variable, but named, e.g.
+    # a <- "2 * Sepal.Width"
+    # data_modify(iris, double_SepWidth = a)
+    # we reconstruct the symbol as if it were provided as literal expression.
+    # However, we need to check that we don't have a character vector,
+    # like: data_modify(iris, new_var = "a")
+    # this one should be recycled instead.
+    if (!is.character(symbol)) {
+      eval_symbol <- .dynEval(symbol, ifnotfound = NULL)
+      if (is.character(eval_symbol)) {
+        symbol <- str2lang(paste0(names(dots)[i], " = ", eval_symbol))
+      }
+    }
+
+    # finally, we can evaluate expression and get values for new variables
+    new_variable <- try(with(data, eval(symbol)), silent = TRUE)
+
+    # successful, or any errors, like misspelled variable name?
+    if (inherits(new_variable, "try-error")) {
+      # in which step did error happen?
+      step_number <- switch(as.character(i),
+        "1" = "the first expression",
+        "2" = "the second expression",
+        "3" = "the third expression",
+        paste("expression", i)
+      )
+      step_msg <- paste0("There was an error in ", step_number, ".")
+      # try to find out which variable was the cause for the error
+      error_msg <- attributes(new_variable)$condition$message
+      if (grepl("object '(.*)' not found", error_msg)) {
+        error_var <- gsub("object '(.*)' not found", "\\1", error_msg)
+        insight::format_error(
+          paste0(step_msg, " Variable \"", error_var, "\" was not found in the dataset or in the environment."),
+          .misspelled_string(colnames(data), error_var, "Possibly misspelled or not yet defined?")
+        )
+      } else {
+        insight::format_error(paste0(
+          step_msg, " ", insight::format_capitalize(error_msg),
+          ". Possibly misspelled or not yet defined?"
+        ))
+      }
+    }
+
+    # give informative error when new variable doesn't match number of rows
+    if (!is.null(new_variable) && length(new_variable) != nrow(data) && (nrow(data) %% length(new_variable)) != 0) {
+      insight::format_error(
+        "New variable has not the same length as the other variables in the data frame and cannot be recycled."
+      )
+    }
+
+    data[[names(dots)[i]]] <- new_variable
+  }
+
+  data
+}
+
+#' @export
+data_modify.grouped_df <- function(data, ...) {
+  # we need to evaluate dots here, and pass them with "do.call" to
+  # the data.frame method later...
+  dots <- match.call(expand.dots = FALSE)$`...`
+
+  # works only for dplyr >= 0.8.0
+  grps <- attr(data, "groups", exact = TRUE)
+  grps <- grps[[".rows"]]
+  attr_data <- attributes(data)
+
+  # remove conflicting class attributes
+  class_attr <- class(data)
+  data <- as.data.frame(data)
+
+  # we check for character vector of expressions, in which case
+  # "dots" should be unnamed
+  if (is.null(names(dots))) {
+    # if we have multiple strings, concatenate them to a character vector
+    # and put it into a list...
+    if (length(dots) > 1) {
+      if (all(vapply(dots, is.character, logical(1)))) {
+        dots <- list(unlist(dots))
+      } else {
+        insight::format_error("You cannot mix string and literal representation of expressions.")
+      }
+    }
+    # expression is given as character string, e.g.
+    # a <- "double_SepWidth = 2 * Sepal.Width"
+    # data_modify(iris, a)
+    # or as character vector, e.g.
+    # data_modify(iris, c("var_a = Sepal.Width / 10", "var_b = Sepal.Width * 10"))
+    character_symbol <- tryCatch(.dynEval(dots[[1]]), error = function(e) NULL)
+    # do we have a character vector? Then we can proceed
+    if (is.character(character_symbol)) {
+      dots <- lapply(character_symbol, function(s) {
+        # turn value from character vector into expression
+        str2lang(.dynEval(s))
+      })
+      names(dots) <- vapply(dots, function(n) insight::safe_deparse(n[[2]]), character(1))
+    }
+  }
+
+  # create new variables as dummys, do for-loop works
+  for (i in names(dots)) {
+    data[[i]] <- NA
+  }
+
+  # create new variables per group
+  for (rows in grps) {
+    data[rows, ] <- data_modify.data.frame(data[rows, ], ...)
+  }
+
+  # set back attributes and class
+  data <- .replace_attrs(data, attr_data)
+  class(data) <- class_attr
+  data
+}
diff --git a/R/describe_distribution.R b/R/describe_distribution.R
@@ -277,18 +277,18 @@ describe_distribution.factor <- function(x,
   }
 
 
-  dot.arguments <- list(...)
+  dot_args <- list(...)
 
-  if (is.null(dot.arguments[["ci"]])) {
+  if (is.null(dot_args[["ci"]])) {
     out$CI_low <- NULL
     out$CI_high <- NULL
   }
 
-  if (is.null(dot.arguments[["iqr"]]) || isFALSE(dot.arguments[["iqr"]])) {
+  if (is.null(dot_args[["iqr"]]) || isFALSE(dot_args[["iqr"]])) {
     out$IQR <- NULL
   }
 
-  if (is.null(dot.arguments[["quartiles"]]) || isFALSE(dot.arguments[["quartiles"]])) {
+  if (is.null(dot_args[["quartiles"]]) || isFALSE(dot_args[["quartiles"]])) {
     out$Q1 <- NULL
     out$Q3 <- NULL
   }
@@ -338,17 +338,17 @@ describe_distribution.character <- function(x,
   }
 
 
-  dot.arguments <- list(...)
-  if (is.null(dot.arguments[["ci"]])) {
+  dot_args <- list(...)
+  if (is.null(dot_args[["ci"]])) {
     out$CI_low <- NULL
     out$CI_high <- NULL
   }
 
-  if (is.null(dot.arguments[["iqr"]]) || isFALSE(dot.arguments[["iqr"]])) {
+  if (is.null(dot_args[["iqr"]]) || isFALSE(dot_args[["iqr"]])) {
     out$IQR <- NULL
   }
 
-  if (is.null(dot.arguments[["quartiles"]]) || isFALSE(dot.arguments[["quartiles"]])) {
+  if (is.null(dot_args[["quartiles"]]) || isFALSE(dot_args[["quartiles"]])) {
     out$Q1 <- NULL
     out$Q3 <- NULL
   }
@@ -408,7 +408,9 @@ describe_distribution.data.frame <- function(x,
     }
   }))
 
-  if (is.null(out)) return(NULL)
+  if (is.null(out)) {
+    return(NULL)
+  }
 
   out$Variable <- row.names(out)
   row.names(out) <- NULL

diff --git a/R/utils-rows.R b/R/utils-rows.R
@@ -116,7 +116,6 @@ rowid_as_column.default <- function(x, var = "rowid") {
 
 #' @export
 rowid_as_column.grouped_df <- function(x, var = "rowid") {
-
   if (!is.character(var)) {
     insight::format_error("Argument 'var' must be of type character.")
   }

diff --git a/_pkgdown.yaml b/_pkgdown.yaml
@@ -27,6 +27,7 @@ reference:
     desc: |
       Functions for transforming variables
     contents:
+      - data_modify
       - data_unite
       - categorize
       - recode_into