From 411436d0930512a2f9ddef82c0fdd971ae3e398a Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Tue, 15 Mar 2022 05:31:48 +0000 Subject: [PATCH] Add tests and method for list in describe_distribution (#105) --- NAMESPACE | 1 + NEWS.md | 2 + R/describe_distribution.R | 101 ++++++++++++- man/describe_distribution.Rd | 41 ++++- tests/testthat/test-describe_distribution.R | 157 +++++++++++++++++++- 5 files changed, 292 insertions(+), 10 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 16d04d9d8..61fbbc8ce 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -41,6 +41,7 @@ S3method(describe_distribution,character) S3method(describe_distribution,data.frame) S3method(describe_distribution,factor) S3method(describe_distribution,grouped_df) +S3method(describe_distribution,list) S3method(describe_distribution,numeric) S3method(kurtosis,data.frame) S3method(kurtosis,default) diff --git a/NEWS.md b/NEWS.md index d48097e06..556ddc910 100644 --- a/NEWS.md +++ b/NEWS.md @@ -35,6 +35,8 @@ MINOR CHANGES data frame will preserve value and variable label attributes, where possible and applicable. + * `describe_distribution()` now works with lists (@etiennebacher, #105). + * `data_rename()` doesn't use `pattern` anymore to rename the columns if `replacement` is not provided (@etiennebacher, #103). diff --git a/R/describe_distribution.R b/R/describe_distribution.R index ca37b1d65..f676cd01a 100644 --- a/R/describe_distribution.R +++ b/R/describe_distribution.R @@ -3,7 +3,8 @@ #' This function describes a distribution by a set of indices (e.g., measures of #' centrality, dispersion, range, skewness, kurtosis). #' -#' @param x A numeric vector. +#' @param x A numeric vector, a character vector, a dataframe, or a list. See +#' `Details`. #' @param range Return the range (min and max). #' @param quartiles Return the first and third quartiles (25th and 75pth #' percentiles). @@ -22,6 +23,15 @@ #' @param verbose Toggle warnings and messages. #' @inheritParams bayestestR::point_estimate #' +#' @details If `x` is a dataframe, only numeric variables are kept and will be displayed in the summary. +#' +#' If `x` is a list, the behavior is different whether `x` is a stored list. If +#' `x` is stored (for example, `describe_distribution(mylist)` where `mylist` +#' was created before), artificial variable names are used in the summary +#' (`Var_1`, `Var_2`, etc.). If `x` is an unstored list (for example, +#' `describe_distribution(list(mtcars$mpg))`), then `"mtcars$mpg"` is used as +#' variable name. +#' #' @note There is also a #' [`plot()`-method](https://easystats.github.io/see/articles/parameters.html) #' implemented in the @@ -34,14 +44,96 @@ #' data(iris) #' describe_distribution(iris) #' describe_distribution(iris, include_factors = TRUE, quartiles = TRUE) +#' describe_distribution(list(mtcars$mpg, mtcars$cyl)) #' @export describe_distribution <- function(x, ...) { UseMethod("describe_distribution") } +#' @rdname describe_distribution +#' @export + +describe_distribution.list <- function(x, + centrality = "mean", + dispersion = TRUE, + iqr = TRUE, + range = TRUE, + quartiles = FALSE, + ci = NULL, + include_factors = FALSE, + iterations = 100, + threshold = .1, + verbose = TRUE, + ...) { + + factor_el <- which(sapply(x, is.factor)) + num_el <- which(sapply(x, is.numeric)) + + # get elements names as is + # ex: list(mtcars$mpg, mtcars$cyl) -> c("mtcars$mpg", "mtcars$cyl") + nm <- sapply(sys.call()[[2]], .safe_deparse)[-1] + + if (!isTRUE(include_factors)) { + x <- x[num_el] + if (length(nm) != 0) { + nm <- nm[num_el] + } + } else { + x <- x[c(num_el, factor_el)] + if (length(nm) != 0) { + nm <- nm[c(num_el, factor_el)] + } + } + + # Not possible to obtain elements names if they are stored in + # an object + if (length(nm) == 0) { + nm <- paste0("Var_", seq_along(x)) + } + + # The function currently doesn't support descriptive summaries for character + # or factor types. + out <- do.call(rbind, lapply(x, function(i) { + if ((include_factors && is.factor(i)) || (!is.character(i) && !is.factor(i))) { + describe_distribution( + i, + centrality = centrality, + dispersion = dispersion, + iqr = iqr, + range = range, + quartiles = quartiles, + ci = ci, + iterations = iterations, + threshold = threshold, + verbose = verbose + ) + } + })) + + + if (!is.null(names(x))) { + empty_names <- which(names(x) == "") + new_names <- names(x) + new_names[empty_names] <- nm[empty_names] + } else { + new_names <- nm + } + + out$Variable <- new_names + row.names(out) <- NULL + out <- out[c("Variable", setdiff(colnames(out), "Variable"))] + + class(out) <- unique(c("parameters_distribution", "see_parameters_distribution", class(out))) + attr(out, "object_name") <- deparse(substitute(x), width.cutoff = 500) + attr(out, "ci") <- ci + attr(out, "threshold") <- threshold + if (centrality == "all") attr(out, "first_centrality") <- colnames(out)[2] + out +} #' @rdname describe_distribution #' @export + describe_distribution.numeric <- function(x, centrality = "mean", dispersion = TRUE, @@ -53,6 +145,8 @@ describe_distribution.numeric <- function(x, threshold = .1, verbose = TRUE, ...) { + + out <- data.frame(.temp = 0) # Missing @@ -198,7 +292,7 @@ describe_distribution.factor <- function(x, } - +#' @rdname describe_distribution #' @export describe_distribution.character <- function(x, dispersion = TRUE, @@ -275,6 +369,9 @@ describe_distribution.data.frame <- function(x, threshold = .1, verbose = TRUE, ...) { + + # The function currently doesn't support descriptive summaries for character + # or factor types. out <- do.call(rbind, lapply(x, function(i) { if ((include_factors && is.factor(i)) || (!is.character(i) && !is.factor(i))) { describe_distribution( diff --git a/man/describe_distribution.Rd b/man/describe_distribution.Rd index b5ba3f14b..bb27272de 100644 --- a/man/describe_distribution.Rd +++ b/man/describe_distribution.Rd @@ -2,13 +2,30 @@ % Please edit documentation in R/describe_distribution.R \name{describe_distribution} \alias{describe_distribution} +\alias{describe_distribution.list} \alias{describe_distribution.numeric} \alias{describe_distribution.factor} +\alias{describe_distribution.character} \alias{describe_distribution.data.frame} \title{Describe a distribution} \usage{ describe_distribution(x, ...) +\method{describe_distribution}{list}( + x, + centrality = "mean", + dispersion = TRUE, + iqr = TRUE, + range = TRUE, + quartiles = FALSE, + ci = NULL, + include_factors = FALSE, + iterations = 100, + threshold = 0.1, + verbose = TRUE, + ... +) + \method{describe_distribution}{numeric}( x, centrality = "mean", @@ -25,6 +42,8 @@ describe_distribution(x, ...) \method{describe_distribution}{factor}(x, dispersion = TRUE, range = TRUE, verbose = TRUE, ...) +\method{describe_distribution}{character}(x, dispersion = TRUE, range = TRUE, verbose = TRUE, ...) + \method{describe_distribution}{data.frame}( x, centrality = "mean", @@ -41,7 +60,8 @@ describe_distribution(x, ...) ) } \arguments{ -\item{x}{A numeric vector.} +\item{x}{A numeric vector, a character vector, a dataframe, or a list. See +\code{Details}.} \item{...}{Additional arguments to be passed to or from methods.} @@ -63,16 +83,16 @@ are based on bootstrap replicates (see \code{iterations}). If \code{centrality = "all"}, the bootstrapped confidence interval refers to the first centrality index (which is typically the median).} +\item{include_factors}{Logical, if \code{TRUE}, factors are included in the +output, however, only columns for range (first and last factor levels) as +well as n and missing will contain information.} + \item{iterations}{The number of bootstrap replicates for computing confidence intervals. Only applies when \code{ci} is not \code{NULL}.} \item{threshold}{For \code{centrality = "trimmed"} (i.e. trimmed mean), indicates the fraction (0 to 0.5) of observations to be trimmed from each end of the vector before the mean is computed.} \item{verbose}{Toggle warnings and messages.} - -\item{include_factors}{Logical, if \code{TRUE}, factors are included in the -output, however, only columns for range (first and last factor levels) as -well as n and missing will contain information.} } \value{ A data frame with columns that describe the properties of the variables. @@ -81,6 +101,16 @@ A data frame with columns that describe the properties of the variables. This function describes a distribution by a set of indices (e.g., measures of centrality, dispersion, range, skewness, kurtosis). } +\details{ +If \code{x} is a dataframe, only numeric variables are kept and will be displayed in the summary. + +If \code{x} is a list, the behavior is different whether \code{x} is a stored list. If +\code{x} is stored (for example, \code{describe_distribution(mylist)} where \code{mylist} +was created before), artificial variable names are used in the summary +(\code{Var_1}, \code{Var_2}, etc.). If \code{x} is an unstored list (for example, +\code{describe_distribution(list(mtcars$mpg))}), then \code{"mtcars$mpg"} is used as +variable name. +} \note{ There is also a \href{https://easystats.github.io/see/articles/parameters.html}{\code{plot()}-method} @@ -93,4 +123,5 @@ describe_distribution(rnorm(100)) data(iris) describe_distribution(iris) describe_distribution(iris, include_factors = TRUE, quartiles = TRUE) +describe_distribution(list(mtcars$mpg, mtcars$cyl)) } diff --git a/tests/testthat/test-describe_distribution.R b/tests/testthat/test-describe_distribution.R index 576c6455e..92fe0eb9f 100644 --- a/tests/testthat/test-describe_distribution.R +++ b/tests/testthat/test-describe_distribution.R @@ -1,12 +1,163 @@ -test_that("describe_distribution - numeric", { - x <- describe_distribution(rnorm(100)) - expect_equal(c(nrow(x), ncol(x)), c(1, 9)) +# numeric --------------------------------------- + +test_that("describe_distribution - numeric: works with basic numeric vector", { + x <- describe_distribution(mtcars$mpg) + expect_equal(dim(x), c(1, 9)) + expect_equal(round(x$Mean), 20) +}) + +test_that("describe_distribution - numeric: correctly handles missing values", { + no_missing <- describe_distribution(mtcars$mpg) + test <- mtcars$mpg + test[1] <- NA + with_missing <- describe_distribution(test) + expect_equal(with_missing$n, 31) + expect_equal(with_missing$n_Missing, 1) + expect_false(with_missing$Mean == no_missing$Mean) +}) + +test_that("describe_distribution - numeric: works with quartiles", { + x <- describe_distribution(mtcars$mpg, quartiles = TRUE) + expect_equal(dim(x), c(1, 11)) + expect_true("Q1" %in% names(x)) + expect_true("Q3" %in% names(x)) +}) + +test_that("describe_distribution - numeric: works with range", { + x <- describe_distribution(mtcars$mpg, range = FALSE) + expect_equal(dim(x), c(1, 7)) + expect_false("min" %in% names(x)) + expect_false("max" %in% names(x)) +}) + + + +# dataframe --------------------------------------- + +test_that("describe_distribution - dataframe: works with basic dataframe", { + x <- describe_distribution(mtcars) + expect_equal(dim(x), c(11, 10)) + expect_equal(round(x[1, "Mean"]), 20) }) +test_that("describe_distribution - dataframe: correctly handles missing values", { + no_missing <- describe_distribution(mtcars) + test <- mtcars + test[1, ] <- NA + with_missing <- describe_distribution(test) + expect_equal(unique(with_missing$n), 31) + expect_equal(unique(with_missing$n_Missing), 1) + expect_false(unique(with_missing$Mean == no_missing$Mean)) +}) + +test_that("describe_distribution - dataframe: works with quartiles", { + x <- describe_distribution(mtcars, quartiles = TRUE) + expect_equal(dim(x), c(11, 12)) + expect_true("Q1" %in% names(x)) + expect_true("Q3" %in% names(x)) +}) + +test_that("describe_distribution - dataframe: works with range", { + x <- describe_distribution(mtcars, range = FALSE) + expect_equal(dim(x), c(11, 8)) + expect_false("min" %in% names(x)) + expect_false("max" %in% names(x)) +}) + + + +# factor --------------------------------------- + test_that("describe_distribution - factor", { expect_snapshot(describe_distribution(factor(substring("statistics", 1:10, 1:10)))) }) + + +# character --------------------------------------- + test_that("describe_distribution - character", { expect_snapshot(describe_distribution(as.character(ToothGrowth$supp))) }) + + + +# list --------------------------------------- + +test_that("describe_distribution - list: works with basic list", { + x <- list(mtcars$mpg, mtcars$cyl) + stored <- describe_distribution(x) + unnamed <- describe_distribution(list(mtcars$mpg, mtcars$cyl)) + named <- describe_distribution(list(foo = mtcars$mpg, foo2 = mtcars$cyl)) + mix <- describe_distribution(list(foo = mtcars$mpg, mtcars$cyl)) + + expect_equal(dim(stored), c(2, 10)) + expect_equal(round(stored$Mean), c(20, 6)) + expect_equal(dim(unnamed), c(2, 10)) + expect_equal(round(unnamed$Mean), c(20, 6)) + expect_equal(dim(named), c(2, 10)) + expect_equal(round(named$Mean), c(20, 6)) + expect_equal(dim(mix), c(2, 10)) + expect_equal(round(mix$Mean), c(20, 6)) +}) + +test_that("describe_distribution - list: works with include_factors", { + x1 <- describe_distribution(list(mtcars$mpg, factor(mtcars$cyl))) + y <- describe_distribution(list(mtcars$mpg)) + expect_identical(x1, y) + + x2 <- describe_distribution(list(mtcars$mpg, factor(mtcars$cyl)), + include_factors = TRUE) + expect_equal(dim(x2), c(2, 10)) + expect_equal(x2$Variable, c("mtcars$mpg", "factor(mtcars$cyl)")) + + x3 <- describe_distribution(list(mtcars$mpg, foo = factor(mtcars$cyl)), + include_factors = TRUE) + expect_equal(dim(x3), c(2, 10)) + expect_equal(x3$Variable, c("mtcars$mpg", "foo")) +}) + +test_that("describe_distribution - list: correctly removes character elements", { + x <- describe_distribution(list(mtcars$mpg, "something")) + y <- describe_distribution(list(mtcars$mpg)) + expect_identical(x, y) +}) + +test_that("describe_distribution - list: correctly handles variable names", { + x <- list(mtcars$mpg, mtcars$cyl) + stored <- describe_distribution(x) + unnamed <- describe_distribution(list(mtcars$mpg, mtcars$cyl)) + named <- describe_distribution(list(foo = mtcars$mpg, foo2 = mtcars$cyl)) + mix <- describe_distribution(list(foo = mtcars$mpg, mtcars$cyl)) + + expect_equal(stored$Variable, c("Var_1", "Var_2")) + expect_equal(unnamed$Variable, c("mtcars$mpg", "mtcars$cyl")) + expect_equal(named$Variable, c("foo", "foo2")) + expect_equal(mix$Variable, c("foo", "mtcars$cyl")) +}) + +test_that("describe_distribution - list: correctly handles missing values", { + no_missing <- describe_distribution(list(mtcars$mpg, mtcars$cyl)) + test <- mtcars$mpg + test2 <- mtcars$cyl + test[1] <- NA + test2[1] <- NA + with_missing <- describe_distribution(list(test, test2)) + expect_equal(unique(with_missing$n), 31) + expect_equal(unique(with_missing$n_Missing), 1) + expect_false(unique(with_missing$Mean == no_missing$Mean)) +}) + +test_that("describe_distribution - list: works with quartiles", { + x <- describe_distribution(list(mtcars$mpg, mtcars$cyl), quartiles = TRUE) + expect_equal(dim(x), c(2, 12)) + expect_true("Q1" %in% names(x)) + expect_true("Q3" %in% names(x)) +}) + +test_that("describe_distribution - list: works with range", { + x <- describe_distribution(list(mtcars$mpg, mtcars$cyl), range = FALSE) + expect_equal(dim(x), c(2, 8)) + expect_false("min" %in% names(x)) + expect_false("max" %in% names(x)) +})