From 411436d0930512a2f9ddef82c0fdd971ae3e398a Mon Sep 17 00:00:00 2001
From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com>
Date: Tue, 15 Mar 2022 05:31:48 +0000
Subject: [PATCH] Add tests and method for list in describe_distribution (#105)

---
 NAMESPACE                                   |   1 +
 NEWS.md                                     |   2 +
 R/describe_distribution.R                   | 101 ++++++++++++-
 man/describe_distribution.Rd                |  41 ++++-
 tests/testthat/test-describe_distribution.R | 157 +++++++++++++++++++-
 5 files changed, 292 insertions(+), 10 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 16d04d9d8..61fbbc8ce 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -41,6 +41,7 @@ S3method(describe_distribution,character)
 S3method(describe_distribution,data.frame)
 S3method(describe_distribution,factor)
 S3method(describe_distribution,grouped_df)
+S3method(describe_distribution,list)
 S3method(describe_distribution,numeric)
 S3method(kurtosis,data.frame)
 S3method(kurtosis,default)
diff --git a/NEWS.md b/NEWS.md
index d48097e06..556ddc910 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -35,6 +35,8 @@ MINOR CHANGES
     data frame will preserve value and variable label attributes, where
     possible and applicable.
     
+  * `describe_distribution()` now works with lists (@etiennebacher, #105).
+
   * `data_rename()` doesn't use `pattern` anymore to rename the columns if
   `replacement` is not provided (@etiennebacher, #103).
   
diff --git a/R/describe_distribution.R b/R/describe_distribution.R
index ca37b1d65..f676cd01a 100644
--- a/R/describe_distribution.R
+++ b/R/describe_distribution.R
@@ -3,7 +3,8 @@
 #' This function describes a distribution by a set of indices (e.g., measures of
 #' centrality, dispersion, range, skewness, kurtosis).
 #'
-#' @param x A numeric vector.
+#' @param x A numeric vector, a character vector, a dataframe, or a list. See
+#' `Details`.
 #' @param range Return the range (min and max).
 #' @param quartiles Return the first and third quartiles (25th and 75pth
 #'   percentiles).
@@ -22,6 +23,15 @@
 #' @param verbose Toggle warnings and messages.
 #' @inheritParams bayestestR::point_estimate
 #'
+#' @details If `x` is a dataframe, only numeric variables are kept and will be displayed in the summary.
+#'
+#' If `x` is a list, the behavior is different whether `x` is a stored list. If
+#' `x` is stored (for example, `describe_distribution(mylist)` where `mylist`
+#' was created before), artificial variable names are used in the summary
+#' (`Var_1`, `Var_2`, etc.). If `x` is an unstored list (for example,
+#' `describe_distribution(list(mtcars$mpg))`), then `"mtcars$mpg"` is used as
+#' variable name.
+#'
 #' @note There is also a
 #'   [`plot()`-method](https://easystats.github.io/see/articles/parameters.html)
 #'   implemented in the
@@ -34,14 +44,96 @@
 #' data(iris)
 #' describe_distribution(iris)
 #' describe_distribution(iris, include_factors = TRUE, quartiles = TRUE)
+#' describe_distribution(list(mtcars$mpg, mtcars$cyl))
 #' @export
 describe_distribution <- function(x, ...) {
   UseMethod("describe_distribution")
 }
 
+#' @rdname describe_distribution
+#' @export
+
+describe_distribution.list <- function(x,
+                                       centrality = "mean",
+                                       dispersion = TRUE,
+                                       iqr = TRUE,
+                                       range = TRUE,
+                                       quartiles = FALSE,
+                                       ci = NULL,
+                                       include_factors = FALSE,
+                                       iterations = 100,
+                                       threshold = .1,
+                                       verbose = TRUE,
+                                       ...) {
+
+  factor_el <- which(sapply(x, is.factor))
+  num_el <- which(sapply(x, is.numeric))
+
+  # get elements names as is
+  # ex: list(mtcars$mpg, mtcars$cyl) -> c("mtcars$mpg", "mtcars$cyl")
+  nm <- sapply(sys.call()[[2]], .safe_deparse)[-1]
+
+  if (!isTRUE(include_factors)) {
+    x <- x[num_el]
+    if (length(nm) != 0) {
+      nm <- nm[num_el]
+    }
+  } else {
+    x <- x[c(num_el, factor_el)]
+    if (length(nm) != 0) {
+      nm <- nm[c(num_el, factor_el)]
+    }
+  }
+
+  # Not possible to obtain elements names if they are stored in
+  # an object
+  if (length(nm) == 0) {
+    nm <- paste0("Var_", seq_along(x))
+  }
+
+  # The function currently doesn't support descriptive summaries for character
+  # or factor types.
+  out <- do.call(rbind, lapply(x, function(i) {
+    if ((include_factors && is.factor(i)) || (!is.character(i) && !is.factor(i))) {
+      describe_distribution(
+        i,
+        centrality = centrality,
+        dispersion = dispersion,
+        iqr = iqr,
+        range = range,
+        quartiles = quartiles,
+        ci = ci,
+        iterations = iterations,
+        threshold = threshold,
+        verbose = verbose
+      )
+    }
+  }))
+
+
+  if (!is.null(names(x))) {
+    empty_names <- which(names(x) == "")
+    new_names <- names(x)
+    new_names[empty_names] <- nm[empty_names]
+  } else {
+    new_names <- nm
+  }
+
+  out$Variable <- new_names
+  row.names(out) <- NULL
+  out <- out[c("Variable", setdiff(colnames(out), "Variable"))]
+
+  class(out) <- unique(c("parameters_distribution", "see_parameters_distribution", class(out)))
+  attr(out, "object_name") <- deparse(substitute(x), width.cutoff = 500)
+  attr(out, "ci") <- ci
+  attr(out, "threshold") <- threshold
+  if (centrality == "all") attr(out, "first_centrality") <- colnames(out)[2]
+  out
+}
 
 #' @rdname describe_distribution
 #' @export
+
 describe_distribution.numeric <- function(x,
                                           centrality = "mean",
                                           dispersion = TRUE,
@@ -53,6 +145,8 @@ describe_distribution.numeric <- function(x,
                                           threshold = .1,
                                           verbose = TRUE,
                                           ...) {
+
+
   out <- data.frame(.temp = 0)
 
   # Missing
@@ -198,7 +292,7 @@ describe_distribution.factor <- function(x,
 }
 
 
-
+#' @rdname describe_distribution
 #' @export
 describe_distribution.character <- function(x,
                                             dispersion = TRUE,
@@ -275,6 +369,9 @@ describe_distribution.data.frame <- function(x,
                                              threshold = .1,
                                              verbose = TRUE,
                                              ...) {
+
+  # The function currently doesn't support descriptive summaries for character
+  # or factor types.
   out <- do.call(rbind, lapply(x, function(i) {
     if ((include_factors && is.factor(i)) || (!is.character(i) && !is.factor(i))) {
       describe_distribution(
diff --git a/man/describe_distribution.Rd b/man/describe_distribution.Rd
index b5ba3f14b..bb27272de 100644
--- a/man/describe_distribution.Rd
+++ b/man/describe_distribution.Rd
@@ -2,13 +2,30 @@
 % Please edit documentation in R/describe_distribution.R
 \name{describe_distribution}
 \alias{describe_distribution}
+\alias{describe_distribution.list}
 \alias{describe_distribution.numeric}
 \alias{describe_distribution.factor}
+\alias{describe_distribution.character}
 \alias{describe_distribution.data.frame}
 \title{Describe a distribution}
 \usage{
 describe_distribution(x, ...)
 
+\method{describe_distribution}{list}(
+  x,
+  centrality = "mean",
+  dispersion = TRUE,
+  iqr = TRUE,
+  range = TRUE,
+  quartiles = FALSE,
+  ci = NULL,
+  include_factors = FALSE,
+  iterations = 100,
+  threshold = 0.1,
+  verbose = TRUE,
+  ...
+)
+
 \method{describe_distribution}{numeric}(
   x,
   centrality = "mean",
@@ -25,6 +42,8 @@ describe_distribution(x, ...)
 
 \method{describe_distribution}{factor}(x, dispersion = TRUE, range = TRUE, verbose = TRUE, ...)
 
+\method{describe_distribution}{character}(x, dispersion = TRUE, range = TRUE, verbose = TRUE, ...)
+
 \method{describe_distribution}{data.frame}(
   x,
   centrality = "mean",
@@ -41,7 +60,8 @@ describe_distribution(x, ...)
 )
 }
 \arguments{
-\item{x}{A numeric vector.}
+\item{x}{A numeric vector, a character vector, a dataframe, or a list. See
+\code{Details}.}
 
 \item{...}{Additional arguments to be passed to or from methods.}
 
@@ -63,16 +83,16 @@ are based on bootstrap replicates (see \code{iterations}). If
 \code{centrality = "all"}, the bootstrapped confidence interval refers to
 the first centrality index (which is typically the median).}
 
+\item{include_factors}{Logical, if \code{TRUE}, factors are included in the
+output, however, only columns for range (first and last factor levels) as
+well as n and missing will contain information.}
+
 \item{iterations}{The number of bootstrap replicates for computing confidence
 intervals. Only applies when \code{ci} is not \code{NULL}.}
 
 \item{threshold}{For \code{centrality = "trimmed"} (i.e. trimmed mean), indicates the fraction (0 to 0.5) of observations to be trimmed from each end of the vector before the mean is computed.}
 
 \item{verbose}{Toggle warnings and messages.}
-
-\item{include_factors}{Logical, if \code{TRUE}, factors are included in the
-output, however, only columns for range (first and last factor levels) as
-well as n and missing will contain information.}
 }
 \value{
 A data frame with columns that describe the properties of the variables.
@@ -81,6 +101,16 @@ A data frame with columns that describe the properties of the variables.
 This function describes a distribution by a set of indices (e.g., measures of
 centrality, dispersion, range, skewness, kurtosis).
 }
+\details{
+If \code{x} is a dataframe, only numeric variables are kept and will be displayed in the summary.
+
+If \code{x} is a list, the behavior is different whether \code{x} is a stored list. If
+\code{x} is stored (for example, \code{describe_distribution(mylist)} where \code{mylist}
+was created before), artificial variable names are used in the summary
+(\code{Var_1}, \code{Var_2}, etc.). If \code{x} is an unstored list (for example,
+\code{describe_distribution(list(mtcars$mpg))}), then \code{"mtcars$mpg"} is used as
+variable name.
+}
 \note{
 There is also a
 \href{https://easystats.github.io/see/articles/parameters.html}{\code{plot()}-method}
@@ -93,4 +123,5 @@ describe_distribution(rnorm(100))
 data(iris)
 describe_distribution(iris)
 describe_distribution(iris, include_factors = TRUE, quartiles = TRUE)
+describe_distribution(list(mtcars$mpg, mtcars$cyl))
 }
diff --git a/tests/testthat/test-describe_distribution.R b/tests/testthat/test-describe_distribution.R
index 576c6455e..92fe0eb9f 100644
--- a/tests/testthat/test-describe_distribution.R
+++ b/tests/testthat/test-describe_distribution.R
@@ -1,12 +1,163 @@
-test_that("describe_distribution - numeric", {
-  x <- describe_distribution(rnorm(100))
-  expect_equal(c(nrow(x), ncol(x)), c(1, 9))
+# numeric ---------------------------------------
+
+test_that("describe_distribution - numeric: works with basic numeric vector", {
+  x <- describe_distribution(mtcars$mpg)
+  expect_equal(dim(x), c(1, 9))
+  expect_equal(round(x$Mean), 20)
+})
+
+test_that("describe_distribution - numeric: correctly handles missing values", {
+  no_missing <- describe_distribution(mtcars$mpg)
+  test <- mtcars$mpg
+  test[1] <- NA
+  with_missing <- describe_distribution(test)
+  expect_equal(with_missing$n, 31)
+  expect_equal(with_missing$n_Missing, 1)
+  expect_false(with_missing$Mean == no_missing$Mean)
+})
+
+test_that("describe_distribution - numeric: works with quartiles", {
+  x <- describe_distribution(mtcars$mpg, quartiles = TRUE)
+  expect_equal(dim(x), c(1, 11))
+  expect_true("Q1" %in% names(x))
+  expect_true("Q3" %in% names(x))
+})
+
+test_that("describe_distribution - numeric: works with range", {
+  x <- describe_distribution(mtcars$mpg, range = FALSE)
+  expect_equal(dim(x), c(1, 7))
+  expect_false("min" %in% names(x))
+  expect_false("max" %in% names(x))
+})
+
+
+
+# dataframe ---------------------------------------
+
+test_that("describe_distribution - dataframe: works with basic dataframe", {
+  x <- describe_distribution(mtcars)
+  expect_equal(dim(x), c(11, 10))
+  expect_equal(round(x[1, "Mean"]), 20)
 })
 
+test_that("describe_distribution - dataframe: correctly handles missing values", {
+  no_missing <- describe_distribution(mtcars)
+  test <- mtcars
+  test[1, ] <- NA
+  with_missing <- describe_distribution(test)
+  expect_equal(unique(with_missing$n), 31)
+  expect_equal(unique(with_missing$n_Missing), 1)
+  expect_false(unique(with_missing$Mean == no_missing$Mean))
+})
+
+test_that("describe_distribution - dataframe: works with quartiles", {
+  x <- describe_distribution(mtcars, quartiles = TRUE)
+  expect_equal(dim(x), c(11, 12))
+  expect_true("Q1" %in% names(x))
+  expect_true("Q3" %in% names(x))
+})
+
+test_that("describe_distribution - dataframe: works with range", {
+  x <- describe_distribution(mtcars, range = FALSE)
+  expect_equal(dim(x), c(11, 8))
+  expect_false("min" %in% names(x))
+  expect_false("max" %in% names(x))
+})
+
+
+
+# factor ---------------------------------------
+
 test_that("describe_distribution - factor", {
   expect_snapshot(describe_distribution(factor(substring("statistics", 1:10, 1:10))))
 })
 
+
+
+# character ---------------------------------------
+
 test_that("describe_distribution - character", {
   expect_snapshot(describe_distribution(as.character(ToothGrowth$supp)))
 })
+
+
+
+# list ---------------------------------------
+
+test_that("describe_distribution - list: works with basic list", {
+  x <- list(mtcars$mpg, mtcars$cyl)
+  stored <- describe_distribution(x)
+  unnamed <- describe_distribution(list(mtcars$mpg, mtcars$cyl))
+  named <- describe_distribution(list(foo = mtcars$mpg, foo2 = mtcars$cyl))
+  mix <- describe_distribution(list(foo = mtcars$mpg, mtcars$cyl))
+
+  expect_equal(dim(stored), c(2, 10))
+  expect_equal(round(stored$Mean), c(20, 6))
+  expect_equal(dim(unnamed), c(2, 10))
+  expect_equal(round(unnamed$Mean), c(20, 6))
+  expect_equal(dim(named), c(2, 10))
+  expect_equal(round(named$Mean), c(20, 6))
+  expect_equal(dim(mix), c(2, 10))
+  expect_equal(round(mix$Mean), c(20, 6))
+})
+
+test_that("describe_distribution - list: works with include_factors", {
+  x1 <- describe_distribution(list(mtcars$mpg, factor(mtcars$cyl)))
+  y <- describe_distribution(list(mtcars$mpg))
+  expect_identical(x1, y)
+
+  x2 <- describe_distribution(list(mtcars$mpg, factor(mtcars$cyl)),
+                              include_factors = TRUE)
+  expect_equal(dim(x2), c(2, 10))
+  expect_equal(x2$Variable, c("mtcars$mpg", "factor(mtcars$cyl)"))
+
+  x3 <- describe_distribution(list(mtcars$mpg, foo = factor(mtcars$cyl)),
+                              include_factors = TRUE)
+  expect_equal(dim(x3), c(2, 10))
+  expect_equal(x3$Variable, c("mtcars$mpg", "foo"))
+})
+
+test_that("describe_distribution - list: correctly removes character elements", {
+  x <- describe_distribution(list(mtcars$mpg, "something"))
+  y <- describe_distribution(list(mtcars$mpg))
+  expect_identical(x, y)
+})
+
+test_that("describe_distribution - list: correctly handles variable names", {
+  x <- list(mtcars$mpg, mtcars$cyl)
+  stored <- describe_distribution(x)
+  unnamed <- describe_distribution(list(mtcars$mpg, mtcars$cyl))
+  named <- describe_distribution(list(foo = mtcars$mpg, foo2 = mtcars$cyl))
+  mix <- describe_distribution(list(foo = mtcars$mpg, mtcars$cyl))
+
+  expect_equal(stored$Variable, c("Var_1", "Var_2"))
+  expect_equal(unnamed$Variable, c("mtcars$mpg", "mtcars$cyl"))
+  expect_equal(named$Variable, c("foo", "foo2"))
+  expect_equal(mix$Variable, c("foo", "mtcars$cyl"))
+})
+
+test_that("describe_distribution - list: correctly handles missing values", {
+  no_missing <- describe_distribution(list(mtcars$mpg, mtcars$cyl))
+  test <- mtcars$mpg
+  test2 <- mtcars$cyl
+  test[1] <- NA
+  test2[1] <- NA
+  with_missing <- describe_distribution(list(test, test2))
+  expect_equal(unique(with_missing$n), 31)
+  expect_equal(unique(with_missing$n_Missing), 1)
+  expect_false(unique(with_missing$Mean == no_missing$Mean))
+})
+
+test_that("describe_distribution - list: works with quartiles", {
+  x <- describe_distribution(list(mtcars$mpg, mtcars$cyl), quartiles = TRUE)
+  expect_equal(dim(x), c(2, 12))
+  expect_true("Q1" %in% names(x))
+  expect_true("Q3" %in% names(x))
+})
+
+test_that("describe_distribution - list: works with range", {
+  x <- describe_distribution(list(mtcars$mpg, mtcars$cyl), range = FALSE)
+  expect_equal(dim(x), c(2, 8))
+  expect_false("min" %in% names(x))
+  expect_false("max" %in% names(x))
+})