Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tests and method for list in describe_distribution #105

Merged
merged 11 commits into from
Mar 15, 2022
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ S3method(describe_distribution,character)
S3method(describe_distribution,data.frame)
S3method(describe_distribution,factor)
S3method(describe_distribution,grouped_df)
S3method(describe_distribution,list)
S3method(describe_distribution,numeric)
S3method(kurtosis,data.frame)
S3method(kurtosis,default)
Expand Down
81 changes: 79 additions & 2 deletions R/describe_distribution.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
#' This function describes a distribution by a set of indices (e.g., measures of
#' centrality, dispersion, range, skewness, kurtosis).
#'
#' @param x A numeric vector.
#' @param x A numeric vector, a character vector, a dataframe, or a list. See
#' `Details`.
#' @param range Return the range (min and max).
#' @param quartiles Return the first and third quartiles (25th and 75pth
#' percentiles).
Expand All @@ -22,6 +23,15 @@
#' @param verbose Toggle warnings and messages.
#' @inheritParams bayestestR::point_estimate
#'
#' @details If `x` is a dataframe, only numeric variables are kept and will be displayed in the summary.
#'
#' If `x` is a list, the behavior is different whether `x` is a stored list. If
#' `x` is stored (for example, `describe_distribution(mylist)` where `mylist`
#' was created before), artificial variable names are used in the summary
#' (`Var_1`, `Var_2`, etc.). If `x` is an unstored list (for example,
#' `describe_distribution(list(mtcars$mpg))`), then `"mtcars$mpg"` is used as
#' variable name.
#'
#' @note There is also a
#' [`plot()`-method](https://easystats.github.io/see/articles/parameters.html)
#' implemented in the
Expand All @@ -34,14 +44,79 @@
#' data(iris)
#' describe_distribution(iris)
#' describe_distribution(iris, include_factors = TRUE, quartiles = TRUE)
#' describe_distribution(list(mtcars$mpg, mtcars$cyl))
#' @export
describe_distribution <- function(x, ...) {
UseMethod("describe_distribution")
}

#' @rdname describe_distribution
#' @export

describe_distribution.list <- function(x,
centrality = "mean",
dispersion = TRUE,
iqr = TRUE,
range = TRUE,
quartiles = FALSE,
ci = NULL,
include_factors = FALSE,
iterations = 100,
threshold = .1,
verbose = TRUE,
...) {

# get elements names as is
# ex: list(mtcars$mpg, mtcars$cyl) -> c("mtcars$mpg", "mtcars$cyl")
nm <- sapply(sys.call()[[2]], deparse)[-1]
etiennebacher marked this conversation as resolved.
Show resolved Hide resolved

# Not possible to obtain elements names if they are stored in
# an object
if (length(nm) == 0) {
nm <- paste0("Var_", 1:length(x))
}

out <- do.call(rbind, lapply(x, function(i) {
if ((include_factors && is.factor(i)) || (!is.character(i) && !is.factor(i))) {
etiennebacher marked this conversation as resolved.
Show resolved Hide resolved
describe_distribution(
i,
centrality = centrality,
dispersion = dispersion,
iqr = iqr,
range = range,
quartiles = quartiles,
ci = ci,
iterations = iterations,
threshold = threshold,
verbose = verbose
)
}
}))


if (!is.null(names(x))) {
empty_names <- which(names(x) == "")
new_names <- names(x)
new_names[empty_names] <- nm[empty_names]
} else {
new_names <- nm
}

out$Variable <- new_names
row.names(out) <- NULL
out <- out[c("Variable", setdiff(colnames(out), "Variable"))]

class(out) <- unique(c("parameters_distribution", "see_parameters_distribution", class(out)))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you tried the plotting method in {see} and checked if it works with outputs from list input?

Maybe you can post an example in the PR thread.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't know there was a plotting method for that. I just tried plot(describe_distribution(list(mtcars$mpg, mtcars$cyl))) and it doesn't work. I don't have any experience with ggplot2 programming so not sure how to handle this. I'll take a look at see source code later, maybe it's easier than I think

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you can leave this in for now and create an issue in {see} repository so that we don't forget about it.

attr(out, "object_name") <- deparse(substitute(x), width.cutoff = 500)
attr(out, "ci") <- ci
attr(out, "threshold") <- threshold
if (centrality == "all") attr(out, "first_centrality") <- colnames(out)[2]
out
}

#' @rdname describe_distribution
#' @export

describe_distribution.numeric <- function(x,
centrality = "mean",
dispersion = TRUE,
Expand All @@ -53,6 +128,8 @@ describe_distribution.numeric <- function(x,
threshold = .1,
verbose = TRUE,
...) {


out <- data.frame(.temp = 0)

# Missing
Expand Down Expand Up @@ -198,7 +275,7 @@ describe_distribution.factor <- function(x,
}



#' @rdname describe_distribution
#' @export
describe_distribution.character <- function(x,
dispersion = TRUE,
Expand Down
41 changes: 36 additions & 5 deletions man/describe_distribution.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

135 changes: 132 additions & 3 deletions tests/testthat/test-describe_distribution.R
Original file line number Diff line number Diff line change
@@ -1,12 +1,141 @@
test_that("describe_distribution - numeric", {
x <- describe_distribution(rnorm(100))
expect_equal(c(nrow(x), ncol(x)), c(1, 9))
# numeric ---------------------------------------

test_that("describe_distribution - numeric: works with basic numeric vector", {
x <- describe_distribution(mtcars$mpg)
expect_equal(dim(x), c(1, 9))
expect_equal(round(x$Mean), 20)
})

test_that("describe_distribution - numeric: correctly handles missing values", {
no_missing <- describe_distribution(mtcars$mpg)
test <- mtcars$mpg
test[1] <- NA
with_missing <- describe_distribution(test)
expect_equal(with_missing$n, 31)
expect_equal(with_missing$n_Missing, 1)
expect_false(with_missing$Mean == no_missing$Mean)
})

test_that("describe_distribution - numeric: works with quartiles", {
x <- describe_distribution(mtcars$mpg, quartiles = TRUE)
expect_equal(dim(x), c(1, 11))
expect_true("Q1" %in% names(x))
expect_true("Q3" %in% names(x))
})

test_that("describe_distribution - numeric: works with range", {
x <- describe_distribution(mtcars$mpg, range = FALSE)
expect_equal(dim(x), c(1, 7))
expect_false("min" %in% names(x))
expect_false("max" %in% names(x))
})



# dataframe ---------------------------------------

test_that("describe_distribution - dataframe: works with basic dataframe", {
x <- describe_distribution(mtcars)
expect_equal(dim(x), c(11, 10))
expect_equal(round(x[1, "Mean"]), 20)
})

test_that("describe_distribution - dataframe: correctly handles missing values", {
no_missing <- describe_distribution(mtcars)
test <- mtcars
test[1, ] <- NA
with_missing <- describe_distribution(test)
expect_equal(unique(with_missing$n), 31)
expect_equal(unique(with_missing$n_Missing), 1)
expect_false(unique(with_missing$Mean == no_missing$Mean))
})

test_that("describe_distribution - dataframe: works with quartiles", {
x <- describe_distribution(mtcars, quartiles = TRUE)
expect_equal(dim(x), c(11, 12))
expect_true("Q1" %in% names(x))
expect_true("Q3" %in% names(x))
})

test_that("describe_distribution - dataframe: works with range", {
x <- describe_distribution(mtcars, range = FALSE)
expect_equal(dim(x), c(11, 8))
expect_false("min" %in% names(x))
expect_false("max" %in% names(x))
})



# factor ---------------------------------------

test_that("describe_distribution - factor", {
expect_snapshot(describe_distribution(factor(substring("statistics", 1:10, 1:10))))
})



# character ---------------------------------------

test_that("describe_distribution - character", {
expect_snapshot(describe_distribution(as.character(ToothGrowth$supp)))
})



# list ---------------------------------------

test_that("describe_distribution - list: works with basic list", {
x <- list(mtcars$mpg, mtcars$cyl)
stored <- describe_distribution(x)
unnamed <- describe_distribution(list(mtcars$mpg, mtcars$cyl))
named <- describe_distribution(list(foo = mtcars$mpg, foo2 = mtcars$cyl))
mix <- describe_distribution(list(foo = mtcars$mpg, mtcars$cyl))

expect_equal(dim(stored), c(2, 10))
expect_equal(round(stored$Mean), c(20, 6))
expect_equal(dim(unnamed), c(2, 10))
expect_equal(round(unnamed$Mean), c(20, 6))
expect_equal(dim(named), c(2, 10))
expect_equal(round(named$Mean), c(20, 6))
expect_equal(dim(mix), c(2, 10))
expect_equal(round(mix$Mean), c(20, 6))
})

test_that("describe_distribution - list: correctly handles variable names", {
x <- list(mtcars$mpg, mtcars$cyl)
stored <- describe_distribution(x)
unnamed <- describe_distribution(list(mtcars$mpg, mtcars$cyl))
named <- describe_distribution(list(foo = mtcars$mpg, foo2 = mtcars$cyl))
mix <- describe_distribution(list(foo = mtcars$mpg, mtcars$cyl))

expect_equal(stored$Variable, c("Var_1", "Var_2"))
expect_equal(unnamed$Variable, c("mtcars$mpg", "mtcars$cyl"))
expect_equal(named$Variable, c("foo", "foo2"))
expect_equal(mix$Variable, c("foo", "mtcars$cyl"))
})

test_that("describe_distribution - list: correctly handles missing values", {
no_missing <- describe_distribution(list(mtcars$mpg, mtcars$cyl))
test <- mtcars$mpg
test2 <- mtcars$cyl
test[1] <- NA
test2[1] <- NA
with_missing <- describe_distribution(list(test, test2))
expect_equal(unique(with_missing$n), 31)
expect_equal(unique(with_missing$n_Missing), 1)
expect_false(unique(with_missing$Mean == no_missing$Mean))
})

test_that("describe_distribution - list: works with quartiles", {
x <- describe_distribution(list(mtcars$mpg, mtcars$cyl), quartiles = TRUE)
expect_equal(dim(x), c(2, 12))
expect_true("Q1" %in% names(x))
expect_true("Q3" %in% names(x))
})

test_that("describe_distribution - list: works with range", {
x <- describe_distribution(list(mtcars$mpg, mtcars$cyl), range = FALSE)
expect_equal(dim(x), c(2, 8))
expect_false("min" %in% names(x))
expect_false("max" %in% names(x))
})