easystats · strengejacke · May 31, 2024 · May 30, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: report
 Type: Package
 Title: Automated Reporting of Results and Statistical Models
-Version: 0.5.8.3
+Version: 0.5.8.4
 Authors@R:
     c(person(given = "Dominique",
              family = "Makowski",
@@ -148,6 +148,7 @@ Collate:
     'report_table.R'
     'utils_error_message.R'
     'utils_grouped_df.R'
+    'utils_misspelled_variables.R'
     'zzz.R'
 Roxygen: list(markdown = TRUE)
 Remotes: easystats/insight, easystats/datawizard, easystats/parameters, easystats/performance, easystats/modelbased
diff --git a/NEWS.md b/NEWS.md
@@ -13,6 +13,7 @@ Minor changes
 
 * `report` now supports reporting of Bayesian model comparison with variables of class `brms::loo_compare`.
 * `report` now supports reporting of BayesFactor objects with variables of class `BFBayesFactor`.
+* `report_sample()` now suggests valid column names for misspelled columns in the `select`, `by`, `weights` and `exclude` arguments.
 
 # report 0.5.8
 

diff --git a/R/report_sample.R b/R/report_sample.R
@@ -114,6 +114,12 @@
     select <- colnames(data)[select]
   }
 
+  # sanity check for existing columns
+  .check_spelling(data, select)
+  .check_spelling(data, exclude)
+  .check_spelling(data, by)
+  .check_spelling(data, weights)
+
   # variables to keep
   if (!is.null(weights)) {
     select <- unique(c(select, weights))
@@ -186,7 +192,7 @@
    # remember values of first columns
    variable <- result[[1]]["Variable"]
    # number of observation, based on weights
    if (!is.null(weights)) {
      n_obs <- round(as.vector(stats::xtabs(data[[weights]] ~ data[[by]])))
    } else {
      n_obs <- as.vector(table(data[by]))
@@ -218,7 +224,7 @@
      final$Total <- NULL
    }
    # define total N, based on weights
    if (!is.null(weights)) {
      total_n <- round(sum(as.vector(table(data[by]))) * mean(data[[weights]], na.rm = TRUE))
    } else {
      total_n <- sum(as.vector(table(data[by])))
@@ -259,7 +265,7 @@
                                        ci = NULL,
                                        ci_method = "wilson",
                                        ci_correct = FALSE) {
  if (!is.null(weights)) {
    w <- x[[weights]]
    columns <- setdiff(colnames(x), weights)
  } else {
@@ -338,7 +344,7 @@
    x <- as.factor(x)
  }

  if (!is.null(weights)) {
    x[is.na(weights)] <- NA
    weights[is.na(x)] <- NA
    weights <- stats::na.omit(weights)
@@ -354,7 +360,7 @@
  }

  # CI for proportions?
  if (!is.null(ci)) {
    ci_low_high <- .ci_proportion(x, table_proportions, weights, ci, ci_method, ci_correct)
    .summary <- sprintf(
      "%.1f [%.1f, %.1f]",

diff --git a/R/utils_misspelled_variables.R b/R/utils_misspelled_variables.R
@@ -0,0 +1,75 @@
+# call this function to check arguments. "select" is the argument where user
+# specified column names. "arg_name" is the name of that argument, can be NULL
+.check_spelling <- function(data, select) {
+  wrong_arg <- paste0("specified in `", deparse(substitute(select)), "` ")
+  if (!is.null(select) && isTRUE(nzchar(select)) && !all(select %in% colnames(data))) {
+    not_found <- select[!select %in% colnames(data)]
+    insight::format_error(
+      paste0(
+        sprintf("The following column(s) %sdon't exist in the dataset: ", wrong_arg),
+        datawizard::text_concatenate(not_found), "."
+      ),
+      .misspelled_string(colnames(data), not_found, "Possibly misspelled?")
+    )
+  }
+}
+
+
+#' Fuzzy grep, matches pattern that are close, but not identical
+#' @examples
+#' colnames(iris)
+#' p <- sprintf("(%s){~%i}", "Spela", 2)
+#' grep(pattern = p, x = colnames(iris), ignore.case = FALSE)
+#' @keywords internal
+#' @noRd
+.fuzzy_grep <- function(x, pattern, precision = NULL) {
+  if (is.null(precision)) {
+    precision <- round(nchar(pattern) / 3)
+  }
+  if (precision > nchar(pattern)) {
+    return(NULL)
+  }
+  p <- sprintf("(%s){~%i}", pattern, precision)
+  grep(pattern = p, x = x, ignore.case = FALSE)
+}
+
+
+#' create a message string to tell user about matches that could possibly
+#' be the string they were looking for
+#'
+#' @keywords internal
+#' @noRd
+.misspelled_string <- function(source, searchterm, default_message = NULL) {
+  if (is.null(searchterm) || length(searchterm) < 1) {
+    return(default_message)
+  }
+  # used for many matches
+  more_found <- ""
+  # init default
+  msg <- ""
+  # guess the misspelled string
+  possible_strings <- unlist(lapply(searchterm, function(s) {
+    source[.fuzzy_grep(source, s)] # nolint
+  }), use.names = FALSE)
+  if (length(possible_strings)) {
+    msg <- "Did you mean "
+    if (length(possible_strings) > 1) {
+      # make sure we don't print dozens of alternatives for larger data frames
+      if (length(possible_strings) > 5) {
+        more_found <- sprintf(
+          " We even found %i more possible matches, not shown here.",
+          length(possible_strings) - 5
+        )
+        possible_strings <- possible_strings[1:5]
+      }
+      msg <- paste0(msg, "one of ", datawizard::text_concatenate(possible_strings, enclose = "\"", last = " or "))
+    } else {
+      msg <- paste0(msg, "\"", possible_strings, "\"")
+    }
+    msg <- paste0(msg, "?", more_found)
+  } else {
+    msg <- default_message
+  }
+  # no double white space
+  insight::trim_ws(msg)
+}
diff --git a/tests/testthat/test-report_sample.R b/tests/testthat/test-report_sample.R
@@ -51,6 +51,8 @@ test_that("report_sample check input", {
   data(iris)
   expect_error(report_sample(lm(Sepal.Length ~ Species, data = iris)))
   expect_silent(report_sample(iris$Species))
+  expect_error(report_sample(iris, by = "Spedies"), regex = "The following column")
+  expect_error(report_sample(iris, select = "Spedies"), regex = "The following column")
 })
 
 test_that("report_sample default", {