From ff1dc21f28f1c06a7acdfa35ff0041b2b530b948 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 19 Nov 2023 13:53:27 -0800 Subject: [PATCH] New nrow_subset_linter (#2298) * New nrow_subset_linter * examples * remove commented code * documentation feedback * remove 'readability' tag * more tests --- DESCRIPTION | 1 + NAMESPACE | 1 + NEWS.md | 1 + R/nrow_subset_linter.R | 40 ++++++++++++++++++++++++ inst/lintr/linters.csv | 1 + man/best_practices_linters.Rd | 1 + man/consistency_linters.Rd | 1 + man/efficiency_linters.Rd | 1 + man/linters.Rd | 7 +++-- man/nrow_subset_linter.Rd | 36 +++++++++++++++++++++ tests/testthat/test-nrow_subset_linter.R | 31 ++++++++++++++++++ 11 files changed, 118 insertions(+), 3 deletions(-) create mode 100644 R/nrow_subset_linter.R create mode 100644 man/nrow_subset_linter.Rd create mode 100644 tests/testthat/test-nrow_subset_linter.R diff --git a/DESCRIPTION b/DESCRIPTION index 626f65a3e..6e9bcfa02 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -139,6 +139,7 @@ Collate: 'namespace_linter.R' 'nested_ifelse_linter.R' 'nonportable_path_linter.R' + 'nrow_subset_linter.R' 'numeric_leading_zero_linter.R' 'nzchar_linter.R' 'object_length_linter.R' diff --git a/NAMESPACE b/NAMESPACE index b314774e4..604679f23 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -101,6 +101,7 @@ export(namespace_linter) export(nested_ifelse_linter) export(no_tab_linter) export(nonportable_path_linter) +export(nrow_subset_linter) export(numeric_leading_zero_linter) export(nzchar_linter) export(object_length_linter) diff --git a/NEWS.md b/NEWS.md index 25714bc50..ff2572d81 100644 --- a/NEWS.md +++ b/NEWS.md @@ -31,6 +31,7 @@ * `which_grepl_linter()` for discouraging `which(grepl(ptn, x))` in favor of directly using `grep(ptn, x)` (part of #884, @MichaelChirico). * `list_comparison_linter()` for discouraging comparisons on the output of `lapply()`, e.g. `lapply(x, sum) > 10` (part of #884, @MichaelChirico). * `print_linter()` for discouraging usage of `print()` on string literals like `print("Reached here")` or `print(paste("Found", nrow(DF), "rows."))` (#1894, @MichaelChirico). +* `nrow_subset_linter()` for discouraging usage like `nrow(subset(x, conditions))` in favor of something like `with(x, sum(conditions))` which doesn't require a full subset of `x` (part of #884, @MichaelChirico). * `pipe_return_linter()` for discouraging usage of `return()` inside a {magrittr} pipeline (part of #884, @MichaelChirico). ### Lint accuracy fixes: removing false positives diff --git a/R/nrow_subset_linter.R b/R/nrow_subset_linter.R new file mode 100644 index 000000000..eb731e579 --- /dev/null +++ b/R/nrow_subset_linter.R @@ -0,0 +1,40 @@ +#' Block usage of `nrow(subset(x, .))` +#' +#' Using `nrow(subset(x, condition))` to count the instances where `condition` +#' applies inefficiently requires doing a full subset of `x` just to +#' count the number of rows in the resulting subset. +#' There are a number of equivalent expressions that don't require the full +#' subset, e.g. `with(x, sum(condition))` (or, more generically, +#' `with(x, sum(condition, na.rm = TRUE))`). +#' +#' @examples +#' # will produce lints +#' lint( +#' text = "nrow(subset(x, is_treatment))", +#' linters = nrow_subset_linter() +#' ) +#' +#' # okay +#' lint( +#' text = "with(x, sum(is_treatment, na.rm = TRUE))", +#' linters = nrow_subset_linter() +#' ) +#' +#' @evalRd rd_tags("nrow_subset_linter") +#' @seealso [linters] for a complete list of linters available in lintr. +#' @export +nrow_subset_linter <- make_linter_from_xpath( + xpath = " + //SYMBOL_FUNCTION_CALL[text() = 'subset'] + /parent::expr + /parent::expr + /parent::expr[expr/SYMBOL_FUNCTION_CALL[text() = 'nrow']] + ", + lint_message = paste( + "Use arithmetic to count the number of rows satisfying a condition,", + "rather than fully subsetting the data.frame and counting the resulting rows.", + "For example, replace nrow(subset(x, is_treatment))", + "with sum(x$is_treatment). NB: use na.rm = TRUE if `is_treatment` has", + "missing values." + ) +) diff --git a/inst/lintr/linters.csv b/inst/lintr/linters.csv index aee3578f0..4c1b31300 100644 --- a/inst/lintr/linters.csv +++ b/inst/lintr/linters.csv @@ -58,6 +58,7 @@ namespace_linter,correctness robustness configurable executing nested_ifelse_linter,efficiency readability no_tab_linter,style consistency deprecated nonportable_path_linter,robustness best_practices configurable +nrow_subset_linter,efficiency consistency best_practices numeric_leading_zero_linter,style consistency readability nzchar_linter,efficiency best_practices consistency object_length_linter,style readability default configurable executing diff --git a/man/best_practices_linters.Rd b/man/best_practices_linters.Rd index 7e372d031..a2a0ae49f 100644 --- a/man/best_practices_linters.Rd +++ b/man/best_practices_linters.Rd @@ -46,6 +46,7 @@ The following linters are tagged with 'best_practices': \item{\code{\link{list_comparison_linter}}} \item{\code{\link{literal_coercion_linter}}} \item{\code{\link{nonportable_path_linter}}} +\item{\code{\link{nrow_subset_linter}}} \item{\code{\link{nzchar_linter}}} \item{\code{\link{object_overwrite_linter}}} \item{\code{\link{outer_negation_linter}}} diff --git a/man/consistency_linters.Rd b/man/consistency_linters.Rd index 3e3e0cbdb..24ae08f8d 100644 --- a/man/consistency_linters.Rd +++ b/man/consistency_linters.Rd @@ -26,6 +26,7 @@ The following linters are tagged with 'consistency': \item{\code{\link{keyword_quote_linter}}} \item{\code{\link{length_levels_linter}}} \item{\code{\link{literal_coercion_linter}}} +\item{\code{\link{nrow_subset_linter}}} \item{\code{\link{numeric_leading_zero_linter}}} \item{\code{\link{nzchar_linter}}} \item{\code{\link{object_name_linter}}} diff --git a/man/efficiency_linters.Rd b/man/efficiency_linters.Rd index 18e4e8a09..808cf7a10 100644 --- a/man/efficiency_linters.Rd +++ b/man/efficiency_linters.Rd @@ -24,6 +24,7 @@ The following linters are tagged with 'efficiency': \item{\code{\link{literal_coercion_linter}}} \item{\code{\link{matrix_apply_linter}}} \item{\code{\link{nested_ifelse_linter}}} +\item{\code{\link{nrow_subset_linter}}} \item{\code{\link{nzchar_linter}}} \item{\code{\link{outer_negation_linter}}} \item{\code{\link{redundant_equals_linter}}} diff --git a/man/linters.Rd b/man/linters.Rd index f62f20f2a..ba0644116 100644 --- a/man/linters.Rd +++ b/man/linters.Rd @@ -17,14 +17,14 @@ see also \code{\link[=available_tags]{available_tags()}}. \section{Tags}{ The following tags exist: \itemize{ -\item{\link[=best_practices_linters]{best_practices} (61 linters)} +\item{\link[=best_practices_linters]{best_practices} (62 linters)} \item{\link[=common_mistakes_linters]{common_mistakes} (10 linters)} \item{\link[=configurable_linters]{configurable} (35 linters)} -\item{\link[=consistency_linters]{consistency} (27 linters)} +\item{\link[=consistency_linters]{consistency} (28 linters)} \item{\link[=correctness_linters]{correctness} (7 linters)} \item{\link[=default_linters]{default} (25 linters)} \item{\link[=deprecated_linters]{deprecated} (4 linters)} -\item{\link[=efficiency_linters]{efficiency} (29 linters)} +\item{\link[=efficiency_linters]{efficiency} (30 linters)} \item{\link[=executing_linters]{executing} (6 linters)} \item{\link[=package_development_linters]{package_development} (14 linters)} \item{\link[=pkg_testthat_linters]{pkg_testthat} (12 linters)} @@ -93,6 +93,7 @@ The following linters exist: \item{\code{\link{namespace_linter}} (tags: configurable, correctness, executing, robustness)} \item{\code{\link{nested_ifelse_linter}} (tags: efficiency, readability)} \item{\code{\link{nonportable_path_linter}} (tags: best_practices, configurable, robustness)} +\item{\code{\link{nrow_subset_linter}} (tags: best_practices, consistency, efficiency)} \item{\code{\link{numeric_leading_zero_linter}} (tags: consistency, readability, style)} \item{\code{\link{nzchar_linter}} (tags: best_practices, consistency, efficiency)} \item{\code{\link{object_length_linter}} (tags: configurable, default, executing, readability, style)} diff --git a/man/nrow_subset_linter.Rd b/man/nrow_subset_linter.Rd new file mode 100644 index 000000000..3e2edcb2a --- /dev/null +++ b/man/nrow_subset_linter.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/nrow_subset_linter.R +\name{nrow_subset_linter} +\alias{nrow_subset_linter} +\title{Block usage of \code{nrow(subset(x, .))}} +\usage{ +nrow_subset_linter() +} +\description{ +Using \code{nrow(subset(x, condition))} to count the instances where \code{condition} +applies inefficiently requires doing a full subset of \code{x} just to +count the number of rows in the resulting subset. +There are a number of equivalent expressions that don't require the full +subset, e.g. \code{with(x, sum(condition))} (or, more generically, +\code{with(x, sum(condition, na.rm = TRUE))}). +} +\examples{ +# will produce lints +lint( + text = "nrow(subset(x, is_treatment))", + linters = nrow_subset_linter() +) + +# okay +lint( + text = "with(x, sum(is_treatment, na.rm = TRUE))", + linters = nrow_subset_linter() +) + +} +\seealso{ +\link{linters} for a complete list of linters available in lintr. +} +\section{Tags}{ +\link[=best_practices_linters]{best_practices}, \link[=consistency_linters]{consistency}, \link[=efficiency_linters]{efficiency} +} diff --git a/tests/testthat/test-nrow_subset_linter.R b/tests/testthat/test-nrow_subset_linter.R new file mode 100644 index 000000000..18a3e45d8 --- /dev/null +++ b/tests/testthat/test-nrow_subset_linter.R @@ -0,0 +1,31 @@ +test_that("nrow_subset_linter skips allowed usage", { + linter <- nrow_subset_linter() + + expect_lint("nrow(foo(subset(x, y == z)))", NULL, linter) + expect_lint("with(x, sum(y == z))", NULL, linter) +}) + +test_that("nrow_subset_linter blocks subset() cases", { + expect_lint( + "nrow(subset(x, y == z))", + rex::rex("Use arithmetic to count the number of rows satisfying a condition"), + nrow_subset_linter() + ) +}) + +test_that("lints vectorize", { + lint_msg <- rex::rex("Use arithmetic to count the number of rows satisfying a condition") + + expect_lint( + trim_some("{ + nrow(subset(x, y == z)) + subset(x) %>% transform(m = 2) + nrow(subset(a, b == c)) + }"), + list( + list(lint_msg, line_number = 2L), + list(lint_msg, line_number = 4L) + ), + nrow_subset_linter() + ) +})