From 2d250250db7ebcd81c66884f3bc58a0720c423d8 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 28 Mar 2022 16:20:08 -0700 Subject: [PATCH] New inner_combine_linter (#1012) * New inner_combine_linter * include lubridate calls * remove unusable link; test itself captures the reference well enough --- DESCRIPTION | 1 + NAMESPACE | 1 + NEWS.md | 1 + R/inner_combine_linter.R | 122 +++++++++++++++++++++ inst/lintr/linters.csv | 1 + man/consistency_linters.Rd | 1 + man/efficiency_linters.Rd | 1 + man/inner_combine_linter.Rd | 21 ++++ man/linters.Rd | 5 +- man/readability_linters.Rd | 1 + tests/testthat/test-inner_combine_linter.R | 94 ++++++++++++++++ 11 files changed, 247 insertions(+), 2 deletions(-) create mode 100644 R/inner_combine_linter.R create mode 100644 man/inner_combine_linter.Rd create mode 100644 tests/testthat/test-inner_combine_linter.R diff --git a/DESCRIPTION b/DESCRIPTION index 33528cb87..8aeb2318b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -90,6 +90,7 @@ Collate: 'ifelse_censor_linter.R' 'implicit_integer_linter.R' 'infix_spaces_linter.R' + 'inner_combine_linter.R' 'line_length_linter.R' 'lint.R' 'linter_tag_docs.R' diff --git a/NAMESPACE b/NAMESPACE index 3297f3dd7..5663327b5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -56,6 +56,7 @@ export(if_else_match_braces_linter) export(ifelse_censor_linter) export(implicit_integer_linter) export(infix_spaces_linter) +export(inner_combine_linter) export(line_length_linter) export(lint) export(lint_dir) diff --git a/NEWS.md b/NEWS.md index 4e9c8c43e..755a4af39 100644 --- a/NEWS.md +++ b/NEWS.md @@ -124,6 +124,7 @@ function calls. (#850, #851, @renkun-ken) * `consecutive_stopifnot_linter()` Require consecutive calls to `stopifnot()` to be unified into one * `ifelse_censor_linter()` Require usage of `pmax()` / `pmin()` where appropriate, e.g. `ifelse(x > y, x, y)` is `pmax(x, y)` * `system_file_linter()` Require file paths to be constructed by `system.file()` instead of calling `file.path()` directly + * `inner_combine_linter` Require inputs to vectorized functions to be combined first rather than later, e.g. `as.Date(c(x, y))` over `c(as.Date(x), as.Date(y))` * `assignment_linter()` now lints right assignment (`->` and `->>`) and gains two arguments. `allow_cascading_assign` (`TRUE` by default) toggles whether to lint `<<-` and `->>`; `allow_right_assign` toggles whether to lint `->` and `->>` (#915, @michaelchirico) * `infix_spaces_linter()` gains argument `exclude_operators` to disable lints on selected infix operators. By default, all "low-precedence" operators throw lints; see `?infix_spaces_linter` for an enumeration of these. (#914 @michaelchirico) * `infix_spaces_linter()` now throws a lint on `a~b` and `function(a=1) {}` (#930, @michaelchirico) diff --git a/R/inner_combine_linter.R b/R/inner_combine_linter.R new file mode 100644 index 000000000..81584f3e9 --- /dev/null +++ b/R/inner_combine_linter.R @@ -0,0 +1,122 @@ +#' Require c() to be applied before relatively expensive vectorized functions +#' +#' `as.Date(c(a, b))` is logically equivalent to `c(as.Date(a), as.Date(b))`; +#' ditto for the equivalence of several other vectorized functions like +#' [as.POSIXct()] and math functions like [sin()]. The former is to be +#' preferred so that the most expensive part of the operation ([as.Date()]) +#' is applied only once. +#' +#' @evalRd rd_tags("inner_combine_linter") +#' @seealso [linters] for a complete list of linters available in lintr. +#' @export +inner_combine_linter <- function() { + Linter(function(source_file) { + if (length(source_file$xml_parsed_content) == 0L) { + return(list()) + } + + xml <- source_file$xml_parsed_content + + # these don't take any other arguments (except maybe by non-default + # methods), so don't need to check equality of other arguments + no_arg_vectorized_funs <- c( + "sin", "cos", "tan", "sinpi", "cospi", "tanpi", "asin", "acos", "atan", + "log2", "log10", "log1p", "exp", "expm1", + "sqrt", "abs" + ) + + # TODO(michaelchirico): the need to spell out specific arguments is pretty brittle, + # but writing the xpath for the alternative case was proving too tricky. + # It's messy enough as is -- it may make sense to take another pass at + # writing the xpath from scratch to see if it can't be simplified. + + # See ?as.Date, ?as.POSIXct. tryFormats is not explicitly in any default + # POSIXct method, but it is in as.Date.character and as.POSIXlt.character -- + # the latter is what actually gets invoked when running as.POSIXct + # on a character. So it is indeed an argument by pass-through. + date_args <- c("format", "origin", "tz", "tryFormats") + date_funs <- c("as.Date", "as.POSIXct", "as.POSIXlt") + + # See ?log. Only these two take a 'base' argument. + log_funs <- c("log", "logb") + log_args <- "base" + + # See ?lubridate::ymd and ?lubridate::ymd_hms + lubridate_args <- c("quiet", "tz", "locale", "truncated") + lubridate_funs <- c( + "ymd", "ydm", "mdy", "myd", "dmy", "dym", + "yq", "ym", "my", + "ymd_hms", "ymd_hm", "ymd_h", "dmy_hms", "dmy_hm", "dmy_h", + "mdy_hms", "mdy_hm", "mdy_h", "ydm_hms", "ydm_hm", "ydm_h" + ) + + date_args_cond <- build_arg_condition(date_funs, date_args) + log_args_cond <- build_arg_condition(log_funs, log_args) + lubridate_args_cond <- build_arg_condition(lubridate_funs, lubridate_args) + + c_expr_cond <- xp_and( + sprintf( + "expr[SYMBOL_FUNCTION_CALL[%s]]", + xp_text_in_table(c(no_arg_vectorized_funs, date_funs, log_funs, lubridate_funs)) + ), + "not(following-sibling::expr[not(expr[SYMBOL_FUNCTION_CALL])])", + "not(expr/SYMBOL_FUNCTION_CALL != following-sibling::expr/expr/SYMBOL_FUNCTION_CALL)", + date_args_cond, + log_args_cond, + lubridate_args_cond + ) + xpath <- glue::glue("//expr[ + count(expr) > 2 + and expr[ + SYMBOL_FUNCTION_CALL[text() = 'c'] + and following-sibling::expr[1][ {c_expr_cond} ] + ] + ]") + + bad_expr <- xml2::xml_find_all(xml, xpath) + + return(lapply( + bad_expr, + xml_nodes_to_lint, + source_file = source_file, + lint_message = function(expr) { + matched_call <- xml2::xml_text(xml2::xml_find_first(expr, "expr/expr/SYMBOL_FUNCTION_CALL")) + message <- sprintf( + "%1$s(c(x, y)) only runs the more expensive %1$s() once as compared to c(%1$s(x), %1$s(y)).", + matched_call + ) + paste("Combine inputs to vectorized functions first to take full advantage of vectorization, e.g.,", message) + }, + type = "warning" + )) + }) +} + +#' Make the XPath condition ensuring an argument matches across calls +#' +#' @param arg Character scalar naming an argument +#' @noRd +arg_match_condition <- function(arg) { + this_symbol <- sprintf("SYMBOL_SUB[text() = '%s']", arg) + following_symbol <- sprintf("following-sibling::expr/%s", this_symbol) + next_expr <- "following-sibling::expr[1]" + return(xp_or( + sprintf("not(%s) and not(%s)", this_symbol, following_symbol), + xp_and( + this_symbol, + following_symbol, + sprintf( + "not(%1$s/%3$s != %2$s/%3$s)", + this_symbol, following_symbol, next_expr + ) + ) + )) +} + +build_arg_condition <- function(calls, arguments) { + xp_or( + sprintf("not(expr[SYMBOL_FUNCTION_CALL[%s]])", xp_text_in_table(calls)), + "not(SYMBOL_SUB) and not(following-sibling::expr/SYMBOL_SUB)", + xp_and(vapply(arguments, arg_match_condition, character(1L))) + ) +} diff --git a/inst/lintr/linters.csv b/inst/lintr/linters.csv index 711d3d5f1..860c0ae1e 100644 --- a/inst/lintr/linters.csv +++ b/inst/lintr/linters.csv @@ -32,6 +32,7 @@ if_else_match_braces_linter,default style readability ifelse_censor_linter,best_practices efficiency implicit_integer_linter,style consistency best_practices infix_spaces_linter,style readability default +inner_combine_linter,efficiency consistency readability line_length_linter,style readability default configurable literal_coercion_linter,best_practices consistency efficiency missing_argument_linter,correctness common_mistakes configurable diff --git a/man/consistency_linters.Rd b/man/consistency_linters.Rd index ce9f0aea3..78b9eeb17 100644 --- a/man/consistency_linters.Rd +++ b/man/consistency_linters.Rd @@ -18,6 +18,7 @@ The following linters are tagged with 'consistency': \item{\code{\link{condition_message_linter}}} \item{\code{\link{consecutive_stopifnot_linter}}} \item{\code{\link{implicit_integer_linter}}} +\item{\code{\link{inner_combine_linter}}} \item{\code{\link{literal_coercion_linter}}} \item{\code{\link{no_tab_linter}}} \item{\code{\link{numeric_leading_zero_linter}}} diff --git a/man/efficiency_linters.Rd b/man/efficiency_linters.Rd index c377e4d47..fd1a9165c 100644 --- a/man/efficiency_linters.Rd +++ b/man/efficiency_linters.Rd @@ -15,6 +15,7 @@ The following linters are tagged with 'efficiency': \item{\code{\link{any_duplicated_linter}}} \item{\code{\link{any_is_na_linter}}} \item{\code{\link{ifelse_censor_linter}}} +\item{\code{\link{inner_combine_linter}}} \item{\code{\link{literal_coercion_linter}}} \item{\code{\link{nested_ifelse_linter}}} \item{\code{\link{outer_negation_linter}}} diff --git a/man/inner_combine_linter.Rd b/man/inner_combine_linter.Rd new file mode 100644 index 000000000..156da8cee --- /dev/null +++ b/man/inner_combine_linter.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/inner_combine_linter.R +\name{inner_combine_linter} +\alias{inner_combine_linter} +\title{Require c() to be applied before relatively expensive vectorized functions} +\usage{ +inner_combine_linter() +} +\description{ +\code{as.Date(c(a, b))} is logically equivalent to \code{c(as.Date(a), as.Date(b))}; +ditto for the equivalence of several other vectorized functions like +\code{\link[=as.POSIXct]{as.POSIXct()}} and math functions like \code{\link[=sin]{sin()}}. The former is to be +preferred so that the most expensive part of the operation (\code{\link[=as.Date]{as.Date()}}) +is applied only once. +} +\seealso{ +\link{linters} for a complete list of linters available in lintr. +} +\section{Tags}{ +\link[=consistency_linters]{consistency}, \link[=efficiency_linters]{efficiency}, \link[=readability_linters]{readability} +} diff --git a/man/linters.Rd b/man/linters.Rd index c41344c6b..d734e341c 100644 --- a/man/linters.Rd +++ b/man/linters.Rd @@ -23,9 +23,9 @@ The following tags exist: \item{\link[=consistency_linters]{consistency} (15 linters)} \item{\link[=correctness_linters]{correctness} (7 linters)} \item{\link[=default_linters]{default} (28 linters)} -\item{\link[=efficiency_linters]{efficiency} (13 linters)} +\item{\link[=efficiency_linters]{efficiency} (14 linters)} \item{\link[=package_development_linters]{package_development} (14 linters)} -\item{\link[=readability_linters]{readability} (35 linters)} +\item{\link[=readability_linters]{readability} (36 linters)} \item{\link[=robustness_linters]{robustness} (11 linters)} \item{\link[=style_linters]{style} (36 linters)} } @@ -66,6 +66,7 @@ The following linters exist: \item{\code{\link{ifelse_censor_linter}} (tags: best_practices, efficiency)} \item{\code{\link{implicit_integer_linter}} (tags: best_practices, consistency, style)} \item{\code{\link{infix_spaces_linter}} (tags: default, readability, style)} +\item{\code{\link{inner_combine_linter}} (tags: consistency, efficiency, readability)} \item{\code{\link{line_length_linter}} (tags: configurable, default, readability, style)} \item{\code{\link{literal_coercion_linter}} (tags: best_practices, consistency, efficiency)} \item{\code{\link{missing_argument_linter}} (tags: common_mistakes, configurable, correctness)} diff --git a/man/readability_linters.Rd b/man/readability_linters.Rd index 2b2ec92dd..78746898f 100644 --- a/man/readability_linters.Rd +++ b/man/readability_linters.Rd @@ -27,6 +27,7 @@ The following linters are tagged with 'readability': \item{\code{\link{function_left_parentheses_linter}}} \item{\code{\link{if_else_match_braces_linter}}} \item{\code{\link{infix_spaces_linter}}} +\item{\code{\link{inner_combine_linter}}} \item{\code{\link{line_length_linter}}} \item{\code{\link{nested_ifelse_linter}}} \item{\code{\link{numeric_leading_zero_linter}}} diff --git a/tests/testthat/test-inner_combine_linter.R b/tests/testthat/test-inner_combine_linter.R new file mode 100644 index 000000000..db9bed52a --- /dev/null +++ b/tests/testthat/test-inner_combine_linter.R @@ -0,0 +1,94 @@ +test_that("inner_combine_linter lints a false positive-ish usage", { + # By default as.POSIXct.character picks up the format to apply from + # the first element and, since it succeeds, applies that to the remaining + # timestamps. Whereas when run individually, it won't succeed until + # the correct format is matched for each input. Nevertheless, it is + # still preferable to vectorize the call, while being sure to use a + # consistent format for the inputs. In this case, the correct equivalent + # call is as.POSIXct(c("2021-01-01 00:00:00", "2021-01-01 01:00:00")). + expect_lint( + "c(as.POSIXct('2021-01-01'), as.POSIXct('2021-01-01 01:00:00'))", + rex::rex("Combine inputs to vectorized functions first"), + inner_combine_linter() + ) +}) + +skip_if_not_installed("patrick") +local({ + vector_funs <- c( + "as.Date", "as.POSIXct", "as.POSIXlt", + "sin", "cos", "tan", "sinpi", "cospi", "tanpi", "asin", "acos", "atan", + "log", "logb", "log2", "log10", "log1p", "exp", "expm1", + "sqrt", "abs", + "ymd", "ydm", "mdy", "myd", "dmy", "dym", + "yq", "ym", "my", + "ymd_hms", "ymd_hm", "ymd_h", "dmy_hms", "dmy_hm", "dmy_h", + "mdy_hms", "mdy_hm", "mdy_h", "ydm_hms", "ydm_hm", "ydm_h", + NULL + ) + patrick::with_parameters_test_that( + "inner_combine_linter blocks simple vectorized calls:", + expect_lint( + sprintf("c(%1$s(x), %1$s(y))", vector_fun), + rex::rex("Combine inputs to vectorized functions first"), + inner_combine_linter() + ), + .test_name = vector_funs, + vector_fun = vector_funs + ) +}) + +patrick::with_parameters_test_that( + "inner_combine_linter blocks as.Date with identical passed arguments:", + expect_lint( + sprintf("c(as.Date(x, %1$s), as.Date(y, %1$s))", arg), + rex::rex("Combine inputs to vectorized functions first"), + inner_combine_linter() + ), + .test_name = c("format", "origin", "tz", "tryFormats", "non-literal"), + arg = c("format = '%F'", "origin = '1900-01-01'", "tz = 'Asia/Jakarta'", "tryFormats = '%F'", "tz = tz") +) + +patrick::with_parameters_test_that( + "inner_combine_linter blocks as.POSIXct with identical passed arguments:", + expect_lint( + sprintf("c(as.POSIXct(x, %1$s), as.POSIXct(y, %1$s))", arg), + rex::rex("Combine inputs to vectorized functions first"), + inner_combine_linter() + ), + .test_name = c("format", "origin", "tz", "non-literal"), + arg = c("format = '%F'", "origin = '1900-01-01'", "tz = 'UTC'", "tz = tz") +) + +test_that("inner_combine_linter is order-agnostic for matching arguments", { + expect_lint( + "c(as.Date(x, format = f, tz = t), as.Date(y, tz = t, format = f))", + rex::rex("Combine inputs to vectorized functions first"), + inner_combine_linter() + ) +}) + +skip_if_not_installed("tibble") +patrick::with_parameters_test_that( + "inner_combine_linter skips allowed usages:", + expect_lint(expr, NULL, inner_combine_linter()), + .cases = tibble::tribble( + ~.test_name, ~expr, + "simple sin()", "x <- sin(1:10)", + "mixed sin()+cos()", "y <- c(sin(1:10), cos(2:20))", + "present/absent vector function", "c(log(x), 0.5)", + "absent/present vector function", "c(0.5, log(x))", + "mismatched arg (Date)", "c(as.Date(x, format = '%y'), as.Date(y, format = '%m'))", + "present/absent arg (Date)", "c(as.Date(x, format = '%y'), as.Date(y))", + "absent/present arg (Date)", "c(as.Date(x), as.Date(y, format = '%y'))", + "matched value, not arg (Date)", "c(as.Date(x, format = '%y'), as.Date(y, tz = '%y'))", + "mismatched arg (POSIXct)", "c(as.POSIXct(x, format = '%y'), as.POSIXct(y, format = '%m'))", + "present/absent arg (POSIXct)", "c(as.POSIXct(x, format = '%y'), as.POSIXct(y))", + "mismatched arg (log)", "c(log(x, base = 4), log(y, base = 5))", + "present/absent arg (log)", "c(log(x, base = 4), log(y))" + # TODO(michaelchirico): fix the code so these edge cases are covered + # "unknown Date method argument", "c(as.Date(x, zoo = zzz), as.Date(y, zoo = zzz))", + # "known+unknown Date argument", "c(as.Date(x, format = '%y', zoo = zzz), as.Date(y, format = '%y', zoo = zzz))", + # "unknown POSIXct method argument", "c(as.POSIXct(x, zoo = zzz), as.POSIXct(y, zoo = zzz))", + ) +)