From 7e7fce62493d74f70a82841ac0b6766595b1cc60 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Thu, 18 Aug 2022 11:08:28 +0000 Subject: [PATCH 01/28] Remove unnecessary variable --- R/verb-joins.R | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/R/verb-joins.R b/R/verb-joins.R index 561beff0b..6b26e30d9 100644 --- a/R/verb-joins.R +++ b/R/verb-joins.R @@ -97,7 +97,7 @@ left_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE, auto_index = FALSE, ..., sql_on = NULL, na_matches = c("never", "na"), x_as = NULL, y_as = NULL) { - lazy_query <- add_join( + x$lazy_query <- add_join( x, y, "left", by = by, @@ -111,7 +111,6 @@ left_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE, ... ) - x$lazy_query <- lazy_query x } @@ -123,7 +122,7 @@ right_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE, auto_index = FALSE, ..., sql_on = NULL, na_matches = c("never", "na"), x_as = NULL, y_as = NULL) { - lazy_query <- add_join( + x$lazy_query <- add_join( x, y, "right", by = by, @@ -137,7 +136,6 @@ right_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE, ... ) - x$lazy_query <- lazy_query x } @@ -149,7 +147,7 @@ full_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE, auto_index = FALSE, ..., sql_on = NULL, na_matches = c("never", "na"), x_as = NULL, y_as = NULL) { - lazy_query <- add_join( + x$lazy_query <- add_join( x, y, "full", by = by, @@ -163,7 +161,6 @@ full_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE, ... ) - x$lazy_query <- lazy_query x } @@ -174,7 +171,7 @@ semi_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE, auto_index = FALSE, ..., sql_on = NULL, na_matches = c("never", "na"), x_as = NULL, y_as = NULL) { - lazy_query <- add_semi_join( + x$lazy_query <- add_semi_join( x, y, anti = FALSE, by = by, @@ -187,7 +184,6 @@ semi_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE, ... ) - x$lazy_query <- lazy_query x } @@ -198,7 +194,7 @@ anti_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE, auto_index = FALSE, ..., sql_on = NULL, na_matches = c("never", "na"), x_as = NULL, y_as = NULL) { - lazy_query <- add_semi_join( + x$lazy_query <- add_semi_join( x, y, anti = TRUE, by = by, @@ -211,7 +207,6 @@ anti_join.tbl_lazy <- function(x, y, by = NULL, copy = FALSE, ... ) - x$lazy_query <- lazy_query x } From 42e4071b577b886e64bea328aee993f047538832 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Fri, 19 Aug 2022 08:12:59 +0200 Subject: [PATCH 02/28] Refactor `check_join_as()` --- R/verb-joins.R | 87 ++++++++++++++++++++++++++--- tests/testthat/_snaps/verb-joins.md | 16 ------ tests/testthat/test-verb-joins.R | 23 ++++---- 3 files changed, 91 insertions(+), 35 deletions(-) diff --git a/R/verb-joins.R b/R/verb-joins.R index 6b26e30d9..9fc647da4 100644 --- a/R/verb-joins.R +++ b/R/verb-joins.R @@ -243,14 +243,15 @@ add_join <- function(x, y, type, by = NULL, sql_on = NULL, copy = FALSE, # the table alias can only be determined after `select()` was inlined. # This works even though `by` is used in `inline_select_in_join()` and updated # because this does not touch `by$x_as` and `by$y_as`. - by[c("x_as", "y_as")] <- check_join_as( - x_as, - inlined_select_list$x, - y_as, - inlined_select_list$y, - sql_on = sql_on, - call = call + join_alias <- check_join_alias(x_as, y_as, sql_on, call) + + table_names <- c( + unclass(query_name(inlined_select_list$x)) %||% NA, + unclass(query_name(inlined_select_list$y)) %||% NA ) + by[c("x_as", "y_as")] <- join_simple_table_alias(table_names, join_alias) + by$x_as <- ident(by$x_as) + by$y_as <- ident(by$y_as) lazy_join_query( x = inlined_select_list$x, @@ -345,7 +346,15 @@ add_semi_join <- function(x, y, anti = FALSE, by = NULL, sql_on = NULL, copy = F } # the table alias can only be determined after `select()` was inlined - by[c("x_as", "y_as")] <- check_join_as(x_as, x_lq, y_as, y, sql_on = sql_on, call = call) + join_alias <- check_join_alias(x_as, y_as, sql_on, call) + + table_names <- c( + unclass(query_name(x_lq)) %||% NA, + unclass(query_name(y)) %||% NA + ) + by[c("x_as", "y_as")] <- join_simple_table_alias(table_names, join_alias) + by$x_as <- ident(by$x_as) + by$y_as <- ident(by$y_as) lazy_semi_join_query( x_lq, @@ -361,6 +370,31 @@ add_semi_join <- function(x, y, anti = FALSE, by = NULL, sql_on = NULL, copy = F ) } +check_join_alias <- function(x_as, y_as, sql_on, call) { + x_as <- check_join_as1(x_as, arg = "x_as", sql_on, default = "LHS", call) + y_as <- check_join_as1(y_as, arg = "y_as", sql_on, default = "RHS", call) + + if (identical(x_as, y_as) && !is.na(x_as)) { + cli_abort("{.arg y_as} must be different from {.arg x_as}.", call = call) + } + + list(x = x_as, y = y_as) +} + +check_join_as1 <- function(as, arg, sql_on, default, call) { + if (!is_null(as)) { + vctrs::vec_assert(as, character(), size = 1, arg = arg, call = call) + } + + if (!is_null(sql_on)) { + # for backwards compatibility use "LHS"/"RHS" if `sql_on` is used + # without a table alias + as <- as %||% default + } + + as %||% NA_character_ +} + check_join_as <- function(x_as, x, y_as, y, sql_on, call) { if (!is_null(x_as)) { vctrs::vec_assert(x_as, character(), size = 1, arg = "x_as", call = call) @@ -430,6 +464,43 @@ join_vars <- function(x_names, y_names, type, by, suffix = c(".x", ".y"), call = ) } +join_simple_table_alias <- function(table_names, aliases) { + stopifnot(length(table_names) == 2) + stopifnot(length(aliases) == 2) + aliases <- unlist(aliases) + + x_alias <- aliases[[1]] + y_alias <- aliases[[2]] + x_name <- table_names[[1]] + y_name <- table_names[[2]] + + x_out <- dplyr::coalesce(x_alias, x_name, "LHS") + y_out <- dplyr::coalesce(y_alias, y_name, "RHS") + out <- c(x_out, y_out) + + if (!identical(x_out, y_out)) { + return(out) + } + # -> must rename + + if (is.na(x_alias) && is.na(y_alias)) { + # self join of named table + if (!is.na(x_name) && !is.na(y_name) && identical(x_name, y_name)) { + out <- c( + paste0(x_name, "_LHS"), + paste0(y_name, "_RHS") + ) + return(out) + } + } + + out_repaired <- vctrs::vec_as_names(c(x_out, y_out), repair = "unique", quiet = TRUE) + may_repair <- is.na(c(x_alias, y_alias)) + out[may_repair] <- out_repaired[may_repair] + + out +} + check_suffix <- function(x, call) { vctrs::vec_assert(x, character(), size = 2, arg = "suffix", call = call) list(x = x[1], y = x[2]) diff --git a/tests/testthat/_snaps/verb-joins.md b/tests/testthat/_snaps/verb-joins.md index 474b43afa..ccbff076c 100644 --- a/tests/testthat/_snaps/verb-joins.md +++ b/tests/testthat/_snaps/verb-joins.md @@ -36,22 +36,6 @@ Error in `left_join()`: ! `y_as` must be different from `x_as`. ---- - - Code - left_join(x, y, by = "a", x_as = "y") - Condition - Error in `left_join()`: - ! `y_as` must be different from `x_as`. - ---- - - Code - left_join(x %>% filter(x == 1), x, by = "x", y_as = "LHS") - Condition - Error in `left_join()`: - ! `y_as` must be different from `x_as`. - # select() before join is inlined Code diff --git a/tests/testthat/test-verb-joins.R b/tests/testthat/test-verb-joins.R index 2aa0c6b17..81eb90e13 100644 --- a/tests/testthat/test-verb-joins.R +++ b/tests/testthat/test-verb-joins.R @@ -159,22 +159,13 @@ test_that("join check `x_as` and `y_as`", { expect_snapshot(error = TRUE, left_join(x, x, by = "x", y_as = c("A", "B"))) expect_snapshot(error = TRUE, left_join(x, x, by = "x", x_as = "LHS", y_as = "LHS")) - - expect_snapshot(error = TRUE, left_join(x, y, by = "a", x_as = "y")) - expect_snapshot(error = TRUE, { - left_join( - x %>% filter(x == 1), - x, - by = "x", - y_as = "LHS" - ) - }) }) test_that("join uses correct table alias", { x <- lazy_frame(a = 1, x = 1, .name = "x") y <- lazy_frame(a = 1, y = 1, .name = "y") + # self joins by <- left_join(x, x, by = "a")$lazy_query$by expect_equal(by$x_as, ident("x_LHS")) expect_equal(by$y_as, ident("x_RHS")) @@ -191,7 +182,7 @@ test_that("join uses correct table alias", { expect_equal(by$x_as, ident("my_x")) expect_equal(by$y_as, ident("my_y")) - + # x-y joins by <- left_join(x, y, by = "a")$lazy_query$by expect_equal(by$x_as, ident("x")) expect_equal(by$y_as, ident("y")) @@ -208,6 +199,16 @@ test_that("join uses correct table alias", { expect_equal(by$x_as, ident("my_x")) expect_equal(by$y_as, ident("my_y")) + # x_as same name as `y` + by <- left_join(x, y, by = "a", x_as = "y")$lazy_query$by + expect_equal(by$x_as, ident("y")) + expect_equal(by$y_as, ident("y...2")) + + by <- left_join(x %>% filter(x == 1), x, by = "x", y_as = "LHS")$lazy_query$by + expect_equal(by$x_as, ident("LHS...1")) + expect_equal(by$y_as, ident("LHS")) + + # sql_on -> use alias or LHS/RHS by <- left_join(x, y, sql_on = sql("LHS.a = RHS.a"))$lazy_query$by expect_equal(by$x_as, ident("LHS")) expect_equal(by$y_as, ident("RHS")) From c953e0744cad331b24f505b70bcf40af61aa841c Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Fri, 19 Aug 2022 13:21:02 +0000 Subject: [PATCH 03/28] Use `NULL` instead of `NA` --- R/verb-joins.R | 44 ++++++++++++++++++-------------------------- 1 file changed, 18 insertions(+), 26 deletions(-) diff --git a/R/verb-joins.R b/R/verb-joins.R index 9fc647da4..67c78c581 100644 --- a/R/verb-joins.R +++ b/R/verb-joins.R @@ -245,11 +245,9 @@ add_join <- function(x, y, type, by = NULL, sql_on = NULL, copy = FALSE, # because this does not touch `by$x_as` and `by$y_as`. join_alias <- check_join_alias(x_as, y_as, sql_on, call) - table_names <- c( - unclass(query_name(inlined_select_list$x)) %||% NA, - unclass(query_name(inlined_select_list$y)) %||% NA - ) - by[c("x_as", "y_as")] <- join_simple_table_alias(table_names, join_alias) + x_name <- unclass(query_name(inlined_select_list$x)) + y_name <- unclass(query_name(inlined_select_list$y)) + by[c("x_as", "y_as")] <- join_two_table_alias(x_name, y_name, join_alias$x, join_alias$y) by$x_as <- ident(by$x_as) by$y_as <- ident(by$y_as) @@ -348,11 +346,9 @@ add_semi_join <- function(x, y, anti = FALSE, by = NULL, sql_on = NULL, copy = F # the table alias can only be determined after `select()` was inlined join_alias <- check_join_alias(x_as, y_as, sql_on, call) - table_names <- c( - unclass(query_name(x_lq)) %||% NA, - unclass(query_name(y)) %||% NA - ) - by[c("x_as", "y_as")] <- join_simple_table_alias(table_names, join_alias) + x_name <- unclass(query_name(x_lq)) + y_name <- unclass(query_name(y)) + by[c("x_as", "y_as")] <- join_two_table_alias(x_name, y_name, join_alias$x, join_alias$y) by$x_as <- ident(by$x_as) by$y_as <- ident(by$y_as) @@ -374,7 +370,7 @@ check_join_alias <- function(x_as, y_as, sql_on, call) { x_as <- check_join_as1(x_as, arg = "x_as", sql_on, default = "LHS", call) y_as <- check_join_as1(y_as, arg = "y_as", sql_on, default = "RHS", call) - if (identical(x_as, y_as) && !is.na(x_as)) { + if (identical(x_as, y_as) && !is.null(x_as)) { cli_abort("{.arg y_as} must be different from {.arg x_as}.", call = call) } @@ -392,7 +388,7 @@ check_join_as1 <- function(as, arg, sql_on, default, call) { as <- as %||% default } - as %||% NA_character_ + as } check_join_as <- function(x_as, x, y_as, y, sql_on, call) { @@ -464,18 +460,14 @@ join_vars <- function(x_names, y_names, type, by, suffix = c(".x", ".y"), call = ) } -join_simple_table_alias <- function(table_names, aliases) { - stopifnot(length(table_names) == 2) - stopifnot(length(aliases) == 2) - aliases <- unlist(aliases) - - x_alias <- aliases[[1]] - y_alias <- aliases[[2]] - x_name <- table_names[[1]] - y_name <- table_names[[2]] +join_two_table_alias <- function(x_name, y_name, x_alias, y_alias) { + stopifnot(is.null(x_name) || is.character(x_name)) + stopifnot(is.null(y_name) || is.character(y_name)) + stopifnot(is.null(x_alias) || is.character(x_alias)) + stopifnot(is.null(y_alias) || is.character(y_alias)) - x_out <- dplyr::coalesce(x_alias, x_name, "LHS") - y_out <- dplyr::coalesce(y_alias, y_name, "RHS") + x_out <- x_alias %||% x_name %||% "LHS" + y_out <- y_alias %||% y_name %||% "RHS" out <- c(x_out, y_out) if (!identical(x_out, y_out)) { @@ -483,9 +475,9 @@ join_simple_table_alias <- function(table_names, aliases) { } # -> must rename - if (is.na(x_alias) && is.na(y_alias)) { + if (is_null(x_alias) && is_null(y_alias)) { # self join of named table - if (!is.na(x_name) && !is.na(y_name) && identical(x_name, y_name)) { + if (!is_null(x_name) && !is_null(y_name) && identical(x_name, y_name)) { out <- c( paste0(x_name, "_LHS"), paste0(y_name, "_RHS") @@ -495,7 +487,7 @@ join_simple_table_alias <- function(table_names, aliases) { } out_repaired <- vctrs::vec_as_names(c(x_out, y_out), repair = "unique", quiet = TRUE) - may_repair <- is.na(c(x_alias, y_alias)) + may_repair <- c(is_null(x_alias), is_null(y_alias)) out[may_repair] <- out_repaired[may_repair] out From a4fb7c50d555b52fc0835e4b5c13ec20a37f6a51 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Fri, 19 Aug 2022 15:04:01 +0200 Subject: [PATCH 04/28] Refactor semi join (#981) * Transfer attributes while inlining * Change semi-join vars to tibble * Document --- R/lazy-join-query.R | 15 +++++++-------- R/verb-joins.R | 18 +++++++++--------- R/verb-select.R | 3 ++- man/sql_build.Rd | 6 +++--- tests/testthat/test-verb-joins.R | 22 +++++++++++++++++++--- tests/testthat/test-verb-select.R | 5 ++++- 6 files changed, 44 insertions(+), 25 deletions(-) diff --git a/R/lazy-join-query.R b/R/lazy-join-query.R index e4a964e7b..76df8ab85 100644 --- a/R/lazy-join-query.R +++ b/R/lazy-join-query.R @@ -81,9 +81,9 @@ lazy_semi_join_query <- function(x, anti, by, na_matches = c("never", "na"), - group_vars = NULL, - order_vars = NULL, - frame = NULL, + group_vars = op_grps(x), + order_vars = op_sort(x), + frame = op_frame(x), call = caller_env()) { stopifnot(inherits(x, "lazy_query")) stopifnot(inherits(y, "lazy_query")) @@ -140,7 +140,7 @@ op_vars.lazy_join_query <- function(op) { } #' @export op_vars.lazy_semi_join_query <- function(op) { - names(op$vars) + op$vars$name } #' @export @@ -158,13 +158,12 @@ sql_build.lazy_join_query <- function(op, con, ...) { #' @export sql_build.lazy_semi_join_query <- function(op, con, ...) { - vars <- op$vars vars_prev <- op_vars(op$x) - if (identical(unname(vars), names(vars)) && - identical(unname(vars), vars_prev)) { + if (identical(op$vars$var, op$vars$name) && + identical(op$vars$var, vars_prev)) { vars <- sql("*") } else { - vars <- ident(vars) + vars <- ident(set_names(op$vars$var, op$vars$name)) } semi_join_query( diff --git a/R/verb-joins.R b/R/verb-joins.R index 67c78c581..21862d6b7 100644 --- a/R/verb-joins.R +++ b/R/verb-joins.R @@ -327,20 +327,23 @@ add_semi_join <- function(x, y, anti = FALSE, by = NULL, sql_on = NULL, copy = F indexes = if (auto_index) list(by$y) ) - vars <- set_names(op_vars(x)) - group_vars <- op_grps(x) - order_vars <- op_sort(x) - frame <- op_frame(x) + vars <- tibble( + name = op_vars(x), + var = op_vars(x) + ) x_lq <- x$lazy_query if (is_null(sql_on) && is_lazy_select_query_simple(x_lq, select = "projection")) { if (!is_select_identity(x_lq$select, op_vars(x_lq))) { by$x <- update_join_vars(by$x, x_lq$select) - vars <- purrr::map_chr(x_lq$select$expr, as_name) - vars <- purrr::set_names(vars, x_lq$select$name) + vars$var <- purrr::map_chr(x_lq$select$expr, as_name) + vars$name <- x_lq$select$name } x_lq <- x_lq$x + x_lq$group_vars <- op_grps(x) + x_lq$order_vars <- op_sort(x) + x_lq$frame <- op_frame(x) } # the table alias can only be determined after `select()` was inlined @@ -359,9 +362,6 @@ add_semi_join <- function(x, y, anti = FALSE, by = NULL, sql_on = NULL, copy = F anti = anti, by = by, na_matches = na_matches, - group_vars = group_vars, - order_vars = order_vars, - frame = frame, call = call ) } diff --git a/R/verb-select.R b/R/verb-select.R index 9aa8ca32c..56286833a 100644 --- a/R/verb-select.R +++ b/R/verb-select.R @@ -174,7 +174,8 @@ add_select <- function(.data, vars, op = c("select", "mutate")) { } if (inherits(lazy_query, "lazy_semi_join_query")) { - out$vars <- set_names(lazy_query$vars[idx], names(sel_vars)) + out$vars <- vctrs::vec_slice(out$vars, idx) + out$vars$name <- names(sel_vars) return(out) } diff --git a/man/sql_build.Rd b/man/sql_build.Rd index 00e45ddb1..ad569936f 100644 --- a/man/sql_build.Rd +++ b/man/sql_build.Rd @@ -38,9 +38,9 @@ lazy_semi_join_query( anti, by, na_matches = c("never", "na"), - group_vars = NULL, - order_vars = NULL, - frame = NULL, + group_vars = op_grps(x), + order_vars = op_sort(x), + frame = op_frame(x), call = caller_env() ) diff --git a/tests/testthat/test-verb-joins.R b/tests/testthat/test-verb-joins.R index 81eb90e13..5311b05a0 100644 --- a/tests/testthat/test-verb-joins.R +++ b/tests/testthat/test-verb-joins.R @@ -354,7 +354,7 @@ test_that("select() before semi_join is inlined", { ) lq <- out_semi$lazy_query expect_equal(op_vars(out_semi), c("a2", "x")) - expect_equal(lq$vars, c(a2 = "a", x = "x1")) + expect_equal(lq$vars, tibble(name = c("a2", "x"), var = c("a", "x1"))) expect_equal(lq$by$x, "x1") expect_snapshot(out_semi) @@ -364,8 +364,24 @@ test_that("select() before semi_join is inlined", { by = "x" ) lq <- out_anti$lazy_query - expect_equal(lq$vars, c(a2 = "a", x = "x1")) + expect_equal(lq$vars, tibble(name = c("a2", "x"), var = c("a", "x1"))) expect_equal(lq$by$x, "x1") + + # attributes like `group`, `sort`, `frame` is kept + lf <- lazy_frame(x = 10, a = 1, b = 1, .name = "lf1") + lf2 <- lazy_frame(x = 10, .name = "lf2") + out_semi <- semi_join( + lf %>% + group_by(a) %>% + arrange(a) %>% + window_frame(0, 1) %>% + select(x, a), + lf2, + by = "x" + ) + expect_equal(op_grps(out_semi), "a") + expect_equal(op_sort(out_semi), list(quo(a)), ignore_formula_env = TRUE) + expect_equal(op_frame(out_semi), list(range = c(0, 1))) }) test_that("select() before join is not inlined when using `sql_on`", { @@ -380,7 +396,7 @@ test_that("select() before join is not inlined when using `sql_on`", { lq <- out$lazy_query expect_s3_class(lq$x, "lazy_select_query") - expect_equal(lq$vars, c(a2 = "a2", x = "x")) + expect_equal(lq$vars, tibble(name = c("a2", "x"), var = c("a2", "x"))) }) # sql_build --------------------------------------------------------------- diff --git a/tests/testthat/test-verb-select.R b/tests/testthat/test-verb-select.R index 2c3b3672b..385fcc94c 100644 --- a/tests/testthat/test-verb-select.R +++ b/tests/testthat/test-verb-select.R @@ -162,7 +162,10 @@ test_that("select() after join handles previous select", { expect_equal(op_vars(lf), c("x2", "y3", "z")) expect_equal( lf$lazy_query$vars, - c(x2 = "x", y3 = "y", z = "z") + tibble( + name = c("x2", "y3", "z"), + var = c("x", "y", "z") + ) ) expect_equal(op_grps(lf), c("x2", "y3", "z")) expect_snapshot(print(lf)) From 09d93d669346fd9520a505f54b948c62c8f3aada Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Fri, 19 Aug 2022 16:23:54 +0200 Subject: [PATCH 05/28] Add `types` to `copy_inline()` (#964) * Add `types` to `copy_inline()` * Remove commented code * Clarify `types` documentation * Remove incorrect `types` argumen * Check `types` argument * Test that `types` argument works * Remove unnecessary code --- NEWS.md | 3 + R/backend-mysql.R | 4 +- R/verb-copy-to.R | 79 +++++++++++++++++---------- man/copy_inline.Rd | 7 ++- tests/testthat/_snaps/verb-copy-to.md | 17 +++++- tests/testthat/test-verb-copy-to.R | 22 +++++++- 6 files changed, 95 insertions(+), 37 deletions(-) diff --git a/NEWS.md b/NEWS.md index 51f606c35..e8e434237 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,8 @@ # dbplyr (development version) +* `copy_inline()` gains a `types` argument to specify the SQL column types + (@mgirlich, #963). + * `summarise()` + `filter()` now uses the `HAVING` clause where possible (@mgirlich, #877). diff --git a/R/backend-mysql.R b/R/backend-mysql.R index d3ab9105f..01ecb35bd 100644 --- a/R/backend-mysql.R +++ b/R/backend-mysql.R @@ -132,8 +132,8 @@ sql_expr_matches.MySQL <- sql_expr_matches.MariaDBConnection sql_expr_matches.MySQLConnection <- sql_expr_matches.MariaDBConnection #' @export -sql_values_subquery.MariaDBConnection <- function(con, df, lvl = 0, ...) { - sql_values_subquery_default(con, df, lvl = lvl, row = TRUE) +sql_values_subquery.MariaDBConnection <- function(con, df, types, lvl = 0, ...) { + sql_values_subquery_default(con, df, types = types, lvl = lvl, row = TRUE) } #' @export diff --git a/R/verb-copy-to.R b/R/verb-copy-to.R index 18d7d2384..804295eb3 100644 --- a/R/verb-copy-to.R +++ b/R/verb-copy-to.R @@ -107,6 +107,10 @@ auto_copy.tbl_sql <- function(x, y, copy = FALSE, ...) { #' @param con A database connection. #' @param df A local data frame. The data is written directly in the SQL query #' so it should be small. +#' @param types A named character vector of SQL data types to use for the columns. +#' The data types are backend specific. For example for Postgres this could +#' be `c(id = "bigint", created_at = "timestamp", values = "integer[]")`. +#' If `NULL`, the default, the types are determined from `df`. #' @return A `tbl_lazy`. #' #' @examples @@ -116,7 +120,7 @@ auto_copy.tbl_sql <- function(x, y, copy = FALSE, ...) { #' copy_inline(con, df) #' #' copy_inline(con, df) %>% dplyr::show_query() -copy_inline <- function(con, df) { +copy_inline <- function(con, df, types = NULL) { if (!inherits(df, "data.frame")) { cli_abort("{.var df} needs to be a data.frame.") } @@ -125,21 +129,30 @@ copy_inline <- function(con, df) { cli_abort("{.var df} needs at least one column.") } + if (!is_null(types)) { + vctrs::vec_assert(types, character()) + + if (!setequal(colnames(df), names(types))) { + cli_abort("Names of {.arg df} and {.arg types} must be the same.") + } + } + # This workaround is needed because `tbl_sql()` applies `as.sql()` on `from` subclass <- class(con)[[1]] # prefix added by dplyr::make_tbl dplyr::make_tbl( c(subclass, "sql", "lazy"), src = src_dbi(con), from = df, - lazy_query = lazy_values_query(df), + lazy_query = lazy_values_query(df, types), vars = colnames(df) ) } -lazy_values_query <- function(df) { +lazy_values_query <- function(df, types) { lazy_query( query_type = "values", x = df, + col_types = types, group_vars = character(), order_vars = NULL, frame = NULL @@ -153,7 +166,7 @@ sql_build.lazy_values_query <- function(op, con, ...) { #' @export sql_render.lazy_values_query <- function(query, con = query$src$con, ..., subquery = FALSE, lvl = 0, cte = FALSE) { - sql_values_subquery(con, query$x, lvl = lvl) + sql_values_subquery(con, query$x, types = query$col_types, lvl = lvl) } #' @export @@ -166,20 +179,20 @@ op_vars.lazy_values_query <- function(op) { colnames(op$x) } -sql_values_subquery <- function(con, df, lvl = 0, ...) { +sql_values_subquery <- function(con, df, types, lvl = 0, ...) { check_dots_empty() UseMethod("sql_values_subquery") } #' @export -sql_values_subquery.DBIConnection <- function(con, df, lvl = 0, ...) { - sql_values_subquery_default(con, df, lvl = lvl, row = FALSE) +sql_values_subquery.DBIConnection <- function(con, df, types, lvl = 0, ...) { + sql_values_subquery_default(con, df, types = types, lvl = lvl, row = FALSE) } -sql_values_subquery_default <- function(con, df, lvl, row) { +sql_values_subquery_default <- function(con, df, types, lvl, row) { df <- values_prepare(con, df) if (nrow(df) == 0L) { - return(sql_values_zero_rows(con, df, lvl)) + return(sql_values_zero_rows(con, df, types, lvl)) } # The query consists of two parts: @@ -212,16 +225,16 @@ sql_values_subquery_default <- function(con, df, lvl, row) { sql_query_select( con, - select = sql_values_select(con, df), + select = sql_values_cast_clauses(con, df, types, na = FALSE), from = sql_subquery(con, subquery, name = "values_table", lvl = lvl), lvl = lvl ) } -sql_values_subquery_column_alias <- function(con, df, lvl) { +sql_values_subquery_column_alias <- function(con, df, types, lvl) { df <- values_prepare(con, df) if (nrow(df) == 0L) { - return(sql_values_zero_rows(con, df, lvl)) + return(sql_values_zero_rows(con, df, types, lvl)) } # The `SELECT` clause converts the values to the correct types. This needs @@ -247,7 +260,7 @@ sql_values_subquery_column_alias <- function(con, df, lvl) { sql_query_select( con, - select = sql_values_select(con, df), + select = sql_values_cast_clauses(con, df, types, na = FALSE), from = rows_query, lvl = lvl ) @@ -261,38 +274,44 @@ sql_values_clause <- function(con, df, row = FALSE) { list(sql_clause("VALUES", rows_sql)) } -sql_values_zero_rows <- function(con, df, lvl) { +sql_values_zero_rows <- function(con, df, types, lvl) { if (nrow(df) != 0L) { # TODO use `cli_abort()` after https://github.com/r-lib/rlang/issues/1386 # is fixed abort("`df` does not have 0 rows", .internal = TRUE) } - typed_cols <- purrr::map_chr( - vctrs::vec_init(df), - ~ { - cast_expr <- call2(sql_cast_dispatch(.x), NA) - translate_sql(!!cast_expr, con = con) - } - ) + typed_cols <- sql_values_cast_clauses(con, df, types, na = TRUE) query <- select_query( from = ident(), - select = sql(typed_cols), + select = typed_cols, where = sql("0 = 1") ) sql_render(query, con = con, lvl = lvl) } -sql_values_select <- function(con, df) { - typed_cols <- purrr::map2_chr( - df, colnames(df), - ~ { - cast_expr <- call2(sql_cast_dispatch(.x), ident(.y)) - translate_sql(!!cast_expr, con = con) - } - ) +sql_values_cast_clauses <- function(con, df, types, na) { + if (is_null(types)) { + typed_cols <- purrr::map2_chr( + df, colnames(df), + ~ { + val <- if (na) NA else ident(.y) + cast_expr <- call2(sql_cast_dispatch(.x), val) + translate_sql(!!cast_expr, con = con) + } + ) + } else { + typed_cols <- purrr::imap_chr( + types, + ~ { + val <- if (na) NA else ident(.y) + sql_expr(cast(!!val %as% !!sql(.x)), con = con) + } + ) + } + sql_vector(typed_cols, parens = FALSE, collapse = NULL, con = con) } diff --git a/man/copy_inline.Rd b/man/copy_inline.Rd index 5e1784f6b..c1880b92c 100644 --- a/man/copy_inline.Rd +++ b/man/copy_inline.Rd @@ -4,13 +4,18 @@ \alias{copy_inline} \title{Use a local data frame in a dbplyr query} \usage{ -copy_inline(con, df) +copy_inline(con, df, types = NULL) } \arguments{ \item{con}{A database connection.} \item{df}{A local data frame. The data is written directly in the SQL query so it should be small.} + +\item{types}{A named character vector of SQL data types to use for the columns. +The data types are backend specific. For example for Postgres this could +be \code{c(id = "bigint", created_at = "timestamp", values = "integer[]")}. +If \code{NULL}, the default, the types are determined from \code{df}.} } \value{ A \code{tbl_lazy}. diff --git a/tests/testthat/_snaps/verb-copy-to.md b/tests/testthat/_snaps/verb-copy-to.md index a7f31b0ad..086422f55 100644 --- a/tests/testthat/_snaps/verb-copy-to.md +++ b/tests/testthat/_snaps/verb-copy-to.md @@ -63,15 +63,28 @@ # checks inputs Code - (expect_error(copy_inline(simulate_dbi(), tibble()))) + (expect_error(copy_inline(con, tibble()))) Output Error in `copy_inline()`: ! `df` needs at least one column. Code - (expect_error(copy_inline(simulate_dbi(), lazy_frame(a = 1)))) + (expect_error(copy_inline(con, lazy_frame(a = 1)))) Output Error in `copy_inline()`: ! `df` needs to be a data.frame. + Code + (expect_error(copy_inline(con, tibble(a = 1), types = c(b = "bigint")))) + Output + + Error in `copy_inline()`: + ! Names of `df` and `types` must be the same. + Code + (expect_error(copy_inline(con, tibble(a = 1), types = c(b = 1)))) + Output + + Error in `copy_inline()`: + ! `types` must be a vector with type . + Instead, it has type . diff --git a/tests/testthat/test-verb-copy-to.R b/tests/testthat/test-verb-copy-to.R index b6667a3e2..a91b43bd1 100644 --- a/tests/testthat/test-verb-copy-to.R +++ b/tests/testthat/test-verb-copy-to.R @@ -118,9 +118,27 @@ test_that("zero row table works", { ) }) +test_that("types argument works", { + con <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") + on.exit(DBI::dbDisconnect(con), add = TRUE) + + df <- tibble(x = "1", y = 2L) + expect_equal(copy_inline(con, df) %>% collect(), df) + + expect_equal( + copy_inline(con, df, types = c(x = "INTEGER", y = "TEXT")) %>% collect(), + tibble(x = 1L, y = "2") + ) +}) + test_that("checks inputs", { + con <- simulate_dbi() + expect_snapshot({ - (expect_error(copy_inline(simulate_dbi(), tibble()))) - (expect_error(copy_inline(simulate_dbi(), lazy_frame(a = 1)))) + (expect_error(copy_inline(con, tibble()))) + (expect_error(copy_inline(con, lazy_frame(a = 1)))) + + (expect_error(copy_inline(con, tibble(a = 1), types = c(b = "bigint")))) + (expect_error(copy_inline(con, tibble(a = 1), types = c(b = 1)))) }) }) From faf8d63c86e7d2ee662d6fd57c11628392b3d7dc Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Fri, 19 Aug 2022 16:45:10 +0200 Subject: [PATCH 06/28] `rows_*()` casts `y` columns if it copies them (#965) * `rows_*()` casts `y` columns if it copies them * Fix `name = NULL` case * Check containment before copying * Only use type inference for Postgres --- NAMESPACE | 3 ++ NEWS.md | 3 ++ R/rows.R | 58 ++++++++++++++++++++------ tests/testthat/test-backend-postgres.R | 43 +++++++++++++++++++ 4 files changed, 95 insertions(+), 12 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 7dc71351e..820c89c5a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -108,6 +108,9 @@ S3method(format,ident) S3method(format,sql) S3method(format,src_sql) S3method(full_join,tbl_lazy) +S3method(get_col_types,DBIConnection) +S3method(get_col_types,PqConnection) +S3method(get_col_types,TestConnection) S3method(group_by,tbl_lazy) S3method(group_size,tbl_sql) S3method(group_vars,tbl_lazy) diff --git a/NEWS.md b/NEWS.md index e8e434237..0e9702845 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,8 @@ # dbplyr (development version) +* `rows_*()` now uses the column types of `x` when copying a local data frame `y` + to a Postgres database (@mgirlich, #909). + * `copy_inline()` gains a `types` argument to specify the SQL column types (@mgirlich, #963). diff --git a/R/rows.R b/R/rows.R index 3befb7c9d..e1f08d25f 100644 --- a/R/rows.R +++ b/R/rows.R @@ -99,9 +99,8 @@ rows_insert.tbl_lazy <- function(x, conflict <- rows_check_conflict(conflict) - y <- auto_copy(x, y, copy = copy) - rows_check_containment(x, y) + y <- rows_auto_copy(x, y, copy = copy) by <- rows_check_by(by, y) @@ -156,9 +155,8 @@ rows_append.tbl_lazy <- function(x, rows_check_in_place(x, in_place) name <- target_table_name(x, in_place) - y <- auto_copy(x, y, copy = copy) - rows_check_containment(x, y) + y <- rows_auto_copy(x, y, copy = copy) returning_cols <- rows_check_returning(x, returning, enexpr(returning)) @@ -204,9 +202,8 @@ rows_update.tbl_lazy <- function(x, rows_check_in_place(x, in_place) name <- target_table_name(x, in_place) - y <- auto_copy(x, y, copy = copy) - rows_check_containment(x, y) + y <- rows_auto_copy(x, y, copy = copy) by <- rows_check_by(by, y) @@ -285,9 +282,8 @@ rows_patch.tbl_lazy <- function(x, rows_check_in_place(x, in_place) name <- target_table_name(x, in_place) - y <- auto_copy(x, y, copy = copy) - rows_check_containment(x, y) + y <- rows_auto_copy(x, y, copy = copy) by <- rows_check_by(by, y) @@ -307,6 +303,7 @@ rows_patch.tbl_lazy <- function(x, } con <- remote_con(x) + update_cols <- setdiff(colnames(y), by) update_values <- sql_coalesce( sql_table_prefix(con, update_cols, name), @@ -374,9 +371,8 @@ rows_upsert.tbl_lazy <- function(x, rows_check_in_place(x, in_place) name <- target_table_name(x, in_place) - y <- auto_copy(x, y, copy = copy) - rows_check_containment(x, y) + y <- rows_auto_copy(x, y, copy = copy) by <- rows_check_by(by, y) @@ -450,9 +446,8 @@ rows_delete.tbl_lazy <- function(x, rows_check_in_place(x, in_place) name <- target_table_name(x, in_place) - y <- auto_copy(x, y, copy = copy) - rows_check_containment(x, y) + y <- rows_auto_copy(x, y, copy = copy) by <- rows_check_by(by, y) @@ -775,6 +770,45 @@ rows_insert_prep <- function(con, x_name, y, by, lvl = 0) { out } +rows_auto_copy <- function(x, y, copy, call = caller_env()) { + name <- remote_name(x) + x_types <- get_col_types(remote_con(x), name, call) + + if (!is_null(x_types)) { + rows_check_containment(x, y, error_call = call) + x_types <- x_types[colnames(y)] + } + + auto_copy(x, y, copy = copy, types = x_types) +} + +get_col_types <- function(con, name, call) { + if (is_null(name)) { + return(NULL) + } + + UseMethod("get_col_types") +} + +#' @export +get_col_types.TestConnection <- function(con, name, call) { + NULL +} + +#' @export +get_col_types.DBIConnection <- function(con, name, call) { + NULL +} + +#' @export +get_col_types.PqConnection <- function(con, name, call) { + res <- DBI::dbSendQuery(con, paste0("SELECT * FROM ", name)) + on.exit(DBI::dbClearResult(res)) + DBI::dbFetch(res, n = 0) + col_info_df <- DBI::dbColumnInfo(res) + set_names(col_info_df[[".typname"]], col_info_df[["name"]]) +} + rows_get_or_execute <- function(x, sql, returning_cols, call = caller_env()) { con <- remote_con(x) msg <- "Can't modify database table {.val {remote_name(x)}}." diff --git a/tests/testthat/test-backend-postgres.R b/tests/testthat/test-backend-postgres.R index 6b8ebff2c..bb2db178c 100644 --- a/tests/testthat/test-backend-postgres.R +++ b/tests/testthat/test-backend-postgres.R @@ -228,6 +228,49 @@ test_that("can insert with returning", { ) }) +test_that("casts `y` column for local df", { + con <- src_test("postgres") + + DBI::dbWriteTable( + con, + "df_x", + value = tibble(id = 1L, val = 10L, arr = "{1,2}"), + field.types = c(id = "bigint", val = "bigint", arr = "integer[]") + ) + withr::defer(DBI::dbRemoveTable(con, DBI::SQL("df_x"))) + + y <- tibble( + id = "2", + val = 20, + arr = "{1, 2, 3}" + ) + + out <- tibble( + id = bit64::as.integer64(1:2), + val = bit64::as.integer64(10L, 20L), + arr = structure(c("{1,2}", "{1,2,3}"), class = "pq__int4") + ) + expect_equal( + rows_append( + tbl(con, "df_x"), + y, + copy = TRUE, + in_place = FALSE + ) %>% + collect(), + out + ) + + rows_append( + tbl(con, "df_x"), + y, + copy = TRUE, + in_place = TRUE + ) + + expect_equal(tbl(con, "df_x") %>% collect(), out) +}) + test_that("can upsert with returning", { con <- src_test("postgres") From e980aaaf190726a6f87cf438dbeaebae34d8e592 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Fri, 19 Aug 2022 15:55:40 +0000 Subject: [PATCH 07/28] Multiple joins in one query --- NAMESPACE | 8 + R/db-sql.R | 152 +++++++++++++ R/lazy-join-query.R | 125 +++++++++++ R/query-join.R | 162 ++++++-------- R/sql-build.R | 21 ++ R/verb-joins.R | 247 ++++++++++++++------ R/verb-select.R | 10 +- man/db-sql.Rd | 12 + man/sql_build.Rd | 12 + tests/testthat/_snaps/query-join.md | 18 +- tests/testthat/_snaps/verb-joins.md | 35 ++- tests/testthat/test-query-join.R | 12 +- tests/testthat/test-verb-joins.R | 334 ++++++++++++++++++---------- tests/testthat/test-verb-select.R | 9 +- 14 files changed, 851 insertions(+), 306 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 820c89c5a..d2e44d2b7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -100,6 +100,7 @@ S3method(explain,tbl_sql) S3method(flatten_query,ident) S3method(flatten_query,join_query) S3method(flatten_query,lazy_values_query) +S3method(flatten_query,multi_join_query) S3method(flatten_query,select_query) S3method(flatten_query,semi_join_query) S3method(flatten_query,set_op_query) @@ -134,6 +135,7 @@ S3method(op_sort,lazy_query) S3method(op_sort,tbl_lazy) S3method(op_vars,lazy_base_query) S3method(op_vars,lazy_join_query) +S3method(op_vars,lazy_multi_join_query) S3method(op_vars,lazy_query) S3method(op_vars,lazy_semi_join_query) S3method(op_vars,lazy_set_op_query) @@ -149,6 +151,7 @@ S3method(print,lazy_join_query) S3method(print,lazy_select_query) S3method(print,lazy_semi_join_query) S3method(print,lazy_set_op_query) +S3method(print,multi_join_query) S3method(print,select_query) S3method(print,semi_join_query) S3method(print,set_op_query) @@ -189,6 +192,7 @@ S3method(sql_build,ident) S3method(sql_build,lazy_base_local_query) S3method(sql_build,lazy_base_remote_query) S3method(sql_build,lazy_join_query) +S3method(sql_build,lazy_multi_join_query) S3method(sql_build,lazy_select_query) S3method(sql_build,lazy_semi_join_query) S3method(sql_build,lazy_set_op_query) @@ -259,6 +263,7 @@ S3method(sql_query_join,MariaDBConnection) S3method(sql_query_join,MySQL) S3method(sql_query_join,MySQLConnection) S3method(sql_query_join,SQLiteConnection) +S3method(sql_query_multi_join,DBIConnection) S3method(sql_query_rows,DBIConnection) S3method(sql_query_save,"Microsoft SQL Server") S3method(sql_query_save,DBIConnection) @@ -300,6 +305,7 @@ S3method(sql_render,ident) S3method(sql_render,join_query) S3method(sql_render,lazy_query) S3method(sql_render,lazy_values_query) +S3method(sql_render,multi_join_query) S3method(sql_render,select_query) S3method(sql_render,semi_join_query) S3method(sql_render,set_op_query) @@ -423,6 +429,7 @@ export(lahman_srcs) export(lazy_base_query) export(lazy_frame) export(lazy_join_query) +export(lazy_multi_join_query) export(lazy_query) export(lazy_select_query) export(lazy_semi_join_query) @@ -489,6 +496,7 @@ export(sql_query_explain) export(sql_query_fields) export(sql_query_insert) export(sql_query_join) +export(sql_query_multi_join) export(sql_query_rows) export(sql_query_save) export(sql_query_select) diff --git a/R/db-sql.R b/R/db-sql.R index 37eef1f23..4af78576f 100644 --- a/R/db-sql.R +++ b/R/db-sql.R @@ -387,6 +387,158 @@ sql_join.DBIConnection <- function(con, x, y, vars, type = "inner", by = NULL, n ) } +#' @rdname db-sql +#' @export +sql_query_multi_join <- function(con, + x, + joins, + table_names, + vars, + all_vars_list, + ..., + lvl = 0) { + UseMethod("sql_query_multi_join") +} + +#' @export +sql_query_multi_join.DBIConnection <- function(con, + x, + joins, + table_names, + vars, + all_vars_list, + ..., + lvl = 0) { + if (vctrs::vec_duplicate_any(table_names)) { + cli_abort("{.arg table_names} must be unique.") + } + + x_name <- ident(table_names[[1]]) + + select_sql <- sql_multi_join_vars(con, vars, table_names, all_vars_list) + + ons <- purrr::pmap( + vctrs::vec_cbind( + rhs = table_names[-1], + joins[c("by_x", "by_y", "on", "na_matches")] + ), + function(rhs, by_x, by_y, on, na_matches) { + if (!is.na(on)) { + return(sql(on)) + } + by <- list(x = ident(by_x), y = ident(by_y), x_as = x_name, y_as = ident(rhs)) + sql_join_tbls(con, by, na_matches = na_matches) + } + ) + + from <- dbplyr_sql_subquery(con, x, name = table_names[[1]], lvl = lvl) + join_table_queries <- purrr::map2( + joins$table, + table_names[-1], + ~ dbplyr_sql_subquery(con, .x, name = .y, lvl = lvl) + ) + types <- toupper(paste0(joins$type, " JOIN")) + join_clauses <- vctrs::vec_interleave( + purrr::map2(join_table_queries, types, ~ sql_clause(.y, .x)), + purrr::map(ons, ~ sql_clause("ON", .x, sep = " AND", parens = TRUE, lvl = 1)) + ) + + list2( + sql_clause_select(con, select_sql), + sql_clause_from(from), + !!!join_clauses + ) %>% + sql_format_clauses(lvl = lvl, con = con) +} + +sql_multi_join_vars <- function(con, vars, table_names, all_vars_list) { + all_vars <- tolower(unlist(all_vars_list)) + duplicated_vars <- all_vars[vctrs::vec_duplicate_detect(all_vars)] + duplicated_vars <- unique(duplicated_vars) + + # FIXME vectorise `sql_table_prefix()` (need to update `ident()` and friends for this...) + ns <- vctrs::list_sizes(vars$table) + if (any(ns > 1)) { + # special treatment for `full_join()` + out <- purrr::map2( + vars$table, vars$var, + function(table_ids, vars) { + if (length(table_ids) > 1) { + sql_expr( + COALESCE( + !!sql_table_prefix(con, vars[[1]], table = ident(table_names[[1]])), + !!sql_table_prefix(con, vars[[2]], table = ident(table_names[[2]])) + ), + con = con + ) + } else { + table_id <- table_ids[[1]] + var <- vars[[1]] + sql_multi_join_var(con, var, table_id, table_names, duplicated_vars) + } + } + ) + + out <- set_names(out, vars$name) + return(sql(unlist(out))) + } + + out <- rep_named(vars$name, list()) + vars$var <- vctrs::vec_unchop(vars$var) + vars$table <- vctrs::vec_unchop(vars$table) + + for (i in seq_along(table_names)) { + all_vars_current <- all_vars_list[[i]] + vars_idx <- which(vars$table == i) + used_vars_current <- vars$var[vars_idx] + out_vars_current <- vars$name[vars_idx] + + if (join_can_use_star(all_vars_current, used_vars_current, out_vars_current, vars_idx)) { + id <- vars_idx[[1]] + tbl_alias <- escape(ident(table_names[i]), con = con) + out[[id]] <- sql(paste0(tbl_alias, ".*")) + names(out)[id] <- "" + } else { + out[vars_idx] <- purrr::map2( + used_vars_current, i, + ~ sql_multi_join_var(con, .x, .y, table_names, duplicated_vars) + ) + + } + } + + out <- purrr::compact(out) + sql(unlist(out)) +} + +join_can_use_star <- function(all_vars, used_vars, out_vars, idx) { + # using `tbl.*` for a single variable is silly + if (length(all_vars) <= 1) { + return(FALSE) + } + + # all variables need to be used + if (!identical(used_vars, all_vars)) { + return(FALSE) + } + + # they must not be renamed + if (!identical(used_vars, out_vars)) { + return(FALSE) + } + + # the variables must form a sequence + all(diff(idx) == 1) +} + +sql_multi_join_var <- function(con, var, table_id, table_names, duplicated_vars) { + if (tolower(var) %in% duplicated_vars) { + sql_table_prefix(con, var, ident(table_names[[table_id]])) + } else { + sql_escape_ident(con, var) + } +} + #' @rdname db-sql #' @export sql_query_semi_join <- function(con, x, y, anti, by, vars, ..., lvl = 0) { diff --git a/R/lazy-join-query.R b/R/lazy-join-query.R index 76df8ab85..4304219c5 100644 --- a/R/lazy-join-query.R +++ b/R/lazy-join-query.R @@ -35,6 +35,51 @@ lazy_join_query <- function(x, ) } +#' @export +#' @rdname sql_build +lazy_multi_join_query <- function(x, + joins, + table_names, + vars, + group_vars = op_grps(x), + order_vars = op_sort(x), + frame = op_frame(x), + call = caller_env()) { + stopifnot(inherits(x, "lazy_query")) + + if (!identical(colnames(joins), c("table", "type", "by_x", "by_y", "on", "na_matches"))) { + cli_abort("`joins` must have fields `table`, `type`, `by_x`, `by_y`, `on`, `na_matches`", .internal = TRUE) + } + vctrs::vec_assert(joins$type, character(), arg = "joins$type", call = caller_env()) + vctrs::vec_assert(joins$on, character(), arg = "joins$on", call = caller_env()) + vctrs::vec_assert(joins$na_matches, character(), arg = "joins$na_matches", call = caller_env()) + + if (!identical(colnames(table_names), c("as", "name"))) { + cli_abort("`table_names` must have fields `as`, `name`", .internal = TRUE) + } + vctrs::vec_assert(table_names$as, character(), arg = "table_names$as", call = caller_env()) + vctrs::vec_assert(table_names$name, character(), arg = "table_names$as", call = caller_env()) + + if (!identical(colnames(vars), c("name", "table", "var"))) { + cli_abort("`vars` must have fields `name`, `table`, `var`", .internal = TRUE) + } + vctrs::vec_assert(vars$name, character(), arg = "vars$name", call = caller_env()) + vctrs::vec_assert(vars$table, list(), arg = "vars$table", call = caller_env()) + vctrs::vec_assert(vars$var, list(), arg = "vars$var", call = caller_env()) + + lazy_query( + query_type = "multi_join", + x = x, + joins = joins, + table_names = table_names, + vars = vars, + last_op = "join", + group_vars = group_vars, + order_vars = order_vars, + frame = frame + ) +} + join_check_vars <- function(vars, call) { if (!vctrs::vec_is_list(vars)) { # TODO use `cli_abort()` after https://github.com/r-lib/rlang/issues/1386 @@ -139,6 +184,10 @@ op_vars.lazy_join_query <- function(op) { op$vars$alias } #' @export +op_vars.lazy_multi_join_query <- function(op) { + op$vars$name +} +#' @export op_vars.lazy_semi_join_query <- function(op) { op$vars$name } @@ -156,6 +205,82 @@ sql_build.lazy_join_query <- function(op, con, ...) { ) } +#' @export +sql_build.lazy_multi_join_query <- function(op, con, ...) { + auto_name <- is.na(op$table_names$as) + table_names_out <- dplyr::coalesce(op$table_names$as, op$table_names$name) + if (length(table_names_out) == 2) { + na_to_null <- function(x) { + if (is.na(x)) { + NULL + } else { + x + } + } + + x_name <- na_to_null(op$table_names$name[[1]]) + y_name <- na_to_null(op$table_names$name[[2]]) + x_alias <- na_to_null(op$table_names$as[[1]]) + y_alias <- na_to_null(op$table_names$as[[2]]) + table_names_out <- join_two_table_alias(x_name, y_name, x_alias, y_alias) + + if (op$joins$type %in% c("full", "right")) { + # construct a classical `join_query()` so that special handling of + # full and right join continue to work + type <- op$joins$type + x_idx <- purrr::map_lgl(op$vars$table, ~ 1L %in% .x) + vars_x <- purrr::map2_chr(op$vars$var, x_idx, ~ {if (.y) .x[[1]] else NA_character_}) + + y_idx <- purrr::map_lgl(op$vars$table, ~ 2L %in% .x) + vars_y <- purrr::map2_chr(op$vars$var, y_idx, ~ {if (.y) dplyr::last(.x) %||% .x[[1]] else NA_character_}) + + + vars_classic <- list( + alias = op$vars$name, + x = vars_x, + y = vars_y, + all_x = op_vars(op$x), + all_y = op_vars(op$joins$table[[1]]) + ) + + out <- join_query( + sql_optimise(sql_build(op$x, con), con), + sql_optimise(sql_build(op$joins$table[[1]], con), con), + vars = vars_classic, + type = type, + by = list( + on = sql(op$joins$on), + x = op$joins$by_x[[1]], + y = op$joins$by_y[[1]], + x_as = ident(table_names_out[[1]]), + y_as = ident(table_names_out[[2]]) + ), + suffix = NULL, # it seems like the suffix is not used for rendering + na_matches = op$joins$na_matches + ) + return(out) + } + } else { + table_names_repaired <- vctrs::vec_as_names(table_names_out, repair = "unique", quiet = TRUE) + table_names_out[auto_name] <- table_names_repaired[auto_name] + } + + all_vars_list <- purrr::map( + c(list(op$x), op$joins$table), + op_vars + ) + + op$joins$table <- purrr::map(op$joins$table, ~ sql_optimise(sql_build(.x, con), con)) + + multi_join_query( + x = sql_optimise(sql_build(op$x, con), con), + joins = op$joins, + table_names = table_names_out, + vars = op$vars, + all_vars_list = all_vars_list + ) +} + #' @export sql_build.lazy_semi_join_query <- function(op, con, ...) { vars_prev <- op_vars(op$x) diff --git a/R/query-join.R b/R/query-join.R index ecbedb796..4d4c74a70 100644 --- a/R/query-join.R +++ b/R/query-join.R @@ -14,6 +14,19 @@ join_query <- function(x, y, vars, type = "inner", by = NULL, suffix = c(".x", " ) } +multi_join_query <- function(x, joins, table_names, vars, all_vars_list) { + structure( + list( + x = x, + joins = joins, + table_names = table_names, + vars = vars, + all_vars_list = all_vars_list + ), + class = c("multi_join_query", "query") + ) +} + #' @export print.join_query <- function(x, ...) { cat_line("") @@ -28,6 +41,24 @@ print.join_query <- function(x, ...) { cat_line(indent_print(sql_build(x$y))) } +#' @export +print.multi_join_query <- function(x, ...) { + cat_line("") + + cat_line("X:") + cat_line(indent_print(sql_build(x$x))) + + for (i in vctrs::vec_seq_along(x$joins)) { + cat_line("Type: ", paste0(x$joins$type[[i]])) + + cat_line("By:") + cat_line(indent(paste0(x$joins$by_x[[i]], "-", x$joins$by_y[[i]]))) + + cat_line("Y:") + cat_line(indent_print(sql_build(x$joins$table[[i]]))) + } +} + #' @export sql_render.join_query <- function(query, con = NULL, ..., subquery = FALSE, lvl = 0) { from_x <- sql_render(query$x, con, ..., subquery = TRUE, lvl = lvl + 1) @@ -42,102 +73,53 @@ sql_render.join_query <- function(query, con = NULL, ..., subquery = FALSE, lvl ) } -# SQL generation ---------------------------------------------------------- - - -sql_join_vars <- function(con, vars, x_as = ident("LHS"), y_as = ident("RHS"), type) { - join_vars_list <- mapply( - FUN = sql_join_var, - alias = vars$alias, - x = vars$x, - y = vars$y, - MoreArgs = list(con = con, all_x = vars$all_x, all_y = vars$all_y, x_as = x_as, y_as = y_as), - SIMPLIFY = FALSE, - USE.NAMES = TRUE +#' @export +sql_render.multi_join_query <- function(query, con = NULL, ..., subquery = FALSE, lvl = 0) { + x <- sql_render(query$x, con, ..., subquery = TRUE, lvl = lvl + 1) + query$joins$table <- purrr::map( + query$joins$table, + ~ sql_render(.x, con, ..., subquery = TRUE, lvl = lvl + 1) ) - x_start <- min(c(Inf, which(!is.na(vars$x)))) - y_start <- min(c(Inf, which(!is.na(vars$y)))) - - if (type == "left" || type == "inner") { - join_vars_list <- join_use_star(con, vars$alias, vars$x, vars$all_x, join_vars_list, x_as) - } else if (type == "right") { - join_vars_list <- join_use_star(con, vars$alias, vars$y, vars$all_y, join_vars_list, y_as) - } else if (type == "cross") { - # shrink `join_vars_list` from the back to not mess up the indices - if (x_start < y_start) { - join_vars_list <- join_use_star(con, vars$alias, vars$y, vars$all_y, join_vars_list, y_as) - join_vars_list <- join_use_star(con, vars$alias, vars$x, vars$all_x, join_vars_list, x_as) - } else { - join_vars_list <- join_use_star(con, vars$alias, vars$x, vars$all_x, join_vars_list, x_as) - join_vars_list <- join_use_star(con, vars$alias, vars$y, vars$all_y, join_vars_list, y_as) - } - } - - sql(unlist(join_vars_list)) -} - -sql_join_var <- function(con, alias, x, y, all_x, all_y, x_as, y_as) { - if (!is.na(x) && !is.na(y)) { - sql_expr( - COALESCE( - !!sql_table_prefix(con, x, table = x_as), - !!sql_table_prefix(con, y, table = y_as) - ), - con = con - ) - } else if (!is.na(x)) { - sql_table_prefix(con, x, table = if (tolower(x) %in% tolower(all_y)) x_as) - } else if (!is.na(y)) { - sql_table_prefix(con, y, table = if (tolower(y) %in% tolower(all_x)) y_as) - } else { - cli_abort("No source for join column {alias}") # nocov - } + sql_query_multi_join( + con = con, + x = x, + joins = query$joins, + table_names = query$table_names, + vars = query$vars, + all_vars_list = query$all_vars_list, + lvl = lvl + ) } -join_use_star <- function(con, - out_vars, - used_vars, - in_vars, - join_vars_list, - tbl_alias) { - if (length(in_vars) <= 1) { - return(join_vars_list) - } - - if (!all(in_vars %in% used_vars)) { - return(join_vars_list) - } - - loc_start <- vctrs::vec_match(in_vars[[1]], used_vars) - loc_end <- loc_start + length(in_vars) - 1 - - # shrink `out_vars` and `used_vars` to the actual part corresponding to the - # table - idx <- seq2(loc_start, loc_end) - alias <- vctrs::vec_slice(out_vars, idx) - used_vars <- vctrs::vec_slice(used_vars, idx) - - renamed <- !identical(used_vars, alias) - if (renamed) { - return(join_vars_list) - } - - moved <- !identical(used_vars, in_vars) - if (moved) { - return(join_vars_list) - } +# SQL generation ---------------------------------------------------------- - # the part of `join_vars_list` corresponding to the vars of the input tbl can - # now be replaced by `tbl_alias.*` - idx_start <- seq2(1, loc_start - 1) - idx_end <- seq2(loc_end + 1, length(join_vars_list)) +sql_join_vars <- function(con, vars, x_as = "LHS", y_as = "RHS", type) { + multi_join_vars <- purrr::map2_dfr( + vars$x, vars$y, + ~ { + table <- integer() + var <- character() + if (!is.na(.x)) { + table <- 1L + var <- .x + } + + if (!is.na(.y)) { + table <- c(table, 2L) + var <- c(var, .y) + } + + vctrs::new_data_frame(list(table = list(table), var = list(var)), n = 1L) + } + ) + multi_join_vars <- vctrs::vec_cbind(name = vars$alias, multi_join_vars) - tbl_alias <- escape(tbl_alias, con = con) - vctrs::vec_c( - vctrs::vec_slice(join_vars_list, idx_start), - list(sql(paste0(tbl_alias, ".*"))), - vctrs::vec_slice(join_vars_list, idx_end) + sql_multi_join_vars( + con, + multi_join_vars, + table_names = c(unclass(x_as), unclass(y_as)), + all_vars_list = vars[c("all_x", "all_y")] ) } diff --git a/R/sql-build.R b/R/sql-build.R index 0c93e0e4d..cc523f2e4 100644 --- a/R/sql-build.R +++ b/R/sql-build.R @@ -147,6 +147,27 @@ flatten_query.join_query <- function(qry, query_list) { querylist_reuse_query(qry, query_list_y) } +#' @export +flatten_query.multi_join_query <- function(qry, query_list) { + x <- qry$x + query_list_new <- flatten_query(x, query_list) + qry$x <- get_subquery_name(x, query_list_new) + + for (i in vctrs::vec_seq_along(qry$joins)) { + y <- qry$joins$table[[i]] + query_list_new <- flatten_query(y, query_list_new) + qry$joins$table[[i]] <- get_subquery_name(y, query_list_new) + } + + # TODO reuse query + name <- unique_subquery_name() + wrapped_query <- set_names(list(qry), name) + + query_list$queries <- c(query_list_new$queries, wrapped_query) + query_list$name <- name + query_list +} + #' @export flatten_query.semi_join_query <- flatten_query.join_query #' @export diff --git a/R/verb-joins.R b/R/verb-joins.R index 21862d6b7..004a28ca8 100644 --- a/R/verb-joins.R +++ b/R/verb-joins.R @@ -219,7 +219,7 @@ add_join <- function(x, y, type, by = NULL, sql_on = NULL, copy = FALSE, y_as = NULL, call = caller_env()) { if (!is.null(sql_on)) { - by <- list(x = character(0), y = character(0), on = sql(sql_on)) + by <- list(x = character(0), y = character(0), on = unclass(sql_on)) } else if (identical(type, "full") && identical(by, character())) { type <- "cross" by <- list(x = character(0), y = character(0)) @@ -234,82 +234,206 @@ add_join <- function(x, y, type, by = NULL, sql_on = NULL, copy = FALSE, ) suffix <- suffix %||% sql_join_suffix(x$src$con, suffix) - vars <- join_vars(op_vars(x), op_vars(y), type = type, by = by, suffix = suffix, call = call) - - inlined_select_list <- inline_select_in_join(x, y, vars, by) - vars <- inlined_select_list$vars - by <- inlined_select_list$by + suffix <- check_suffix(suffix, call) + na_matches <- arg_match(na_matches, c("na", "never"), error_call = call) # the table alias can only be determined after `select()` was inlined. # This works even though `by` is used in `inline_select_in_join()` and updated # because this does not touch `by$x_as` and `by$y_as`. + # TODO can this be really be done before inlining? join_alias <- check_join_alias(x_as, y_as, sql_on, call) - x_name <- unclass(query_name(inlined_select_list$x)) - y_name <- unclass(query_name(inlined_select_list$y)) - by[c("x_as", "y_as")] <- join_two_table_alias(x_name, y_name, join_alias$x, join_alias$y) - by$x_as <- ident(by$x_as) - by$y_as <- ident(by$y_as) + inline_result <- join_inline_select(x$lazy_query, by$x, by$on) + x_lq <- inline_result$lq + x_vars <- inline_result$vars + by_x_org <- by$x + by$x <- inline_result$by + + new_query <- join_needs_new_query(x$lazy_query, join_alias, type) + if (new_query) { + x_join_vars <- tibble( + name = op_vars(x), + table = list(1L), + var = as.list(x_vars) + ) + table_id <- 2L + } else { + x_join_vars <- x_lq$vars + table_id <- vctrs::vec_size(x_lq$table_names) + 1L + } - lazy_join_query( - x = inlined_select_list$x, - y = inlined_select_list$y, - vars = vars, + inline_result <- join_inline_select(y$lazy_query, by$y, by$on) + y_lq <- inline_result$lq + y_vars <- inline_result$vars + by$y <- inline_result$by + + y_join_vars <- tibble( + name = op_vars(y), + table = list(table_id), + var = as.list(y_vars) + ) + + vars <- multi_join_vars( + x_join_vars = x_join_vars, + y_join_vars = y_join_vars, + by_x_org = by_x_org, + by_y = by$y, type = type, - by = by, - suffix = suffix, - na_matches = na_matches, - group_vars = op_grps(x), - order_vars = op_sort(x), - frame = op_frame(x), - call = call + table_id = table_id, + suffix = suffix ) -} -inline_select_in_join <- function(x, y, vars, by) { - x_lq <- x$lazy_query - y_lq <- y$lazy_query - # Cannot inline select if `on` is used because the user might have - # used a renamed column. - if (!is_empty(by$on)) { - out <- list( + joins_field <- tibble( + table = list(y_lq), + type = type, + by_x = list(by$x), + by_y = list(by$y), + on = by$on %||% NA_character_, + na_matches + ) + + table_names_y <- tibble( + as = join_alias$y %||% NA_character_, + name = as.character(query_name(y_lq) %||% NA) + ) + + if (new_query) { + table_names_x <- tibble( + as = join_alias$x %||% NA_character_, + name = as.character(query_name(x_lq) %||% NA) + ) + + out <- lazy_multi_join_query( x = x_lq, - y = y_lq, - vars = vars, - by = by + joins = joins_field, + table_names = vctrs::vec_rbind(table_names_x, table_names_y), + vars = vars ) return(out) } - # In some cases it would also be possible to inline mutate but this would - # require careful analysis to not introduce bugs which does not really seem - # worth it currently. - if (is_lazy_select_query_simple(x_lq, select = "projection")) { - vars$x <- update_join_vars(vars$x, x_lq$select) - by$x <- update_join_vars(by$x, x_lq$select) - vars$all_x <- op_vars(x_lq$x) - x_lq <- x_lq$x + # `x_lq` must be a `lazy_multi_join_query` so it can be modified directly + if (!is_null(join_alias$x)) { + x_lq$table_names$as[[1]] <- join_alias$x } - if (is_lazy_select_query_simple(y_lq, select = "projection")) { - vars$y <- update_join_vars(vars$y, y_lq$select) - by$y <- update_join_vars(by$y, y_lq$select) - vars$all_y <- op_vars(y_lq$x) - y_lq <- y_lq$x + x_lq$joins <- vctrs::vec_rbind(x_lq$joins, joins_field) + x_lq$table_names <- vctrs::vec_rbind(x_lq$table_names, table_names_y) + x_lq$vars <- vars + + x_lq +} + +join_inline_select <- function(lq, by, on) { + if (is_empty(on) && is_lazy_select_query_simple(lq, select = "projection")) { + vars <- purrr::map_chr(lq$select$expr, as_string) + + idx <- vctrs::vec_match(lq$select$name, by) + by <- vctrs::vec_assign(by, idx, vars) + + lq_org <- lq + lq <- lq$x + lq$group_vars <- op_grps(lq_org) + lq$order_vars <- op_sort(lq_org) + lq$frame <- op_frame(lq_org) + } else { + vars <- op_vars(lq) } list( - x = x_lq, - y = y_lq, + lq = lq, vars = vars, by = by ) } -update_join_vars <- function(vars, select) { - idx <- vctrs::vec_match(select$name, vars) - prev_vars <- purrr::map_chr(select$expr, as_string) - vctrs::vec_assign(vars, idx, prev_vars) +join_needs_new_query <- function(x_lq, join_alias, type) { + if (!inherits(x_lq, "lazy_multi_join_query")) { + return(TRUE) + } + + if (!type %in% c("left", "inner")) { + return(TRUE) + } + + x_as <- join_alias$x + y_as <- join_alias$y + + as_current <- x_lq$table_names$as + if (!is_null(x_as)) { + x_as_current <- as_current[[1]] + if (!is_null(x_as_current) && !identical(x_as, x_as_current)) { + return(TRUE) + } + + if (join_as_already_used(x_as, as_current[-1])) { + return(TRUE) + } + } + + if (!is_null(y_as)) { + if (join_as_already_used(y_as, as_current)) { + return(TRUE) + } + } + + FALSE +} + +join_as_already_used <- function(as, as_used) { + if (all(is.na(as_used))) { + return(FALSE) + } + + any(as == as_used) +} + +multi_join_vars <- function(x_join_vars, + y_join_vars, + by_x_org, + by_y, + type, + table_id, + suffix, + call) { + # Remove join keys from y + y_join_idx <- vctrs::vec_match(by_y, unlist(y_join_vars$var)) + if (!is_empty(y_join_idx)) { + y_join_vars <- vctrs::vec_slice(y_join_vars, -y_join_idx) + } + x_names <- x_join_vars$name + y_names <- y_join_vars$name + + # Add suffix where needed + x_join_vars$name <- add_suffixes(x_names, y_names, suffix$x) + y_join_vars$name <- add_suffixes(y_names, x_names, suffix$y) + + if (type %in% c("left", "inner")) { + # use all variables from `x` as is + # use non-join variables from `y` + } else if (type == "right") { + # Careful: this relies on the assumption that right_join()` starts a new query + # `x`: non-join variables; `y`: all variables + # -> must update table id of `x` join vars + x_join_vars$table[x_join_vars$name %in% by_x_org] <- list(table_id) + idx <- vctrs::vec_match(by_x_org, x_join_vars$name) + x_join_vars$var[idx] <- as.list(by_y) + } else if (type == "full") { + # Careful: this relies on the assumption that `full_join()` starts a new query + idx <- vctrs::vec_match(by_x_org, x_join_vars$name) + x_join_vars$table[idx] <- purrr::map( + x_join_vars$table[idx], + ~ c(.x, table_id) + ) + x_join_vars$var[idx] <- purrr::map2( + x_join_vars$var[idx], by_y, + ~ c(.x, .y) + ) + } else if (type == "cross") { + # -> simply append `y_rows` + } + + vctrs::vec_rbind(x_join_vars, y_join_vars) } add_semi_join <- function(x, y, anti = FALSE, by = NULL, sql_on = NULL, copy = FALSE, @@ -327,25 +451,14 @@ add_semi_join <- function(x, y, anti = FALSE, by = NULL, sql_on = NULL, copy = F indexes = if (auto_index) list(by$y) ) + inline_result <- join_inline_select(x$lazy_query, by$x, sql_on) + x_lq <- inline_result$lq + by$x <- inline_result$by vars <- tibble( name = op_vars(x), - var = op_vars(x) + var = inline_result$vars ) - x_lq <- x$lazy_query - if (is_null(sql_on) && is_lazy_select_query_simple(x_lq, select = "projection")) { - if (!is_select_identity(x_lq$select, op_vars(x_lq))) { - by$x <- update_join_vars(by$x, x_lq$select) - vars$var <- purrr::map_chr(x_lq$select$expr, as_name) - vars$name <- x_lq$select$name - } - - x_lq <- x_lq$x - x_lq$group_vars <- op_grps(x) - x_lq$order_vars <- op_sort(x) - x_lq$frame <- op_frame(x) - } - # the table alias can only be determined after `select()` was inlined join_alias <- check_join_alias(x_as, y_as, sql_on, call) diff --git a/R/verb-select.R b/R/verb-select.R index 56286833a..5157cc4e6 100644 --- a/R/verb-select.R +++ b/R/verb-select.R @@ -165,15 +165,7 @@ add_select <- function(.data, vars, op = c("select", "mutate")) { out <- lazy_query out$group_vars <- grps - if (inherits(lazy_query, "lazy_join_query")) { - out$vars$alias <- names(sel_vars) - out$vars$x <- lazy_query$vars$x[idx] - out$vars$y <- lazy_query$vars$y[idx] - - return(out) - } - - if (inherits(lazy_query, "lazy_semi_join_query")) { + if (inherits(lazy_query, "lazy_multi_join_query") || inherits(lazy_query, "lazy_semi_join_query")) { out$vars <- vctrs::vec_slice(out$vars, idx) out$vars$name <- names(sel_vars) diff --git a/man/db-sql.Rd b/man/db-sql.Rd index e91673e78..62fa5336c 100644 --- a/man/db-sql.Rd +++ b/man/db-sql.Rd @@ -17,6 +17,7 @@ \alias{supports_star_without_alias} \alias{sql_query_select} \alias{sql_query_join} +\alias{sql_query_multi_join} \alias{sql_query_semi_join} \alias{sql_query_set_op} \alias{sql_returning_cols} @@ -76,6 +77,17 @@ sql_query_join( lvl = 0 ) +sql_query_multi_join( + con, + x, + joins, + table_names, + vars, + all_vars_list, + ..., + lvl = 0 +) + sql_query_semi_join(con, x, y, anti, by, vars, ..., lvl = 0) sql_query_set_op(con, x, y, method, ..., all = FALSE, lvl = 0) diff --git a/man/sql_build.Rd b/man/sql_build.Rd index ad569936f..ca5eb54f4 100644 --- a/man/sql_build.Rd +++ b/man/sql_build.Rd @@ -4,6 +4,7 @@ % R/query-select.R, R/query-semi-join.R, R/query-set-op.R, R/sql-build.R \name{lazy_join_query} \alias{lazy_join_query} +\alias{lazy_multi_join_query} \alias{lazy_semi_join_query} \alias{lazy_query} \alias{lazy_select_query} @@ -31,6 +32,17 @@ lazy_join_query( call = caller_env() ) +lazy_multi_join_query( + x, + joins, + table_names, + vars, + group_vars = op_grps(x), + order_vars = op_sort(x), + frame = op_frame(x), + call = caller_env() +) + lazy_semi_join_query( x, y, diff --git a/tests/testthat/_snaps/query-join.md b/tests/testthat/_snaps/query-join.md index dd6b3a8bf..f591aef54 100644 --- a/tests/testthat/_snaps/query-join.md +++ b/tests/testthat/_snaps/query-join.md @@ -1,17 +1,21 @@ # print method doesn't change unexpectedly Code - sql_build(left_join(lf1, lf2)) - Message - Joining, by = "x" + left_join(lf1, lf2, by = "x") %>% left_join(lf3, by = "x") %>% sql_build() Output - + + X: + lf1 + Type: left + By: + x-x + Y: + lf2 + Type: left By: x-x - X: - df Y: - df + lf3 # generated sql doesn't change unexpectedly diff --git a/tests/testthat/_snaps/verb-joins.md b/tests/testthat/_snaps/verb-joins.md index ccbff076c..5d7738440 100644 --- a/tests/testthat/_snaps/verb-joins.md +++ b/tests/testthat/_snaps/verb-joins.md @@ -63,16 +63,31 @@ WHERE (`lf1`.`x1` = `RHS`.`x`) ) +# multiple joins create a single query + + Code + remote_query(out) + Output + SELECT `df3`.`x` AS `x`, `a`, `LHS`.`b` AS `b.x`, `df3`.`b` AS `b.y` + FROM ( + SELECT `df1`.*, `b` + FROM `df1` + LEFT JOIN `df2` + ON (`df1`.`x` = `df2`.`x`) + ) `LHS` + RIGHT JOIN `df3` + ON (`LHS`.`x` = `df3`.`x`) + # can optionally match NA values Code - left_join(lf, lf, by = "x", na_matches = "na") + left_join(lf1, lf2, by = "x", na_matches = "na") Output - SELECT `df_LHS`.`x` AS `x` - FROM `df` AS `df_LHS` - LEFT JOIN `df` AS `df_RHS` - ON (CASE WHEN (`df_LHS`.`x` = `df_RHS`.`x`) OR (`df_LHS`.`x` IS NULL AND `df_RHS`.`x` IS NULL) THEN 0 ELSE 1 END = 0) + SELECT `lf1`.`x` AS `x` + FROM `lf1` + LEFT JOIN `lf2` + ON (CASE WHEN (`lf1`.`x` = `lf2`.`x`) OR (`lf1`.`x` IS NULL AND `lf2`.`x` IS NULL) THEN 0 ELSE 1 END = 0) # suffix arg is checked @@ -95,8 +110,10 @@ INNER JOIN `lf1` AS `lf1_RHS` ON (`lf1_LHS`.`x` = `lf1_RHS`.`x`) ) - SELECT `LHS`.`x` AS `x` - FROM `q01` AS `LHS` - LEFT JOIN `q01` AS `RHS` - ON (`LHS`.`x` = `RHS`.`x`) + SELECT `lf1...1`.`x` AS `x` + FROM `lf1` AS `lf1...1` + INNER JOIN `lf1` AS `lf1...2` + ON (`lf1...1`.`x` = `lf1...2`.`x`) + LEFT JOIN `q01` AS `...3` + ON (`lf1...1`.`x` = `...3`.`x`) diff --git a/tests/testthat/test-query-join.R b/tests/testthat/test-query-join.R index 97e34dd11..a5c9c064b 100644 --- a/tests/testthat/test-query-join.R +++ b/tests/testthat/test-query-join.R @@ -1,7 +1,13 @@ test_that("print method doesn't change unexpectedly", { - lf1 <- lazy_frame(x = 1, y = 2) - lf2 <- lazy_frame(x = 1, z = 2) - expect_snapshot(sql_build(left_join(lf1, lf2))) + lf1 <- lazy_frame(x = 1, y = 2, .name = "lf1") + lf2 <- lazy_frame(x = 1, z = 2, .name = "lf2") + lf3 <- lazy_frame(x = 1, z = 2, .name = "lf3") + + expect_snapshot( + left_join(lf1, lf2, by = "x") %>% + left_join(lf3, by = "x") %>% + sql_build() + ) }) test_that("generated sql doesn't change unexpectedly", { diff --git a/tests/testthat/test-verb-joins.R b/tests/testthat/test-verb-joins.R index 5311b05a0..ab854d998 100644 --- a/tests/testthat/test-verb-joins.R +++ b/tests/testthat/test-verb-joins.R @@ -166,71 +166,70 @@ test_that("join uses correct table alias", { y <- lazy_frame(a = 1, y = 1, .name = "y") # self joins - by <- left_join(x, x, by = "a")$lazy_query$by - expect_equal(by$x_as, ident("x_LHS")) - expect_equal(by$y_as, ident("x_RHS")) + table_names <- sql_build(left_join(x, x, by = "a"))$table_names + expect_equal(table_names, c("x_LHS", "x_RHS")) - by <- left_join(x, x, by = "a", x_as = "my_x")$lazy_query$by - expect_equal(by$x_as, ident("my_x")) - expect_equal(by$y_as, ident("x")) + table_names <- sql_build(left_join(x, x, by = "a", x_as = "my_x"))$table_names + expect_equal(table_names, c("my_x", "x")) - by <- left_join(x, x, by = "a", y_as = "my_y")$lazy_query$by - expect_equal(by$x_as, ident("x")) - expect_equal(by$y_as, ident("my_y")) + table_names <- sql_build(left_join(x, x, by = "a", y_as = "my_y"))$table_names + expect_equal(table_names, c("x", "my_y")) - by <- left_join(x, x, by = "a", x_as = "my_x", y_as = "my_y")$lazy_query$by - expect_equal(by$x_as, ident("my_x")) - expect_equal(by$y_as, ident("my_y")) + table_names <- sql_build(left_join(x, x, by = "a", x_as = "my_x", y_as = "my_y"))$table_names + expect_equal(table_names, c("my_x", "my_y")) # x-y joins - by <- left_join(x, y, by = "a")$lazy_query$by - expect_equal(by$x_as, ident("x")) - expect_equal(by$y_as, ident("y")) + table_names <- sql_build(left_join(x, y, by = "a"))$table_names + expect_equal(table_names, c("x", "y")) - by <- left_join(x, y, by = "a", x_as = "my_x")$lazy_query$by - expect_equal(by$x_as, ident("my_x")) - expect_equal(by$y_as, ident("y")) + table_names <- sql_build(left_join(x, y, by = "a", x_as = "my_x"))$table_names + expect_equal(table_names, c("my_x", "y")) - by <- left_join(x, y, by = "a", y_as = "my_y")$lazy_query$by - expect_equal(by$x_as, ident("x")) - expect_equal(by$y_as, ident("my_y")) + table_names <- sql_build(left_join(x, y, by = "a", y_as = "my_y"))$table_names + expect_equal(table_names, c("x", "my_y")) - by <- left_join(x, y, by = "a", x_as = "my_x", y_as = "my_y")$lazy_query$by - expect_equal(by$x_as, ident("my_x")) - expect_equal(by$y_as, ident("my_y")) + table_names <- sql_build(left_join(x, y, by = "a", x_as = "my_x", y_as = "my_y"))$table_names + expect_equal(table_names, c("my_x", "my_y")) # x_as same name as `y` - by <- left_join(x, y, by = "a", x_as = "y")$lazy_query$by - expect_equal(by$x_as, ident("y")) - expect_equal(by$y_as, ident("y...2")) + table_names <- sql_build(left_join(x, y, by = "a", x_as = "y"))$table_names + expect_equal(table_names, c("y", "y...2")) - by <- left_join(x %>% filter(x == 1), x, by = "x", y_as = "LHS")$lazy_query$by - expect_equal(by$x_as, ident("LHS...1")) - expect_equal(by$y_as, ident("LHS")) + table_names <- sql_build(left_join(x %>% filter(x == 1), x, by = "x", y_as = "LHS"))$table_names + expect_equal(table_names, c("LHS...1", "LHS")) # sql_on -> use alias or LHS/RHS - by <- left_join(x, y, sql_on = sql("LHS.a = RHS.a"))$lazy_query$by - expect_equal(by$x_as, ident("LHS")) - expect_equal(by$y_as, ident("RHS")) + table_names <- sql_build(left_join(x, y, sql_on = sql("LHS.a = RHS.a")))$table_names + expect_equal(table_names, c("LHS", "RHS")) - by <- left_join(x, y, x_as = "my_x", sql_on = sql("my_x.a = RHS.a"))$lazy_query$by - expect_equal(by$x_as, ident("my_x")) - expect_equal(by$y_as, ident("RHS")) + table_names <- sql_build(left_join(x, y, x_as = "my_x", sql_on = sql("my_x.a = RHS.a")))$table_names + expect_equal(table_names, c("my_x", "RHS")) + + # triple join + z <- lazy_frame(a = 1, z = 1, .name = "z") + out <- left_join(x, y, by = "a") %>% + left_join(z, by = "a") %>% + sql_build() + expect_equal(out$table_names, c("x", "y", "z")) + + # triple join where names need to be repaired + out <- left_join(x, x, by = "a") %>% + left_join(z, by = "a") %>% + sql_build() + expect_equal(out$table_names, c("x...1", "x...2", "z")) }) test_that("select() before join is inlined", { lf <- lazy_frame(x1 = 10, a = 1, y = 3, .name = "lf1") lf2 <- lazy_frame(x2 = 10, b = 2, z = 4, .name = "lf2") - test_vars <- function(lq, x, y) { - expect_equal(lq$vars$alias, c("a2", "x", "b")) - expect_equal(lq$vars$x, x) - expect_equal(lq$vars$y, y) - expect_equal(lq$vars$all_x, c("x1", "a", "y")) - expect_equal(lq$vars$all_y, c("x2", "b", "z")) + test_vars <- function(lq, var, table) { + expect_equal(lq$vars$name, c("a2", "x", "b")) + expect_equal(lq$vars$var, var) + expect_equal(lq$vars$table, table) - expect_equal(lq$by$x, "x1") - expect_equal(lq$by$y, "x2") + expect_equal(lq$joins$by_x, list("x1")) + expect_equal(lq$joins$by_y, list("x2")) } out_left <- left_join( @@ -238,7 +237,7 @@ test_that("select() before join is inlined", { lf2 %>% select(x = x2, b), by = "x" ) - test_vars(out_left$lazy_query, c("a", "x1", NA), c(NA, NA, "b")) + test_vars(out_left$lazy_query, list("a", "x1", "b"), list(1, 1, 2)) expect_equal(op_vars(out_left), c("a2", "x", "b")) expect_snapshot(out_left) @@ -247,21 +246,24 @@ test_that("select() before join is inlined", { lf2 %>% select(x = x2, b), by = "x" ) - test_vars(out_inner$lazy_query, c("a", "x1", NA), c(NA, NA, "b")) + test_vars(out_inner$lazy_query, list("a", "x1", "b"), list(1, 1, 2)) out_right <- right_join( lf %>% select(a2 = a, x = x1), lf2 %>% select(x = x2, b), by = "x" ) - test_vars(out_right$lazy_query, c("a", NA, NA), c(NA, "x2", "b")) + test_vars(out_right$lazy_query, list("a", "x2", "b"), list(1, 2, 2)) out_full <- full_join( lf %>% select(a2 = a, x = x1), lf2 %>% select(x = x2, b), by = "x" ) - test_vars(out_full$lazy_query, c("a", "x1", NA), c(NA, "x2", "b")) + vars <- out_full$lazy_query$vars + expect_equal(vars$name, c("a2", "x", "b")) + expect_equal(vars$var, list("a", c("x1", "x2"), "b")) + expect_equal(vars$table, list(1, c(1, 2), 2)) out_cross <- full_join( lf %>% select(a2 = a, x = x1), @@ -269,11 +271,9 @@ test_that("select() before join is inlined", { by = character() ) vars <- out_cross$lazy_query$vars - expect_equal(vars$alias, c("a2", "x.x", "x.y", "b")) - expect_equal(vars$x, c("a", "x1", NA, NA)) - expect_equal(vars$y, c(NA, NA, "x2", "b")) - expect_equal(vars$all_x, c("x1", "a", "y")) - expect_equal(vars$all_y, c("x2", "b", "z")) + expect_equal(vars$name, c("a2", "x.x", "x.y", "b")) + expect_equal(vars$var, list("a", "x1", "x2", "b")) + expect_equal(vars$table, list(1, 1, 2, 2)) }) test_that("select() before join works for tables with same column name", { @@ -288,8 +288,8 @@ test_that("select() before join works for tables with same column name", { lq <- out$lazy_query expect_equal(op_vars(lq), c("id1", "x", "id2")) - expect_equal(lq$vars$x, c("id", "x", NA)) - expect_equal(lq$vars$y, c(NA, NA, "id")) + expect_equal(lq$vars$var, list("id", "x", "id")) + expect_equal(lq$vars$table, list(1, 1, 2)) }) test_that("named by works in combination with inlined select", { @@ -304,10 +304,10 @@ test_that("named by works in combination with inlined select", { lq <- out$lazy_query expect_equal(op_vars(lq), c("id_x", "x.x")) - expect_equal(lq$vars$x, c("id_x", "x")) - expect_equal(lq$vars$y, c(NA_character_, NA_character_)) - expect_equal(lq$by$x, c("id_x", "x")) - expect_equal(lq$by$y, c("id_y", "x")) + expect_equal(lq$vars$var, list("id_x", "x")) + expect_equal(lq$vars$table, list(1, 1)) + expect_equal(lq$joins$by_x, list(c("id_x", "x"))) + expect_equal(lq$joins$by_y, list(c("id_y", "x"))) }) test_that("suffix works in combination with inlined select", { @@ -322,8 +322,8 @@ test_that("suffix works in combination with inlined select", { lq <- out$lazy_query expect_equal(op_vars(lq), c("id", "x2.x", "x2.y")) - expect_equal(lq$vars$x, c("id", "x", NA)) - expect_equal(lq$vars$y, c(NA, NA, "x")) + expect_equal(lq$vars$var, list("id", "x", "x")) + expect_equal(lq$vars$table, list(1L, 1L, 2L)) }) test_that("select() before join is not inlined when using `sql_on`", { @@ -338,9 +338,9 @@ test_that("select() before join is not inlined when using `sql_on`", { lq <- out$lazy_query expect_s3_class(lq$x, "lazy_select_query") - expect_s3_class(lq$y, "lazy_select_query") - expect_equal(lq$vars$x, c("a2", "x", NA, NA)) - expect_equal(lq$vars$y, c(NA, NA, "x", "b")) + expect_s3_class(lq$joins$table[[1]], "lazy_select_query") + expect_equal(lq$vars$var, list("a2", "x", "x", "b")) + expect_equal(lq$vars$table, list(1L, 1L, 2L, 2L)) }) test_that("select() before semi_join is inlined", { @@ -399,6 +399,94 @@ test_that("select() before join is not inlined when using `sql_on`", { expect_equal(lq$vars, tibble(name = c("a2", "x"), var = c("a2", "x"))) }) +test_that("multiple joins create a single query", { + lf <- lazy_frame(x = 1, a = 1, .name = "df1") + lf2 <- lazy_frame(x = 1, b = 2, .name = "df2") + lf3 <- lazy_frame(x = 1, b = 2, .name = "df3") + + out <- left_join(lf, lf2, by = "x") %>% + inner_join(lf3, by = "x") + lq <- out$lazy_query + expect_s3_class(lq, "lazy_multi_join_query") + expect_equal(lq$table_names, tibble(as = NA_character_, name = c("df1", "df2", "df3"))) + expect_equal(lq$vars$name, c("x", "a", "b.x", "b.y")) + expect_equal(lq$vars$table, list(1L, 1L, 2L, 3L)) + expect_equal(lq$vars$var, list("x", "a", "b", "b")) + + expect_equal( + remote_query(out), + sql( +"SELECT `df1`.*, `df2`.`b` AS `b.x`, `df3`.`b` AS `b.y` +FROM `df1` +LEFT JOIN `df2` + ON (`df1`.`x` = `df2`.`x`) +INNER JOIN `df3` + ON (`df1`.`x` = `df3`.`x`)" + ) + ) + + # `right_join()` and `full_join()` are not inlined + out <- left_join(lf, lf2, by = "x") %>% + right_join(lf3, by = "x") + + out_build <- out %>% sql_build() + expect_s3_class(out_build, "join_query") + expect_s3_class(out_build$x, "multi_join_query") + + expect_snapshot(remote_query(out)) + + out_build <- left_join(lf, lf2, by = "x") %>% + full_join(lf3, by = "x") %>% + sql_build() + + expect_s3_class(out_build, "join_query") + expect_s3_class(out_build$x, "multi_join_query") +}) + +test_that("multi joins work with x_as", { + lf <- lazy_frame(x = 1, a = 1, .name = "df1") + lf2 <- lazy_frame(x = 1, b = 2, .name = "df2") + lf3 <- lazy_frame(x = 1, b = 2, .name = "df3") + + out <- left_join(lf, lf2, by = "x", x_as = "lf1", y_as = "lf2") %>% + inner_join(lf3, by = "x", y_as = "lf3") + lq <- out$lazy_query + expect_s3_class(lq, "lazy_multi_join_query") + expect_equal(lq$table_names, tibble(as = c("lf1", "lf2", "lf3"), name = c("df1", "df2", "df3"))) + + # `x_as` provided twice with the same name -> one query + out2 <- left_join(lf, lf2, by = "x", x_as = "lf1", y_as = "lf2") %>% + inner_join(lf3, by = "x", x_as = "lf1", y_as = "lf3") + expect_equal(out, out2) + + # `x_as` provided twice with different names -> two queries + lq <- left_join(lf, lf2, by = "x", x_as = "lf1") %>% + inner_join(lf3, by = "x", x_as = "lf2") %>% + .$lazy_query + expect_s3_class(lq, "lazy_multi_join_query") + expect_s3_class(lq$x, "lazy_multi_join_query") + + # `x_as` name already used + lq <- left_join(lf, lf2, by = "x", y_as = "lf2") %>% + inner_join(lf3, by = "x", x_as = "lf2") %>% + .$lazy_query + expect_s3_class(lq, "lazy_multi_join_query") + expect_s3_class(lq$x, "lazy_multi_join_query") + + # `y_as` name already used + lq <- left_join(lf, lf2, by = "x", y_as = "lf2") %>% + inner_join(lf3, by = "x", y_as = "lf2") %>% + .$lazy_query + expect_s3_class(lq, "lazy_multi_join_query") + expect_s3_class(lq$x, "lazy_multi_join_query") + + lq <- left_join(lf, lf2, by = "x", x_as = "lf2") %>% + inner_join(lf3, by = "x", y_as = "lf2") %>% + .$lazy_query + expect_s3_class(lq, "lazy_multi_join_query") + expect_s3_class(lq$x, "lazy_multi_join_query") +}) + # sql_build --------------------------------------------------------------- test_that("join verbs generate expected ops", { @@ -406,20 +494,20 @@ test_that("join verbs generate expected ops", { lf2 <- lazy_frame(x = 1, z = 2) ji <- inner_join(lf1, lf2, by = "x") - expect_s3_class(ji$lazy_query, "lazy_join_query") - expect_equal(ji$lazy_query$type, "inner") + expect_s3_class(ji$lazy_query, "lazy_multi_join_query") + expect_equal(ji$lazy_query$joins$type, "inner") jl <- left_join(lf1, lf2, by = "x") - expect_s3_class(jl$lazy_query, "lazy_join_query") - expect_equal(jl$lazy_query$type, "left") + expect_s3_class(jl$lazy_query, "lazy_multi_join_query") + expect_equal(jl$lazy_query$joins$type, "left") jr <- right_join(lf1, lf2, by = "x") - expect_s3_class(jr$lazy_query, "lazy_join_query") - expect_equal(jr$lazy_query$type, "right") + expect_s3_class(jr$lazy_query, "lazy_multi_join_query") + expect_equal(jr$lazy_query$joins$type, "right") jf <- full_join(lf1, lf2, by = "x") - expect_s3_class(jf$lazy_query, "lazy_join_query") - expect_equal(jf$lazy_query$type, "full") + expect_s3_class(jf$lazy_query, "lazy_multi_join_query") + expect_equal(jf$lazy_query$joins$type, "full") js <- semi_join(lf1, lf2, by = "x") expect_s3_class(js$lazy_query, "lazy_semi_join_query") @@ -431,8 +519,9 @@ test_that("join verbs generate expected ops", { }) test_that("can optionally match NA values", { - lf <- lazy_frame(x = 1) - expect_snapshot(left_join(lf, lf, by = "x", na_matches = "na")) + lf1 <- lazy_frame(x = 1, .name = "lf1") + lf2 <- lazy_frame(x = 1, .name = "lf2") + expect_snapshot(left_join(lf1, lf2, by = "x", na_matches = "na")) }) test_that("join captures both tables", { @@ -441,8 +530,8 @@ test_that("join captures both tables", { out <- inner_join(lf1, lf2, by = "x") %>% sql_build() - expect_s3_class(out, "join_query") - expect_equal(out$type, "inner") + expect_s3_class(out, "multi_join_query") + expect_equal(out$joins$type, "inner") }) test_that("semi join captures both tables", { @@ -498,42 +587,46 @@ test_that("left_join/inner_join uses *", { con <- simulate_dbi() out <- lf1 %>% - left_join(lf2, by = c("a", "b")) + left_join(lf2, by = c("a", "b")) %>% + sql_build() expect_equal( - sql_join_vars(con, out$lazy_query$vars, type = "left"), - sql("`LHS`.*", z = "`z`") + sql_multi_join_vars(con, out$vars, out$table_names, out$all_vars_list), + sql("`df_LHS`.*", z = "`z`") ) # also works after relocate out <- lf1 %>% left_join(lf2, by = c("a", "b")) %>% - relocate(z) + relocate(z) %>% + sql_build() expect_equal( - sql_join_vars(con, out$lazy_query$vars, type = "left"), - sql(z = "`z`", "`LHS`.*") + sql_multi_join_vars(con, out$vars, out$table_names, out$all_vars_list), + sql(z = "`z`", "`df_LHS`.*") ) # does not use * if variable are missing out <- lf1 %>% left_join(lf2, by = c("a", "b")) %>% - select(a, c) + select(a, c) %>% + sql_build() expect_equal( - sql_join_vars(con, out$lazy_query$vars, type = "left"), - sql(a = "`LHS`.`a`", c = "`c`") + sql_multi_join_vars(con, out$vars, out$table_names, out$all_vars_list), + sql(a = "`df_LHS`.`a`", c = "`c`") ) # does not use * if variable names changed lf1 <- lazy_frame(a = 1, b = 2) lf2 <- lazy_frame(a = 1, b = 2) out <- lf1 %>% - left_join(lf2, by = c("a")) + left_join(lf2, by = c("a")) %>% + sql_build() expect_equal( - sql_join_vars(con, out$lazy_query$vars, type = "left"), - sql(a = "`LHS`.`a`", `b.x` = "`LHS`.`b`", `b.y` = "`RHS`.`b`") + sql_multi_join_vars(con, out$vars, out$table_names, out$all_vars_list), + sql(a = "`df_LHS`.`a`", `b.x` = "`df_LHS`.`b`", `b.y` = "`df_RHS`.`b`") ) }) @@ -543,43 +636,47 @@ test_that("right_join uses *", { con <- simulate_dbi() out <- lf1 %>% - right_join(lf2, by = c("a", "b")) + right_join(lf2, by = c("a", "b")) %>% + sql_build() # cannot use * without relocate or select expect_equal( - sql_join_vars(con, out$lazy_query$vars, type = "right"), - sql(a = "`RHS`.`a`", b = "`RHS`.`b`", c = "`c`", z = "`z`") + sql_join_vars(con, out$vars, type = "right", x_as = out$by$x_as, y_as = out$by$y_as), + sql(a = "`df_RHS`.`a`", b = "`df_RHS`.`b`", c = "`c`", z = "`z`") ) # also works after relocate out <- lf1 %>% right_join(lf2, by = c("a", "b")) %>% - select(a, b, z, c) + select(a, b, z, c) %>% + sql_build() expect_equal( - sql_join_vars(con, out$lazy_query$vars, type = "right"), - sql("`RHS`.*", c = "`c`") + sql_join_vars(con, out$vars, type = "right", x_as = out$by$x_as, y_as = out$by$y_as), + sql("`df_RHS`.*", c = "`c`") ) # does not use * if variable are missing out <- lf1 %>% right_join(lf2, by = c("a", "b")) %>% - select(a, z) + select(a, z) %>% + sql_build() expect_equal( - sql_join_vars(con, out$lazy_query$vars, type = "right"), - sql(a = "`RHS`.`a`", z = "`z`") + sql_join_vars(con, out$vars, type = "right", x_as = out$by$x_as, y_as = out$by$y_as), + sql(a = "`df_RHS`.`a`", z = "`z`") ) # does not use * if variable names changed lf1 <- lazy_frame(a = 1, b = 2) lf2 <- lazy_frame(a = 1, b = 2) out <- lf1 %>% - right_join(lf2, by = c("a")) + right_join(lf2, by = c("a")) %>% + sql_build() expect_equal( - sql_join_vars(con, out$lazy_query$vars, type = "right"), - sql(a = "`RHS`.`a`", `b.x` = "`LHS`.`b`", `b.y` = "`RHS`.`b`") + sql_join_vars(con, out$vars, type = "right", x_as = out$by$x_as, y_as = out$by$y_as), + sql(a = "`df_RHS`.`a`", `b.x` = "`df_LHS`.`b`", `b.y` = "`df_RHS`.`b`") ) }) @@ -589,39 +686,43 @@ test_that("cross_join uses *", { con <- simulate_dbi() out <- lf1 %>% - full_join(lf2, by = character()) + full_join(lf2, by = character()) %>% + sql_build() expect_equal( - sql_join_vars(con, out$lazy_query$vars, type = "cross"), - set_names(sql("`LHS`.*", "`RHS`.*"), c("", "")) + sql_multi_join_vars(con, out$vars, out$table_names, out$all_vars_list), + set_names(sql("`df_LHS`.*", "`df_RHS`.*"), c("", "")) ) # also works after relocate out <- lf1 %>% full_join(lf2, by = character()) %>% - select(x, y, a, b) + select(x, y, a, b) %>% + sql_build() expect_equal( - sql_join_vars(con, out$lazy_query$vars, type = "cross"), - set_names(sql("`RHS`.*", "`LHS`.*"), c("", "")) + sql_multi_join_vars(con, out$vars, out$table_names, out$all_vars_list), + set_names(sql("`df_RHS`.*", "`df_LHS`.*"), c("", "")) ) out <- lf1 %>% full_join(lf2, by = character()) %>% - select(x, a, b, y) + select(x, a, b, y) %>% + sql_build() expect_equal( - sql_join_vars(con, out$lazy_query$vars, type = "cross"), - sql(x = "`x`", "`LHS`.*", y = "`y`") + sql_multi_join_vars(con, out$vars, out$table_names, out$all_vars_list), + sql(x = "`x`", "`df_LHS`.*", y = "`y`") ) out <- lf1 %>% full_join(lf2, by = character()) %>% - select(a, x, y, b) + select(a, x, y, b) %>% + sql_build() expect_equal( - sql_join_vars(con, out$lazy_query$vars, type = "cross"), - sql(a = "`a`", "`RHS`.*", b = "`b`") + sql_multi_join_vars(con, out$vars, out$table_names, out$all_vars_list), + sql(a = "`a`", "`df_RHS`.*", b = "`b`") ) }) @@ -631,13 +732,14 @@ test_that("full_join() does not use *", { lf2 <- lazy_frame(a = 1, b = 2) out <- lf1 %>% - full_join(lf2, by = c("a", "b")) + full_join(lf2, by = c("a", "b")) %>% + sql_build() expect_equal( - sql_join_vars(con, out$lazy_query$vars, type = "full"), + sql_join_vars(con, out$vars, type = "full", x_as = out$by$x_as, y_as = out$by$y_as), sql( - a = "COALESCE(`LHS`.`a`, `RHS`.`a`)", - b = "COALESCE(`LHS`.`b`, `RHS`.`b`)" + a = "COALESCE(`df_LHS`.`a`, `df_RHS`.`a`)", + b = "COALESCE(`df_LHS`.`b`, `df_RHS`.`b`)" ) ) }) diff --git a/tests/testthat/test-verb-select.R b/tests/testthat/test-verb-select.R index 385fcc94c..ee8e319c4 100644 --- a/tests/testthat/test-verb-select.R +++ b/tests/testthat/test-verb-select.R @@ -120,8 +120,8 @@ test_that("select() after left_join() is inlined", { relocate(b)) ) expect_equal(op_vars(out), c("b", "x", "a")) - expect_equal(out$lazy_query$vars$x, c(NA, "x", "a")) - expect_equal(out$lazy_query$vars$y, c("b", NA, NA)) + expect_equal(out$lazy_query$vars$var, list("b", "x", "a")) + expect_equal(out$lazy_query$vars$table, list(2L, 1L, 1L)) out <- left_join(lf1, lf2, by = "x") %>% transmute(b, x = x + 1) @@ -181,9 +181,8 @@ test_that("select() after join handles previous select", { expect_equal(op_vars(lf2), c("x2", "y3", "z")) vars2 <- lf2$lazy_query$vars - expect_equal(vars2$alias, c("x2", "y3", "z")) - expect_equal(vars2$x, c("x", "y", "z")) - expect_equal(vars2$y, c(NA_character_, NA, NA)) + expect_equal(vars2$var, list("x", "y", "z")) + expect_equal(vars2$table, list(1L, 1L, 1L)) expect_equal(op_grps(lf2), c("x2", "y3", "z")) expect_snapshot(print(lf2)) From c4dd32bf86d82887fef9ac6ed50dbb3df1990c1c Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Thu, 25 Aug 2022 13:29:13 +0000 Subject: [PATCH 08/28] Fix incorrect check in `join_needs_new_query()` --- R/verb-joins.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/verb-joins.R b/R/verb-joins.R index 004a28ca8..134a85477 100644 --- a/R/verb-joins.R +++ b/R/verb-joins.R @@ -362,7 +362,7 @@ join_needs_new_query <- function(x_lq, join_alias, type) { as_current <- x_lq$table_names$as if (!is_null(x_as)) { x_as_current <- as_current[[1]] - if (!is_null(x_as_current) && !identical(x_as, x_as_current)) { + if (!is.na(x_as_current) && !identical(x_as, x_as_current)) { return(TRUE) } From fe93ae6026a93327ae03183789a8a3b347d8743a Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Wed, 14 Sep 2022 13:54:24 +0000 Subject: [PATCH 09/28] Can use by column from other table than first --- R/db-sql.R | 15 ++++++++++---- R/lazy-join-query.R | 4 ++-- R/query-join.R | 2 +- R/verb-joins.R | 35 ++++++++++++++++++++++++++------ tests/testthat/test-verb-joins.R | 29 ++++++++++++++++++++++++++ 5 files changed, 72 insertions(+), 13 deletions(-) diff --git a/R/db-sql.R b/R/db-sql.R index 4af78576f..faa198e3e 100644 --- a/R/db-sql.R +++ b/R/db-sql.R @@ -420,14 +420,21 @@ sql_query_multi_join.DBIConnection <- function(con, ons <- purrr::pmap( vctrs::vec_cbind( rhs = table_names[-1], - joins[c("by_x", "by_y", "on", "na_matches")] + joins[c("by_x", "by_x_table_id", "by_y", "on", "na_matches")] ), - function(rhs, by_x, by_y, on, na_matches) { + function(rhs, by_x, by_x_table_id, by_y, on, na_matches) { if (!is.na(on)) { return(sql(on)) } - by <- list(x = ident(by_x), y = ident(by_y), x_as = x_name, y_as = ident(rhs)) - sql_join_tbls(con, by, na_matches = na_matches) + + by <- list( + x = ident(by_x), + y = ident(by_y), + x_as = ident(table_names[by_x_table_id]), + y_as = ident(rhs) + ) + + sql_join_tbls(con, by = by, na_matches = na_matches) } ) diff --git a/R/lazy-join-query.R b/R/lazy-join-query.R index 4304219c5..227cbb380 100644 --- a/R/lazy-join-query.R +++ b/R/lazy-join-query.R @@ -47,8 +47,8 @@ lazy_multi_join_query <- function(x, call = caller_env()) { stopifnot(inherits(x, "lazy_query")) - if (!identical(colnames(joins), c("table", "type", "by_x", "by_y", "on", "na_matches"))) { - cli_abort("`joins` must have fields `table`, `type`, `by_x`, `by_y`, `on`, `na_matches`", .internal = TRUE) + if (!identical(colnames(joins), c("table", "type", "by_x", "by_y", "by_x_table_id", "on", "na_matches"))) { + cli_abort("`joins` must have fields `table`, `type`, `by_x`, `by_y`, `by_x_table_id`, `on`, `na_matches`", .internal = TRUE) } vctrs::vec_assert(joins$type, character(), arg = "joins$type", call = caller_env()) vctrs::vec_assert(joins$on, character(), arg = "joins$on", call = caller_env()) diff --git a/R/query-join.R b/R/query-join.R index 4d4c74a70..dfcc672d2 100644 --- a/R/query-join.R +++ b/R/query-join.R @@ -148,7 +148,7 @@ sql_table_prefix <- function(con, var, table = NULL) { var <- sql_escape_ident(con, var) if (!is.null(table)) { - table <- escape(table, con = con) + table <- escape(table, collapse = NULL, con = con) sql(paste0(table, ".", var)) } else { var diff --git a/R/verb-joins.R b/R/verb-joins.R index 134a85477..bd40826f2 100644 --- a/R/verb-joins.R +++ b/R/verb-joins.R @@ -283,13 +283,13 @@ add_join <- function(x, y, type, by = NULL, sql_on = NULL, copy = FALSE, suffix = suffix ) - joins_field <- tibble( - table = list(y_lq), + joins_field <- join_get_metadata( + x_lq, + y_lq, + new_query = new_query, type = type, - by_x = list(by$x), - by_y = list(by$y), - on = by$on %||% NA_character_, - na_matches + by = by, + na_matches = na_matches ) table_names_y <- tibble( @@ -436,6 +436,29 @@ multi_join_vars <- function(x_join_vars, vctrs::vec_rbind(x_join_vars, y_join_vars) } +join_get_metadata <- function(x_lq, y_lq, new_query, type, by, na_matches) { + if (new_query) { + by_x_table_id <- rep_along(by$x, 1L) + } else { + idx <- vctrs::vec_match(by$x, x_lq$vars$name) + stopifnot(all(vctrs::list_sizes(x_lq$vars$var[idx]) == 1)) + + # need to fix `by$x` in case it was renamed in an inlined select + by$x <- unlist(x_lq$vars$var)[idx] + by_x_table_id <- unlist(x_lq$vars$table)[idx] + } + + tibble( + table = list(y_lq), + type = type, + by_x = list(by$x), + by_y = list(by$y), + by_x_table_id = list(by_x_table_id), + on = by$on %||% NA_character_, + na_matches + ) +} + add_semi_join <- function(x, y, anti = FALSE, by = NULL, sql_on = NULL, copy = FALSE, auto_index = FALSE, na_matches = "never", x_as = NULL, y_as = NULL, diff --git a/tests/testthat/test-verb-joins.R b/tests/testthat/test-verb-joins.R index ab854d998..a712bac7f 100644 --- a/tests/testthat/test-verb-joins.R +++ b/tests/testthat/test-verb-joins.R @@ -443,6 +443,35 @@ INNER JOIN `df3` expect_s3_class(out_build$x, "multi_join_query") }) +test_that("multiple joins can use by column from any table", { + lf <- lazy_frame(x = 1, a = 1, .name = "df1") + lf2 <- lazy_frame(x = 1, y = 2, .name = "df2") + lf3 <- lazy_frame(x = 1, y = 2, b = 2, .name = "df3") + + out <- left_join(lf, lf2, by = "x") %>% + left_join(lf3, by = c("x", "y")) + joins_metadata <- out$lazy_query$joins + expect_equal(joins_metadata$by_x, list("x", c("x", "y"))) + expect_equal(joins_metadata$by_y, list("x", c("x", "y"))) + expect_equal(joins_metadata$by_x_table_id, list(1, c(1, 2))) + + lf <- lazy_frame(x = 1, a = 1, .name = "df1") + lf2 <- lazy_frame(x = 1, y2 = 2, .name = "df2") + lf3 <- lazy_frame(x = 1, y = 2, b = 2, .name = "df3") + + out <- left_join( + lf, + lf2 %>% rename(y = y2), + by = "x" + ) %>% + left_join(lf3, by = c("x", "y")) + + joins_metadata <- out$lazy_query$joins + expect_equal(joins_metadata$by_x, list("x", c("x", "y2"))) + expect_equal(joins_metadata$by_y, list("x", c("x", "y"))) + expect_equal(joins_metadata$by_x_table_id, list(1, c(1, 2))) +}) + test_that("multi joins work with x_as", { lf <- lazy_frame(x = 1, a = 1, .name = "df1") lf2 <- lazy_frame(x = 1, b = 2, .name = "df2") From f1f596108e43f33b7453d5ecbdbdd62f9f6d8982 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Wed, 14 Sep 2022 15:30:44 +0000 Subject: [PATCH 10/28] Remove unnecessary `join_as_already_used()` --- R/verb-joins.R | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/R/verb-joins.R b/R/verb-joins.R index bd40826f2..a41990f4f 100644 --- a/R/verb-joins.R +++ b/R/verb-joins.R @@ -366,13 +366,13 @@ join_needs_new_query <- function(x_lq, join_alias, type) { return(TRUE) } - if (join_as_already_used(x_as, as_current[-1])) { + if (x_as %in% as_current[-1]) { return(TRUE) } } if (!is_null(y_as)) { - if (join_as_already_used(y_as, as_current)) { + if (y_as %in% as_current) { return(TRUE) } } @@ -380,14 +380,6 @@ join_needs_new_query <- function(x_lq, join_alias, type) { FALSE } -join_as_already_used <- function(as, as_used) { - if (all(is.na(as_used))) { - return(FALSE) - } - - any(as == as_used) -} - multi_join_vars <- function(x_join_vars, y_join_vars, by_x_org, From e029821697e4dc8860b9485f78b15b786f63df11 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Wed, 14 Sep 2022 15:34:13 +0000 Subject: [PATCH 11/28] Fix NEWS --- NEWS.md | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/NEWS.md b/NEWS.md index 4c4276d9d..ade04d5b6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,13 +1,5 @@ # dbplyr (development version) -* `rows_*()` now uses the column types of `x` when copying a local data frame `y` - to a Postgres database (@mgirlich, #909). - -* `copy_inline()` gains a `types` argument to specify the SQL column types - (@mgirlich, #963). - -* `copy_inline()` now works for Hana (#950), Oracle (#972), and Redshift - (#949, thanks to @ejneer for an initial implementation). * Variables that are neither found in the data nor in the environment now produce an error (@mgirlich, #907). @@ -76,6 +68,8 @@ * `summarise()` + `filter()` now translates to `HAVING` where possible (@mgirlich, #877). + + * `left/inner_join()` + `left/inner_join()` (@mgirlich, #865). * The generated SQL is now shorter and more readable: From 082d357b8d9f8323fc8342b5e15fe553328c1cf4 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Tue, 18 Oct 2022 12:40:55 +0000 Subject: [PATCH 12/28] Replace `vec_unchop()` by `list_unchop()` --- R/db-sql.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/db-sql.R b/R/db-sql.R index faa198e3e..66b8c7552 100644 --- a/R/db-sql.R +++ b/R/db-sql.R @@ -491,8 +491,8 @@ sql_multi_join_vars <- function(con, vars, table_names, all_vars_list) { } out <- rep_named(vars$name, list()) - vars$var <- vctrs::vec_unchop(vars$var) - vars$table <- vctrs::vec_unchop(vars$table) + vars$var <- vctrs::list_unchop(vars$var) + vars$table <- vctrs::list_unchop(vars$table) for (i in seq_along(table_names)) { all_vars_current <- all_vars_list[[i]] From 023cb460696719ffc9585ea5c8ae0870bbbeb7a0 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Wed, 23 Nov 2022 09:42:20 +0000 Subject: [PATCH 13/28] Document `joins` data structure --- R/db-sql.R | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/R/db-sql.R b/R/db-sql.R index 51539e56f..a04e92bd8 100644 --- a/R/db-sql.R +++ b/R/db-sql.R @@ -400,6 +400,35 @@ sql_query_multi_join <- function(con, } #' @export +#' @param vars tibble with three columns: +#' * `table` ``: the tables to join with. +#' * `type` ``: the join type (left, right, inner, full). +#' * `by_x`, `by_y` `>`: The columns to join by +#' * `by_x_table_id` `>`: The table index where the join column +#' comes from. This needs to be a list because a the join columns might come +#' from different tables +#' * `on` `` +#' * `na_matches` ``: Either `"na"` or `"never"`. +#' @param vars See [sql_multi_join_vars()]. +#' @param table_vars `named >`: All variables in each table. +#' @noRd +#' @examples +#' # Left join with * +#' df1 <- lazy_frame(x = 1, y = 1) +#' df2 <- lazy_frame(x = 1, z = 1) +#' df3 <- lazy_frame(x = 1, z2 = 1) +#' +#' tmp <- left_join(df1, df2, by = "x") %>% +#' left_join(df3, by = c("x", z = "z2")) +#' tibble( +#' table = list(df1, df2), +#' type = c("left", "left"), +#' by_x = list("x", c("x", "z")), +#' by_y = list("x", c("x", "z2")), +#' by_x_table_id = list(1L, c(1L, 2L)), +#' on = c(NA, NA), +#' na_matches = c("never", "never") +#' ) sql_query_multi_join.DBIConnection <- function(con, x, joins, From 64bc627aaf9cd7e5d084dfe369ba763f425e63bd Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Wed, 23 Nov 2022 09:42:32 +0000 Subject: [PATCH 14/28] Minor refactoring --- R/db-sql.R | 52 +++++++++++++++++++++++++------------------------- R/verb-joins.R | 8 ++++---- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/R/db-sql.R b/R/db-sql.R index a04e92bd8..b78046ac5 100644 --- a/R/db-sql.R +++ b/R/db-sql.R @@ -441,48 +441,48 @@ sql_query_multi_join.DBIConnection <- function(con, cli_abort("{.arg table_names} must be unique.") } - x_name <- ident(table_names[[1]]) - - # FIXME pass `table_vars` to sql_query_multi_join select_sql <- sql_multi_join_vars(con, vars, table_vars) + from <- dbplyr_sql_subquery(con, x, name = table_names[[1]], lvl = lvl) + + join_table_queries <- purrr::map2( + joins$table, + table_names[-1], + function(table, name) dbplyr_sql_subquery(con, table, name = name, lvl = lvl) + ) + types <- toupper(paste0(joins$type, " JOIN")) + join_clauses <- purrr::map2( + types, + join_table_queries, + function(join_kw, from) sql_clause(join_kw, from) + ) - ons <- purrr::pmap( + on_clauses <- purrr::pmap( vctrs::vec_cbind( rhs = table_names[-1], joins[c("by_x", "by_x_table_id", "by_y", "on", "na_matches")] ), function(rhs, by_x, by_x_table_id, by_y, on, na_matches) { if (!is.na(on)) { - return(sql(on)) + on <- sql(on) + } else { + by <- list( + x = ident(by_x), + y = ident(by_y), + x_as = ident(table_names[by_x_table_id]), + y_as = ident(rhs) + ) + on <- sql_join_tbls(con, by = by, na_matches = na_matches) } - by <- list( - x = ident(by_x), - y = ident(by_y), - x_as = ident(table_names[by_x_table_id]), - y_as = ident(rhs) - ) - - sql_join_tbls(con, by = by, na_matches = na_matches) + sql_clause("ON", on, sep = " AND", parens = TRUE, lvl = 1) } ) - - from <- dbplyr_sql_subquery(con, x, name = table_names[[1]], lvl = lvl) - join_table_queries <- purrr::map2( - joins$table, - table_names[-1], - ~ dbplyr_sql_subquery(con, .x, name = .y, lvl = lvl) - ) - types <- toupper(paste0(joins$type, " JOIN")) - join_clauses <- vctrs::vec_interleave( - purrr::map2(join_table_queries, types, ~ sql_clause(.y, .x)), - purrr::map(ons, ~ sql_clause("ON", .x, sep = " AND", parens = TRUE, lvl = 1)) - ) + join_on_clauses <- vctrs::vec_interleave(join_clauses, on_clauses) list2( sql_clause_select(con, select_sql), sql_clause_from(from), - !!!join_clauses + !!!join_on_clauses ) %>% sql_format_clauses(lvl = lvl, con = con) } diff --git a/R/verb-joins.R b/R/verb-joins.R index 1dae10211..afd33681c 100644 --- a/R/verb-joins.R +++ b/R/verb-joins.R @@ -283,7 +283,7 @@ add_join <- function(x, y, type, by = NULL, sql_on = NULL, copy = FALSE, suffix = suffix ) - joins_field <- join_get_metadata( + joins_data <- new_joins_data( x_lq, y_lq, new_query = new_query, @@ -305,7 +305,7 @@ add_join <- function(x, y, type, by = NULL, sql_on = NULL, copy = FALSE, out <- lazy_multi_join_query( x = x_lq, - joins = joins_field, + joins = joins_data, table_names = vctrs::vec_rbind(table_names_x, table_names_y), vars = vars ) @@ -317,7 +317,7 @@ add_join <- function(x, y, type, by = NULL, sql_on = NULL, copy = FALSE, x_lq$table_names$as[[1]] <- join_alias$x } - x_lq$joins <- vctrs::vec_rbind(x_lq$joins, joins_field) + x_lq$joins <- vctrs::vec_rbind(x_lq$joins, joins_data) x_lq$table_names <- vctrs::vec_rbind(x_lq$table_names, table_names_y) x_lq$vars <- vars @@ -428,7 +428,7 @@ multi_join_vars <- function(x_join_vars, vctrs::vec_rbind(x_join_vars, y_join_vars) } -join_get_metadata <- function(x_lq, y_lq, new_query, type, by, na_matches) { +new_joins_data <- function(x_lq, y_lq, new_query, type, by, na_matches) { if (new_query) { by_x_table_id <- rep_along(by$x, 1L) } else { From 6504c1877a8a5c51d0a186b1661f63faab565c00 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Wed, 23 Nov 2022 09:43:09 +0000 Subject: [PATCH 15/28] Update documentation --- man/db-sql.Rd | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/man/db-sql.Rd b/man/db-sql.Rd index 62fa5336c..b8c0694f7 100644 --- a/man/db-sql.Rd +++ b/man/db-sql.Rd @@ -77,16 +77,7 @@ sql_query_join( lvl = 0 ) -sql_query_multi_join( - con, - x, - joins, - table_names, - vars, - all_vars_list, - ..., - lvl = 0 -) +sql_query_multi_join(con, x, joins, table_vars, vars, ..., lvl = 0) sql_query_semi_join(con, x, y, anti, by, vars, ..., lvl = 0) From 78b63d74a1f69dba65e7447bd50382ca09eb02e5 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Wed, 23 Nov 2022 10:01:34 +0000 Subject: [PATCH 16/28] Remove duplicated code --- R/query-join.R | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/R/query-join.R b/R/query-join.R index 41ab550bf..0b95ba1d1 100644 --- a/R/query-join.R +++ b/R/query-join.R @@ -242,33 +242,6 @@ sql_multi_join_var <- function(con, var, table_id, table_names, duplicated_vars) } } -sql_multi_join_var <- function(con, var, table_id, table_names, duplicated_vars) { - if (length(table_id) > 1) { - if (length(table_id) != 2 || length(var) != 2) { - cli_abort("{.arg table_id} or {.arg var} does not have length 2", .internal = TRUE) - } - - table_1 <- table_names[[table_id[[1]]]] - table_2 <- table_names[[table_id[[2]]]] - - out <- sql_expr( - COALESCE( - !!sql_table_prefix(con, var[[1]], table = ident(table_1)), - !!sql_table_prefix(con, var[[2]], table = ident(table_2)) - ), - con = con - ) - - return(out) - } - - if (tolower(var) %in% duplicated_vars) { - sql_table_prefix(con, var, ident(table_names[[table_id]])) - } else { - sql_escape_ident(con, var) - } -} - sql_join_vars <- function(con, vars, x_as = "LHS", y_as = "RHS", type) { multi_join_vars <- purrr::map2_dfr( vars$x, vars$y, From ef21e2a9398c7ff7d2f55d2af7ad52e63c09497d Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Thu, 24 Nov 2022 08:04:13 +0100 Subject: [PATCH 17/28] Update R/db-sql.R Co-authored-by: Hadley Wickham --- R/db-sql.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/db-sql.R b/R/db-sql.R index b78046ac5..c9768bc07 100644 --- a/R/db-sql.R +++ b/R/db-sql.R @@ -400,7 +400,7 @@ sql_query_multi_join <- function(con, } #' @export -#' @param vars tibble with three columns: +#' @param vars tibble with six columns: #' * `table` ``: the tables to join with. #' * `type` ``: the join type (left, right, inner, full). #' * `by_x`, `by_y` `>`: The columns to join by From ef7ff2ff841a10b5541835cb57a846a2a41f17ce Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Thu, 24 Nov 2022 08:44:29 +0000 Subject: [PATCH 18/28] Avoid pipe --- R/db-sql.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/R/db-sql.R b/R/db-sql.R index c9768bc07..c38125df2 100644 --- a/R/db-sql.R +++ b/R/db-sql.R @@ -479,12 +479,13 @@ sql_query_multi_join.DBIConnection <- function(con, ) join_on_clauses <- vctrs::vec_interleave(join_clauses, on_clauses) - list2( + clauses <- list2( sql_clause_select(con, select_sql), sql_clause_from(from), !!!join_on_clauses - ) %>% - sql_format_clauses(lvl = lvl, con = con) + ) + + sql_format_clauses(clauses, lvl = lvl, con = con) } #' @rdname db-sql From eaf60f40325eb5f4350e23372276cf18d59aa4cd Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Thu, 24 Nov 2022 09:13:31 +0000 Subject: [PATCH 19/28] Refactor `joins` data structure --- R/db-sql.R | 22 +++------------ R/lazy-join-query.R | 48 +++++++++++++++++++++----------- R/query-join.R | 3 +- R/verb-joins.R | 10 ++++--- tests/testthat/test-verb-joins.R | 21 ++++++++------ 5 files changed, 56 insertions(+), 48 deletions(-) diff --git a/R/db-sql.R b/R/db-sql.R index c38125df2..3a8cdb876 100644 --- a/R/db-sql.R +++ b/R/db-sql.R @@ -456,24 +456,10 @@ sql_query_multi_join.DBIConnection <- function(con, function(join_kw, from) sql_clause(join_kw, from) ) - on_clauses <- purrr::pmap( - vctrs::vec_cbind( - rhs = table_names[-1], - joins[c("by_x", "by_x_table_id", "by_y", "on", "na_matches")] - ), - function(rhs, by_x, by_x_table_id, by_y, on, na_matches) { - if (!is.na(on)) { - on <- sql(on) - } else { - by <- list( - x = ident(by_x), - y = ident(by_y), - x_as = ident(table_names[by_x_table_id]), - y_as = ident(rhs) - ) - on <- sql_join_tbls(con, by = by, na_matches = na_matches) - } - + on_clauses <- purrr::map( + joins$by, + function(by) { + on <- sql_join_tbls(con, by = by, na_matches = by$na_matches) sql_clause("ON", on, sep = " AND", parens = TRUE, lvl = 1) } ) diff --git a/R/lazy-join-query.R b/R/lazy-join-query.R index 5bf2a8cb7..9069ba61e 100644 --- a/R/lazy-join-query.R +++ b/R/lazy-join-query.R @@ -47,12 +47,12 @@ lazy_multi_join_query <- function(x, call = caller_env()) { stopifnot(inherits(x, "lazy_query")) - if (!identical(colnames(joins), c("table", "type", "by_x", "by_y", "by_x_table_id", "on", "na_matches"))) { - cli_abort("`joins` must have fields `table`, `type`, `by_x`, `by_y`, `by_x_table_id`, `on`, `na_matches`", .internal = TRUE) - } + # if (!identical(colnames(joins), c("table", "type", "by_x", "by_y", "by_x_table_id", "on", "na_matches"))) { + # cli_abort("`joins` must have fields `table`, `type`, `by_x`, `by_y`, `by_x_table_id`, `on`, `na_matches`", .internal = TRUE) + # } vctrs::vec_assert(joins$type, character(), arg = "joins$type", call = caller_env()) - vctrs::vec_assert(joins$on, character(), arg = "joins$on", call = caller_env()) - vctrs::vec_assert(joins$na_matches, character(), arg = "joins$na_matches", call = caller_env()) + # vctrs::vec_assert(joins$on, character(), arg = "joins$on", call = caller_env()) + # vctrs::vec_assert(joins$na_matches, character(), arg = "joins$na_matches", call = caller_env()) if (!identical(colnames(table_names), c("as", "name"))) { cli_abort("`table_names` must have fields `as`, `name`", .internal = TRUE) @@ -243,20 +243,23 @@ sql_build.lazy_multi_join_query <- function(op, con, ...) { all_y = op_vars(op$joins$table[[1]]) ) + by <- op$joins$by[[1]] + if (is.na(by$on)) { + by$on <- NULL + } else { + by$on <- sql(by$on) + } + by$x_as <- ident(table_names_out[[1]]) + by$y_as <- ident(table_names_out[[2]]) + out <- join_query( sql_optimise(sql_build(op$x, con), con), sql_optimise(sql_build(op$joins$table[[1]], con), con), vars = vars_classic, type = type, - by = list( - on = sql(op$joins$on), - x = op$joins$by_x[[1]], - y = op$joins$by_y[[1]], - x_as = ident(table_names_out[[1]]), - y_as = ident(table_names_out[[2]]) - ), + by = by, suffix = NULL, # it seems like the suffix is not used for rendering - na_matches = op$joins$na_matches + na_matches = by$na_matches ) return(out) } @@ -265,17 +268,30 @@ sql_build.lazy_multi_join_query <- function(op, con, ...) { table_names_out[auto_name] <- table_names_repaired[auto_name] } - all_vars_list <- purrr::map( - c(list(op$x), op$joins$table), + table_vars <- purrr::map( + set_names(c(list(op$x), op$joins$table), table_names_out), op_vars ) op$joins$table <- purrr::map(op$joins$table, ~ sql_optimise(sql_build(.x, con), con)) + for (i in seq_along(op$joins$by)) { + by_i <- op$joins$by[[i]] + by_i$x_as <- ident(table_names_out[op$joins$by_x_table_id[[i]]]) + by_i$y_as <- ident(table_names_out[i + 1L]) + if (is.na(by_i$on)) { + by_i$on <- NULL + } else { + by_i$on <- sql(by_i$on) + } + op$joins$by[[i]] <- by_i + } + multi_join_query( x = sql_optimise(sql_build(op$x, con), con), joins = op$joins, - table_vars = set_names(all_vars_list, table_names_out), + by_list = op$joins$by, + table_vars = table_vars, vars = op$vars ) } diff --git a/R/query-join.R b/R/query-join.R index 0b95ba1d1..c7d8d692c 100644 --- a/R/query-join.R +++ b/R/query-join.R @@ -51,7 +51,7 @@ print.multi_join_query <- function(x, ...) { cat_line("Type: ", paste0(x$joins$type[[i]])) cat_line("By:") - cat_line(indent(paste0(x$joins$by_x[[i]], "-", x$joins$by_y[[i]]))) + cat_line(indent(paste0(x$joins$by[[i]]$x, "-", x$joins$by[[i]]$y))) cat_line("Y:") cat_line(indent_print(sql_build(x$joins$table[[i]]))) @@ -85,6 +85,7 @@ sql_render.multi_join_query <- function(query, con = NULL, ..., subquery = FALSE x = x, joins = query$joins, table_vars = query$table_vars, + by_list = query$by_list, vars = query$vars, lvl = lvl ) diff --git a/R/verb-joins.R b/R/verb-joins.R index afd33681c..64d8de464 100644 --- a/R/verb-joins.R +++ b/R/verb-joins.R @@ -443,11 +443,13 @@ new_joins_data <- function(x_lq, y_lq, new_query, type, by, na_matches) { tibble( table = list(y_lq), type = type, - by_x = list(by$x), - by_y = list(by$y), by_x_table_id = list(by_x_table_id), - on = by$on %||% NA_character_, - na_matches + by = list(list( + x = ident(by$x), + y = ident(by$y), + on = sql(by$on), + na_matches = na_matches + )) ) } diff --git a/tests/testthat/test-verb-joins.R b/tests/testthat/test-verb-joins.R index ac304f741..8f01ed083 100644 --- a/tests/testthat/test-verb-joins.R +++ b/tests/testthat/test-verb-joins.R @@ -228,8 +228,8 @@ test_that("select() before join is inlined", { expect_equal(lq$vars$var, var) expect_equal(lq$vars$table, table) - expect_equal(lq$joins$by_x, list("x1")) - expect_equal(lq$joins$by_y, list("x2")) + expect_equal(lq$joins$by[[1]]$x, ident("x1")) + expect_equal(lq$joins$by[[1]]$y, ident("x2")) } out_left <- left_join( @@ -322,8 +322,8 @@ test_that("named by works in combination with inlined select", { expect_equal(op_vars(lq), c("id_x", "x.x")) expect_equal(lq$vars$var, list("id_x", "x")) expect_equal(lq$vars$table, list(1, 1)) - expect_equal(lq$joins$by_x, list(c("id_x", "x"))) - expect_equal(lq$joins$by_y, list(c("id_y", "x"))) + expect_equal(lq$joins$by[[1]]$x, ident(c("id_x", "x"))) + expect_equal(lq$joins$by[[1]]$y, ident(c("id_y", "x"))) }) test_that("suffix works in combination with inlined select", { @@ -467,8 +467,10 @@ test_that("multiple joins can use by column from any table", { out <- left_join(lf, lf2, by = "x") %>% left_join(lf3, by = c("x", "y")) joins_metadata <- out$lazy_query$joins - expect_equal(joins_metadata$by_x, list("x", c("x", "y"))) - expect_equal(joins_metadata$by_y, list("x", c("x", "y"))) + expect_equal(joins_metadata$by[[1]]$x, ident("x")) + expect_equal(joins_metadata$by[[2]]$x, ident(c("x", "y"))) + expect_equal(joins_metadata$by[[1]]$y, ident("x")) + expect_equal(joins_metadata$by[[2]]$y, ident(c("x", "y"))) expect_equal(joins_metadata$by_x_table_id, list(1, c(1, 2))) lf <- lazy_frame(x = 1, a = 1, .name = "df1") @@ -483,8 +485,10 @@ test_that("multiple joins can use by column from any table", { left_join(lf3, by = c("x", "y")) joins_metadata <- out$lazy_query$joins - expect_equal(joins_metadata$by_x, list("x", c("x", "y2"))) - expect_equal(joins_metadata$by_y, list("x", c("x", "y"))) + expect_equal(joins_metadata$by[[1]]$x, ident("x")) + expect_equal(joins_metadata$by[[2]]$x, ident(c("x", "y2"))) + expect_equal(joins_metadata$by[[1]]$y, ident("x")) + expect_equal(joins_metadata$by[[2]]$y, ident(c("x", "y"))) expect_equal(joins_metadata$by_x_table_id, list(1, c(1, 2))) }) @@ -833,4 +837,3 @@ test_that("add_suffixes works if no suffix requested", { expect_equal(add_suffixes(c("x", "x"), "y", ""), c("x", "x")) expect_equal(add_suffixes(c("x", "y"), "y", ""), c("x", "y")) }) - From a442e280adaeae9dbf0d7a2f8de3ed45a6e5c309 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Thu, 24 Nov 2022 10:19:18 +0000 Subject: [PATCH 20/28] Pull out generation of table names --- R/lazy-join-query.R | 59 +++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 31 deletions(-) diff --git a/R/lazy-join-query.R b/R/lazy-join-query.R index 9069ba61e..20004320e 100644 --- a/R/lazy-join-query.R +++ b/R/lazy-join-query.R @@ -207,23 +207,9 @@ sql_build.lazy_join_query <- function(op, con, ...) { #' @export sql_build.lazy_multi_join_query <- function(op, con, ...) { - auto_name <- is.na(op$table_names$as) - table_names_out <- dplyr::coalesce(op$table_names$as, op$table_names$name) - if (length(table_names_out) == 2) { - na_to_null <- function(x) { - if (is.na(x)) { - NULL - } else { - x - } - } - - x_name <- na_to_null(op$table_names$name[[1]]) - y_name <- na_to_null(op$table_names$name[[2]]) - x_alias <- na_to_null(op$table_names$as[[1]]) - y_alias <- na_to_null(op$table_names$as[[2]]) - table_names_out <- join_two_table_alias(x_name, y_name, x_alias, y_alias) + table_names_out <- generate_join_table_names(op$table_names) + if (length(table_names_out) == 2) { if (op$joins$type %in% c("full", "right")) { # construct a classical `join_query()` so that special handling of # full and right join continue to work @@ -234,7 +220,6 @@ sql_build.lazy_multi_join_query <- function(op, con, ...) { y_idx <- purrr::map_lgl(op$vars$table, ~ 2L %in% .x) vars_y <- purrr::map2_chr(op$vars$var, y_idx, ~ {if (.y) dplyr::last(.x) %||% .x[[1]] else NA_character_}) - vars_classic <- list( alias = op$vars$name, x = vars_x, @@ -244,11 +229,6 @@ sql_build.lazy_multi_join_query <- function(op, con, ...) { ) by <- op$joins$by[[1]] - if (is.na(by$on)) { - by$on <- NULL - } else { - by$on <- sql(by$on) - } by$x_as <- ident(table_names_out[[1]]) by$y_as <- ident(table_names_out[[2]]) @@ -263,9 +243,6 @@ sql_build.lazy_multi_join_query <- function(op, con, ...) { ) return(out) } - } else { - table_names_repaired <- vctrs::vec_as_names(table_names_out, repair = "unique", quiet = TRUE) - table_names_out[auto_name] <- table_names_repaired[auto_name] } table_vars <- purrr::map( @@ -279,23 +256,43 @@ sql_build.lazy_multi_join_query <- function(op, con, ...) { by_i <- op$joins$by[[i]] by_i$x_as <- ident(table_names_out[op$joins$by_x_table_id[[i]]]) by_i$y_as <- ident(table_names_out[i + 1L]) - if (is.na(by_i$on)) { - by_i$on <- NULL - } else { - by_i$on <- sql(by_i$on) - } op$joins$by[[i]] <- by_i } multi_join_query( x = sql_optimise(sql_build(op$x, con), con), joins = op$joins, - by_list = op$joins$by, table_vars = table_vars, vars = op$vars ) } +generate_join_table_names <- function(table_names) { + table_names_out <- dplyr::coalesce(table_names$as, table_names$name) + + if (length(table_names_out) != 2) { + table_names_repaired <- vctrs::vec_as_names(table_names_out, repair = "unique", quiet = TRUE) + auto_name <- is.na(table_names$as) + table_names_out[auto_name] <- table_names_repaired[auto_name] + + return(table_names_out) + } + + na_to_null <- function(x) { + if (is.na(x)) { + NULL + } else { + x + } + } + + x_name <- na_to_null(table_names$name[[1]]) + y_name <- na_to_null(table_names$name[[2]]) + x_alias <- na_to_null(table_names$as[[1]]) + y_alias <- na_to_null(table_names$as[[2]]) + join_two_table_alias(x_name, y_name, x_alias, y_alias) +} + #' @export sql_build.lazy_semi_join_query <- function(op, con, ...) { vars_prev <- op_vars(op$x) From 40c84b2a2d65fdcc9bd7c0f8a8fda7099979dc1a Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Thu, 24 Nov 2022 12:05:53 +0000 Subject: [PATCH 21/28] Refactor `sql_build.lazy_multi_join_query()` --- R/lazy-join-query.R | 100 ++++++++++++++++++++++---------------------- 1 file changed, 50 insertions(+), 50 deletions(-) diff --git a/R/lazy-join-query.R b/R/lazy-join-query.R index 20004320e..9f0a04aa5 100644 --- a/R/lazy-join-query.R +++ b/R/lazy-join-query.R @@ -209,61 +209,61 @@ sql_build.lazy_join_query <- function(op, con, ...) { sql_build.lazy_multi_join_query <- function(op, con, ...) { table_names_out <- generate_join_table_names(op$table_names) - if (length(table_names_out) == 2) { - if (op$joins$type %in% c("full", "right")) { - # construct a classical `join_query()` so that special handling of - # full and right join continue to work - type <- op$joins$type - x_idx <- purrr::map_lgl(op$vars$table, ~ 1L %in% .x) - vars_x <- purrr::map2_chr(op$vars$var, x_idx, ~ {if (.y) .x[[1]] else NA_character_}) - - y_idx <- purrr::map_lgl(op$vars$table, ~ 2L %in% .x) - vars_y <- purrr::map2_chr(op$vars$var, y_idx, ~ {if (.y) dplyr::last(.x) %||% .x[[1]] else NA_character_}) - - vars_classic <- list( - alias = op$vars$name, - x = vars_x, - y = vars_y, - all_x = op_vars(op$x), - all_y = op_vars(op$joins$table[[1]]) - ) - - by <- op$joins$by[[1]] - by$x_as <- ident(table_names_out[[1]]) - by$y_as <- ident(table_names_out[[2]]) - - out <- join_query( - sql_optimise(sql_build(op$x, con), con), - sql_optimise(sql_build(op$joins$table[[1]], con), con), - vars = vars_classic, - type = type, - by = by, - suffix = NULL, # it seems like the suffix is not used for rendering - na_matches = by$na_matches - ) - return(out) - } + if (length(table_names_out) > 2 || !op$joins$type %in% c("full", "right")) { + table_vars <- purrr::map( + set_names(c(list(op$x), op$joins$table), table_names_out), + op_vars + ) + + op$joins$table <- purrr::map(op$joins$table, ~ sql_optimise(sql_build(.x, con), con)) + op$joins$by <- purrr::map2( + op$joins$by, seq_along(op$joins$by), + function(by, i) { + by$x_as <- ident(table_names_out[op$joins$by_x_table_id[[i]]]) + by$y_as <- ident(table_names_out[i + 1L]) + by + } + ) + + out <- multi_join_query( + x = sql_optimise(sql_build(op$x, con), con), + joins = op$joins, + table_vars = table_vars, + vars = op$vars + ) + + return(out) } - table_vars <- purrr::map( - set_names(c(list(op$x), op$joins$table), table_names_out), - op_vars + # construct a classical `join_query()` so that special handling of + # full and right join continue to work + type <- op$joins$type + x_idx <- purrr::map_lgl(op$vars$table, ~ 1L %in% .x) + vars_x <- purrr::map2_chr(op$vars$var, x_idx, ~ {if (.y) .x[[1]] else NA_character_}) + + y_idx <- purrr::map_lgl(op$vars$table, ~ 2L %in% .x) + vars_y <- purrr::map2_chr(op$vars$var, y_idx, ~ {if (.y) dplyr::last(.x) %||% .x[[1]] else NA_character_}) + + vars_classic <- list( + alias = op$vars$name, + x = vars_x, + y = vars_y, + all_x = op_vars(op$x), + all_y = op_vars(op$joins$table[[1]]) ) - op$joins$table <- purrr::map(op$joins$table, ~ sql_optimise(sql_build(.x, con), con)) - - for (i in seq_along(op$joins$by)) { - by_i <- op$joins$by[[i]] - by_i$x_as <- ident(table_names_out[op$joins$by_x_table_id[[i]]]) - by_i$y_as <- ident(table_names_out[i + 1L]) - op$joins$by[[i]] <- by_i - } + by <- op$joins$by[[1]] + by$x_as <- ident(table_names_out[[1]]) + by$y_as <- ident(table_names_out[[2]]) - multi_join_query( - x = sql_optimise(sql_build(op$x, con), con), - joins = op$joins, - table_vars = table_vars, - vars = op$vars + join_query( + sql_optimise(sql_build(op$x, con), con), + sql_optimise(sql_build(op$joins$table[[1]], con), con), + vars = vars_classic, + type = type, + by = by, + suffix = NULL, # it seems like the suffix is not used for rendering + na_matches = by$na_matches ) } From 4f7e430d7afd97f5de008cbd4fed7e12477cca75 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Thu, 24 Nov 2022 12:06:29 +0000 Subject: [PATCH 22/28] Use `length()` instead of `vec_size()` --- R/verb-joins.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/verb-joins.R b/R/verb-joins.R index 64d8de464..8a1a258c8 100644 --- a/R/verb-joins.R +++ b/R/verb-joins.R @@ -259,7 +259,7 @@ add_join <- function(x, y, type, by = NULL, sql_on = NULL, copy = FALSE, table_id <- 2L } else { x_join_vars <- x_lq$vars - table_id <- vctrs::vec_size(x_lq$table_names) + 1L + table_id <- length(x_lq$table_names) + 1L } inline_result <- join_inline_select(y$lazy_query, by$y, by$on) From 57090a643d30c2f8707162a2d570daac6fdfd5d1 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Thu, 24 Nov 2022 12:10:40 +0000 Subject: [PATCH 23/28] Rename to `make_join_aliases()` --- R/verb-joins.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/verb-joins.R b/R/verb-joins.R index 8a1a258c8..e3a2e6be5 100644 --- a/R/verb-joins.R +++ b/R/verb-joins.R @@ -241,7 +241,7 @@ add_join <- function(x, y, type, by = NULL, sql_on = NULL, copy = FALSE, # This works even though `by` is used in `join_inline_select()` and updated # because this does not touch `by$x_as` and `by$y_as`. # TODO can this be really be done before inlining? - join_alias <- check_join_alias(x_as, y_as, sql_on, call) + join_alias <- make_join_aliases(x_as, y_as, sql_on, call) inline_result <- join_inline_select(x$lazy_query, by$x, by$on) x_lq <- inline_result$lq @@ -477,7 +477,7 @@ add_semi_join <- function(x, y, anti = FALSE, by = NULL, sql_on = NULL, copy = F ) # the table alias can only be determined after `select()` was inlined - join_alias <- check_join_alias(x_as, y_as, sql_on, call) + join_alias <- make_join_aliases(x_as, y_as, sql_on, call) x_name <- unclass(query_name(x_lq)) y_name <- unclass(query_name(y)) @@ -496,7 +496,7 @@ add_semi_join <- function(x, y, anti = FALSE, by = NULL, sql_on = NULL, copy = F ) } -check_join_alias <- function(x_as, y_as, sql_on, call) { +make_join_aliases <- function(x_as, y_as, sql_on, call) { x_as <- check_join_as1(x_as, arg = "x_as", sql_on, default = "LHS", call) y_as <- check_join_as1(y_as, arg = "y_as", sql_on, default = "RHS", call) From 0b0502c79fd861175c4cd4eb1897fdece760c21b Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Thu, 24 Nov 2022 12:13:55 +0000 Subject: [PATCH 24/28] Avoid unnecessary negation --- R/verb-joins.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/verb-joins.R b/R/verb-joins.R index e3a2e6be5..4e6656585 100644 --- a/R/verb-joins.R +++ b/R/verb-joins.R @@ -352,7 +352,7 @@ join_needs_new_query <- function(x_lq, join_alias, type) { return(TRUE) } - if (!type %in% c("left", "inner")) { + if (type %in% c("full", "right")) { return(TRUE) } From 155f5caebea18c6a74375dad10a16d879e0059df Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Thu, 24 Nov 2022 12:14:08 +0000 Subject: [PATCH 25/28] Use postgres for simpler snapshot --- tests/testthat/_snaps/verb-joins.md | 2 +- tests/testthat/test-verb-joins.R | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/testthat/_snaps/verb-joins.md b/tests/testthat/_snaps/verb-joins.md index 5d7738440..dd0a952e8 100644 --- a/tests/testthat/_snaps/verb-joins.md +++ b/tests/testthat/_snaps/verb-joins.md @@ -87,7 +87,7 @@ SELECT `lf1`.`x` AS `x` FROM `lf1` LEFT JOIN `lf2` - ON (CASE WHEN (`lf1`.`x` = `lf2`.`x`) OR (`lf1`.`x` IS NULL AND `lf2`.`x` IS NULL) THEN 0 ELSE 1 END = 0) + ON (`lf1`.`x` IS NOT DISTINCT FROM `lf2`.`x`) # suffix arg is checked diff --git a/tests/testthat/test-verb-joins.R b/tests/testthat/test-verb-joins.R index 8f01ed083..06d27e609 100644 --- a/tests/testthat/test-verb-joins.R +++ b/tests/testthat/test-verb-joins.R @@ -568,8 +568,9 @@ test_that("join verbs generate expected ops", { }) test_that("can optionally match NA values", { - lf1 <- lazy_frame(x = 1, .name = "lf1") - lf2 <- lazy_frame(x = 1, .name = "lf2") + con <- simulate_postgres() + lf1 <- lazy_frame(x = 1, .name = "lf1", con = con) + lf2 <- lazy_frame(x = 1, .name = "lf2", con = con) expect_snapshot(left_join(lf1, lf2, by = "x", na_matches = "na")) }) From 5d4fd19a1ae2e0937d09aed3a2282dea4164439d Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Thu, 24 Nov 2022 12:16:25 +0000 Subject: [PATCH 26/28] Split test --- tests/testthat/_snaps/verb-joins.md | 2 +- tests/testthat/test-verb-joins.R | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/testthat/_snaps/verb-joins.md b/tests/testthat/_snaps/verb-joins.md index dd0a952e8..0a1200bb4 100644 --- a/tests/testthat/_snaps/verb-joins.md +++ b/tests/testthat/_snaps/verb-joins.md @@ -63,7 +63,7 @@ WHERE (`lf1`.`x1` = `RHS`.`x`) ) -# multiple joins create a single query +# multiple joins produce separate queries if using right/full join Code remote_query(out) diff --git a/tests/testthat/test-verb-joins.R b/tests/testthat/test-verb-joins.R index 06d27e609..977fe523c 100644 --- a/tests/testthat/test-verb-joins.R +++ b/tests/testthat/test-verb-joins.R @@ -440,8 +440,13 @@ INNER JOIN `df3` ON (`df1`.`x` = `df3`.`x`)" ) ) +}) + +test_that("multiple joins produce separate queries if using right/full join", { + lf <- lazy_frame(x = 1, a = 1, .name = "df1") + lf2 <- lazy_frame(x = 1, b = 2, .name = "df2") + lf3 <- lazy_frame(x = 1, b = 2, .name = "df3") - # `right_join()` and `full_join()` are not inlined out <- left_join(lf, lf2, by = "x") %>% right_join(lf3, by = "x") From 8a267d1877387ce2dd8045902c39c7419c604c18 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Thu, 24 Nov 2022 12:20:30 +0000 Subject: [PATCH 27/28] Replace explicity test with snapshot --- tests/testthat/_snaps/verb-joins.md | 13 +++++++++++++ tests/testthat/test-verb-joins.R | 12 +----------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/tests/testthat/_snaps/verb-joins.md b/tests/testthat/_snaps/verb-joins.md index 0a1200bb4..c2b46e77f 100644 --- a/tests/testthat/_snaps/verb-joins.md +++ b/tests/testthat/_snaps/verb-joins.md @@ -63,6 +63,19 @@ WHERE (`lf1`.`x1` = `RHS`.`x`) ) +# multiple joins create a single query + + Code + out + Output + + SELECT `df1`.*, `df2`.`b` AS `b.x`, `df3`.`b` AS `b.y` + FROM `df1` + LEFT JOIN `df2` + ON (`df1`.`x` = `df2`.`x`) + INNER JOIN `df3` + ON (`df1`.`x` = `df3`.`x`) + # multiple joins produce separate queries if using right/full join Code diff --git a/tests/testthat/test-verb-joins.R b/tests/testthat/test-verb-joins.R index 977fe523c..245246989 100644 --- a/tests/testthat/test-verb-joins.R +++ b/tests/testthat/test-verb-joins.R @@ -429,17 +429,7 @@ test_that("multiple joins create a single query", { expect_equal(lq$vars$table, list(1L, 1L, 2L, 3L)) expect_equal(lq$vars$var, list("x", "a", "b", "b")) - expect_equal( - remote_query(out), - sql( -"SELECT `df1`.*, `df2`.`b` AS `b.x`, `df3`.`b` AS `b.y` -FROM `df1` -LEFT JOIN `df2` - ON (`df1`.`x` = `df2`.`x`) -INNER JOIN `df3` - ON (`df1`.`x` = `df3`.`x`)" - ) - ) + expect_snapshot(out) }) test_that("multiple joins produce separate queries if using right/full join", { From 678ffa6161e2d183041960a060c2157c092caa29 Mon Sep 17 00:00:00 2001 From: Maximilian Girlich Date: Tue, 29 Nov 2022 09:20:24 +0000 Subject: [PATCH 28/28] Refactor join table name logic --- R/lazy-join-query.R | 40 ++++++------------ R/verb-joins.R | 72 +++++++++++++++++--------------- tests/testthat/test-verb-joins.R | 4 +- 3 files changed, 53 insertions(+), 63 deletions(-) diff --git a/R/lazy-join-query.R b/R/lazy-join-query.R index 9f0a04aa5..4d6059fe0 100644 --- a/R/lazy-join-query.R +++ b/R/lazy-join-query.R @@ -47,18 +47,16 @@ lazy_multi_join_query <- function(x, call = caller_env()) { stopifnot(inherits(x, "lazy_query")) - # if (!identical(colnames(joins), c("table", "type", "by_x", "by_y", "by_x_table_id", "on", "na_matches"))) { - # cli_abort("`joins` must have fields `table`, `type`, `by_x`, `by_y`, `by_x_table_id`, `on`, `na_matches`", .internal = TRUE) - # } + if (!identical(colnames(joins), c("table", "type", "by_x_table_id", "by"))) { + cli_abort("`joins` must have fields `table`, `type`, `by_x_table_id`, `by`", .internal = TRUE) + } vctrs::vec_assert(joins$type, character(), arg = "joins$type", call = caller_env()) - # vctrs::vec_assert(joins$on, character(), arg = "joins$on", call = caller_env()) - # vctrs::vec_assert(joins$na_matches, character(), arg = "joins$na_matches", call = caller_env()) - if (!identical(colnames(table_names), c("as", "name"))) { - cli_abort("`table_names` must have fields `as`, `name`", .internal = TRUE) + if (!identical(colnames(table_names), c("name", "from"))) { + cli_abort("`table_names` must have fields `name`, `from`", .internal = TRUE) } - vctrs::vec_assert(table_names$as, character(), arg = "table_names$as", call = caller_env()) vctrs::vec_assert(table_names$name, character(), arg = "table_names$as", call = caller_env()) + vctrs::vec_assert(table_names$from, character(), arg = "table_names$from", call = caller_env()) if (!identical(colnames(vars), c("name", "table", "var"))) { cli_abort("`vars` must have fields `name`, `table`, `var`", .internal = TRUE) @@ -268,29 +266,15 @@ sql_build.lazy_multi_join_query <- function(op, con, ...) { } generate_join_table_names <- function(table_names) { - table_names_out <- dplyr::coalesce(table_names$as, table_names$name) - - if (length(table_names_out) != 2) { - table_names_repaired <- vctrs::vec_as_names(table_names_out, repair = "unique", quiet = TRUE) - auto_name <- is.na(table_names$as) - table_names_out[auto_name] <- table_names_repaired[auto_name] - - return(table_names_out) - } + if (length(table_names$name) != 2) { + table_names_repaired <- vctrs::vec_as_names(table_names$name, repair = "unique", quiet = TRUE) + auto_name <- table_names$from != "as" + table_names$name[auto_name] <- table_names_repaired[auto_name] - na_to_null <- function(x) { - if (is.na(x)) { - NULL - } else { - x - } + return(table_names$name) } - x_name <- na_to_null(table_names$name[[1]]) - y_name <- na_to_null(table_names$name[[2]]) - x_alias <- na_to_null(table_names$as[[1]]) - y_alias <- na_to_null(table_names$as[[2]]) - join_two_table_alias(x_name, y_name, x_alias, y_alias) + join_two_table_alias(table_names$name, table_names$from) } #' @export diff --git a/R/verb-joins.R b/R/verb-joins.R index 4e6656585..8afba2fe0 100644 --- a/R/verb-joins.R +++ b/R/verb-joins.R @@ -292,17 +292,10 @@ add_join <- function(x, y, type, by = NULL, sql_on = NULL, copy = FALSE, na_matches = na_matches ) - table_names_y <- tibble( - as = join_alias$y %||% NA_character_, - name = as.character(query_name(y_lq) %||% NA) - ) + table_names_y <- make_table_names(join_alias$y, y_lq) if (new_query) { - table_names_x <- tibble( - as = join_alias$x %||% NA_character_, - name = as.character(query_name(x_lq) %||% NA) - ) - + table_names_x <- make_table_names(join_alias$x, x_lq) out <- lazy_multi_join_query( x = x_lq, joins = joins_data, @@ -314,7 +307,8 @@ add_join <- function(x, y, type, by = NULL, sql_on = NULL, copy = FALSE, # `x_lq` must be a `lazy_multi_join_query` so it can be modified directly if (!is_null(join_alias$x)) { - x_lq$table_names$as[[1]] <- join_alias$x + x_lq$table_names$name[[1]] <- join_alias$x + x_lq$table_names$from[[1]] <- "as" } x_lq$joins <- vctrs::vec_rbind(x_lq$joins, joins_data) @@ -359,20 +353,20 @@ join_needs_new_query <- function(x_lq, join_alias, type) { x_as <- join_alias$x y_as <- join_alias$y - as_current <- x_lq$table_names$as + names <- x_lq$table_names$name + from <- x_lq$table_names$from if (!is_null(x_as)) { - x_as_current <- as_current[[1]] - if (!is.na(x_as_current) && !identical(x_as, x_as_current)) { + if (from[[1]] == "as" && !identical(x_as, names[[1]])) { return(TRUE) } - if (x_as %in% as_current[-1]) { + if (x_as %in% names[-1][from[-1] == "as"]) { return(TRUE) } } if (!is_null(y_as)) { - if (y_as %in% as_current) { + if (y_as %in% names[from == "as"]) { return(TRUE) } } @@ -479,9 +473,12 @@ add_semi_join <- function(x, y, anti = FALSE, by = NULL, sql_on = NULL, copy = F # the table alias can only be determined after `select()` was inlined join_alias <- make_join_aliases(x_as, y_as, sql_on, call) - x_name <- unclass(query_name(x_lq)) - y_name <- unclass(query_name(y)) - by[c("x_as", "y_as")] <- join_two_table_alias(x_name, y_name, join_alias$x, join_alias$y) + x_alias <- make_table_names(join_alias$x, x_lq) + y_alias <- make_table_names(join_alias$y, y) + by[c("x_as", "y_as")] <- join_two_table_alias( + c(x_alias$name, y_alias$name), + c(x_alias$from, y_alias$from) + ) by$x_as <- ident(by$x_as) by$y_as <- ident(by$y_as) @@ -507,6 +504,18 @@ make_join_aliases <- function(x_as, y_as, sql_on, call) { list(x = x_as, y = y_as) } +make_table_names <- function(as, lq) { + name <- unclass(query_name(lq)) + + if (!is.null(as)) { + tibble(name = as, from = "as") + } else if (!is.null(name)) { + tibble(name = name, from = "name") + } else { + tibble(name = NA_character_, from = "") + } +} + check_join_as1 <- function(as, arg, sql_on, default, call) { if (!is_null(as)) { vctrs::vec_assert(as, character(), size = 1, arg = arg, call = call) @@ -590,34 +599,31 @@ join_vars <- function(x_names, y_names, type, by, suffix = c(".x", ".y"), call = ) } -join_two_table_alias <- function(x_name, y_name, x_alias, y_alias) { - stopifnot(is.null(x_name) || is.character(x_name)) - stopifnot(is.null(y_name) || is.character(y_name)) - stopifnot(is.null(x_alias) || is.character(x_alias)) - stopifnot(is.null(y_alias) || is.character(y_alias)) +join_two_table_alias <- function(names, from) { + stopifnot(is.character(names)) + stopifnot(is.character(from)) + stopifnot(length(names) == 2L) - x_out <- x_alias %||% x_name %||% "LHS" - y_out <- y_alias %||% y_name %||% "RHS" - out <- c(x_out, y_out) + out <- dplyr::coalesce(names, c("LHS", "RHS")) - if (!identical(x_out, y_out)) { + if (!identical(out[1], out[2])) { return(out) } # -> must rename - if (is_null(x_alias) && is_null(y_alias)) { + if (from[1] != "as" && from[2] != "as") { # self join of named table - if (!is_null(x_name) && !is_null(y_name) && identical(x_name, y_name)) { + if (!is.na(names[1]) && !is.na(names[2]) && identical(names[1], names[2])) { out <- c( - paste0(x_name, "_LHS"), - paste0(y_name, "_RHS") + paste0(names[1], "_LHS"), + paste0(names[2], "_RHS") ) return(out) } } - out_repaired <- vctrs::vec_as_names(c(x_out, y_out), repair = "unique", quiet = TRUE) - may_repair <- c(is_null(x_alias), is_null(y_alias)) + out_repaired <- vctrs::vec_as_names(out, repair = "unique", quiet = TRUE) + may_repair <- c(from[1] != "as", from[2] != "as") out[may_repair] <- out_repaired[may_repair] out diff --git a/tests/testthat/test-verb-joins.R b/tests/testthat/test-verb-joins.R index 245246989..cce7c0f23 100644 --- a/tests/testthat/test-verb-joins.R +++ b/tests/testthat/test-verb-joins.R @@ -424,7 +424,7 @@ test_that("multiple joins create a single query", { inner_join(lf3, by = "x") lq <- out$lazy_query expect_s3_class(lq, "lazy_multi_join_query") - expect_equal(lq$table_names, tibble(as = NA_character_, name = c("df1", "df2", "df3"))) + expect_equal(lq$table_names, tibble(name = c("df1", "df2", "df3"), from = "name")) expect_equal(lq$vars$name, c("x", "a", "b.x", "b.y")) expect_equal(lq$vars$table, list(1L, 1L, 2L, 3L)) expect_equal(lq$vars$var, list("x", "a", "b", "b")) @@ -496,7 +496,7 @@ test_that("multi joins work with x_as", { inner_join(lf3, by = "x", y_as = "lf3") lq <- out$lazy_query expect_s3_class(lq, "lazy_multi_join_query") - expect_equal(lq$table_names, tibble(as = c("lf1", "lf2", "lf3"), name = c("df1", "df2", "df3"))) + expect_equal(lq$table_names, tibble(name = c("lf1", "lf2", "lf3"), from = "as")) # `x_as` provided twice with the same name -> one query out2 <- left_join(lf, lf2, by = "x", x_as = "lf1", y_as = "lf2") %>%