From 8adac2cba32fb28ea1eff9590ffebe6f96f6ba10 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Mon, 4 Nov 2024 22:39:33 +0100
Subject: [PATCH 01/71] init

---
 R/lazyframe-frame.R               | 1416 +++++++++++++++++++++++++++++
 src/rust/Cargo.toml               |    5 +
 src/rust/src/lazyframe/general.rs |  621 ++++++++++++-
 3 files changed, 2029 insertions(+), 13 deletions(-)

diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index 0f65d86d..c00fb7cb 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -358,3 +358,1419 @@ lazyframe__tail <- function(n = 5) {
   self$`_ldf`$tail(n) |>
     wrap()
 }
+
+
+#' Get the first row of a LazyFrame
+#'
+#' @inherit as_polars_lf return
+#' @examples
+#' as_polars_lf(mtcars)$first()$collect()
+lazyframe__first <- function() {
+  wrap({
+    self$`_rexpr`$first()
+  })
+}
+
+#' Get the last row of a LazyFrame
+#' @description Aggregate the columns in the LazyFrame to their maximum value.
+#'
+#' @inherit as_polars_lf return
+#' @examples
+#' as_polars_lf(mtcars)$last()$collect()
+lazyframe__last <- function() {
+  wrap({
+    self$`_rexpr`$last()
+  })
+}
+
+#' Max
+#' @description Aggregate the columns in the LazyFrame to their maximum value.
+#'
+#' @inherit as_polars_lf return
+#' @examples
+#' as_polars_lf(mtcars)$max()$collect()
+lazyframe__max <- function() {
+  wrap({
+    self$`_rexpr`$max()
+  })
+}
+
+#' Mean
+#' @description Aggregate the columns in the LazyFrame to their mean value.
+#'
+#' @inherit as_polars_lf return
+#' @examples
+#' as_polars_lf(mtcars)$mean()$collect()
+lazyframe__mean <- function() {
+  wrap({
+    self$`_rexpr`$mean()
+  })
+}
+
+#' Median
+#' @description Aggregate the columns in the LazyFrame to their median value.
+#'
+#' @inherit as_polars_lf return
+#' @examples
+#' as_polars_lf(mtcars)$median()$collect()
+lazyframe__median <- function() {
+  wrap({
+    self$`_rexpr`$median()
+  })
+}
+
+#' Min
+#' @description Aggregate the columns in the LazyFrame to their minimum value.
+#'
+#' @inherit as_polars_lf return
+#' @examples
+#' as_polars_lf(mtcars)$min()$collect()
+lazyframe__min <- function() {
+  wrap({
+    self$`_rexpr`$min()
+  })
+}
+
+#' Sum
+#' @description Aggregate the columns of this LazyFrame to their sum values.
+#'
+#' @inherit as_polars_lf return
+#' @examples
+#' as_polars_lf(mtcars)$sum()$collect()
+lazyframe__sum <- function() {
+  wrap({
+    self$`_rexpr`$sum()
+  })
+}
+
+#' Var
+#' @description Aggregate the columns of this LazyFrame to their variance values.
+#'
+#' @inheritParams DataFrame_var
+#' @inherit as_polars_lf return
+#' @examples
+#' as_polars_lf(mtcars)$var()$collect()
+lazyframe__var <- function(ddof = 1) {
+  wrap({
+    self$`_rexpr`$var(ddof)
+  })
+}
+
+#' Std
+#' @description Aggregate the columns of this LazyFrame to their standard
+#' deviation values.
+#'
+#' @inheritParams DataFrame_std
+#' @inherit as_polars_lf return
+#' @examples
+#' as_polars_lf(mtcars)$std()$collect()
+lazyframe__std <- function(ddof = 1) {
+  wrap({
+    self$`_rexpr`$std(ddof)
+  })
+}
+
+#' Quantile
+#' @description Aggregate the columns in the DataFrame to a unique quantile
+#' value. Use `$describe()` to specify several quantiles.
+#' @inheritParams DataFrame_quantile
+#' @inherit as_polars_lf return
+#' @examples
+#' as_polars_lf(mtcars)$quantile(.4)$collect()
+lazyframe__quantile <- function(quantile, interpolation = "nearest") {
+  wrap({
+    self$`_rexpr`$quantile(wrap_e_result(quantile), interpolation)
+  })
+}
+
+#' @inherit Expr_fill_nan title params
+#'
+#' @inherit as_polars_lf return
+#' @examples
+#' df <- pl$LazyFrame(
+#'   a = c(1.5, 2, NaN, 4),
+#'   b = c(1.5, NaN, NaN, 4)
+#' )
+#' df$fill_nan(99)$collect()
+lazyframe__fill_nan <- function(value) {
+  wrap({
+    self$`_rexpr`$fill_nan(value)
+  })
+}
+
+#' @inherit DataFrame_fill_null title description params
+#'
+#' @inherit as_polars_lf return
+#' @examples
+#' df <- pl$LazyFrame(
+#'   a = c(1.5, 2, NA, 4),
+#'   b = c(1.5, NA, NA, 4)
+#' )
+#' df$fill_null(99)$collect()
+lazyframe__fill_null <- function(fill_value) {
+  wrap({
+    self$`_rexpr`$fill_null(wrap_e_result(fill_value))
+  })
+}
+
+#' Shift a LazyFrame
+#'
+#' @inherit DataFrame_shift description params
+#'
+#' @inherit as_polars_lf return
+#' @examples
+#' lf <- pl$LazyFrame(a = 1:4, b = 5:8)
+#'
+#' lf$shift(2)$collect()
+#'
+#' lf$shift(-2)$collect()
+#'
+#' lf$shift(-2, fill_value = 100)$collect()
+lazyframe__shift <- function(n = 1, fill_value = NULL) {
+  self$`_rexpr`$shift(n, fill_value)
+}
+
+#' Drop columns of a LazyFrame
+#'
+#' @inheritParams DataFrame_drop
+#'
+#' @inherit as_polars_lf return
+#' @examples
+#' as_polars_lf(mtcars)$drop(c("mpg", "hp"))$collect()
+#'
+#' # equivalent
+#' as_polars_lf(mtcars)$drop("mpg", "hp")$collect()
+lazyframe__drop <- function(..., strict = TRUE) {
+  cols <- unpack_list(..., .context = "in $drop():") |>
+    unlist()
+  if (length(cols) == 0) {
+    return(self)
+  }
+  self$`_rexpr`$drop(cols, strict)
+}
+
+#' Reverse
+#' @description Reverse the LazyFrame (the last row becomes the first one, etc.).
+#'
+#' @inherit as_polars_lf return
+#' @examples
+#' as_polars_lf(mtcars)$reverse()$collect()
+lazyframe__reverse <- function() {
+  wrap({
+    self$`_rexpr`$reverse()
+  })
+}
+
+#' Slice
+#' @description Get a slice of the LazyFrame.
+#' @inheritParams DataFrame_slice
+#' @return A [LazyFrame][lazyframe__class]
+#' @examples
+#' as_polars_lf(mtcars)$slice(2, 4)$collect()
+#' as_polars_lf(mtcars)$slice(30)$collect()
+#' mtcars[2:6, ]
+lazyframe__slice <- function(offset, length = NULL) {
+  wrap({
+    self$`_rexpr`$slice(offset, length)
+  })
+}
+
+#' Get the last `n` rows.
+#'
+#' @inherit lazyframe__head return params
+#' @inheritParams lazyframe__head
+#' @seealso [`<LazyFrame>$head()`][lazyframe__head]
+#' @examples
+#' lf <- pl$LazyFrame(a = 1:6, b = 7:12)
+#'
+#' lf$tail()$collect()
+#'
+#' lf$tail(2)$collect()
+lazyframe__tail <- function(n = 5L) {
+  wrap({
+    self$`_rexpr`$tail(n)
+  })
+}
+
+#' @inherit DataFrame_drop_nulls title description params
+#'
+#' @inherit as_polars_lf return
+#' @examples
+#' tmp <- mtcars
+#' tmp[1:3, "mpg"] <- NA
+#' tmp[4, "hp"] <- NA
+#' tmp <- pl$LazyFrame(tmp)
+#'
+#' # number of rows in `tmp` before dropping nulls
+#' tmp$collect()$height
+#'
+#' tmp$drop_nulls()$collect()$height
+#' tmp$drop_nulls("mpg")$collect()$height
+#' tmp$drop_nulls(c("mpg", "hp"))$collect()$height
+lazyframe__drop_nulls <- function(subset = NULL) {
+  if (!is.null(subset)) subset <- as.list(subset)
+  self$`_rexpr`$drop_nulls(subset)
+}
+
+#' @inherit DataFrame_unique title description params
+#'
+#' @inherit as_polars_lf return
+#' @examples
+#' df <- pl$LazyFrame(
+#'   x = sample(10, 100, rep = TRUE),
+#'   y = sample(10, 100, rep = TRUE)
+#' )
+#' df$collect()$height
+#'
+#' df$unique()$collect()$height
+#' df$unique(subset = "x")$collect()$height
+#'
+#' df$unique(keep = "last")
+#'
+#' # only keep unique rows
+#' df$unique(keep = "none")
+lazyframe__unique <- function(
+    subset = NULL,
+    ...,
+    keep = "any",
+    maintain_order = FALSE) {
+  wrap({
+    self$`_rexpr`$unique(subset, keep, maintain_order)
+  })
+}
+
+#' Group a LazyFrame
+#' @description This doesn't modify the data but only stores information about
+#' the group structure. This structure can then be used by several functions
+#' (`$agg()`, `$filter()`, etc.).
+#'
+#' @param ... Column(s) to group by.
+#' Accepts [expression][Expr_class] input. Characters are parsed as column names.
+#' @param maintain_order Ensure that the order of the groups is consistent with the input data.
+#' This is slower than a default group by.
+#' Setting this to `TRUE` blocks the possibility to run on the streaming engine.
+#' The default value can be changed with `options(polars.maintain_order = TRUE)`.
+#' @return [LazyGroupBy][LazyGroupBy_class] (a LazyFrame with special groupby methods like `$agg()`)
+#' @examples
+#' lf <- pl$LazyFrame(
+#'   a = c("a", "b", "a", "b", "c"),
+#'   b = c(1, 2, 1, 3, 3),
+#'   c = c(5, 4, 3, 2, 1)
+#' )
+#'
+#' lf$group_by("a")$agg(pl$col("b")$sum())$collect()
+#'
+#' # Set `maintain_order = TRUE` to ensure the order of the groups is consistent with the input.
+#' lf$group_by("a", maintain_order = TRUE)$agg(pl$col("c"))$collect()
+#'
+#' # Group by multiple columns by passing a list of column names.
+#' lf$group_by(c("a", "b"))$agg(pl$max("c"))$collect()
+#'
+#' # Or pass some arguments to group by multiple columns in the same way.
+#' # Expressions are also accepted.
+#' lf$group_by("a", pl$col("b") %/% 2)$agg(
+#'   pl$col("c")$mean()
+#' )$collect()
+#'
+#' # The columns will be renamed to the argument names.
+#' lf$group_by(d = "a", e = pl$col("b") %/% 2)$agg(
+#'   pl$col("c")$mean()
+#' )$collect()
+lazyframe__group_by <- function(..., maintain_order = polars_options()$maintain_order) {
+  self$`_rexpr`$group_by(unpack_list(..., .context = "in $group_by():"), maintain_order)
+}
+
+#' Join LazyFrames
+#'
+#' This function can do both mutating joins (adding columns based on matching
+#' observations, for example with `how = "left"`) and filtering joins (keeping
+#' observations based on matching observations, for example with `how =
+#' "inner"`).
+#'
+#' @param other LazyFrame to join with.
+#' @param on Either a vector of column names or a list of expressions and/or
+#'   strings. Use `left_on` and `right_on` if the column names to match on are
+#'   different between the two DataFrames.
+#' @param how One of the following methods: "inner", "left", "right", "full",
+#'   "semi", "anti", "cross".
+#' @param ... Ignored.
+#' @param left_on,right_on Same as `on` but only for the left or the right
+#'   DataFrame. They must have the same length.
+#' @param suffix Suffix to add to duplicated column names.
+#' @param validate Checks if join is of specified type:
+#' * `"m:m"` (default): many-to-many, doesn't perform any checks;
+#' * `"1:1"`: one-to-one, check if join keys are unique in both left and right
+#'   datasets;
+#' * `"1:m"`: one-to-many, check if join keys are unique in left dataset
+#' * `"m:1"`: many-to-one, check if join keys are unique in right dataset
+#'
+#' Note that this is currently not supported by the streaming engine, and is
+#' only supported when joining by single columns.
+#'
+#' @param join_nulls Join on null values. By default null values will never
+#'   produce matches.
+#' @param allow_parallel Allow the physical plan to optionally evaluate the
+#'   computation of both DataFrames up to the join in parallel.
+#' @param force_parallel Force the physical plan to evaluate the computation of
+#'   both DataFrames up to the join in parallel.
+#' @param coalesce Coalescing behavior (merging of join columns).
+#' - `NULL`: join specific.
+#' - `TRUE`: Always coalesce join columns.
+#' - `FALSE`: Never coalesce join columns.
+#'
+#' @inherit as_polars_lf return
+#' @examples
+#' # inner join by default
+#' df1 <- pl$LazyFrame(list(key = 1:3, payload = c("f", "i", NA)))
+#' df2 <- pl$LazyFrame(list(key = c(3L, 4L, 5L, NA_integer_)))
+#' df1$join(other = df2, on = "key")
+#'
+#' # cross join
+#' df1 <- pl$LazyFrame(x = letters[1:3])
+#' df2 <- pl$LazyFrame(y = 1:4)
+#' df1$join(other = df2, how = "cross")
+#'
+#' # use "validate" to ensure join keys are not duplicated
+#' df1 <- pl$LazyFrame(x = letters[1:5], y = 1:5)
+#' df2 <- pl$LazyFrame(x = c("a", letters[1:4]), y2 = 6:10)
+#'
+#' # this throws an error because there are two keys in df2 that match the key
+#' # in df1
+#' tryCatch(
+#'   df1$join(df2, on = "x", validate = "1:1")$collect(),
+#'   error = function(e) print(e)
+#' )
+lazyframe__join <- function(
+    other,
+    on = NULL,
+    how = "inner",
+    ...,
+    left_on = NULL,
+    right_on = NULL,
+    suffix = "_right",
+    validate = "m:m",
+    join_nulls = FALSE,
+    allow_parallel = TRUE,
+    force_parallel = FALSE,
+    coalesce = NULL) {
+  uw <- \(res) wrap({
+    res
+  })
+
+
+  if (!is_polars_lf(other)) {
+    Err_plain("`other` must be a LazyFrame.") |> uw()
+  }
+
+  if (how == "cross") {
+    if (!is.null(on) || !is.null(left_on) || !is.null(right_on)) {
+      Err_plain("cross join should not pass join keys.") |> uw()
+    }
+    rexprs_left <- as.list(NULL)
+    rexprs_right <- as.list(NULL)
+  } else {
+    if (!is.null(on)) {
+      rexprs_right <- rexprs_left <- as.list(on)
+    } else if ((!is.null(left_on) && !is.null(right_on))) {
+      rexprs_left <- as.list(left_on)
+      rexprs_right <- as.list(right_on)
+    } else {
+      Err_plain("must specify either `on`, or `left_on` and `right_on`.") |> uw()
+    }
+  }
+
+  self$`_rexpr`$join(
+    lf, other, rexprs_left, rexprs_right, how, validate, join_nulls, suffix,
+    allow_parallel, force_parallel, coalesce
+  ) |>
+    uw()
+}
+
+#' Perform a join based on one or multiple (in)equality predicates
+#'
+#' @description
+#' This performs an inner join, so only rows where all predicates are true are
+#' included in the result, and a row from either LazyFrame may be included
+#' multiple times in the result.
+#'
+#' Note that the row order of the input LazyFrames is not preserved.
+#'
+#' @param other LazyFrame to join with.
+#' @param ... (In)Equality condition to join the two tables on. When a column
+#' name occurs in both tables, the proper suffix must be applied in the
+#' predicate. For example, if both tables have a column `"x"` that you want to
+#' use in the conditions, you must refer to the column of the right table as
+#' `"x<suffix>"`.
+#' @param suffix Suffix to append to columns with a duplicate name.
+#'
+#' @return A LazyFrame
+#'
+#' @examples
+#' east <- pl$LazyFrame(
+#'   id = c(100, 101, 102),
+#'   dur = c(120, 140, 160),
+#'   rev = c(12, 14, 16),
+#'   cores = c(2, 8, 4)
+#' )
+#'
+#' west <- pl$LazyFrame(
+#'   t_id = c(404, 498, 676, 742),
+#'   time = c(90, 130, 150, 170),
+#'   cost = c(9, 13, 15, 16),
+#'   cores = c(4, 2, 1, 4)
+#' )
+#'
+#' east$join_where(
+#'   west,
+#'   pl$col("dur") < pl$col("time"),
+#'   pl$col("rev") < pl$col("cost")
+#' )$collect()
+lazyframe__join_where <- function(
+    other,
+    ...,
+    suffix = "_right") {
+  uw <- \(res) wrap({
+    res
+  })
+
+
+  if (!is_polars_lf(other)) {
+    Err_plain("`other` must be a LazyFrame.") |> uw()
+  }
+
+  self$`_rexpr`$join_where(lf, other, unpack_list(..., .context = "in $join_where():"), suffix) |>
+    uw()
+}
+
+
+
+#' Sort the LazyFrame by the given columns
+#'
+#' @inheritParams Series_sort
+#' @param by Column(s) to sort by. Can be character vector of column names,
+#' a list of Expr(s) or a list with a mix of Expr(s) and column names.
+#' @param ... More columns to sort by as above but provided one Expr per argument.
+#' @param descending Logical. Sort in descending order (default is `FALSE`). This must be
+#' either of length 1 or a logical vector of the same length as the number of
+#' Expr(s) specified in `by` and `...`.
+#' @param nulls_last A logical or logical vector of the same length as the number of columns.
+#' If `TRUE`, place `null` values last insead of first.
+#' @param maintain_order Whether the order should be maintained if elements are
+#' equal. If `TRUE`, streaming is not possible and performance might be worse
+#' since this requires a stable search.
+#' @inherit as_polars_lf return
+#' @keywords  LazyFrame
+#' @examples
+#' df <- mtcars
+#' df$mpg[1] <- NA
+#' df <- pl$LazyFrame(df)
+#' df$sort("mpg")$collect()
+#' df$sort("mpg", nulls_last = TRUE)$collect()
+#' df$sort("cyl", "mpg")$collect()
+#' df$sort(c("cyl", "mpg"))$collect()
+#' df$sort(c("cyl", "mpg"), descending = TRUE)$collect()
+#' df$sort(c("cyl", "mpg"), descending = c(TRUE, FALSE))$collect()
+#' df$sort(pl$col("cyl"), pl$col("mpg"))$collect()
+lazyframe__sort <- function(
+    by,
+    ...,
+    descending = FALSE,
+    nulls_last = FALSE,
+    maintain_order = FALSE,
+    multithreaded = TRUE) {
+  self$`_rexpr`$sort_by_exprs(
+    lf, wrap_elist_result(by, str_to_lit = FALSE), err_on_named_args(...),
+    descending, nulls_last, maintain_order, multithreaded
+  )
+}
+
+
+#' Perform joins on nearest keys
+#'
+#' This is similar to a left-join except that we match on nearest key rather
+#' than equal keys.
+#'
+#' Both tables (DataFrames or LazyFrames) must be sorted by the asof_join key.
+#' @param other LazyFrame
+#' @param ...  Not used, blocks use of further positional arguments
+#' @inheritParams DataFrame_join
+#' @param by Join on these columns before performing asof join. Either a vector
+#' of column names or a list of expressions and/or strings. Use `left_by` and
+#' `right_by` if the column names to match on are different between the two
+#' tables.
+#' @param by_left,by_right Same as `by` but only for the left or the right
+#' table. They must have the same length.
+#' @param strategy Strategy for where to find match:
+#' * "backward" (default): search for the last row in the right table whose `on`
+#'   key is less than or equal to the left key.
+#' * "forward": search for the first row in the right table whose `on` key is
+#'   greater than or equal to the left key.
+#' * "nearest": search for the last row in the right table whose value is nearest
+#'   to the left key. String keys are not currently supported for a nearest
+#'   search.
+#' @param tolerance
+#' Numeric tolerance. By setting this the join will only be done if the near
+#' keys are within this distance. If an asof join is done on columns of dtype
+#' "Date", "Datetime", "Duration" or "Time", use the Polars duration string language.
+#' About the language, see the `Polars duration string language` section for details.
+#'
+#' There may be a circumstance where R types are not sufficient to express a
+#' numeric tolerance. In that case, you can use the expression syntax like
+#' `tolerance = pl$lit(42)$cast(pl$Uint64)`
+#' @param coalesce Coalescing behavior (merging of `on` / `left_on` / `right_on`
+#' columns):
+#' * `TRUE`: Always coalesce join columns;
+#' * `FALSE`: Never coalesce join columns.
+#' Note that joining on any other expressions than `col` will turn off coalescing.
+#'
+#' @inheritSection polars_duration_string  Polars duration string language
+#' @examples #
+#' # create two LazyFrame to join asof
+#' gdp <- pl$LazyFrame(
+#'   date = as.Date(c("2015-1-1", "2016-1-1", "2017-5-1", "2018-1-1", "2019-1-1")),
+#'   gdp = c(4321, 4164, 4411, 4566, 4696),
+#'   group = c("b", "a", "a", "b", "b")
+#' )
+#'
+#' pop <- pl$LazyFrame(
+#'   date = as.Date(c("2016-5-12", "2017-5-12", "2018-5-12", "2019-5-12")),
+#'   population = c(82.19, 82.66, 83.12, 83.52),
+#'   group = c("b", "b", "a", "a")
+#' )
+#'
+#' # optional make sure tables are already sorted with "on" join-key
+#' gdp <- gdp$sort("date")
+#' pop <- pop$sort("date")
+#'
+#'
+#' # Left-join_asof LazyFrame pop with gdp on "date"
+#' # Look backward in gdp to find closest matching date
+#' pop$join_asof(gdp, on = "date", strategy = "backward")$collect()
+#'
+#' # .... and forward
+#' pop$join_asof(gdp, on = "date", strategy = "forward")$collect()
+#'
+#' # join by a group: "only look within groups"
+#' pop$join_asof(gdp, on = "date", by = "group", strategy = "backward")$collect()
+#'
+#' # only look 2 weeks and 2 days back
+#' pop$join_asof(gdp, on = "date", strategy = "backward", tolerance = "2w2d")$collect()
+#'
+#' # only look 11 days back (numeric tolerance depends on polars type, <date> is in days)
+#' pop$join_asof(gdp, on = "date", strategy = "backward", tolerance = 11)$collect()
+lazyframe__join_asof <- function(
+    other,
+    ...,
+    left_on = NULL,
+    right_on = NULL,
+    on = NULL,
+    by_left = NULL,
+    by_right = NULL,
+    by = NULL,
+    strategy = c("backward", "forward", "nearest"),
+    suffix = "_right",
+    tolerance = NULL,
+    allow_parallel = TRUE,
+    force_parallel = FALSE,
+    coalesce = TRUE) {
+  if (!is.null(by)) by_left <- by_right <- by
+  if (!is.null(on)) left_on <- right_on <- on
+  tolerance_str <- if (is.character(tolerance)) tolerance else NULL
+  tolerance_num <- if (!is.character(tolerance)) tolerance else NULL
+
+  self$`_rexpr`$join_asof(
+    lf = self,
+    other = other,
+    left_on = left_on,
+    right_on = right_on,
+    left_by = by_left,
+    right_by = by_right,
+    allow_parallel = allow_parallel,
+    force_parallel = force_parallel,
+    suffix = suffix,
+    strategy = strategy,
+    tolerance = tolerance_num,
+    tolerance_str = tolerance_str,
+    coalesce = coalesce
+  )
+}
+
+
+#' Unpivot a Frame from wide to long format
+#'
+#' @param on Values to use as identifier variables. If `value_vars` is
+#' empty all columns that are not in `id_vars` will be used.
+#' @param ... Not used.
+#' @param index Columns to use as identifier variables.
+#' @param variable_name Name to give to the new column containing the names of
+#' the melted columns. Defaults to "variable".
+#' @param value_name Name to give to the new column containing the values of
+#' the melted columns. Defaults to `"value"`.
+#'
+#' @details
+#' Optionally leaves identifiers set.
+#'
+#' This function is useful to massage a Frame into a format where one or more
+#' columns are identifier variables (id_vars), while all other columns, considered
+#' measured variables (value_vars), are "unpivoted" to the row axis, leaving just
+#' two non-identifier columns, 'variable' and 'value'.
+#'
+#'
+#'
+#' @return A LazyFrame
+#'
+#' @examples
+#' lf <- pl$LazyFrame(
+#'   a = c("x", "y", "z"),
+#'   b = c(1, 3, 5),
+#'   c = c(2, 4, 6)
+#' )
+#' lf$unpivot(index = "a", on = c("b", "c"))$collect()
+lazyframe__unpivot <- function(
+    on = NULL,
+    ...,
+    index = NULL,
+    variable_name = NULL,
+    value_name = NULL) {
+  self$`_rexpr`$unpivot(
+    lf, on %||% character(), index %||% character(),
+    value_name, variable_name
+  ) |> unwrap("in $unpivot( ): ")
+}
+
+#' Rename column names of a LazyFrame
+#'
+#' @details
+#' If existing names are swapped (e.g. `A` points to `B` and `B` points to `A`),
+#' polars will block projection and predicate pushdowns at this node.
+#' @inherit pl_LazyFrame return
+#' @param ... One of the following:
+#' - Key value pairs that map from old name to new name, like `old_name = "new_name"`.
+#' - As above but with params wrapped in a list
+#' - An R function that takes the old names character vector as input and
+#'   returns the new names character vector.
+#' @examples
+#' lf <- pl$LazyFrame(
+#'   foo = 1:3,
+#'   bar = 6:8,
+#'   ham = letters[1:3]
+#' )
+#'
+#' lf$rename(foo = "apple")$collect()
+#'
+#' lf$rename(
+#'   \(column_name) paste0("c", substr(column_name, 2, 100))
+#' )$collect()
+lazyframe__rename <- function(...) {
+  uw <- \(res) wrap({
+    res
+  })
+
+
+  if (!nargs()) {
+    Err_plain("No arguments provided for `$rename()`.") |>
+      uw()
+  }
+
+  mapping <- list2(...)
+  if (is.function(mapping[[1L]])) {
+    result({
+      existing <- names(self)
+      new <- mapping[[1L]](existing)
+    }) |>
+      uw()
+  } else {
+    if (is.list(mapping[[1L]])) {
+      mapping <- mapping[[1L]]
+    }
+    new <- unname(unlist(mapping))
+    existing <- names(mapping)
+  }
+  self$`_rexpr`$rename(existing, new) |>
+    uw()
+}
+
+#' Fetch `n` rows of a LazyFrame
+#'
+#' This is similar to `$collect()` but limit the number of rows to collect. It
+#' is mostly useful to check that a query works as expected.
+#'
+#'
+#' @details
+#' `$fetch()` does not guarantee the final number of rows in the DataFrame output.
+#' It only guarantees that `n` rows are used at the beginning of the query.
+#' Filters, join operations and a lower number of rows available in the scanned
+#' file influence the final number of rows.
+#'
+#' @param n_rows Integer. Maximum number of rows to fetch.
+#' @inheritParams lazyframe__collect
+#' @return A DataFrame of maximum n_rows
+#' @seealso
+#'  - [`$collect()`][lazyframe__collect] - regular collect.
+#'  - [`$profile()`][lazyframe__profile] - same as `$collect()` but also returns
+#'    a table with each operation profiled.
+#'  - [`$collect_in_background()`][lazyframe__collect_in_background] - non-blocking
+#'    collect returns a future handle. Can also just be used via
+#'    `$collect(collect_in_background = TRUE)`.
+#'  - [`$sink_parquet()`][lazyframe__sink_parquet()] streams query to a parquet file.
+#'  - [`$sink_ipc()`][lazyframe__sink_ipc()] streams query to a arrow file.
+#'
+#' @examples
+#' # fetch 3 rows
+#' pl$LazyFrame(iris)$fetch(3)
+#'
+#' # this fetch-query returns 4 rows, because we started with 3 and appended one
+#' # row in the query (see section 'Details')
+#' pl$LazyFrame(iris)$
+#'   select(pl$col("Species")$append("flora gigantica, alien"))$
+#'   fetch(3)
+lazyframe__fetch <- function(
+    n_rows = 500,
+    ...,
+    type_coercion = TRUE,
+    predicate_pushdown = TRUE,
+    projection_pushdown = TRUE,
+    simplify_expression = TRUE,
+    slice_pushdown = TRUE,
+    comm_subplan_elim = TRUE,
+    comm_subexpr_elim = TRUE,
+    cluster_with_columns = TRUE,
+    streaming = FALSE,
+    no_optimization = FALSE) {
+  if (isTRUE(no_optimization)) {
+    predicate_pushdown <- FALSE
+    projection_pushdown <- FALSE
+    slice_pushdown <- FALSE
+    comm_subplan_elim <- FALSE
+    comm_subexpr_elim <- FALSE
+    cluster_with_columns <- FALSE
+  }
+
+  if (isTRUE(streaming)) {
+    comm_subplan_elim <- FALSE
+  }
+
+  lf <- self |>
+    self$`_rexpr`$optimization_toggle(
+      pe_coercion = type_coercion,
+      predicate_pushdown = predicate_pushdown,
+      projection_pushdown = projection_pushdown,
+      simplify_expression = simplify_expression,
+      slice_pushdown = slice_pushdown,
+      comm_subplan_elim = comm_subplan_elim,
+      comm_subexpr_elim = comm_subexpr_elim,
+      cluster_with_columns = cluster_with_columns,
+      streaming = streaming,
+      eager = FALSE
+    )
+
+  self$`_rexpr`$fetch(n_rows)
+}
+
+#' Collect and profile a lazy query.
+#' @description This will run the query and return a list containing the
+#' materialized DataFrame and a DataFrame that contains profiling information
+#' of each node that is executed.
+#'
+#' @inheritParams lazyframe__collect
+#' @param show_plot Show a Gantt chart of the profiling result
+#' @param truncate_nodes Truncate the label lengths in the Gantt chart to this
+#' number of characters. If `0` (default), do not truncate.
+#'
+#' @details The units of the timings are microseconds.
+#'
+#'
+#' @return List of two `DataFrame`s: one with the collected result, the other
+#' with the timings of each step. If `show_graph = TRUE`, then the plot is
+#' also stored in the list.
+#' @seealso
+#'  - [`$collect()`][lazyframe__collect] - regular collect.
+#'  - [`$fetch()`][lazyframe__fetch] - fast limited query check
+#'  - [`$collect_in_background()`][lazyframe__collect_in_background] - non-blocking
+#'    collect returns a future handle. Can also just be used via
+#'    `$collect(collect_in_background = TRUE)`.
+#'  - [`$sink_parquet()`][lazyframe__sink_parquet()] streams query to a parquet file.
+#'  - [`$sink_ipc()`][lazyframe__sink_ipc()] streams query to a arrow file.
+#'
+#' @examples
+#' ## Simplest use case
+#' pl$LazyFrame()$select(pl$lit(2) + 2)$profile()
+#'
+#' ## Use $profile() to compare two queries
+#'
+#' # -1-  map each Species-group with native polars, takes ~120us only
+#' pl$LazyFrame(iris)$
+#'   sort("Sepal.Length")$
+#'   group_by("Species", maintain_order = TRUE)$
+#'   agg(pl$col(pl$Float64)$first() + 5)$
+#'   profile()
+#'
+#' # -2-  map each Species-group of each numeric column with an R function, takes ~7000us (slow!)
+#'
+#' # some R function, prints `.` for each time called by polars
+#' r_func <- \(s) {
+#'   cat(".")
+#'   s$to_r()[1] + 5
+#' }
+#'
+#' pl$LazyFrame(iris)$
+#'   sort("Sepal.Length")$
+#'   group_by("Species", maintain_order = TRUE)$
+#'   agg(pl$col(pl$Float64)$map_elements(r_func))$
+#'   profile()
+lazyframe__profile <- function(
+    type_coercion = TRUE,
+    predicate_pushdown = TRUE,
+    projection_pushdown = TRUE,
+    simplify_expression = TRUE,
+    slice_pushdown = TRUE,
+    comm_subplan_elim = TRUE,
+    comm_subexpr_elim = TRUE,
+    cluster_with_columns = TRUE,
+    streaming = FALSE,
+    no_optimization = FALSE,
+    collect_in_background = FALSE,
+    show_plot = FALSE,
+    truncate_nodes = 0) {
+  if (isTRUE(no_optimization)) {
+    predicate_pushdown <- FALSE
+    projection_pushdown <- FALSE
+    slice_pushdown <- FALSE
+    comm_subplan_elim <- FALSE
+    comm_subexpr_elim <- FALSE
+    cluster_with_columns <- FALSE
+  }
+
+  if (isTRUE(streaming)) {
+    comm_subplan_elim <- FALSE
+  }
+
+  lf <- self |>
+    self$`_rexpr`$optimization_toggle(
+      pe_coercion = type_coercion,
+      predicate_pushdown = predicate_pushdown,
+      projection_pushdown = projection_pushdown,
+      simplify_expression = simplify_expression,
+      slice_pushdown = slice_pushdown,
+      comm_subplan_elim = comm_subplan_elim,
+      comm_subexpr_elim = comm_subexpr_elim,
+      cluster_with_columns = cluster_with_columns,
+      streaming = streaming,
+      eager = FALSE
+    )
+
+  out <- lf |>
+    self$`_rexpr`$profile() >
+    unwrap("in $profile()")
+
+  if (isTRUE(show_plot)) {
+    out[["plot"]] <- make_profile_plot(out, truncate_nodes) |>
+      result() |>
+      unwrap("in $profile()")
+  }
+
+  out
+}
+
+#' Explode columns containing a list of values
+#' @description This will take every element of a list column and add it on an
+#' additional row.
+#'
+#'
+#'
+#' @param ... Column(s) to be exploded as individual `Into<Expr>` or list/vector
+#' of `Into<Expr>`. In a handful of places in rust-polars, only the plain variant
+#' `Expr::Column` is accepted. This is currenly one of such places. Therefore
+#' `pl$col("name")` and `pl$all()` is allowed, not `pl$col("name")$alias("newname")`.
+#' `"name"` is implicitly converted to `pl$col("name")`.
+#'
+#' @details
+#' Only columns of DataType `List` or `Array` can be exploded.
+#'
+#' Named expressions like `$explode(a = pl$col("b"))` will not implicitly trigger
+#' `$alias("a")` here, due to only variant `Expr::Column` is supported in
+#' rust-polars.
+#'
+#' @inherit as_polars_lf return
+#' @examples
+#' df <- pl$LazyFrame(
+#'   letters = c("aa", "aa", "bb", "cc"),
+#'   numbers = list(1, c(2, 3), c(4, 5), c(6, 7, 8)),
+#'   numbers_2 = list(0, c(1, 2), c(3, 4), c(5, 6, 7)) # same structure as numbers
+#' )
+#' df
+#'
+#' # explode a single column, append others
+#' df$explode("numbers")$collect()
+#'
+#' # explode two columns of same nesting structure, by names or the common dtype
+#' # "List(Float64)"
+#' df$explode("numbers", "numbers_2")$collect()
+#' df$explode(pl$col(pl$List(pl$Float64)))$collect()
+lazyframe__explode <- function(...) {
+  dotdotdot_args <- unpack_list(..., .context = "in explode():")
+  self$`_rexpr`$explode(dotdotdot_args)
+}
+
+#' Clone a LazyFrame
+#'
+#' This makes a very cheap deep copy/clone of an existing
+#' [`LazyFrame`][lazyframe__class]. Rarely useful as `LazyFrame`s are nearly 100%
+#' immutable. Any modification of a `LazyFrame` should lead to a clone anyways,
+#' but this can be useful when dealing with attributes (see examples).
+#'
+#'
+#' @return A LazyFrame
+#' @examples
+#' df1 <- pl$LazyFrame(iris)
+#'
+#' # Make a function to take a LazyFrame, add an attribute, and return a LazyFrame
+#' give_attr <- function(data) {
+#'   attr(data, "created_on") <- "2024-01-29"
+#'   data
+#' }
+#' df2 <- give_attr(df1)
+#'
+#' # Problem: the original LazyFrame also gets the attribute while it shouldn't!
+#' attributes(df1)
+#'
+#' # Use $clone() inside the function to avoid that
+#' give_attr <- function(data) {
+#'   data <- data$clone()
+#'   attr(data, "created_on") <- "2024-01-29"
+#'   data
+#' }
+#' df1 <- pl$LazyFrame(iris)
+#' df2 <- give_attr(df1)
+#'
+#' # now, the original LazyFrame doesn't get this attribute
+#' attributes(df1)
+lazyframe__clone <- function() {
+  self$`_rexpr`$clone_in_rust()
+}
+
+
+#' Unnest the Struct columns of a LazyFrame
+#'
+#' @inheritParams DataFrame_unnest
+#'
+#' @return A LazyFrame where some or all columns of datatype Struct are unnested.
+#' @examples
+#' lf <- pl$LazyFrame(
+#'   a = 1:5,
+#'   b = c("one", "two", "three", "four", "five"),
+#'   c = 6:10
+#' )$
+#'   select(
+#'   pl$struct("b"),
+#'   pl$struct(c("a", "c"))$alias("a_and_c")
+#' )
+#' lf$collect()
+#'
+#' # by default, all struct columns are unnested
+#' lf$unnest()$collect()
+#'
+#' # we can specify specific columns to unnest
+#' lf$unnest("a_and_c")$collect()
+lazyframe__unnest <- function(...) {
+  columns <- unpack_list(..., .context = "in $unnest():")
+  if (length(columns) == 0) {
+    columns <- names(which(dtypes_are_struct(self$`_rexpr`$schema(ok))))
+  } else {
+    columns <- unlist(columns)
+  }
+  wrap({
+    self$`_rexpr`$unnest(columns)
+  })
+}
+
+#' Add an external context to the computation graph
+#'
+#' This allows expressions to also access columns from DataFrames or LazyFrames
+#' that are not part of this one.
+#'
+#' @param other Data/LazyFrame to have access to. This can be a list of DataFrames
+#' and LazyFrames.
+#' @return A LazyFrame
+#'
+#' @examples
+#' lf <- pl$LazyFrame(a = c(1, 2, 3), b = c("a", "c", NA))
+#' lf_other <- pl$LazyFrame(c = c("foo", "ham"))
+#'
+#' lf$with_context(lf_other)$select(
+#'   pl$col("b") + pl$col("c")$first()
+#' )$collect()
+#'
+#' # Fill nulls with the median from another lazyframe:
+#' train_lf <- pl$LazyFrame(
+#'   feature_0 = c(-1.0, 0, 1), feature_1 = c(-1.0, 0, 1)
+#' )
+#' test_lf <- pl$LazyFrame(
+#'   feature_0 = c(-1.0, NA, 1), feature_1 = c(-1.0, 0, 1)
+#' )
+#'
+#' test_lf$with_context(train_lf$select(pl$all()$name$suffix("_train")))$select(
+#'   pl$col("feature_0")$fill_null(pl$col("feature_0_train")$median())
+#' )$collect()
+lazyframe__with_context <- function(other) {
+  self$`_rexpr`$with_context(other)
+}
+
+
+#' Create rolling groups based on a date/time or integer column
+#'
+#' @inherit Expr_rolling description details params
+#' @param index_column Column used to group based on the time window. Often of
+#' type Date/Datetime. This column must be sorted in ascending order (or, if `by`
+#' is specified, then it must be sorted in ascending order within each group). In
+#' case of a rolling group by on indices, dtype needs to be either Int32 or Int64.
+#' Note that Int32 gets temporarily cast to Int64, so if performance matters use
+#' an Int64 column.
+#' @param group_by Also group by this column/these columns.
+#'
+#' @inheritSection polars_duration_string  Polars duration string language
+#' @return A [LazyGroupBy][LazyGroupBy_class] object
+#' @seealso
+#' - [`<LazyFrame>$group_by_dynamic()`][lazyframe__group_by_dynamic]
+#' @examples
+#' dates <- c(
+#'   "2020-01-01 13:45:48",
+#'   "2020-01-01 16:42:13",
+#'   "2020-01-01 16:45:09",
+#'   "2020-01-02 18:12:48",
+#'   "2020-01-03 19:45:32",
+#'   "2020-01-08 23:16:43"
+#' )
+#'
+#' df <- pl$LazyFrame(dt = dates, a = c(3, 7, 5, 9, 2, 1))$with_columns(
+#'   pl$col("dt")$str$strptime(pl$Datetime())$set_sorted()
+#' )
+#'
+#' df$rolling(index_column = "dt", period = "2d")$agg(
+#'   sum_a = pl$sum("a"),
+#'   min_a = pl$min("a"),
+#'   max_a = pl$max("a")
+#' )$collect()
+lazyframe__rolling <- function(
+    index_column,
+    ...,
+    period,
+    offset = NULL,
+    closed = "right",
+    group_by = NULL) {
+  period <- parse_as_polars_duration_string(period)
+  offset <- parse_as_polars_duration_string(offset) %||% negate_duration_string(period)
+  self$`_rexpr`$rolling(
+    lf, index_column, period, offset, closed,
+    wrap_elist_result(group_by, str_to_lit = FALSE)
+  )
+}
+
+
+#' Group based on a date/time or integer column
+#'
+#' @inherit lazyframe__rolling description details params
+#'
+#' @param every Interval of the window.
+#' @param include_boundaries Add two columns `"_lower_boundary"` and
+#' `"_upper_boundary"` columns that show the boundaries of the window. This will
+#' impact performance because it’s harder to parallelize.
+#' @param label Define which label to use for the window:
+#' * `"left"`: lower boundary of the window
+#' * `"right"`: upper boundary of the window
+#' * `"datapoint"`: the first value of the index column in the given window. If
+#' you don’t need the label to be at one of the boundaries, choose this option
+#' for maximum performance.
+#' @param start_by The strategy to determine the start of the first window by:
+#' * `"window"`: start by taking the earliest timestamp, truncating it with `every`,
+#'   and then adding `offset`. Note that weekly windows start on Monday.
+#' * `"datapoint"`: start from the first encountered data point.
+#' * a day of the week (only takes effect if `every` contains `"w"`): `"monday"`
+#'   starts the window on the Monday before the first data point, etc.
+#'
+#' @return A [LazyGroupBy][LazyGroupBy_class] object
+#' @seealso
+#' - [`<LazyFrame>$rolling()`][lazyframe__rolling]
+#' @examples
+#' lf <- pl$LazyFrame(
+#'   time = pl$datetime_range(
+#'     start = strptime("2021-12-16 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"),
+#'     end = strptime("2021-12-16 03:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"),
+#'     interval = "30m"
+#'   ),
+#'   n = 0:6
+#' )
+#' lf$collect()
+#'
+#' # get the sum in the following hour relative to the "time" column
+#' lf$group_by_dynamic("time", every = "1h")$agg(
+#'   vals = pl$col("n"),
+#'   sum = pl$col("n")$sum()
+#' )$collect()
+#'
+#' # using "include_boundaries = TRUE" is helpful to see the period considered
+#' lf$group_by_dynamic("time", every = "1h", include_boundaries = TRUE)$agg(
+#'   vals = pl$col("n")
+#' )$collect()
+#'
+#' # in the example above, the values didn't include the one *exactly* 1h after
+#' # the start because "closed = 'left'" by default.
+#' # Changing it to "right" includes values that are exactly 1h after. Note that
+#' # the value at 00:00:00 now becomes included in the interval [23:00:00 - 00:00:00],
+#' # even if this interval wasn't there originally
+#' lf$group_by_dynamic("time", every = "1h", closed = "right")$agg(
+#'   vals = pl$col("n")
+#' )$collect()
+#' # To keep both boundaries, we use "closed = 'both'". Some values now belong to
+#' # several groups:
+#' lf$group_by_dynamic("time", every = "1h", closed = "both")$agg(
+#'   vals = pl$col("n")
+#' )$collect()
+#'
+#' # Dynamic group bys can also be combined with grouping on normal keys
+#' lf <- lf$with_columns(
+#'   groups = as_polars_series(c("a", "a", "a", "b", "b", "a", "a"))
+#' )
+#' lf$collect()
+#'
+#' lf$group_by_dynamic(
+#'   "time",
+#'   every = "1h",
+#'   closed = "both",
+#'   group_by = "groups",
+#'   include_boundaries = TRUE
+#' )$agg(pl$col("n"))$collect()
+#'
+#' # We can also create a dynamic group by based on an index column
+#' lf <- pl$LazyFrame(
+#'   idx = 0:5,
+#'   A = c("A", "A", "B", "B", "B", "C")
+#' )$with_columns(pl$col("idx")$set_sorted())
+#' lf$collect()
+#'
+#' lf$group_by_dynamic(
+#'   "idx",
+#'   every = "2i",
+#'   period = "3i",
+#'   include_boundaries = TRUE,
+#'   closed = "right"
+#' )$agg(A_agg_list = pl$col("A"))$collect()
+lazyframe__group_by_dynamic <- function(
+    index_column,
+    ...,
+    every,
+    period = NULL,
+    offset = NULL,
+    include_boundaries = FALSE,
+    closed = "left",
+    label = "left",
+    group_by = NULL,
+    start_by = "window") {
+  every <- parse_as_polars_duration_string(every)
+  offset <- parse_as_polars_duration_string(offset) %||% negate_duration_string(every)
+  period <- parse_as_polars_duration_string(period) %||% every
+
+  self$`_rexpr`$group_by_dynamic(
+    lf, index_column, every, period, offset, label, include_boundaries, closed,
+    wrap_elist_result(group_by, str_to_lit = FALSE), start_by
+  )
+}
+
+#' Plot the query plan
+#'
+#' This only returns the "dot" output that can be passed to other packages, such
+#' as `DiagrammeR::grViz()`.
+#'
+#' @param ... Not used..
+#' @param optimized Optimize the query plan.
+#' @inheritParams lazyframe__explain
+#'
+#' @return A character vector
+#'
+#' @examples
+#' lf <- pl$LazyFrame(
+#'   a = c("a", "b", "a", "b", "b", "c"),
+#'   b = 1:6,
+#'   c = 6:1
+#' )
+#'
+#' query <- lf$group_by("a", maintain_order = TRUE)$agg(
+#'   pl$all()$sum()
+#' )$sort(
+#'   "a"
+#' )
+#'
+#' query$to_dot() |> cat()
+#'
+#' # You could print the graph by using DiagrammeR for example, with
+#' # query$to_dot() |> DiagrammeR::grViz().
+lazyframe__to_dot <- function(
+    ...,
+    optimized = TRUE,
+    type_coercion = TRUE,
+    predicate_pushdown = TRUE,
+    projection_pushdown = TRUE,
+    simplify_expression = TRUE,
+    slice_pushdown = TRUE,
+    comm_subplan_elim = TRUE,
+    comm_subexpr_elim = TRUE,
+    cluster_with_columns = TRUE,
+    streaming = FALSE) {
+  lf <- self |>
+    self$`_rexpr`$optimization_toggle(
+      pe_coercion = type_coercion,
+      predicate_pushdown = predicate_pushdown,
+      projection_pushdown = projection_pushdown,
+      simplify_expression = simplify_expression,
+      slice_pushdown = slice_pushdown,
+      comm_subplan_elim = comm_subplan_elim,
+      comm_subexpr_elim = comm_subexpr_elim,
+      cluster_with_columns = cluster_with_columns,
+      streaming = streaming,
+      eager = FALSE
+    )
+
+  self$`_rexpr`$to_dot(optimized)
+}
+
+#' Create an empty or n-row null-filled copy of the LazyFrame
+#'
+#' Returns a n-row null-filled LazyFrame with an identical schema. `n` can be
+#' greater than the current number of rows in the LazyFrame.
+#'
+#' @inheritParams DataFrame_clear
+#'
+#' @return A n-row null-filled LazyFrame with an identical schema
+#'
+#' @examples
+#' df <- pl$LazyFrame(
+#'   a = c(NA, 2, 3, 4),
+#'   b = c(0.5, NA, 2.5, 13),
+#'   c = c(TRUE, TRUE, FALSE, NA)
+#' )
+#'
+#' df$clear()
+#'
+#' df$clear(n = 5)
+lazyframe__clear <- function(n = 0) {
+  pl$DataFrame(schema = self$schema)$clear(n)$lazy()
+}
+
+
+# TODO: we can't use % in the SQL query
+# <https://github.com/r-lib/roxygen2/issues/1616>
+#' Execute a SQL query against the LazyFrame
+#'
+#' The calling frame is automatically registered as a table in the SQL context
+#' under the name `"self"`. All [DataFrames][DataFrame_class] and
+#' [LazyFrames][lazyframe__class] found in the `envir` are also registered,
+#' using their variable name.
+#' More control over registration and execution behaviour is available by
+#' the [SQLContext][SQLContext_class] object.
+#'
+#' This functionality is considered **unstable**, although it is close to
+#' being considered stable. It may be changed at any point without it being
+#' considered a breaking change.
+#' @inherit pl_LazyFrame return
+#' @inheritParams SQLContext_execute
+#' @inheritParams SQLContext_register_globals
+#' @param table_name `NULL` (default) or a character of an explicit name for the table
+#' that represents the calling frame (the alias `"self"` will always be registered/available).
+#' @seealso
+#' - [SQLContext][SQLContext_class]
+#' @examplesIf polars_info()$features$sql
+#' lf1 <- pl$LazyFrame(a = 1:3, b = 6:8, c = c("z", "y", "x"))
+#' lf2 <- pl$LazyFrame(a = 3:1, d = c(125, -654, 888))
+#'
+#' # Query the LazyFrame using SQL:
+#' lf1$sql("SELECT c, b FROM self WHERE a > 1")$collect()
+#'
+#' # Join two LazyFrames:
+#' lf1$sql(
+#'   "
+#' SELECT self.*, d
+#' FROM self
+#' INNER JOIN lf2 USING (a)
+#' WHERE a > 1 AND b < 8
+#' "
+#' )$collect()
+#'
+#' # Apply SQL transforms (aliasing "self" to "frame") and subsequently
+#' # filter natively (you can freely mix SQL and native operations):
+#' lf1$sql(
+#'   query = r"(
+#' SELECT
+#'  a,
+#' MOD(a, 2) == 0 AS a_is_even,
+#' (b::float / 2) AS 'b/2',
+#' CONCAT_WS(':', c, c, c) AS c_c_c
+#' FROM frame
+#' ORDER BY a
+#' )",
+#'   table_name = "frame"
+#' )$filter(!pl$col("c_c_c")$str$starts_with("x"))$collect()
+lazyframe__sql <- function(query, ..., table_name = NULL, envir = parent.frame()) {
+  result({
+    ctx <- pl$SQLContext()$register_globals(envir = envir)$register("self", self)
+
+    if (!is.null(table_name)) {
+      ctx$register(table_name, self)
+    }
+
+    ctx$execute(query)
+  })
+}
+
+
+#' Take every nth row in the LazyFrame
+#'
+#' @param n Gather every `n`-th row.
+#' @param offset Starting index.
+#'
+#' @return A LazyFrame
+#'
+#' @examples
+#' lf <- pl$LazyFrame(a = 1:4, b = 5:8)
+#' lf$gather_every(2)$collect()
+#'
+#' lf$gather_every(2, offset = 1)$collect()
+lazyframe__gather_every <- function(n, offset = 0) {
+  self$select(pl$col("*")$gather_every(n, offset))
+}
+
+
+#' Cast LazyFrame column(s) to the specified dtype
+#'
+#' This allows to convert all columns to a datatype or to convert only specific
+#' columns. Contrarily to the Python implementation, it is not possible to
+#' convert all columns of a specific datatype to another datatype.
+#'
+#' @param dtypes Either a datatype or a list where the names are column names and
+#' the values are the datatypes to convert to.
+#' @param ... Ignored.
+#' @param strict If `TRUE` (default), throw an error if a cast could not be done
+#' (for instance, due to an overflow). Otherwise, return `null`.
+#'
+#' @return A LazyFrame
+#'
+#' @examples
+#' lf <- pl$LazyFrame(
+#'   foo = 1:3,
+#'   bar = c(6, 7, 8),
+#'   ham = as.Date(c("2020-01-02", "2020-03-04", "2020-05-06"))
+#' )
+#'
+#' # Cast only some columns
+#' lf$cast(list(foo = pl$Float32, bar = pl$UInt8))$collect()
+#'
+#' # Cast all columns to the same type
+#' lf$cast(pl$String)$collect()
+lazyframe__cast <- function(dtypes, ..., strict = TRUE) {
+  if (!is.list(dtypes)) {
+    self$`_rexpr`$cast_all(dtype = dtypes, strict = strict) |>
+      unwrap("in $cast():")
+  } else {
+    self$`_rexpr`$cast(dtypes = dtypes, strict = strict) |>
+      unwrap("in $cast():")
+  }
+}
diff --git a/src/rust/Cargo.toml b/src/rust/Cargo.toml
index cbae5b3c..e922dbed 100644
--- a/src/rust/Cargo.toml
+++ b/src/rust/Cargo.toml
@@ -29,16 +29,20 @@ features = [
     "array_any_all",
     "array_count",
     "array_to_struct",
+    "asof_join",
     "binary_encoding",
     "business",
     "concat_str",
     "cse",
     "diff",
+    "dot_diagram",
     "dtype-full",
+    "dynamic_group_by",
     "extract_groups",
     "extract_jsonpath",
     "find_many",
     "fused",
+    "ipc",
     "is_in",
     "json",
     "lazy",
@@ -50,6 +54,7 @@ features = [
     "list_sample",
     "list_sets",
     "list_to_struct",
+    "merge_sorted",
     "meta",
     "month_start",
     "month_end",
diff --git a/src/rust/src/lazyframe/general.rs b/src/rust/src/lazyframe/general.rs
index 7877a84e..86ba11af 100644
--- a/src/rust/src/lazyframe/general.rs
+++ b/src/rust/src/lazyframe/general.rs
@@ -1,6 +1,5 @@
-use crate::{
-    prelude::*, PlRDataFrame, PlRDataType, PlRExpr, PlRLazyFrame, PlRLazyGroupBy, RPolarsErr,
-};
+use super::*;
+use crate::{PlRDataFrame, PlRDataType, PlRExpr, PlRLazyFrame, PlRLazyGroupBy, RPolarsErr};
 use savvy::{savvy, ListSexp, LogicalSexp, NumericScalar, OwnedStringSexp, Result, Sexp};
 
 #[savvy]
@@ -47,7 +46,7 @@ impl PlRLazyFrame {
         cluster_with_columns: bool,
         streaming: bool,
         _eager: bool,
-    ) -> Result<Self> {
+    ) -> Result<PlRLazyFrame> {
         let ldf = self
             .ldf
             .clone()
@@ -65,12 +64,12 @@ impl PlRLazyFrame {
         Ok(ldf.into())
     }
 
-    fn filter(&mut self, predicate: &PlRExpr) -> Result<Self> {
+    fn filter(&mut self, predicate: &PlRExpr) -> Result<PlRLazyFrame> {
         let ldf = self.ldf.clone();
         Ok(ldf.filter(predicate.inner.clone()).into())
     }
 
-    fn select(&mut self, exprs: ListSexp) -> Result<Self> {
+    fn select(&mut self, exprs: ListSexp) -> Result<PlRLazyFrame> {
         let ldf = self.ldf.clone();
         let exprs = <Wrap<Vec<Expr>>>::from(exprs).0;
         Ok(ldf.select(exprs).into())
@@ -128,7 +127,7 @@ impl PlRLazyFrame {
         Ok(df.into())
     }
 
-    fn slice(&self, offset: NumericScalar, len: Option<NumericScalar>) -> Result<Self> {
+    fn slice(&self, offset: NumericScalar, len: Option<NumericScalar>) -> Result<PlRLazyFrame> {
         let ldf = self.ldf.clone();
         let offset = <Wrap<i64>>::try_from(offset)?.0;
         let len = len
@@ -138,13 +137,13 @@ impl PlRLazyFrame {
         Ok(ldf.slice(offset, len.unwrap_or(u32::MAX)).into())
     }
 
-    fn tail(&self, n: NumericScalar) -> Result<Self> {
+    fn tail(&self, n: NumericScalar) -> Result<PlRLazyFrame> {
         let ldf = self.ldf.clone();
         let n = <Wrap<u32>>::try_from(n)?.0;
         Ok(ldf.tail(n).into())
     }
 
-    fn drop(&self, columns: ListSexp, strict: bool) -> Result<Self> {
+    fn drop(&self, columns: ListSexp, strict: bool) -> Result<PlRLazyFrame> {
         let ldf = self.ldf.clone();
         let columns = <Wrap<Vec<Expr>>>::from(columns).0;
         if strict {
@@ -154,14 +153,14 @@ impl PlRLazyFrame {
         }
     }
 
-    fn cast(&self, dtypes: ListSexp, strict: bool) -> Result<Self> {
+    fn cast(&self, dtypes: ListSexp, strict: bool) -> Result<PlRLazyFrame> {
         let dtypes = <Wrap<Vec<Field>>>::try_from(dtypes)?.0;
         let mut cast_map = PlHashMap::with_capacity(dtypes.len());
         cast_map.extend(dtypes.iter().map(|f| (f.name.as_ref(), f.dtype.clone())));
         Ok(self.ldf.clone().cast(cast_map, strict).into())
     }
 
-    fn cast_all(&self, dtype: &PlRDataType, strict: bool) -> Result<Self> {
+    fn cast_all(&self, dtype: &PlRDataType, strict: bool) -> Result<PlRLazyFrame> {
         Ok(self.ldf.clone().cast_all(dtype.dt.clone(), strict).into())
     }
 
@@ -172,7 +171,7 @@ impl PlRLazyFrame {
         nulls_last: LogicalSexp,
         maintain_order: bool,
         multithreaded: bool,
-    ) -> Result<Self> {
+    ) -> Result<PlRLazyFrame> {
         let ldf = self.ldf.clone();
         let by = <Wrap<Vec<Expr>>>::from(by).0;
         Ok(ldf
@@ -188,9 +187,605 @@ impl PlRLazyFrame {
             .into())
     }
 
-    fn with_columns(&mut self, exprs: ListSexp) -> Result<Self> {
+    fn with_columns(&mut self, exprs: ListSexp) -> Result<PlRLazyFrame> {
         let ldf = self.ldf.clone();
         let exprs = <Wrap<Vec<Expr>>>::from(exprs).0;
         Ok(ldf.with_columns(exprs).into())
     }
+
+    fn to_dot(&self, optimized: bool) -> Result<String> {
+        let result = self.ldf.to_dot(optimized).map_err(RPolarsErr::from)?;
+        Ok(result)
+    }
+
+    fn sort(
+        &self,
+        by_column: &str,
+        descending: bool,
+        nulls_last: bool,
+        maintain_order: bool,
+        multithreaded: bool,
+    ) -> Result<PlRLazyFrame> {
+        let ldf = self.ldf.clone();
+        Ok(ldf
+            .sort(
+                [by_column],
+                SortMultipleOptions {
+                    descending: vec![descending],
+                    nulls_last: vec![nulls_last],
+                    multithreaded,
+                    maintain_order,
+                },
+            )
+            .into())
+    }
+
+    fn top_k(&self, k: IdxSize, by: ListSexp, reverse: LogicalSexp) -> Result<PlRLazyFrame> {
+        let ldf = self.ldf.clone();
+        let exprs = <Wrap<Vec<Expr>>>::from(by).0;
+        let reverse = reverse.to_vec();
+        Ok(ldf
+            .top_k(
+                k,
+                exprs,
+                SortMultipleOptions::new().with_order_descending_multi(reverse),
+            )
+            .into())
+    }
+
+    fn bottom_k(&self, k: IdxSize, by: ListSexp, reverse: LogicalSexp) -> Result<PlRLazyFrame> {
+        let ldf = self.ldf.clone();
+        let exprs = <Wrap<Vec<Expr>>>::from(by).0;
+        let reverse = reverse.to_vec();
+        Ok(ldf
+            .bottom_k(
+                k,
+                exprs,
+                SortMultipleOptions::new().with_order_descending_multi(reverse),
+            )
+            .into())
+    }
+
+    fn cache(&self) -> Result<PlRLazyFrame> {
+        let ldf = self.ldf.clone();
+        Ok(ldf.cache().into())
+    }
+
+    // fn profile(&self, py: Python) -> Result<(PlRDataFrame, PlRDataFrame)> {
+    //     // if we don't allow threads and we have udfs trying to acquire the gil from different
+    //     // threads we deadlock.
+    //     let (df, time_df) = py.allow_threads(|| {
+    //         let ldf = self.ldf.clone();
+    //         ldf.profile().map_err(RPolarsErr::from)
+    //     })?;
+    //     Ok((df.into(), time_df.into()))
+    // }
+
+    //     fn collect_with_callback(&self, lambda: PyObject) {
+    //         let ldf = self.ldf.clone();
+
+    //         polars_core::POOL.spawn(move || {
+    //             let result = ldf
+    //                 .collect()
+    //                 .map(PlRDataFrame::new)
+    //                 .map_err(RPolarsErr::from);
+
+    //             Python::with_gil(|py| match result {
+    //                 Ok(df) => {
+    //                     lambda.call1(py, (df,)).map_err(|err| err.restore(py)).ok();
+    //                 }
+    //                 Err(err) => {
+    //                     lambda
+    //                         .call1(py, (PyErr::from(err).to_object(py),))
+    //                         .map_err(|err| err.restore(py))
+    //                         .ok();
+    //                 }
+    //             });
+    //         });
+    //     }
+
+    // fn sink_parquet(
+    //     &self,
+    //     py: Python,
+    //     path: PathBuf,
+    //     compression: &str,
+    //     compression_level: Option<i32>,
+    //     statistics: Wrap<StatisticsOptions>,
+    //     row_group_size: Option<usize>,
+    //     data_page_size: Option<usize>,
+    //     maintain_order: bool,
+    // ) -> Result<()> {
+    //     let compression = parse_parquet_compression(compression, compression_level)?;
+
+    //     let options = ParquetWriteOptions {
+    //         compression,
+    //         statistics: statistics.0,
+    //         row_group_size,
+    //         data_page_size,
+    //         maintain_order,
+    //     };
+
+    //     // if we don't allow threads and we have udfs trying to acquire the gil from different
+    //     // threads we deadlock.
+    //     py.allow_threads(|| {
+    //         let ldf = self.ldf.clone();
+    //         ldf.sink_parquet(path, options).map_err(RPolarsErr::from)
+    //     })?;
+    //     Ok(())
+    // }
+
+    // fn sink_ipc(
+    //     &self,
+    //     py: Python,
+    //     path: PathBuf,
+    //     compression: Option<Wrap<IpcCompression>>,
+    //     maintain_order: bool,
+    // ) -> Result<()> {
+    //     let options = IpcWriterOptions {
+    //         compression: compression.map(|c| c.0),
+    //         maintain_order,
+    //     };
+
+    //     // if we don't allow threads and we have udfs trying to acquire the gil from different
+    //     // threads we deadlock.
+    //     py.allow_threads(|| {
+    //         let ldf = self.ldf.clone();
+    //         ldf.sink_ipc(path, options).map_err(RPolarsErr::from)
+    //     })?;
+    //     Ok(())
+    // }
+
+    // fn sink_csv(
+    //     &self,
+    //     py: Python,
+    //     path: PathBuf,
+    //     include_bom: bool,
+    //     include_header: bool,
+    //     separator: u8,
+    //     line_terminator: String,
+    //     quote_char: u8,
+    //     batch_size: NonZeroUsize,
+    //     datetime_format: Option<String>,
+    //     date_format: Option<String>,
+    //     time_format: Option<String>,
+    //     float_scientific: Option<bool>,
+    //     float_precision: Option<usize>,
+    //     null_value: Option<String>,
+    //     quote_style: Option<Wrap<QuoteStyle>>,
+    //     maintain_order: bool,
+    // ) -> Result<()> {
+    //     let quote_style = quote_style.map_or(QuoteStyle::default(), |wrap| wrap.0);
+    //     let null_value = null_value.unwrap_or(SerializeOptions::default().null);
+
+    //     let serialize_options = SerializeOptions {
+    //         date_format,
+    //         time_format,
+    //         datetime_format,
+    //         float_scientific,
+    //         float_precision,
+    //         separator,
+    //         quote_char,
+    //         null: null_value,
+    //         line_terminator,
+    //         quote_style,
+    //     };
+
+    //     let options = CsvWriterOptions {
+    //         include_bom,
+    //         include_header,
+    //         maintain_order,
+    //         batch_size,
+    //         serialize_options,
+    //     };
+
+    //     // if we don't allow threads and we have udfs trying to acquire the gil from different
+    //     // threads we deadlock.
+    //     py.allow_threads(|| {
+    //         let ldf = self.ldf.clone();
+    //         ldf.sink_csv(path, options).map_err(RPolarsErr::from)
+    //     })?;
+    //     Ok(())
+    // }
+
+    //     fn sink_json(&self, py: Python, path: PathBuf, maintain_order: bool) -> Result<()> {
+    //         let options = JsonWriterOptions { maintain_order };
+
+    //         // if we don't allow threads and we have udfs trying to acquire the gil from different
+    //         // threads we deadlock.
+    //         py.allow_threads(|| {
+    //             let ldf = self.ldf.clone();
+    //             ldf.sink_json(path, options).map_err(RPolarsErr::from)
+    //         })?;
+    //         Ok(())
+    //     }
+
+    fn fetch(&self, py: Python, n_rows: NumericScalar) -> Result<PlRDataFrame> {
+        let ldf = self.ldf.clone();
+        let n_rows = <Wrap<usize>>::try_from(n_rows)?.0;
+        let df = py.allow_threads(|| ldf.fetch(n_rows).map_err(RPolarsErr::from))?;
+        Ok(df.into())
+    }
+
+    fn select_seq(&mut self, exprs: ListSexp) -> Result<PlRLazyFrame> {
+        let ldf = self.ldf.clone();
+        let exprs = <Wrap<Vec<Expr>>>::from(exprs).0;
+        Ok(ldf.select_seq(exprs).into())
+    }
+
+    fn rolling(
+        &mut self,
+        index_column: &PlRExpr,
+        period: &str,
+        offset: &str,
+        closed: &str,
+        by: ListSexp,
+    ) -> Result<PlRLazyGroupBy> {
+        let closed_window = <Wrap<ClosedWindow>>::try_from(closed)?.0;
+        let ldf = self.ldf.clone();
+        let by = <Wrap<Vec<Expr>>>::from(by).0;
+        let lazy_gb = ldf.rolling(
+            index_column.inner,
+            by,
+            RollingGroupOptions {
+                index_column: "".into(),
+                period: Duration::try_parse(period).map_err(RPolarsErr::from)?,
+                offset: Duration::try_parse(offset).map_err(RPolarsErr::from)?,
+                closed_window,
+            },
+        );
+
+        Ok(PlRLazyGroupBy { lgb: Some(lazy_gb) })
+    }
+
+    fn group_by_dynamic(
+        &mut self,
+        index_column: &PlRExpr,
+        every: &str,
+        period: &str,
+        offset: &str,
+        label: Wrap<Label>,
+        include_boundaries: bool,
+        closed: &str,
+        group_by: ListSexp,
+        start_by: Wrap<StartBy>,
+    ) -> Result<PlRLazyGroupBy> {
+        let closed_window = <Wrap<ClosedWindow>>::try_from(closed)?.0;
+        let group_by = <Wrap<Vec<Expr>>>::from(group_by).0;
+        let ldf = self.ldf.clone();
+        let lazy_gb = ldf.group_by_dynamic(
+            index_column.inner,
+            group_by,
+            DynamicGroupOptions {
+                every: Duration::try_parse(every).map_err(RPolarsErr::from)?,
+                period: Duration::try_parse(period).map_err(RPolarsErr::from)?,
+                offset: Duration::try_parse(offset).map_err(RPolarsErr::from)?,
+                label: label.0,
+                include_boundaries,
+                closed_window,
+                start_by: start_by.0,
+                ..Default::default()
+            },
+        );
+
+        Ok(PlRLazyGroupBy { lgb: Some(lazy_gb) })
+    }
+
+    fn with_context(&self, contexts: Vec<Self>) -> Result<PlRLazyFrame> {
+        let contexts = contexts.into_iter().map(|ldf| ldf.ldf).collect::<Vec<_>>();
+        Ok(self.ldf.clone().with_context(contexts).into())
+    }
+
+    // fn join_asof(
+    //     &self,
+    //     other: &PlRLazyFrame,
+    //     left_on: &PlRExpr,
+    //     right_on: &PlRExpr,
+    //     left_by: Option<Vec<&str>>,
+    //     right_by: Option<Vec<&str>>,
+    //     allow_parallel: bool,
+    //     force_parallel: bool,
+    //     suffix: String,
+    //     strategy: Wrap<AsofStrategy>,
+    //     tolerance: Option<Wrap<AnyValue<'_>>>,
+    //     tolerance_str: Option<String>,
+    //     coalesce: bool,
+    // ) -> Result<PlRLazyFrame> {
+    //     let coalesce = if coalesce {
+    //         JoinCoalesce::CoalesceColumns
+    //     } else {
+    //         JoinCoalesce::KeepColumns
+    //     };
+    //     let ldf = self.ldf.clone();
+    //     let other = other.ldf;
+    //     let left_on = left_on.inner;
+    //     let right_on = right_on.inner;
+    //     Ok(ldf
+    //         .join_builder()
+    //         .with(other)
+    //         .left_on([left_on])
+    //         .right_on([right_on])
+    //         .allow_parallel(allow_parallel)
+    //         .force_parallel(force_parallel)
+    //         .coalesce(coalesce)
+    //         .how(JoinType::AsOf(AsOfOptions {
+    //             strategy: strategy.0,
+    //             left_by: left_by.map(strings_to_pl_smallstr),
+    //             right_by: right_by.map(strings_to_pl_smallstr),
+    //             tolerance: tolerance.map(|t| t.0.into_static()),
+    //             tolerance_str: tolerance_str.map(|s| s.into()),
+    //         }))
+    //         .suffix(suffix)
+    //         .finish()
+    //         .into())
+    // }
+
+    fn join(
+        &self,
+        other: &PlRLazyFrame,
+        left_on: ListSexp,
+        right_on: ListSexp,
+        allow_parallel: bool,
+        force_parallel: bool,
+        join_nulls: bool,
+        how: Wrap<JoinType>,
+        suffix: String,
+        validate: Wrap<JoinValidation>,
+        coalesce: Option<bool>,
+    ) -> Result<PlRLazyFrame> {
+        let coalesce = match coalesce {
+            None => JoinCoalesce::JoinSpecific,
+            Some(true) => JoinCoalesce::CoalesceColumns,
+            Some(false) => JoinCoalesce::KeepColumns,
+        };
+        let ldf = self.ldf.clone();
+        let other = other.ldf;
+        let left_on = left_on
+            .into_iter()
+            .map(|pyexpr| pyexpr.inner)
+            .collect::<Vec<_>>();
+        let right_on = right_on
+            .into_iter()
+            .map(|pyexpr| pyexpr.inner)
+            .collect::<Vec<_>>();
+
+        Ok(ldf
+            .join_builder()
+            .with(other)
+            .left_on(left_on)
+            .right_on(right_on)
+            .allow_parallel(allow_parallel)
+            .force_parallel(force_parallel)
+            .join_nulls(join_nulls)
+            .how(how.0)
+            .coalesce(coalesce)
+            .validate(validate.0)
+            .suffix(suffix)
+            .finish()
+            .into())
+    }
+
+    fn join_where(
+        &self,
+        other: &PlRLazyFrame,
+        predicates: ListSexp,
+        suffix: String,
+    ) -> Result<PlRLazyFrame> {
+        let ldf = self.ldf.clone();
+        let other = other.ldf;
+
+        let predicates = <Wrap<Vec<Expr>>>::from(predicates).0;
+
+        Ok(ldf
+            .join_builder()
+            .with(other)
+            .suffix(suffix)
+            .join_where(predicates)
+            .into())
+    }
+
+    fn with_columns_seq(&mut self, exprs: ListSexp) -> Result<PlRLazyFrame> {
+        let ldf = self.ldf.clone();
+        let exprs = <Wrap<Vec<Expr>>>::from(exprs).0;
+        Ok(ldf.with_columns_seq(exprs).into())
+    }
+
+    fn rename(
+        &mut self,
+        existing: Vec<String>,
+        new: Vec<String>,
+        strict: bool,
+    ) -> Result<PlRLazyFrame> {
+        let ldf = self.ldf.clone();
+        Ok(ldf.rename(existing, new, strict).into())
+    }
+
+    fn reverse(&self) -> Result<PlRLazyFrame> {
+        let ldf = self.ldf.clone();
+        Ok(ldf.reverse().into())
+    }
+
+    fn shift(&self, n: &PlRExpr, fill_value: Option<&PlRExpr>) -> Result<PlRLazyFrame> {
+        let lf = self.ldf.clone();
+        let out = match fill_value {
+            Some(v) => lf.shift_and_fill(n.inner, v.inner),
+            None => lf.shift(n.inner),
+        };
+        Ok(out.into())
+    }
+
+    fn fill_nan(&self, fill_value: &PlRExpr) -> Result<PlRLazyFrame> {
+        let ldf = self.ldf.clone();
+        ldf.fill_nan(fill_value.inner).into()
+    }
+
+    fn min(&self) -> Result<PlRLazyFrame> {
+        let ldf = self.ldf.clone();
+        let out = ldf.min();
+        Ok(out.into())
+    }
+
+    fn max(&self) -> Result<PlRLazyFrame> {
+        let ldf = self.ldf.clone();
+        let out = ldf.max();
+        Ok(out.into())
+    }
+
+    fn sum(&self) -> Result<PlRLazyFrame> {
+        let ldf = self.ldf.clone();
+        let out = ldf.sum();
+        Ok(out.into())
+    }
+
+    fn mean(&self) -> Result<PlRLazyFrame> {
+        let ldf = self.ldf.clone();
+        let out = ldf.mean();
+        Ok(out.into())
+    }
+
+    fn std(&self, ddof: u8) -> Result<PlRLazyFrame> {
+        let ldf = self.ldf.clone();
+        let out = ldf.std(ddof);
+        Ok(out.into())
+    }
+
+    fn var(&self, ddof: u8) -> Result<PlRLazyFrame> {
+        let ldf = self.ldf.clone();
+        let out = ldf.var(ddof);
+        Ok(out.into())
+    }
+
+    fn median(&self) -> Result<PlRLazyFrame> {
+        let ldf = self.ldf.clone();
+        let out = ldf.median();
+        Ok(out.into())
+    }
+
+    fn quantile(
+        &self,
+        quantile: &PlRExpr,
+        interpolation: Wrap<QuantileMethod>,
+    ) -> Result<PlRLazyFrame> {
+        let ldf = self.ldf.clone();
+        let out = ldf.quantile(quantile.inner, interpolation.0);
+        Ok(out.into())
+    }
+
+    fn explode(&self, column: ListSexp) -> Result<PlRLazyFrame> {
+        let ldf = self.ldf.clone();
+        let column = <Wrap<Vec<Expr>>>::from(column).0;
+        Ok(ldf.explode(column).into())
+    }
+
+    fn null_count(&self) -> Result<PlRLazyFrame> {
+        let ldf = self.ldf.clone();
+        Ok(ldf.null_count().into())
+    }
+
+    fn unique(
+        &self,
+        maintain_order: bool,
+        keep: Wrap<UniqueKeepStrategy>,
+        subset: Option<ListSexp>,
+    ) -> Result<PlRLazyFrame> {
+        let ldf = self.ldf.clone();
+        let subset = subset.map(|e| <Wrap<Vec<Expr>>>::from(e).0);
+        let out = match maintain_order {
+            true => ldf.unique_stable_generic(subset, keep.0),
+            false => ldf.unique_generic(subset, keep.0),
+        };
+        Ok(out.into())
+    }
+
+    fn drop_nulls(&self, subset: Option<ListSexp>) -> Result<PlRLazyFrame> {
+        let ldf = self.ldf.clone();
+        let subset = subset.map(|e| <Wrap<Vec<Expr>>>::from(e).0);
+        Ok(ldf.drop_nulls(subset).into())
+    }
+
+    fn unpivot(
+        &self,
+        on: ListSexp,
+        index: ListSexp,
+        value_name: Option<String>,
+        variable_name: Option<String>,
+    ) -> Result<PlRLazyFrame> {
+        let args = UnpivotArgsDSL {
+            on: on.into_iter().map(|e| e.inner.into()).collect(),
+            index: index.into_iter().map(|e| e.inner.into()).collect(),
+            value_name: value_name.map(|s| s.into()),
+            variable_name: variable_name.map(|s| s.into()),
+        };
+
+        let ldf = self.ldf.clone();
+        Ok(ldf.unpivot(args).into())
+    }
+
+    fn with_row_index(&self, name: &str, offset: Option<IdxSize>) -> Result<PlRLazyFrame> {
+        let ldf = self.ldf.clone();
+        Ok(ldf.with_row_index(name, offset).into())
+    }
+
+    // fn map_batches(
+    //     &self,
+    //     lambda: PyObject,
+    //     predicate_pushdown: bool,
+    //     projection_pushdown: bool,
+    //     slice_pushdown: bool,
+    //     streamable: bool,
+    //     schema: Option<Wrap<Schema>>,
+    //     validate_output: bool,
+    // ) -> Result<PlRLazyFrame> {
+    //     let mut opt = OptFlags::default();
+    //     opt.set(OptFlags::PREDICATE_PUSHDOWN, predicate_pushdown);
+    //     opt.set(OptFlags::PROJECTION_PUSHDOWN, projection_pushdown);
+    //     opt.set(OptFlags::SLICE_PUSHDOWN, slice_pushdown);
+    //     opt.set(OptFlags::STREAMING, streamable);
+
+    //     self.ldf
+    //         .clone()
+    //         .map_python(
+    //             lambda.into(),
+    //             opt,
+    //             schema.map(|s| Arc::new(s.0)),
+    //             validate_output,
+    //         )
+    //         .into()
+    // }
+
+    fn clone(&self) -> Result<PlRLazyFrame> {
+        Ok(self.ldf.clone().into())
+    }
+
+    fn collect_schema(&mut self, py: Python) -> Result<ListSexp> {
+        let schema = py
+            .allow_threads(|| self.ldf.collect_schema())
+            .map_err(RPolarsErr::from)?;
+
+        let schema_dict = PyDict::new_bound(py);
+        schema.iter_fields().for_each(|fld| {
+            schema_dict
+                .set_item(fld.name().as_str(), Wrap(fld.dtype().clone()))
+                .unwrap()
+        });
+        Ok(schema_dict.to_object(py))
+    }
+
+    fn unnest(&self, columns: ListSexp) -> Result<PlRLazyFrame> {
+        let columns = <Wrap<Vec<Expr>>>::from(columns).0;
+        Ok(self.ldf.clone().unnest(columns).into())
+    }
+
+    fn count(&self) -> Result<PlRLazyFrame> {
+        let ldf = self.ldf.clone();
+        Ok(ldf.count().into())
+    }
+
+    fn merge_sorted(&self, other: &PlRLazyFrame, key: &str) -> Result<PlRLazyFrame> {
+        let out = self
+            .ldf
+            .clone()
+            .merge_sorted(other.ldf, key)
+            .map_err(RPolarsErr::from)?;
+        Ok(out.into())
+    }
 }

From 9827498acb183986bf9932d7ad481240737e4318 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Tue, 5 Nov 2024 09:44:05 +0100
Subject: [PATCH 02/71] fix compil

---
 src/rust/Cargo.toml               |   2 +
 src/rust/src/conversion/mod.rs    | 120 ++++++++++++++++++++++++-
 src/rust/src/lazyframe/general.rs | 144 ++++++++++++++++--------------
 src/rust/src/lazyframe/mod.rs     |  14 ++-
 4 files changed, 209 insertions(+), 71 deletions(-)

diff --git a/src/rust/Cargo.toml b/src/rust/Cargo.toml
index e922dbed..9378f879 100644
--- a/src/rust/Cargo.toml
+++ b/src/rust/Cargo.toml
@@ -60,8 +60,10 @@ features = [
     "month_end",
     "offset_by",
     "parquet",
+    "pivot",
     "propagate_nans",
     "range",
+    "semi_anti_join",
     "serde",
     "serde-lazy",
     "streaming",
diff --git a/src/rust/src/conversion/mod.rs b/src/rust/src/conversion/mod.rs
index 98fd8dd9..e8b74754 100644
--- a/src/rust/src/conversion/mod.rs
+++ b/src/rust/src/conversion/mod.rs
@@ -1,5 +1,5 @@
 use crate::prelude::*;
-use crate::{PlRDataFrame, PlRDataType, PlRExpr};
+use crate::{PlRDataFrame, PlRDataType, PlRExpr, PlRLazyFrame};
 use polars::series::ops::NullBehavior;
 use savvy::{ListSexp, NumericScalar, NumericSexp, NumericTypedSexp, TypedSexp};
 pub mod base_date;
@@ -96,6 +96,21 @@ impl TryFrom<ListSexp> for Wrap<Vec<DataFrame>> {
     }
 }
 
+impl TryFrom<ListSexp> for Wrap<Vec<LazyFrame>> {
+    type Error = savvy::Error;
+
+    fn try_from(list: ListSexp) -> Result<Self, savvy::Error> {
+        let dfs = list
+            .values_iter()
+            .map(|sexp| match sexp.into_typed() {
+                TypedSexp::Environment(e) => Ok(<&PlRLazyFrame>::try_from(e)?.ldf.clone()),
+                _ => Err("Only accept a list of polars data frames".to_string()),
+            })
+            .collect::<Result<Vec<_>, _>>()?;
+        Ok(Wrap(dfs))
+    }
+}
+
 impl From<ListSexp> for Wrap<Vec<Expr>> {
     fn from(list: ListSexp) -> Self {
         let expr_list = list
@@ -427,7 +442,7 @@ impl TryFrom<&str> for Wrap<ClosedWindow> {
             "left" => ClosedWindow::Left,
             "none" => ClosedWindow::None,
             "right" => ClosedWindow::Right,
-            v => return Err(format!("unreachable",)),
+            _ => return Err(format!("unreachable",)),
         };
         Ok(Wrap(parsed))
     }
@@ -441,7 +456,106 @@ impl TryFrom<&str> for Wrap<Roll> {
             "raise" => Roll::Raise,
             "forward" => Roll::Forward,
             "backward" => Roll::Backward,
-            v => return Err(format!("unreachable",)),
+            _ => return Err(format!("unreachable",)),
+        };
+        Ok(Wrap(parsed))
+    }
+}
+
+impl TryFrom<&str> for Wrap<QuantileMethod> {
+    type Error = String;
+
+    fn try_from(interpolation: &str) -> Result<Self, String> {
+        let parsed = match interpolation {
+            "nearest" => QuantileMethod::Nearest,
+            "higher" => QuantileMethod::Higher,
+            "lower" => QuantileMethod::Lower,
+            "midpoint" => QuantileMethod::Midpoint,
+            "linear" => QuantileMethod::Linear,
+            "equiprobable" => QuantileMethod::Equiprobable,
+            _ => return Err(format!("unreachable")),
+        };
+        Ok(Wrap(parsed))
+    }
+}
+
+impl TryFrom<&str> for Wrap<UniqueKeepStrategy> {
+    type Error = String;
+
+    fn try_from(strategy: &str) -> Result<Self, String> {
+        let parsed = match strategy {
+            "first" => UniqueKeepStrategy::First,
+            "last" => UniqueKeepStrategy::Last,
+            "none" => UniqueKeepStrategy::None,
+            "any" => UniqueKeepStrategy::Any,
+            _ => return Err(format!("unreachable")),
+        };
+        Ok(Wrap(parsed))
+    }
+}
+
+impl TryFrom<&str> for Wrap<JoinType> {
+    type Error = String;
+
+    fn try_from(how: &str) -> Result<Self, String> {
+        let parsed = match how {
+            "cross" => JoinType::Cross,
+            "inner" => JoinType::Inner,
+            "left" => JoinType::Left,
+            "right" => JoinType::Right,
+            "full" => JoinType::Full,
+            "semi" => JoinType::Semi,
+            "anti" => JoinType::Anti,
+            _ => return Err(format!("unreachable")),
+        };
+        Ok(Wrap(parsed))
+    }
+}
+
+impl TryFrom<&str> for Wrap<JoinValidation> {
+    type Error = String;
+
+    fn try_from(validation: &str) -> Result<Self, String> {
+        let parsed = match validation {
+            "m:m" => JoinValidation::ManyToMany,
+            "1:m" => JoinValidation::OneToMany,
+            "1:1" => JoinValidation::OneToOne,
+            "m:1" => JoinValidation::ManyToOne,
+            _ => return Err(format!("unreachable")),
+        };
+        Ok(Wrap(parsed))
+    }
+}
+
+impl TryFrom<&str> for Wrap<Label> {
+    type Error = String;
+
+    fn try_from(label: &str) -> Result<Self, String> {
+        let parsed = match label {
+            "left" => Label::Left,
+            "right" => Label::Right,
+            "datapoint" => Label::DataPoint,
+            _ => return Err(format!("unreachable")),
+        };
+        Ok(Wrap(parsed))
+    }
+}
+
+impl TryFrom<&str> for Wrap<StartBy> {
+    type Error = String;
+
+    fn try_from(start_by: &str) -> Result<Self, String> {
+        let parsed = match start_by {
+            "window" => StartBy::WindowBound,
+            "datapoint" => StartBy::DataPoint,
+            "monday" => StartBy::Monday,
+            "tuesday" => StartBy::Tuesday,
+            "wednesday" => StartBy::Wednesday,
+            "thursday" => StartBy::Thursday,
+            "friday" => StartBy::Friday,
+            "saturday" => StartBy::Saturday,
+            "sunday" => StartBy::Sunday,
+            _ => return Err(format!("unreachable")),
         };
         Ok(Wrap(parsed))
     }
diff --git a/src/rust/src/lazyframe/general.rs b/src/rust/src/lazyframe/general.rs
index 86ba11af..85116775 100644
--- a/src/rust/src/lazyframe/general.rs
+++ b/src/rust/src/lazyframe/general.rs
@@ -1,6 +1,8 @@
 use super::*;
 use crate::{PlRDataFrame, PlRDataType, PlRExpr, PlRLazyFrame, PlRLazyGroupBy, RPolarsErr};
-use savvy::{savvy, ListSexp, LogicalSexp, NumericScalar, OwnedStringSexp, Result, Sexp};
+use savvy::{
+    savvy, ListSexp, LogicalSexp, NumericScalar, OwnedStringSexp, Result, Sexp, StringSexp,
+};
 
 #[savvy]
 impl PlRLazyFrame {
@@ -220,8 +222,9 @@ impl PlRLazyFrame {
             .into())
     }
 
-    fn top_k(&self, k: IdxSize, by: ListSexp, reverse: LogicalSexp) -> Result<PlRLazyFrame> {
+    fn top_k(&self, k: NumericScalar, by: ListSexp, reverse: LogicalSexp) -> Result<PlRLazyFrame> {
         let ldf = self.ldf.clone();
+        let k = <Wrap<u32>>::try_from(k)?.0;
         let exprs = <Wrap<Vec<Expr>>>::from(by).0;
         let reverse = reverse.to_vec();
         Ok(ldf
@@ -233,8 +236,14 @@ impl PlRLazyFrame {
             .into())
     }
 
-    fn bottom_k(&self, k: IdxSize, by: ListSexp, reverse: LogicalSexp) -> Result<PlRLazyFrame> {
+    fn bottom_k(
+        &self,
+        k: NumericScalar,
+        by: ListSexp,
+        reverse: LogicalSexp,
+    ) -> Result<PlRLazyFrame> {
         let ldf = self.ldf.clone();
+        let k = <Wrap<u32>>::try_from(k)?.0;
         let exprs = <Wrap<Vec<Expr>>>::from(by).0;
         let reverse = reverse.to_vec();
         Ok(ldf
@@ -399,12 +408,12 @@ impl PlRLazyFrame {
     //         Ok(())
     //     }
 
-    fn fetch(&self, py: Python, n_rows: NumericScalar) -> Result<PlRDataFrame> {
-        let ldf = self.ldf.clone();
-        let n_rows = <Wrap<usize>>::try_from(n_rows)?.0;
-        let df = py.allow_threads(|| ldf.fetch(n_rows).map_err(RPolarsErr::from))?;
-        Ok(df.into())
-    }
+    // fn fetch(&self, py: Python, n_rows: NumericScalar) -> Result<PlRDataFrame> {
+    //     let ldf = self.ldf.clone();
+    //     let n_rows = <Wrap<usize>>::try_from(n_rows)?.0;
+    //     let df = py.allow_threads(|| ldf.fetch(n_rows).map_err(RPolarsErr::from))?;
+    //     Ok(df.into())
+    // }
 
     fn select_seq(&mut self, exprs: ListSexp) -> Result<PlRLazyFrame> {
         let ldf = self.ldf.clone();
@@ -424,7 +433,7 @@ impl PlRLazyFrame {
         let ldf = self.ldf.clone();
         let by = <Wrap<Vec<Expr>>>::from(by).0;
         let lazy_gb = ldf.rolling(
-            index_column.inner,
+            index_column.inner.clone(),
             by,
             RollingGroupOptions {
                 index_column: "".into(),
@@ -443,26 +452,28 @@ impl PlRLazyFrame {
         every: &str,
         period: &str,
         offset: &str,
-        label: Wrap<Label>,
+        label: &str,
         include_boundaries: bool,
         closed: &str,
         group_by: ListSexp,
-        start_by: Wrap<StartBy>,
+        start_by: &str,
     ) -> Result<PlRLazyGroupBy> {
         let closed_window = <Wrap<ClosedWindow>>::try_from(closed)?.0;
         let group_by = <Wrap<Vec<Expr>>>::from(group_by).0;
         let ldf = self.ldf.clone();
+        let label = <Wrap<Label>>::try_from(label)?.0;
+        let start_by = <Wrap<StartBy>>::try_from(start_by)?.0;
         let lazy_gb = ldf.group_by_dynamic(
-            index_column.inner,
+            index_column.inner.clone(),
             group_by,
             DynamicGroupOptions {
                 every: Duration::try_parse(every).map_err(RPolarsErr::from)?,
                 period: Duration::try_parse(period).map_err(RPolarsErr::from)?,
                 offset: Duration::try_parse(offset).map_err(RPolarsErr::from)?,
-                label: label.0,
+                label,
                 include_boundaries,
                 closed_window,
-                start_by: start_by.0,
+                start_by,
                 ..Default::default()
             },
         );
@@ -470,8 +481,8 @@ impl PlRLazyFrame {
         Ok(PlRLazyGroupBy { lgb: Some(lazy_gb) })
     }
 
-    fn with_context(&self, contexts: Vec<Self>) -> Result<PlRLazyFrame> {
-        let contexts = contexts.into_iter().map(|ldf| ldf.ldf).collect::<Vec<_>>();
+    fn with_context(&self, contexts: ListSexp) -> Result<PlRLazyFrame> {
+        let contexts = <Wrap<Vec<LazyFrame>>>::try_from(contexts)?.0;
         Ok(self.ldf.clone().with_context(contexts).into())
     }
 
@@ -527,9 +538,9 @@ impl PlRLazyFrame {
         allow_parallel: bool,
         force_parallel: bool,
         join_nulls: bool,
-        how: Wrap<JoinType>,
-        suffix: String,
-        validate: Wrap<JoinValidation>,
+        how: &str,
+        suffix: &str,
+        validate: &str,
         coalesce: Option<bool>,
     ) -> Result<PlRLazyFrame> {
         let coalesce = match coalesce {
@@ -538,16 +549,11 @@ impl PlRLazyFrame {
             Some(false) => JoinCoalesce::KeepColumns,
         };
         let ldf = self.ldf.clone();
-        let other = other.ldf;
-        let left_on = left_on
-            .into_iter()
-            .map(|pyexpr| pyexpr.inner)
-            .collect::<Vec<_>>();
-        let right_on = right_on
-            .into_iter()
-            .map(|pyexpr| pyexpr.inner)
-            .collect::<Vec<_>>();
-
+        let other = other.ldf.clone();
+        let left_on = <Wrap<Vec<Expr>>>::from(left_on).0;
+        let right_on = <Wrap<Vec<Expr>>>::from(right_on).0;
+        let how = <Wrap<JoinType>>::try_from(how)?.0;
+        let validate = <Wrap<JoinValidation>>::try_from(validate)?.0;
         Ok(ldf
             .join_builder()
             .with(other)
@@ -556,9 +562,9 @@ impl PlRLazyFrame {
             .allow_parallel(allow_parallel)
             .force_parallel(force_parallel)
             .join_nulls(join_nulls)
-            .how(how.0)
+            .how(how)
             .coalesce(coalesce)
-            .validate(validate.0)
+            .validate(validate)
             .suffix(suffix)
             .finish()
             .into())
@@ -568,10 +574,10 @@ impl PlRLazyFrame {
         &self,
         other: &PlRLazyFrame,
         predicates: ListSexp,
-        suffix: String,
+        suffix: &str,
     ) -> Result<PlRLazyFrame> {
         let ldf = self.ldf.clone();
-        let other = other.ldf;
+        let other = other.ldf.clone();
 
         let predicates = <Wrap<Vec<Expr>>>::from(predicates).0;
 
@@ -591,12 +597,12 @@ impl PlRLazyFrame {
 
     fn rename(
         &mut self,
-        existing: Vec<String>,
-        new: Vec<String>,
+        existing: StringSexp,
+        new: StringSexp,
         strict: bool,
     ) -> Result<PlRLazyFrame> {
         let ldf = self.ldf.clone();
-        Ok(ldf.rename(existing, new, strict).into())
+        Ok(ldf.rename(existing.to_vec(), new.to_vec(), strict).into())
     }
 
     fn reverse(&self) -> Result<PlRLazyFrame> {
@@ -607,15 +613,15 @@ impl PlRLazyFrame {
     fn shift(&self, n: &PlRExpr, fill_value: Option<&PlRExpr>) -> Result<PlRLazyFrame> {
         let lf = self.ldf.clone();
         let out = match fill_value {
-            Some(v) => lf.shift_and_fill(n.inner, v.inner),
-            None => lf.shift(n.inner),
+            Some(v) => lf.shift_and_fill(n.inner.clone(), v.inner.clone()),
+            None => lf.shift(n.inner.clone()),
         };
         Ok(out.into())
     }
 
     fn fill_nan(&self, fill_value: &PlRExpr) -> Result<PlRLazyFrame> {
         let ldf = self.ldf.clone();
-        ldf.fill_nan(fill_value.inner).into()
+        Ok(ldf.fill_nan(fill_value.inner.clone()).into())
     }
 
     fn min(&self) -> Result<PlRLazyFrame> {
@@ -660,13 +666,10 @@ impl PlRLazyFrame {
         Ok(out.into())
     }
 
-    fn quantile(
-        &self,
-        quantile: &PlRExpr,
-        interpolation: Wrap<QuantileMethod>,
-    ) -> Result<PlRLazyFrame> {
+    fn quantile(&self, quantile: &PlRExpr, interpolation: &str) -> Result<PlRLazyFrame> {
         let ldf = self.ldf.clone();
-        let out = ldf.quantile(quantile.inner, interpolation.0);
+        let interpolation = <Wrap<QuantileMethod>>::try_from(interpolation)?.0;
+        let out = ldf.quantile(quantile.inner.clone(), interpolation);
         Ok(out.into())
     }
 
@@ -684,14 +687,15 @@ impl PlRLazyFrame {
     fn unique(
         &self,
         maintain_order: bool,
-        keep: Wrap<UniqueKeepStrategy>,
+        keep: &str,
         subset: Option<ListSexp>,
     ) -> Result<PlRLazyFrame> {
         let ldf = self.ldf.clone();
+        let keep = <Wrap<UniqueKeepStrategy>>::try_from(keep)?.0;
         let subset = subset.map(|e| <Wrap<Vec<Expr>>>::from(e).0);
         let out = match maintain_order {
-            true => ldf.unique_stable_generic(subset, keep.0),
-            false => ldf.unique_generic(subset, keep.0),
+            true => ldf.unique_stable_generic(subset, keep),
+            false => ldf.unique_generic(subset, keep),
         };
         Ok(out.into())
     }
@@ -706,12 +710,14 @@ impl PlRLazyFrame {
         &self,
         on: ListSexp,
         index: ListSexp,
-        value_name: Option<String>,
-        variable_name: Option<String>,
+        value_name: Option<&str>,
+        variable_name: Option<&str>,
     ) -> Result<PlRLazyFrame> {
+        let on = <Wrap<Vec<Expr>>>::from(on).0;
+        let index = <Wrap<Vec<Expr>>>::from(index).0;
         let args = UnpivotArgsDSL {
-            on: on.into_iter().map(|e| e.inner.into()).collect(),
-            index: index.into_iter().map(|e| e.inner.into()).collect(),
+            on: on.into_iter().map(|e| e.into()).collect(),
+            index: index.into_iter().map(|e| e.into()).collect(),
             value_name: value_name.map(|s| s.into()),
             variable_name: variable_name.map(|s| s.into()),
         };
@@ -720,8 +726,12 @@ impl PlRLazyFrame {
         Ok(ldf.unpivot(args).into())
     }
 
-    fn with_row_index(&self, name: &str, offset: Option<IdxSize>) -> Result<PlRLazyFrame> {
+    fn with_row_index(&self, name: &str, offset: Option<NumericScalar>) -> Result<PlRLazyFrame> {
         let ldf = self.ldf.clone();
+        let offset: Option<u32> = match offset {
+            Some(x) => Some(<Wrap<u32>>::try_from(x)?.0),
+            None => None,
+        };
         Ok(ldf.with_row_index(name, offset).into())
     }
 
@@ -756,19 +766,19 @@ impl PlRLazyFrame {
         Ok(self.ldf.clone().into())
     }
 
-    fn collect_schema(&mut self, py: Python) -> Result<ListSexp> {
-        let schema = py
-            .allow_threads(|| self.ldf.collect_schema())
-            .map_err(RPolarsErr::from)?;
+    // fn collect_schema(&mut self, py: Python) -> Result<ListSexp> {
+    //     let schema = py
+    //         .allow_threads(|| self.ldf.collect_schema())
+    //         .map_err(RPolarsErr::from)?;
 
-        let schema_dict = PyDict::new_bound(py);
-        schema.iter_fields().for_each(|fld| {
-            schema_dict
-                .set_item(fld.name().as_str(), Wrap(fld.dtype().clone()))
-                .unwrap()
-        });
-        Ok(schema_dict.to_object(py))
-    }
+    //     let schema_dict = PyDict::new_bound(py);
+    //     schema.iter_fields().for_each(|fld| {
+    //         schema_dict
+    //             .set_item(fld.name().as_str(), Wrap(fld.dtype().clone()))
+    //             .unwrap()
+    //     });
+    //     Ok(schema_dict.to_object(py))
+    // }
 
     fn unnest(&self, columns: ListSexp) -> Result<PlRLazyFrame> {
         let columns = <Wrap<Vec<Expr>>>::from(columns).0;
@@ -784,7 +794,7 @@ impl PlRLazyFrame {
         let out = self
             .ldf
             .clone()
-            .merge_sorted(other.ldf, key)
+            .merge_sorted(other.ldf.clone(), key)
             .map_err(RPolarsErr::from)?;
         Ok(out.into())
     }
diff --git a/src/rust/src/lazyframe/mod.rs b/src/rust/src/lazyframe/mod.rs
index b5b5fc09..20725f28 100644
--- a/src/rust/src/lazyframe/mod.rs
+++ b/src/rust/src/lazyframe/mod.rs
@@ -1,7 +1,7 @@
 mod general;
 
 use crate::prelude::*;
-use savvy::savvy;
+use savvy::{savvy, EnvironmentSexp};
 
 #[savvy]
 #[repr(transparent)]
@@ -15,3 +15,15 @@ impl From<LazyFrame> for PlRLazyFrame {
         PlRLazyFrame { ldf }
     }
 }
+
+impl TryFrom<EnvironmentSexp> for &PlRLazyFrame {
+    type Error = String;
+
+    fn try_from(env: EnvironmentSexp) -> Result<Self, String> {
+        let ptr = env
+            .get(".ptr")
+            .expect("Failed to get `.ptr` from the object")
+            .ok_or("The object is not a valid polars data frame")?;
+        <&PlRLazyFrame>::try_from(ptr).map_err(|e| e.to_string())
+    }
+}

From 88e0db08485b8a262b4351d483a5713c2012ae44 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Tue, 5 Nov 2024 10:00:25 +0100
Subject: [PATCH 03/71] more

---
 R/000-wrappers.R    | 247 ++++++++++++++++++++++++++++++++++++++++++++
 R/lazyframe-frame.R |  37 +++++--
 src/init.c          | 204 ++++++++++++++++++++++++++++++++++++
 src/rust/api.h      |  34 ++++++
 4 files changed, 513 insertions(+), 9 deletions(-)

diff --git a/R/000-wrappers.R b/R/000-wrappers.R
index e55614ec..32e00034 100644
--- a/R/000-wrappers.R
+++ b/R/000-wrappers.R
@@ -2300,6 +2300,219 @@ class(`PlRExpr`) <- c("PlRExpr__bundle", "savvy_neopolars__sealed")
   }
 }
 
+`PlRLazyFrame_to_dot` <- function(self) {
+  function(`optimized`) {
+    .savvy_wrap_String(.Call(savvy_PlRLazyFrame_to_dot__impl, `self`, `optimized`))
+  }
+}
+
+`PlRLazyFrame_sort` <- function(self) {
+  function(`by_column`, `descending`, `nulls_last`, `maintain_order`, `multithreaded`) {
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_sort__impl, `self`, `by_column`, `descending`, `nulls_last`, `maintain_order`, `multithreaded`))
+  }
+}
+
+`PlRLazyFrame_top_k` <- function(self) {
+  function(`k`, `by`, `reverse`) {
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_top_k__impl, `self`, `k`, `by`, `reverse`))
+  }
+}
+
+`PlRLazyFrame_bottom_k` <- function(self) {
+  function(`k`, `by`, `reverse`) {
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_bottom_k__impl, `self`, `k`, `by`, `reverse`))
+  }
+}
+
+`PlRLazyFrame_cache` <- function(self) {
+  function() {
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_cache__impl, `self`))
+  }
+}
+
+`PlRLazyFrame_select_seq` <- function(self) {
+  function(`exprs`) {
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_select_seq__impl, `self`, `exprs`))
+  }
+}
+
+`PlRLazyFrame_rolling` <- function(self) {
+  function(`index_column`, `period`, `offset`, `closed`, `by`) {
+    `index_column` <- .savvy_extract_ptr(`index_column`, "PlRExpr")
+    .savvy_wrap_PlRLazyGroupBy(.Call(savvy_PlRLazyFrame_rolling__impl, `self`, `index_column`, `period`, `offset`, `closed`, `by`))
+  }
+}
+
+`PlRLazyFrame_group_by_dynamic` <- function(self) {
+  function(`index_column`, `every`, `period`, `offset`, `label`, `include_boundaries`, `closed`, `group_by`, `start_by`) {
+    `index_column` <- .savvy_extract_ptr(`index_column`, "PlRExpr")
+    .savvy_wrap_PlRLazyGroupBy(.Call(savvy_PlRLazyFrame_group_by_dynamic__impl, `self`, `index_column`, `every`, `period`, `offset`, `label`, `include_boundaries`, `closed`, `group_by`, `start_by`))
+  }
+}
+
+`PlRLazyFrame_with_context` <- function(self) {
+  function(`contexts`) {
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_with_context__impl, `self`, `contexts`))
+  }
+}
+
+`PlRLazyFrame_join` <- function(self) {
+  function(`other`, `left_on`, `right_on`, `allow_parallel`, `force_parallel`, `join_nulls`, `how`, `suffix`, `validate`, `coalesce` = NULL) {
+    `other` <- .savvy_extract_ptr(`other`, "PlRLazyFrame")
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_join__impl, `self`, `other`, `left_on`, `right_on`, `allow_parallel`, `force_parallel`, `join_nulls`, `how`, `suffix`, `validate`, `coalesce`))
+  }
+}
+
+`PlRLazyFrame_join_where` <- function(self) {
+  function(`other`, `predicates`, `suffix`) {
+    `other` <- .savvy_extract_ptr(`other`, "PlRLazyFrame")
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_join_where__impl, `self`, `other`, `predicates`, `suffix`))
+  }
+}
+
+`PlRLazyFrame_with_columns_seq` <- function(self) {
+  function(`exprs`) {
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_with_columns_seq__impl, `self`, `exprs`))
+  }
+}
+
+`PlRLazyFrame_rename` <- function(self) {
+  function(`existing`, `new`, `strict`) {
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_rename__impl, `self`, `existing`, `new`, `strict`))
+  }
+}
+
+`PlRLazyFrame_reverse` <- function(self) {
+  function() {
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_reverse__impl, `self`))
+  }
+}
+
+`PlRLazyFrame_shift` <- function(self) {
+  function(`n`, `fill_value` = NULL) {
+    `n` <- .savvy_extract_ptr(`n`, "PlRExpr")
+    `fill_value` <- .savvy_extract_ptr(`fill_value`, "PlRExpr")
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_shift__impl, `self`, `n`, `fill_value`))
+  }
+}
+
+`PlRLazyFrame_fill_nan` <- function(self) {
+  function(`fill_value`) {
+    `fill_value` <- .savvy_extract_ptr(`fill_value`, "PlRExpr")
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_fill_nan__impl, `self`, `fill_value`))
+  }
+}
+
+`PlRLazyFrame_min` <- function(self) {
+  function() {
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_min__impl, `self`))
+  }
+}
+
+`PlRLazyFrame_max` <- function(self) {
+  function() {
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_max__impl, `self`))
+  }
+}
+
+`PlRLazyFrame_sum` <- function(self) {
+  function() {
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_sum__impl, `self`))
+  }
+}
+
+`PlRLazyFrame_mean` <- function(self) {
+  function() {
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_mean__impl, `self`))
+  }
+}
+
+`PlRLazyFrame_std` <- function(self) {
+  function(`ddof`) {
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_std__impl, `self`, `ddof`))
+  }
+}
+
+`PlRLazyFrame_var` <- function(self) {
+  function(`ddof`) {
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_var__impl, `self`, `ddof`))
+  }
+}
+
+`PlRLazyFrame_median` <- function(self) {
+  function() {
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_median__impl, `self`))
+  }
+}
+
+`PlRLazyFrame_quantile` <- function(self) {
+  function(`quantile`, `interpolation`) {
+    `quantile` <- .savvy_extract_ptr(`quantile`, "PlRExpr")
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_quantile__impl, `self`, `quantile`, `interpolation`))
+  }
+}
+
+`PlRLazyFrame_explode` <- function(self) {
+  function(`column`) {
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_explode__impl, `self`, `column`))
+  }
+}
+
+`PlRLazyFrame_null_count` <- function(self) {
+  function() {
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_null_count__impl, `self`))
+  }
+}
+
+`PlRLazyFrame_unique` <- function(self) {
+  function(`maintain_order`, `keep`, `subset` = NULL) {
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_unique__impl, `self`, `maintain_order`, `keep`, `subset`))
+  }
+}
+
+`PlRLazyFrame_drop_nulls` <- function(self) {
+  function(`subset` = NULL) {
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_drop_nulls__impl, `self`, `subset`))
+  }
+}
+
+`PlRLazyFrame_unpivot` <- function(self) {
+  function(`on`, `index`, `value_name` = NULL, `variable_name` = NULL) {
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_unpivot__impl, `self`, `on`, `index`, `value_name`, `variable_name`))
+  }
+}
+
+`PlRLazyFrame_with_row_index` <- function(self) {
+  function(`name`, `offset` = NULL) {
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_with_row_index__impl, `self`, `name`, `offset`))
+  }
+}
+
+`PlRLazyFrame_clone` <- function(self) {
+  function() {
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_clone__impl, `self`))
+  }
+}
+
+`PlRLazyFrame_unnest` <- function(self) {
+  function(`columns`) {
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_unnest__impl, `self`, `columns`))
+  }
+}
+
+`PlRLazyFrame_count` <- function(self) {
+  function() {
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_count__impl, `self`))
+  }
+}
+
+`PlRLazyFrame_merge_sorted` <- function(self) {
+  function(`other`, `key`) {
+    `other` <- .savvy_extract_ptr(`other`, "PlRLazyFrame")
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_merge_sorted__impl, `self`, `other`, `key`))
+  }
+}
+
 `.savvy_wrap_PlRLazyFrame` <- function(ptr) {
   e <- new.env(parent = emptyenv())
   e$.ptr <- ptr
@@ -2319,6 +2532,40 @@ class(`PlRExpr`) <- c("PlRExpr__bundle", "savvy_neopolars__sealed")
   e$`cast_all` <- `PlRLazyFrame_cast_all`(ptr)
   e$`sort_by_exprs` <- `PlRLazyFrame_sort_by_exprs`(ptr)
   e$`with_columns` <- `PlRLazyFrame_with_columns`(ptr)
+  e$`to_dot` <- `PlRLazyFrame_to_dot`(ptr)
+  e$`sort` <- `PlRLazyFrame_sort`(ptr)
+  e$`top_k` <- `PlRLazyFrame_top_k`(ptr)
+  e$`bottom_k` <- `PlRLazyFrame_bottom_k`(ptr)
+  e$`cache` <- `PlRLazyFrame_cache`(ptr)
+  e$`select_seq` <- `PlRLazyFrame_select_seq`(ptr)
+  e$`rolling` <- `PlRLazyFrame_rolling`(ptr)
+  e$`group_by_dynamic` <- `PlRLazyFrame_group_by_dynamic`(ptr)
+  e$`with_context` <- `PlRLazyFrame_with_context`(ptr)
+  e$`join` <- `PlRLazyFrame_join`(ptr)
+  e$`join_where` <- `PlRLazyFrame_join_where`(ptr)
+  e$`with_columns_seq` <- `PlRLazyFrame_with_columns_seq`(ptr)
+  e$`rename` <- `PlRLazyFrame_rename`(ptr)
+  e$`reverse` <- `PlRLazyFrame_reverse`(ptr)
+  e$`shift` <- `PlRLazyFrame_shift`(ptr)
+  e$`fill_nan` <- `PlRLazyFrame_fill_nan`(ptr)
+  e$`min` <- `PlRLazyFrame_min`(ptr)
+  e$`max` <- `PlRLazyFrame_max`(ptr)
+  e$`sum` <- `PlRLazyFrame_sum`(ptr)
+  e$`mean` <- `PlRLazyFrame_mean`(ptr)
+  e$`std` <- `PlRLazyFrame_std`(ptr)
+  e$`var` <- `PlRLazyFrame_var`(ptr)
+  e$`median` <- `PlRLazyFrame_median`(ptr)
+  e$`quantile` <- `PlRLazyFrame_quantile`(ptr)
+  e$`explode` <- `PlRLazyFrame_explode`(ptr)
+  e$`null_count` <- `PlRLazyFrame_null_count`(ptr)
+  e$`unique` <- `PlRLazyFrame_unique`(ptr)
+  e$`drop_nulls` <- `PlRLazyFrame_drop_nulls`(ptr)
+  e$`unpivot` <- `PlRLazyFrame_unpivot`(ptr)
+  e$`with_row_index` <- `PlRLazyFrame_with_row_index`(ptr)
+  e$`clone` <- `PlRLazyFrame_clone`(ptr)
+  e$`unnest` <- `PlRLazyFrame_unnest`(ptr)
+  e$`count` <- `PlRLazyFrame_count`(ptr)
+  e$`merge_sorted` <- `PlRLazyFrame_merge_sorted`(ptr)
 
   class(e) <- c("PlRLazyFrame", "savvy_neopolars__sealed")
   e
diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index c00fb7cb..10e42ffd 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -477,9 +477,15 @@ lazyframe__std <- function(ddof = 1) {
 #' @inherit as_polars_lf return
 #' @examples
 #' as_polars_lf(mtcars)$quantile(.4)$collect()
-lazyframe__quantile <- function(quantile, interpolation = "nearest") {
+lazyframe__quantile <- function(
+    quantile,
+    interpolation = c("nearest", "higher", "lower", "midpoint", "linear")) {
   wrap({
-    self$`_rexpr`$quantile(wrap_e_result(quantile), interpolation)
+    interpolation <- arg_match0(
+      interpolation,
+      values = c("nearest", "higher", "lower", "midpoint", "linear")
+    )
+    self$`_rexpr`$quantile(as_polars_expr(quantile, as_lit = TRUE)$`_rexpr`, interpolation)
   })
 }
 
@@ -803,7 +809,7 @@ lazyframe__join <- function(
 #' `"x<suffix>"`.
 #' @param suffix Suffix to append to columns with a duplicate name.
 #'
-#' @return A LazyFrame
+#' @inherit as_polars_lf return
 #'
 #' @examples
 #' east <- pl$LazyFrame(
@@ -1017,7 +1023,7 @@ lazyframe__join_asof <- function(
 #'
 #'
 #'
-#' @return A LazyFrame
+#' @inherit as_polars_lf return
 #'
 #' @examples
 #' lf <- pl$LazyFrame(
@@ -1320,7 +1326,7 @@ lazyframe__explode <- function(...) {
 #' but this can be useful when dealing with attributes (see examples).
 #'
 #'
-#' @return A LazyFrame
+#' @inherit as_polars_lf return
 #' @examples
 #' df1 <- pl$LazyFrame(iris)
 #'
@@ -1354,7 +1360,7 @@ lazyframe__clone <- function() {
 #'
 #' @inheritParams DataFrame_unnest
 #'
-#' @return A LazyFrame where some or all columns of datatype Struct are unnested.
+#' @inherit as_polars_lf return where some or all columns of datatype Struct are unnested.
 #' @examples
 #' lf <- pl$LazyFrame(
 #'   a = 1:5,
@@ -1391,7 +1397,7 @@ lazyframe__unnest <- function(...) {
 #'
 #' @param other Data/LazyFrame to have access to. This can be a list of DataFrames
 #' and LazyFrames.
-#' @return A LazyFrame
+#' @inherit as_polars_lf return
 #'
 #' @examples
 #' lf <- pl$LazyFrame(a = c(1, 2, 3), b = c("a", "c", NA))
@@ -1727,7 +1733,7 @@ lazyframe__sql <- function(query, ..., table_name = NULL, envir = parent.frame()
 #' @param n Gather every `n`-th row.
 #' @param offset Starting index.
 #'
-#' @return A LazyFrame
+#' @inherit as_polars_lf return
 #'
 #' @examples
 #' lf <- pl$LazyFrame(a = 1:4, b = 5:8)
@@ -1751,7 +1757,7 @@ lazyframe__gather_every <- function(n, offset = 0) {
 #' @param strict If `TRUE` (default), throw an error if a cast could not be done
 #' (for instance, due to an overflow). Otherwise, return `null`.
 #'
-#' @return A LazyFrame
+#' @inherit as_polars_lf return
 #'
 #' @examples
 #' lf <- pl$LazyFrame(
@@ -1774,3 +1780,16 @@ lazyframe__cast <- function(dtypes, ..., strict = TRUE) {
       unwrap("in $cast():")
   }
 }
+
+#' Return the number of non-null elements for each column
+#'
+#' @inherit as_polars_lf return
+#'
+#' @examples
+#' lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, NA), c = rep(NA, 4))
+#' lf$count()$collect()
+lazyframe__count <- function() {
+  wrap({
+    self$`_rexpr`$count()
+  })
+}
diff --git a/src/init.c b/src/init.c
index a6d3f075..38be6b57 100644
--- a/src/init.c
+++ b/src/init.c
@@ -1549,6 +1549,176 @@ SEXP savvy_PlRLazyFrame_with_columns__impl(SEXP self__, SEXP c_arg__exprs) {
     return handle_result(res);
 }
 
+SEXP savvy_PlRLazyFrame_to_dot__impl(SEXP self__, SEXP c_arg__optimized) {
+    SEXP res = savvy_PlRLazyFrame_to_dot__ffi(self__, c_arg__optimized);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_sort__impl(SEXP self__, SEXP c_arg__by_column, SEXP c_arg__descending, SEXP c_arg__nulls_last, SEXP c_arg__maintain_order, SEXP c_arg__multithreaded) {
+    SEXP res = savvy_PlRLazyFrame_sort__ffi(self__, c_arg__by_column, c_arg__descending, c_arg__nulls_last, c_arg__maintain_order, c_arg__multithreaded);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_top_k__impl(SEXP self__, SEXP c_arg__k, SEXP c_arg__by, SEXP c_arg__reverse) {
+    SEXP res = savvy_PlRLazyFrame_top_k__ffi(self__, c_arg__k, c_arg__by, c_arg__reverse);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_bottom_k__impl(SEXP self__, SEXP c_arg__k, SEXP c_arg__by, SEXP c_arg__reverse) {
+    SEXP res = savvy_PlRLazyFrame_bottom_k__ffi(self__, c_arg__k, c_arg__by, c_arg__reverse);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_cache__impl(SEXP self__) {
+    SEXP res = savvy_PlRLazyFrame_cache__ffi(self__);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_select_seq__impl(SEXP self__, SEXP c_arg__exprs) {
+    SEXP res = savvy_PlRLazyFrame_select_seq__ffi(self__, c_arg__exprs);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_rolling__impl(SEXP self__, SEXP c_arg__index_column, SEXP c_arg__period, SEXP c_arg__offset, SEXP c_arg__closed, SEXP c_arg__by) {
+    SEXP res = savvy_PlRLazyFrame_rolling__ffi(self__, c_arg__index_column, c_arg__period, c_arg__offset, c_arg__closed, c_arg__by);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_group_by_dynamic__impl(SEXP self__, SEXP c_arg__index_column, SEXP c_arg__every, SEXP c_arg__period, SEXP c_arg__offset, SEXP c_arg__label, SEXP c_arg__include_boundaries, SEXP c_arg__closed, SEXP c_arg__group_by, SEXP c_arg__start_by) {
+    SEXP res = savvy_PlRLazyFrame_group_by_dynamic__ffi(self__, c_arg__index_column, c_arg__every, c_arg__period, c_arg__offset, c_arg__label, c_arg__include_boundaries, c_arg__closed, c_arg__group_by, c_arg__start_by);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_with_context__impl(SEXP self__, SEXP c_arg__contexts) {
+    SEXP res = savvy_PlRLazyFrame_with_context__ffi(self__, c_arg__contexts);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_join__impl(SEXP self__, SEXP c_arg__other, SEXP c_arg__left_on, SEXP c_arg__right_on, SEXP c_arg__allow_parallel, SEXP c_arg__force_parallel, SEXP c_arg__join_nulls, SEXP c_arg__how, SEXP c_arg__suffix, SEXP c_arg__validate, SEXP c_arg__coalesce) {
+    SEXP res = savvy_PlRLazyFrame_join__ffi(self__, c_arg__other, c_arg__left_on, c_arg__right_on, c_arg__allow_parallel, c_arg__force_parallel, c_arg__join_nulls, c_arg__how, c_arg__suffix, c_arg__validate, c_arg__coalesce);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_join_where__impl(SEXP self__, SEXP c_arg__other, SEXP c_arg__predicates, SEXP c_arg__suffix) {
+    SEXP res = savvy_PlRLazyFrame_join_where__ffi(self__, c_arg__other, c_arg__predicates, c_arg__suffix);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_with_columns_seq__impl(SEXP self__, SEXP c_arg__exprs) {
+    SEXP res = savvy_PlRLazyFrame_with_columns_seq__ffi(self__, c_arg__exprs);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_rename__impl(SEXP self__, SEXP c_arg__existing, SEXP c_arg__new, SEXP c_arg__strict) {
+    SEXP res = savvy_PlRLazyFrame_rename__ffi(self__, c_arg__existing, c_arg__new, c_arg__strict);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_reverse__impl(SEXP self__) {
+    SEXP res = savvy_PlRLazyFrame_reverse__ffi(self__);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_shift__impl(SEXP self__, SEXP c_arg__n, SEXP c_arg__fill_value) {
+    SEXP res = savvy_PlRLazyFrame_shift__ffi(self__, c_arg__n, c_arg__fill_value);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_fill_nan__impl(SEXP self__, SEXP c_arg__fill_value) {
+    SEXP res = savvy_PlRLazyFrame_fill_nan__ffi(self__, c_arg__fill_value);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_min__impl(SEXP self__) {
+    SEXP res = savvy_PlRLazyFrame_min__ffi(self__);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_max__impl(SEXP self__) {
+    SEXP res = savvy_PlRLazyFrame_max__ffi(self__);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_sum__impl(SEXP self__) {
+    SEXP res = savvy_PlRLazyFrame_sum__ffi(self__);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_mean__impl(SEXP self__) {
+    SEXP res = savvy_PlRLazyFrame_mean__ffi(self__);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_std__impl(SEXP self__, SEXP c_arg__ddof) {
+    SEXP res = savvy_PlRLazyFrame_std__ffi(self__, c_arg__ddof);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_var__impl(SEXP self__, SEXP c_arg__ddof) {
+    SEXP res = savvy_PlRLazyFrame_var__ffi(self__, c_arg__ddof);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_median__impl(SEXP self__) {
+    SEXP res = savvy_PlRLazyFrame_median__ffi(self__);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_quantile__impl(SEXP self__, SEXP c_arg__quantile, SEXP c_arg__interpolation) {
+    SEXP res = savvy_PlRLazyFrame_quantile__ffi(self__, c_arg__quantile, c_arg__interpolation);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_explode__impl(SEXP self__, SEXP c_arg__column) {
+    SEXP res = savvy_PlRLazyFrame_explode__ffi(self__, c_arg__column);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_null_count__impl(SEXP self__) {
+    SEXP res = savvy_PlRLazyFrame_null_count__ffi(self__);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_unique__impl(SEXP self__, SEXP c_arg__maintain_order, SEXP c_arg__keep, SEXP c_arg__subset) {
+    SEXP res = savvy_PlRLazyFrame_unique__ffi(self__, c_arg__maintain_order, c_arg__keep, c_arg__subset);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_drop_nulls__impl(SEXP self__, SEXP c_arg__subset) {
+    SEXP res = savvy_PlRLazyFrame_drop_nulls__ffi(self__, c_arg__subset);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_unpivot__impl(SEXP self__, SEXP c_arg__on, SEXP c_arg__index, SEXP c_arg__value_name, SEXP c_arg__variable_name) {
+    SEXP res = savvy_PlRLazyFrame_unpivot__ffi(self__, c_arg__on, c_arg__index, c_arg__value_name, c_arg__variable_name);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_with_row_index__impl(SEXP self__, SEXP c_arg__name, SEXP c_arg__offset) {
+    SEXP res = savvy_PlRLazyFrame_with_row_index__ffi(self__, c_arg__name, c_arg__offset);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_clone__impl(SEXP self__) {
+    SEXP res = savvy_PlRLazyFrame_clone__ffi(self__);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_unnest__impl(SEXP self__, SEXP c_arg__columns) {
+    SEXP res = savvy_PlRLazyFrame_unnest__ffi(self__, c_arg__columns);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_count__impl(SEXP self__) {
+    SEXP res = savvy_PlRLazyFrame_count__ffi(self__);
+    return handle_result(res);
+}
+
+SEXP savvy_PlRLazyFrame_merge_sorted__impl(SEXP self__, SEXP c_arg__other, SEXP c_arg__key) {
+    SEXP res = savvy_PlRLazyFrame_merge_sorted__ffi(self__, c_arg__other, c_arg__key);
+    return handle_result(res);
+}
+
 SEXP savvy_PlRLazyGroupBy_agg__impl(SEXP self__, SEXP c_arg__aggs) {
     SEXP res = savvy_PlRLazyGroupBy_agg__ffi(self__, c_arg__aggs);
     return handle_result(res);
@@ -2049,6 +2219,40 @@ static const R_CallMethodDef CallEntries[] = {
     {"savvy_PlRLazyFrame_cast_all__impl", (DL_FUNC) &savvy_PlRLazyFrame_cast_all__impl, 3},
     {"savvy_PlRLazyFrame_sort_by_exprs__impl", (DL_FUNC) &savvy_PlRLazyFrame_sort_by_exprs__impl, 6},
     {"savvy_PlRLazyFrame_with_columns__impl", (DL_FUNC) &savvy_PlRLazyFrame_with_columns__impl, 2},
+    {"savvy_PlRLazyFrame_to_dot__impl", (DL_FUNC) &savvy_PlRLazyFrame_to_dot__impl, 2},
+    {"savvy_PlRLazyFrame_sort__impl", (DL_FUNC) &savvy_PlRLazyFrame_sort__impl, 6},
+    {"savvy_PlRLazyFrame_top_k__impl", (DL_FUNC) &savvy_PlRLazyFrame_top_k__impl, 4},
+    {"savvy_PlRLazyFrame_bottom_k__impl", (DL_FUNC) &savvy_PlRLazyFrame_bottom_k__impl, 4},
+    {"savvy_PlRLazyFrame_cache__impl", (DL_FUNC) &savvy_PlRLazyFrame_cache__impl, 1},
+    {"savvy_PlRLazyFrame_select_seq__impl", (DL_FUNC) &savvy_PlRLazyFrame_select_seq__impl, 2},
+    {"savvy_PlRLazyFrame_rolling__impl", (DL_FUNC) &savvy_PlRLazyFrame_rolling__impl, 6},
+    {"savvy_PlRLazyFrame_group_by_dynamic__impl", (DL_FUNC) &savvy_PlRLazyFrame_group_by_dynamic__impl, 10},
+    {"savvy_PlRLazyFrame_with_context__impl", (DL_FUNC) &savvy_PlRLazyFrame_with_context__impl, 2},
+    {"savvy_PlRLazyFrame_join__impl", (DL_FUNC) &savvy_PlRLazyFrame_join__impl, 11},
+    {"savvy_PlRLazyFrame_join_where__impl", (DL_FUNC) &savvy_PlRLazyFrame_join_where__impl, 4},
+    {"savvy_PlRLazyFrame_with_columns_seq__impl", (DL_FUNC) &savvy_PlRLazyFrame_with_columns_seq__impl, 2},
+    {"savvy_PlRLazyFrame_rename__impl", (DL_FUNC) &savvy_PlRLazyFrame_rename__impl, 4},
+    {"savvy_PlRLazyFrame_reverse__impl", (DL_FUNC) &savvy_PlRLazyFrame_reverse__impl, 1},
+    {"savvy_PlRLazyFrame_shift__impl", (DL_FUNC) &savvy_PlRLazyFrame_shift__impl, 3},
+    {"savvy_PlRLazyFrame_fill_nan__impl", (DL_FUNC) &savvy_PlRLazyFrame_fill_nan__impl, 2},
+    {"savvy_PlRLazyFrame_min__impl", (DL_FUNC) &savvy_PlRLazyFrame_min__impl, 1},
+    {"savvy_PlRLazyFrame_max__impl", (DL_FUNC) &savvy_PlRLazyFrame_max__impl, 1},
+    {"savvy_PlRLazyFrame_sum__impl", (DL_FUNC) &savvy_PlRLazyFrame_sum__impl, 1},
+    {"savvy_PlRLazyFrame_mean__impl", (DL_FUNC) &savvy_PlRLazyFrame_mean__impl, 1},
+    {"savvy_PlRLazyFrame_std__impl", (DL_FUNC) &savvy_PlRLazyFrame_std__impl, 2},
+    {"savvy_PlRLazyFrame_var__impl", (DL_FUNC) &savvy_PlRLazyFrame_var__impl, 2},
+    {"savvy_PlRLazyFrame_median__impl", (DL_FUNC) &savvy_PlRLazyFrame_median__impl, 1},
+    {"savvy_PlRLazyFrame_quantile__impl", (DL_FUNC) &savvy_PlRLazyFrame_quantile__impl, 3},
+    {"savvy_PlRLazyFrame_explode__impl", (DL_FUNC) &savvy_PlRLazyFrame_explode__impl, 2},
+    {"savvy_PlRLazyFrame_null_count__impl", (DL_FUNC) &savvy_PlRLazyFrame_null_count__impl, 1},
+    {"savvy_PlRLazyFrame_unique__impl", (DL_FUNC) &savvy_PlRLazyFrame_unique__impl, 4},
+    {"savvy_PlRLazyFrame_drop_nulls__impl", (DL_FUNC) &savvy_PlRLazyFrame_drop_nulls__impl, 2},
+    {"savvy_PlRLazyFrame_unpivot__impl", (DL_FUNC) &savvy_PlRLazyFrame_unpivot__impl, 5},
+    {"savvy_PlRLazyFrame_with_row_index__impl", (DL_FUNC) &savvy_PlRLazyFrame_with_row_index__impl, 3},
+    {"savvy_PlRLazyFrame_clone__impl", (DL_FUNC) &savvy_PlRLazyFrame_clone__impl, 1},
+    {"savvy_PlRLazyFrame_unnest__impl", (DL_FUNC) &savvy_PlRLazyFrame_unnest__impl, 2},
+    {"savvy_PlRLazyFrame_count__impl", (DL_FUNC) &savvy_PlRLazyFrame_count__impl, 1},
+    {"savvy_PlRLazyFrame_merge_sorted__impl", (DL_FUNC) &savvy_PlRLazyFrame_merge_sorted__impl, 3},
     {"savvy_PlRLazyGroupBy_agg__impl", (DL_FUNC) &savvy_PlRLazyGroupBy_agg__impl, 2},
     {"savvy_PlRLazyGroupBy_head__impl", (DL_FUNC) &savvy_PlRLazyGroupBy_head__impl, 2},
     {"savvy_PlRLazyGroupBy_tail__impl", (DL_FUNC) &savvy_PlRLazyGroupBy_tail__impl, 2},
diff --git a/src/rust/api.h b/src/rust/api.h
index 5a9d8b98..6be30108 100644
--- a/src/rust/api.h
+++ b/src/rust/api.h
@@ -313,6 +313,40 @@ SEXP savvy_PlRLazyFrame_cast__ffi(SEXP self__, SEXP c_arg__dtypes, SEXP c_arg__s
 SEXP savvy_PlRLazyFrame_cast_all__ffi(SEXP self__, SEXP c_arg__dtype, SEXP c_arg__strict);
 SEXP savvy_PlRLazyFrame_sort_by_exprs__ffi(SEXP self__, SEXP c_arg__by, SEXP c_arg__descending, SEXP c_arg__nulls_last, SEXP c_arg__maintain_order, SEXP c_arg__multithreaded);
 SEXP savvy_PlRLazyFrame_with_columns__ffi(SEXP self__, SEXP c_arg__exprs);
+SEXP savvy_PlRLazyFrame_to_dot__ffi(SEXP self__, SEXP c_arg__optimized);
+SEXP savvy_PlRLazyFrame_sort__ffi(SEXP self__, SEXP c_arg__by_column, SEXP c_arg__descending, SEXP c_arg__nulls_last, SEXP c_arg__maintain_order, SEXP c_arg__multithreaded);
+SEXP savvy_PlRLazyFrame_top_k__ffi(SEXP self__, SEXP c_arg__k, SEXP c_arg__by, SEXP c_arg__reverse);
+SEXP savvy_PlRLazyFrame_bottom_k__ffi(SEXP self__, SEXP c_arg__k, SEXP c_arg__by, SEXP c_arg__reverse);
+SEXP savvy_PlRLazyFrame_cache__ffi(SEXP self__);
+SEXP savvy_PlRLazyFrame_select_seq__ffi(SEXP self__, SEXP c_arg__exprs);
+SEXP savvy_PlRLazyFrame_rolling__ffi(SEXP self__, SEXP c_arg__index_column, SEXP c_arg__period, SEXP c_arg__offset, SEXP c_arg__closed, SEXP c_arg__by);
+SEXP savvy_PlRLazyFrame_group_by_dynamic__ffi(SEXP self__, SEXP c_arg__index_column, SEXP c_arg__every, SEXP c_arg__period, SEXP c_arg__offset, SEXP c_arg__label, SEXP c_arg__include_boundaries, SEXP c_arg__closed, SEXP c_arg__group_by, SEXP c_arg__start_by);
+SEXP savvy_PlRLazyFrame_with_context__ffi(SEXP self__, SEXP c_arg__contexts);
+SEXP savvy_PlRLazyFrame_join__ffi(SEXP self__, SEXP c_arg__other, SEXP c_arg__left_on, SEXP c_arg__right_on, SEXP c_arg__allow_parallel, SEXP c_arg__force_parallel, SEXP c_arg__join_nulls, SEXP c_arg__how, SEXP c_arg__suffix, SEXP c_arg__validate, SEXP c_arg__coalesce);
+SEXP savvy_PlRLazyFrame_join_where__ffi(SEXP self__, SEXP c_arg__other, SEXP c_arg__predicates, SEXP c_arg__suffix);
+SEXP savvy_PlRLazyFrame_with_columns_seq__ffi(SEXP self__, SEXP c_arg__exprs);
+SEXP savvy_PlRLazyFrame_rename__ffi(SEXP self__, SEXP c_arg__existing, SEXP c_arg__new, SEXP c_arg__strict);
+SEXP savvy_PlRLazyFrame_reverse__ffi(SEXP self__);
+SEXP savvy_PlRLazyFrame_shift__ffi(SEXP self__, SEXP c_arg__n, SEXP c_arg__fill_value);
+SEXP savvy_PlRLazyFrame_fill_nan__ffi(SEXP self__, SEXP c_arg__fill_value);
+SEXP savvy_PlRLazyFrame_min__ffi(SEXP self__);
+SEXP savvy_PlRLazyFrame_max__ffi(SEXP self__);
+SEXP savvy_PlRLazyFrame_sum__ffi(SEXP self__);
+SEXP savvy_PlRLazyFrame_mean__ffi(SEXP self__);
+SEXP savvy_PlRLazyFrame_std__ffi(SEXP self__, SEXP c_arg__ddof);
+SEXP savvy_PlRLazyFrame_var__ffi(SEXP self__, SEXP c_arg__ddof);
+SEXP savvy_PlRLazyFrame_median__ffi(SEXP self__);
+SEXP savvy_PlRLazyFrame_quantile__ffi(SEXP self__, SEXP c_arg__quantile, SEXP c_arg__interpolation);
+SEXP savvy_PlRLazyFrame_explode__ffi(SEXP self__, SEXP c_arg__column);
+SEXP savvy_PlRLazyFrame_null_count__ffi(SEXP self__);
+SEXP savvy_PlRLazyFrame_unique__ffi(SEXP self__, SEXP c_arg__maintain_order, SEXP c_arg__keep, SEXP c_arg__subset);
+SEXP savvy_PlRLazyFrame_drop_nulls__ffi(SEXP self__, SEXP c_arg__subset);
+SEXP savvy_PlRLazyFrame_unpivot__ffi(SEXP self__, SEXP c_arg__on, SEXP c_arg__index, SEXP c_arg__value_name, SEXP c_arg__variable_name);
+SEXP savvy_PlRLazyFrame_with_row_index__ffi(SEXP self__, SEXP c_arg__name, SEXP c_arg__offset);
+SEXP savvy_PlRLazyFrame_clone__ffi(SEXP self__);
+SEXP savvy_PlRLazyFrame_unnest__ffi(SEXP self__, SEXP c_arg__columns);
+SEXP savvy_PlRLazyFrame_count__ffi(SEXP self__);
+SEXP savvy_PlRLazyFrame_merge_sorted__ffi(SEXP self__, SEXP c_arg__other, SEXP c_arg__key);
 
 // methods and associated functions for PlRLazyGroupBy
 SEXP savvy_PlRLazyGroupBy_agg__ffi(SEXP self__, SEXP c_arg__aggs);

From 1f8fd9b27d6897e55eabcf503d64d9b4eaa57d46 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Tue, 5 Nov 2024 10:56:38 +0100
Subject: [PATCH 04/71] more

---
 R/lazyframe-frame.R               | 478 ++++++++++++++++--------------
 src/rust/src/lazyframe/general.rs |   6 +-
 2 files changed, 266 insertions(+), 218 deletions(-)

diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index 10e42ffd..7c0a8b18 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -57,7 +57,7 @@ wrap.PlRLazyFrame <- function(x, ...) {
 #' `$select()` call. For instance, if you create a variable `x`, you will only
 #' be able to use it in another `$select()` or `$with_columns()` call.
 #'
-#' @inherit pl__LazyFrame return
+#' @inherit as_polars_lf return
 #' @param ... <[`dynamic-dots`][rlang::dyn-dots]>
 #' Name-value pairs of objects to be converted to polars [expressions][Expr]
 #' by the [as_polars_expr()] function.
@@ -185,6 +185,38 @@ lazyframe__collect <- function(
   })
 }
 
+#' Create a string representation of the query plan
+#'
+#' The query plan is read from bottom to top. When `optimized = FALSE`, the
+#' query as it was written by the user is shown. This is not what Polars runs.
+#' Instead, it applies optimizations that are displayed by default by `$explain()`.
+#' One classic example is the predicate pushdown, which applies the filter as
+#' early as possible (i.e. at the bottom of the plan).
+#'
+#' @inheritParams rlang::check_dots_empty0
+#' @inheritParams lazyframe__collect
+#' @param format The format to use for displaying the logical plan. Must be
+#' either `"plain"` (default) or `"tree"`.
+#' @param optimized Return an optimized query plan. If `TRUE` (default), the
+#' subsequent optimization flags control which optimizations run.
+#'
+#' @return A character value containing the query plan.
+#' @examples
+#' lazy_frame <- as_polars_lf(iris)
+#'
+#' # Prepare your query
+#' lazy_query <- lazy_frame$sort("Species")$filter(pl$col("Species") != "setosa")
+#'
+#' # This is the query that was written by the user, without any optimizations
+#' # (use cat() for better printing)
+#' lazy_query$explain(optimized = FALSE) |> cat()
+#'
+#' # This is the query after `polars` optimizes it: instead of sorting first and
+#' # then filtering, it is faster to filter first and then sort the rest.
+#' lazy_query$explain() |> cat()
+#'
+#' # Also possible to see this as tree format
+#' lazy_query$explain(format = "tree") |> cat()
 lazyframe__explain <- function(
     ...,
     format = c("plain", "tree"),
@@ -200,7 +232,6 @@ lazyframe__explain <- function(
     streaming = FALSE) {
   wrap({
     check_dots_empty0(...)
-
     format <- arg_match0(format, c("plain", "tree"))
 
     if (isTRUE(optimized)) {
@@ -232,6 +263,32 @@ lazyframe__explain <- function(
   })
 }
 
+#' Cast LazyFrame column(s) to the specified dtype(s)
+#'
+#' This allows to convert all columns to a datatype or to convert only specific
+#' columns. Contrarily to the Python implementation, it is not possible to
+#' convert all columns of a specific datatype to another datatype.
+#'
+#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Either a datatype to which
+#' all columns will be cast, or a list where the names are column names and the
+#' values are the datatypes to convert to.
+#' @param strict If `TRUE` (default), throw an error if a cast could not be done
+#' (for instance, due to an overflow). Otherwise, return `null`.
+#'
+#' @return A LazyFrame
+#'
+#' @examples
+#' lf <- pl$LazyFrame(
+#'   foo = 1:3,
+#'   bar = c(6, 7, 8),
+#'   ham = as.Date(c("2020-01-02", "2020-03-04", "2020-05-06"))
+#' )
+#'
+#' # Cast only some columns
+#' lf$cast(foo = pl$Float32, bar = pl$UInt8)$collect()
+#'
+#' # Cast all columns to the same type
+#' lf$cast(pl$String)$collect()
 lazyframe__cast <- function(..., .strict = TRUE) {
   wrap({
     check_bool(.strict)
@@ -285,7 +342,7 @@ lazyframe__sort <- function(
 #' variable `x`, you will only be able to use it in another `$with_columns()`
 #' or `$select()` call.
 #'
-#' @inherit pl__LazyFrame return
+#' @inherit as_polars_lf return
 #' @inheritParams lazyframe__select
 #' @examples
 #' # Pass an expression to add it as a new column.
@@ -331,10 +388,29 @@ lazyframe__with_columns <- function(...) {
   })
 }
 
+#' Remove columns from the DataFrame
+#'
+#' @param  <[`dynamic-dots`][rlang::dyn-dots]> Names of the columns that should
+#' be removed from the dataframe. Accepts column selector input.
+#' @param strict Validate that all column names exist in the current schema,
+#' and throw an exception if any do not.
+#'
+#' @inherit as_polars_lf return
+#' @examples
+#' # Drop columns by passing the name of those columns
+#' lf <- pl$LazyFrame(
+#'   foo = 1:3,
+#'   bar = c(6, 7, 8),
+#'   ham = c("a", "b", "c")
+#' )
+#' lf$drop("ham")$collect()
+#' lf$drop("ham", "bar")$collect()
+#'
+#' # Drop multiple columns by passing a selector
+#' lf$drop(cs$all())$collect()
 lazyframe__drop <- function(..., strict = TRUE) {
   wrap({
     check_dots_unnamed()
-
     parse_into_list_of_expressions(...) |>
       self$`_ldf`$drop(strict)
   })
@@ -367,7 +443,7 @@ lazyframe__tail <- function(n = 5) {
 #' as_polars_lf(mtcars)$first()$collect()
 lazyframe__first <- function() {
   wrap({
-    self$`_rexpr`$first()
+    self$`_ldf`$first()
   })
 }
 
@@ -376,107 +452,109 @@ lazyframe__first <- function() {
 #'
 #' @inherit as_polars_lf return
 #' @examples
-#' as_polars_lf(mtcars)$last()$collect()
+#' lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1))
+#' lf$last()$collect()
 lazyframe__last <- function() {
   wrap({
-    self$`_rexpr`$last()
+    self$`_ldf`$last()
   })
 }
 
-#' Max
-#' @description Aggregate the columns in the LazyFrame to their maximum value.
+#' Aggregate the columns in the LazyFrame to their maximum value
 #'
 #' @inherit as_polars_lf return
 #' @examples
-#' as_polars_lf(mtcars)$max()$collect()
+#' lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1))
+#' lf$max()$collect()
 lazyframe__max <- function() {
   wrap({
-    self$`_rexpr`$max()
+    self$`_ldf`$max()
   })
 }
 
-#' Mean
-#' @description Aggregate the columns in the LazyFrame to their mean value.
+#' Aggregate the columns in the LazyFrame to their mean value
 #'
 #' @inherit as_polars_lf return
 #' @examples
-#' as_polars_lf(mtcars)$mean()$collect()
+#' lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1))
+#' lf$mean()$collect()
 lazyframe__mean <- function() {
   wrap({
-    self$`_rexpr`$mean()
+    self$`_ldf`$mean()
   })
 }
 
-#' Median
-#' @description Aggregate the columns in the LazyFrame to their median value.
+#' Aggregate the columns in the LazyFrame to their median value
 #'
 #' @inherit as_polars_lf return
 #' @examples
-#' as_polars_lf(mtcars)$median()$collect()
+#' lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1))
+#' lf$median()$collect()
 lazyframe__median <- function() {
   wrap({
-    self$`_rexpr`$median()
+    self$`_ldf`$median()
   })
 }
 
-#' Min
-#' @description Aggregate the columns in the LazyFrame to their minimum value.
+#' Aggregate the columns in the LazyFrame to their minimum value
 #'
 #' @inherit as_polars_lf return
 #' @examples
-#' as_polars_lf(mtcars)$min()$collect()
+#' lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1))
+#' lf$min()$collect()
 lazyframe__min <- function() {
   wrap({
-    self$`_rexpr`$min()
+    self$`_ldf`$min()
   })
 }
 
-#' Sum
-#' @description Aggregate the columns of this LazyFrame to their sum values.
+#' Aggregate the columns of this LazyFrame to their sum values
 #'
 #' @inherit as_polars_lf return
 #' @examples
-#' as_polars_lf(mtcars)$sum()$collect()
+#' lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1))
+#' lf$sum()$collect()
 lazyframe__sum <- function() {
   wrap({
-    self$`_rexpr`$sum()
+    self$`_ldf`$sum()
   })
 }
 
-#' Var
-#' @description Aggregate the columns of this LazyFrame to their variance values.
+#' Aggregate the columns in the LazyFrame to their variance value
 #'
 #' @inheritParams DataFrame_var
 #' @inherit as_polars_lf return
 #' @examples
-#' as_polars_lf(mtcars)$var()$collect()
+#' lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1))
+#' lf$var()$collect()
+#' lf$var(ddof = 0)$collect()
 lazyframe__var <- function(ddof = 1) {
   wrap({
-    self$`_rexpr`$var(ddof)
+    self$`_ldf`$var(ddof)
   })
 }
 
-#' Std
-#' @description Aggregate the columns of this LazyFrame to their standard
-#' deviation values.
+#' Aggregate the columns of this LazyFrame to their standard deviation values
 #'
 #' @inheritParams DataFrame_std
 #' @inherit as_polars_lf return
 #' @examples
-#' as_polars_lf(mtcars)$std()$collect()
+#' lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1))
+#' lf$std()$collect()
+#' lf$std(ddof = 0)$collect()
 lazyframe__std <- function(ddof = 1) {
   wrap({
-    self$`_rexpr`$std(ddof)
+    self$`_ldf`$std(ddof)
   })
 }
 
-#' Quantile
-#' @description Aggregate the columns in the DataFrame to a unique quantile
-#' value. Use `$describe()` to specify several quantiles.
+#' Aggregate the columns in the DataFrame to a unique quantile value
+#'
 #' @inheritParams DataFrame_quantile
 #' @inherit as_polars_lf return
 #' @examples
-#' as_polars_lf(mtcars)$quantile(.4)$collect()
+#' lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1))
+#' lf$quantile(0.7)$collect()
 lazyframe__quantile <- function(
     quantile,
     interpolation = c("nearest", "higher", "lower", "midpoint", "linear")) {
@@ -485,7 +563,7 @@ lazyframe__quantile <- function(
       interpolation,
       values = c("nearest", "higher", "lower", "midpoint", "linear")
     )
-    self$`_rexpr`$quantile(as_polars_expr(quantile, as_lit = TRUE)$`_rexpr`, interpolation)
+    self$`_ldf`$quantile(as_polars_expr(quantile, as_lit = TRUE)$`_rexpr`, interpolation)
   })
 }
 
@@ -500,7 +578,7 @@ lazyframe__quantile <- function(
 #' df$fill_nan(99)$collect()
 lazyframe__fill_nan <- function(value) {
   wrap({
-    self$`_rexpr`$fill_nan(value)
+    self$`_ldf`$fill_nan(value)
   })
 }
 
@@ -515,7 +593,7 @@ lazyframe__fill_nan <- function(value) {
 #' df$fill_null(99)$collect()
 lazyframe__fill_null <- function(fill_value) {
   wrap({
-    self$`_rexpr`$fill_null(wrap_e_result(fill_value))
+    self$`_ldf`$fill_null(wrap_e_result(fill_value))
   })
 }
 
@@ -533,26 +611,7 @@ lazyframe__fill_null <- function(fill_value) {
 #'
 #' lf$shift(-2, fill_value = 100)$collect()
 lazyframe__shift <- function(n = 1, fill_value = NULL) {
-  self$`_rexpr`$shift(n, fill_value)
-}
-
-#' Drop columns of a LazyFrame
-#'
-#' @inheritParams DataFrame_drop
-#'
-#' @inherit as_polars_lf return
-#' @examples
-#' as_polars_lf(mtcars)$drop(c("mpg", "hp"))$collect()
-#'
-#' # equivalent
-#' as_polars_lf(mtcars)$drop("mpg", "hp")$collect()
-lazyframe__drop <- function(..., strict = TRUE) {
-  cols <- unpack_list(..., .context = "in $drop():") |>
-    unlist()
-  if (length(cols) == 0) {
-    return(self)
-  }
-  self$`_rexpr`$drop(cols, strict)
+  self$`_ldf`$shift(n, fill_value)
 }
 
 #' Reverse
@@ -563,7 +622,7 @@ lazyframe__drop <- function(..., strict = TRUE) {
 #' as_polars_lf(mtcars)$reverse()$collect()
 lazyframe__reverse <- function() {
   wrap({
-    self$`_rexpr`$reverse()
+    self$`_ldf`$reverse()
   })
 }
 
@@ -577,7 +636,7 @@ lazyframe__reverse <- function() {
 #' mtcars[2:6, ]
 lazyframe__slice <- function(offset, length = NULL) {
   wrap({
-    self$`_rexpr`$slice(offset, length)
+    self$`_ldf`$slice(offset, length)
   })
 }
 
@@ -594,28 +653,38 @@ lazyframe__slice <- function(offset, length = NULL) {
 #' lf$tail(2)$collect()
 lazyframe__tail <- function(n = 5L) {
   wrap({
-    self$`_rexpr`$tail(n)
+    self$`_ldf`$tail(n)
   })
 }
 
-#' @inherit DataFrame_drop_nulls title description params
+#' Drop all rows that contain null values
+#'
+#' The original order of the remaining rows is preserved.
+#'
+#' @param subset Column name(s) for which null values are considered. If `NULL`
+#' (default), use all columns.
 #'
 #' @inherit as_polars_lf return
 #' @examples
-#' tmp <- mtcars
-#' tmp[1:3, "mpg"] <- NA
-#' tmp[4, "hp"] <- NA
-#' tmp <- pl$LazyFrame(tmp)
+#' lf <- pl$LazyFrame(
+#'   foo = 1:3,
+#'   bar = c(6, NA, 8),
+#'   ham = c("a", "b", NA)
+#' )
 #'
-#' # number of rows in `tmp` before dropping nulls
-#' tmp$collect()$height
+#' # The default behavior of this method is to drop rows where any single value
+#' # of the row is null.
+#' lf$drop_nulls()$collect()
 #'
-#' tmp$drop_nulls()$collect()$height
-#' tmp$drop_nulls("mpg")$collect()$height
-#' tmp$drop_nulls(c("mpg", "hp"))$collect()$height
+#' # This behaviour can be constrained to consider only a subset of columns, as
+#' # defined by name or with a selector. For example, dropping rows if there is
+#' # a null in any of the integer columns:
+#' lf$drop_nulls(subset = cs$integer())$collect()
 lazyframe__drop_nulls <- function(subset = NULL) {
-  if (!is.null(subset)) subset <- as.list(subset)
-  self$`_rexpr`$drop_nulls(subset)
+  wrap({
+    subset <- parse_into_list_of_expressions(!!!subset)
+    self$`_ldf`$drop_nulls(subset)
+  })
 }
 
 #' @inherit DataFrame_unique title description params
@@ -641,7 +710,7 @@ lazyframe__unique <- function(
     keep = "any",
     maintain_order = FALSE) {
   wrap({
-    self$`_rexpr`$unique(subset, keep, maintain_order)
+    self$`_ldf`$unique(subset, keep, maintain_order)
   })
 }
 
@@ -683,7 +752,7 @@ lazyframe__unique <- function(
 #'   pl$col("c")$mean()
 #' )$collect()
 lazyframe__group_by <- function(..., maintain_order = polars_options()$maintain_order) {
-  self$`_rexpr`$group_by(unpack_list(..., .context = "in $group_by():"), maintain_order)
+  self$`_ldf`$group_by(unpack_list(..., .context = "in $group_by():"), maintain_order)
 }
 
 #' Join LazyFrames
@@ -785,7 +854,7 @@ lazyframe__join <- function(
     }
   }
 
-  self$`_rexpr`$join(
+  self$`_ldf`$join(
     lf, other, rexprs_left, rexprs_right, how, validate, join_nulls, suffix,
     allow_parallel, force_parallel, coalesce
   ) |>
@@ -835,62 +904,12 @@ lazyframe__join_where <- function(
     other,
     ...,
     suffix = "_right") {
-  uw <- \(res) wrap({
-    res
+  wrap({
+    check_polars_lf(other)
+    self$`_ldf`$join_where(other, unpack_list(..., .context = "in $join_where():"), suffix)
   })
-
-
-  if (!is_polars_lf(other)) {
-    Err_plain("`other` must be a LazyFrame.") |> uw()
-  }
-
-  self$`_rexpr`$join_where(lf, other, unpack_list(..., .context = "in $join_where():"), suffix) |>
-    uw()
 }
 
-
-
-#' Sort the LazyFrame by the given columns
-#'
-#' @inheritParams Series_sort
-#' @param by Column(s) to sort by. Can be character vector of column names,
-#' a list of Expr(s) or a list with a mix of Expr(s) and column names.
-#' @param ... More columns to sort by as above but provided one Expr per argument.
-#' @param descending Logical. Sort in descending order (default is `FALSE`). This must be
-#' either of length 1 or a logical vector of the same length as the number of
-#' Expr(s) specified in `by` and `...`.
-#' @param nulls_last A logical or logical vector of the same length as the number of columns.
-#' If `TRUE`, place `null` values last insead of first.
-#' @param maintain_order Whether the order should be maintained if elements are
-#' equal. If `TRUE`, streaming is not possible and performance might be worse
-#' since this requires a stable search.
-#' @inherit as_polars_lf return
-#' @keywords  LazyFrame
-#' @examples
-#' df <- mtcars
-#' df$mpg[1] <- NA
-#' df <- pl$LazyFrame(df)
-#' df$sort("mpg")$collect()
-#' df$sort("mpg", nulls_last = TRUE)$collect()
-#' df$sort("cyl", "mpg")$collect()
-#' df$sort(c("cyl", "mpg"))$collect()
-#' df$sort(c("cyl", "mpg"), descending = TRUE)$collect()
-#' df$sort(c("cyl", "mpg"), descending = c(TRUE, FALSE))$collect()
-#' df$sort(pl$col("cyl"), pl$col("mpg"))$collect()
-lazyframe__sort <- function(
-    by,
-    ...,
-    descending = FALSE,
-    nulls_last = FALSE,
-    maintain_order = FALSE,
-    multithreaded = TRUE) {
-  self$`_rexpr`$sort_by_exprs(
-    lf, wrap_elist_result(by, str_to_lit = FALSE), err_on_named_args(...),
-    descending, nulls_last, maintain_order, multithreaded
-  )
-}
-
-
 #' Perform joins on nearest keys
 #'
 #' This is similar to a left-join except that we match on nearest key rather
@@ -984,7 +1003,7 @@ lazyframe__join_asof <- function(
   tolerance_str <- if (is.character(tolerance)) tolerance else NULL
   tolerance_num <- if (!is.character(tolerance)) tolerance else NULL
 
-  self$`_rexpr`$join_asof(
+  self$`_ldf`$join_asof(
     lf = self,
     other = other,
     left_on = left_on,
@@ -1038,7 +1057,7 @@ lazyframe__unpivot <- function(
     index = NULL,
     variable_name = NULL,
     value_name = NULL) {
-  self$`_rexpr`$unpivot(
+  self$`_ldf`$unpivot(
     lf, on %||% character(), index %||% character(),
     value_name, variable_name
   ) |> unwrap("in $unpivot( ): ")
@@ -1092,7 +1111,7 @@ lazyframe__rename <- function(...) {
     new <- unname(unlist(mapping))
     existing <- names(mapping)
   }
-  self$`_rexpr`$rename(existing, new) |>
+  self$`_ldf`$rename(existing, new) |>
     uw()
 }
 
@@ -1123,11 +1142,11 @@ lazyframe__rename <- function(...) {
 #'
 #' @examples
 #' # fetch 3 rows
-#' pl$LazyFrame(iris)$fetch(3)
+#' as_polars_lf(iris)$fetch(3)
 #'
 #' # this fetch-query returns 4 rows, because we started with 3 and appended one
 #' # row in the query (see section 'Details')
-#' pl$LazyFrame(iris)$
+#' as_polars_lf(iris)$
 #'   select(pl$col("Species")$append("flora gigantica, alien"))$
 #'   fetch(3)
 lazyframe__fetch <- function(
@@ -1157,7 +1176,7 @@ lazyframe__fetch <- function(
   }
 
   lf <- self |>
-    self$`_rexpr`$optimization_toggle(
+    self$`_ldf`$optimization_toggle(
       pe_coercion = type_coercion,
       predicate_pushdown = predicate_pushdown,
       projection_pushdown = projection_pushdown,
@@ -1170,7 +1189,7 @@ lazyframe__fetch <- function(
       eager = FALSE
     )
 
-  self$`_rexpr`$fetch(n_rows)
+  self$`_ldf`$fetch(n_rows)
 }
 
 #' Collect and profile a lazy query.
@@ -1205,7 +1224,7 @@ lazyframe__fetch <- function(
 #' ## Use $profile() to compare two queries
 #'
 #' # -1-  map each Species-group with native polars, takes ~120us only
-#' pl$LazyFrame(iris)$
+#' as_polars_lf(iris)$
 #'   sort("Sepal.Length")$
 #'   group_by("Species", maintain_order = TRUE)$
 #'   agg(pl$col(pl$Float64)$first() + 5)$
@@ -1219,7 +1238,7 @@ lazyframe__fetch <- function(
 #'   s$to_r()[1] + 5
 #' }
 #'
-#' pl$LazyFrame(iris)$
+#' as_polars_lf(iris)$
 #'   sort("Sepal.Length")$
 #'   group_by("Species", maintain_order = TRUE)$
 #'   agg(pl$col(pl$Float64)$map_elements(r_func))$
@@ -1252,7 +1271,7 @@ lazyframe__profile <- function(
   }
 
   lf <- self |>
-    self$`_rexpr`$optimization_toggle(
+    self$`_ldf`$optimization_toggle(
       pe_coercion = type_coercion,
       predicate_pushdown = predicate_pushdown,
       projection_pushdown = projection_pushdown,
@@ -1266,7 +1285,7 @@ lazyframe__profile <- function(
     )
 
   out <- lf |>
-    self$`_rexpr`$profile() >
+    self$`_ldf`$profile() >
     unwrap("in $profile()")
 
   if (isTRUE(show_plot)) {
@@ -1278,44 +1297,26 @@ lazyframe__profile <- function(
   out
 }
 
-#' Explode columns containing a list of values
-#' @description This will take every element of a list column and add it on an
-#' additional row.
-#'
+#' Explode the DataFrame to long format by exploding the given columns
 #'
-#'
-#' @param ... Column(s) to be exploded as individual `Into<Expr>` or list/vector
-#' of `Into<Expr>`. In a handful of places in rust-polars, only the plain variant
-#' `Expr::Column` is accepted. This is currenly one of such places. Therefore
-#' `pl$col("name")` and `pl$all()` is allowed, not `pl$col("name")$alias("newname")`.
-#' `"name"` is implicitly converted to `pl$col("name")`.
-#'
-#' @details
-#' Only columns of DataType `List` or `Array` can be exploded.
-#'
-#' Named expressions like `$explode(a = pl$col("b"))` will not implicitly trigger
-#' `$alias("a")` here, due to only variant `Expr::Column` is supported in
-#' rust-polars.
+#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Column names, expressions, or
+#' a selector defining them. The underlying columns being exploded must be of
+#' the `List` or `Array` data type.
 #'
 #' @inherit as_polars_lf return
 #' @examples
-#' df <- pl$LazyFrame(
-#'   letters = c("aa", "aa", "bb", "cc"),
-#'   numbers = list(1, c(2, 3), c(4, 5), c(6, 7, 8)),
-#'   numbers_2 = list(0, c(1, 2), c(3, 4), c(5, 6, 7)) # same structure as numbers
+#' lf <- pl$LazyFrame(
+#'   letters = c("a", "a", "b", "c"),
+#'   numbers = list(1, c(2, 3), c(4, 5), c(6, 7, 8))
 #' )
-#' df
 #'
-#' # explode a single column, append others
-#' df$explode("numbers")$collect()
-#'
-#' # explode two columns of same nesting structure, by names or the common dtype
-#' # "List(Float64)"
-#' df$explode("numbers", "numbers_2")$collect()
-#' df$explode(pl$col(pl$List(pl$Float64)))$collect()
+#' lf$explode("numbers")$collect()
 lazyframe__explode <- function(...) {
-  dotdotdot_args <- unpack_list(..., .context = "in explode():")
-  self$`_rexpr`$explode(dotdotdot_args)
+  wrap({
+    check_dots_unnamed()
+    by <- parse_into_list_of_expressions(...)
+    self$`_ldf`$explode(by)
+  })
 }
 
 #' Clone a LazyFrame
@@ -1328,7 +1329,7 @@ lazyframe__explode <- function(...) {
 #'
 #' @inherit as_polars_lf return
 #' @examples
-#' df1 <- pl$LazyFrame(iris)
+#' df1 <- as_polars_lf(iris)
 #'
 #' # Make a function to take a LazyFrame, add an attribute, and return a LazyFrame
 #' give_attr <- function(data) {
@@ -1346,13 +1347,13 @@ lazyframe__explode <- function(...) {
 #'   attr(data, "created_on") <- "2024-01-29"
 #'   data
 #' }
-#' df1 <- pl$LazyFrame(iris)
+#' df1 <- as_polars_lf(iris)
 #' df2 <- give_attr(df1)
 #'
 #' # now, the original LazyFrame doesn't get this attribute
 #' attributes(df1)
 lazyframe__clone <- function() {
-  self$`_rexpr`$clone_in_rust()
+  self$`_ldf`$clone_in_rust()
 }
 
 
@@ -1381,12 +1382,12 @@ lazyframe__clone <- function() {
 lazyframe__unnest <- function(...) {
   columns <- unpack_list(..., .context = "in $unnest():")
   if (length(columns) == 0) {
-    columns <- names(which(dtypes_are_struct(self$`_rexpr`$schema(ok))))
+    columns <- names(which(dtypes_are_struct(self$`_ldf`$schema(ok))))
   } else {
     columns <- unlist(columns)
   }
   wrap({
-    self$`_rexpr`$unnest(columns)
+    self$`_ldf`$unnest(columns)
   })
 }
 
@@ -1419,7 +1420,7 @@ lazyframe__unnest <- function(...) {
 #'   pl$col("feature_0")$fill_null(pl$col("feature_0_train")$median())
 #' )$collect()
 lazyframe__with_context <- function(other) {
-  self$`_rexpr`$with_context(other)
+  self$`_ldf`$with_context(other)
 }
 
 
@@ -1466,7 +1467,7 @@ lazyframe__rolling <- function(
     group_by = NULL) {
   period <- parse_as_polars_duration_string(period)
   offset <- parse_as_polars_duration_string(offset) %||% negate_duration_string(period)
-  self$`_rexpr`$rolling(
+  self$`_ldf`$rolling(
     lf, index_column, period, offset, closed,
     wrap_elist_result(group_by, str_to_lit = FALSE)
   )
@@ -1576,7 +1577,7 @@ lazyframe__group_by_dynamic <- function(
   offset <- parse_as_polars_duration_string(offset) %||% negate_duration_string(every)
   period <- parse_as_polars_duration_string(period) %||% every
 
-  self$`_rexpr`$group_by_dynamic(
+  self$`_ldf`$group_by_dynamic(
     lf, index_column, every, period, offset, label, include_boundaries, closed,
     wrap_elist_result(group_by, str_to_lit = FALSE), start_by
   )
@@ -1623,7 +1624,7 @@ lazyframe__to_dot <- function(
     cluster_with_columns = TRUE,
     streaming = FALSE) {
   lf <- self |>
-    self$`_rexpr`$optimization_toggle(
+    self$`_ldf`$optimization_toggle(
       pe_coercion = type_coercion,
       predicate_pushdown = predicate_pushdown,
       projection_pushdown = projection_pushdown,
@@ -1636,7 +1637,7 @@ lazyframe__to_dot <- function(
       eager = FALSE
     )
 
-  self$`_rexpr`$to_dot(optimized)
+  self$`_ldf`$to_dot(optimized)
 }
 
 #' Create an empty or n-row null-filled copy of the LazyFrame
@@ -1644,7 +1645,7 @@ lazyframe__to_dot <- function(
 #' Returns a n-row null-filled LazyFrame with an identical schema. `n` can be
 #' greater than the current number of rows in the LazyFrame.
 #'
-#' @inheritParams DataFrame_clear
+#' @param n Number of (empty) rows to return in the cleared frame.
 #'
 #' @return A n-row null-filled LazyFrame with an identical schema
 #'
@@ -1744,52 +1745,97 @@ lazyframe__gather_every <- function(n, offset = 0) {
   self$select(pl$col("*")$gather_every(n, offset))
 }
 
+#' Return the number of non-null elements for each column
+#'
+#' @inherit as_polars_lf return
+#'
+#' @examples
+#' lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, NA), c = rep(NA, 4))
+#' lf$count()$collect()
+lazyframe__count <- function() {
+  wrap({
+    self$`_ldf`$count()
+  })
+}
 
-#' Cast LazyFrame column(s) to the specified dtype
+#' Return the number of null elements for each column
 #'
-#' This allows to convert all columns to a datatype or to convert only specific
-#' columns. Contrarily to the Python implementation, it is not possible to
-#' convert all columns of a specific datatype to another datatype.
+#' @inherit as_polars_lf return
 #'
-#' @param dtypes Either a datatype or a list where the names are column names and
-#' the values are the datatypes to convert to.
-#' @param ... Ignored.
-#' @param strict If `TRUE` (default), throw an error if a cast could not be done
-#' (for instance, due to an overflow). Otherwise, return `null`.
+#' @examples
+#' lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, NA), c = rep(NA, 4))
+#' lf$null_count()$collect()
+lazyframe__null_count <- function() {
+  wrap({
+    self$`_ldf`$null_count()
+  })
+}
+
+#' Return the `k` smallest rows
+#'
+#' @description
+#' Non-null elements are always preferred over null elements, regardless of the
+#' value of `reverse`. The output is not guaranteed to be in any particular
+#' order, call `sort()` after this function if you wish the output to be sorted.
+#'
+#' @inheritParams rlang::check_dots_empty
+#' @param k Number of rows to return.
+#' @param by Column(s) used to determine the bottom rows. Accepts expression
+#' input. Strings are parsed as column names.
+#' @param reverse Consider the `k` largest elements of the by column(s)
+#' (instead of the k smallest). This can be specified per column by passing a
+#' sequence of booleans.
 #'
 #' @inherit as_polars_lf return
 #'
 #' @examples
 #' lf <- pl$LazyFrame(
-#'   foo = 1:3,
-#'   bar = c(6, 7, 8),
-#'   ham = as.Date(c("2020-01-02", "2020-03-04", "2020-05-06"))
+#'   a = c("a", "b", "a", "b", "b", "c"),
+#'   b = c(2, 1, 1, 3, 2, 1)
 #' )
 #'
-#' # Cast only some columns
-#' lf$cast(list(foo = pl$Float32, bar = pl$UInt8))$collect()
+#' # Get the rows which contain the 4 smallest values in column b.
+#' lf$bottom_k(4, by = "b")$collect()
 #'
-#' # Cast all columns to the same type
-#' lf$cast(pl$String)$collect()
-lazyframe__cast <- function(dtypes, ..., strict = TRUE) {
-  if (!is.list(dtypes)) {
-    self$`_rexpr`$cast_all(dtype = dtypes, strict = strict) |>
-      unwrap("in $cast():")
-  } else {
-    self$`_rexpr`$cast(dtypes = dtypes, strict = strict) |>
-      unwrap("in $cast():")
-  }
+#' # Get the rows which contain the 4 smallest values when sorting on column a
+#' # and b$
+#' lf$bottom_k(4, by = c("a", "b"))$collect()
+lazyframe__bottom_k <- function(k, ..., by, reverse = FALSE) {
+  wrap({
+    check_dots_empty0(...)
+    by <- parse_into_list_of_expressions(!!!by)
+    reverse <- extend_bool(reverse, length(by), "reverse", "...")
+    self$`_ldf`$bottom_k(k, by, reverse)
+  })
 }
 
-#' Return the number of non-null elements for each column
+#' Return the `k` largest rows
 #'
+#' @inherit lazyframe__bottom_k description params
+#' @inheritParams rlang::check_dots_empty0
+#' @param reverse Consider the `k` smallest elements of the `by` column(s)
+#' (instead of the `k` largest). This can be specified per column by passing a
+#' sequence of booleans.
+
 #' @inherit as_polars_lf return
 #'
 #' @examples
-#' lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, NA), c = rep(NA, 4))
-#' lf$count()$collect()
-lazyframe__count <- function() {
+#' lf <- pl$LazyFrame(
+#'   a = c("a", "b", "a", "b", "b", "c"),
+#'   b = c(2, 1, 1, 3, 2, 1)
+#' )
+#'
+#' # Get the rows which contain the 4 largest values in column b.
+#' lf$top_k(4, by = "b")$collect()
+#'
+#' # Get the rows which contain the 4 largest values when sorting on column a
+#' # and b$
+#' lf$top_k(4, by = c("a", "b"))$collect()
+lazyframe__top_k <- function(k, ..., by, reverse = FALSE) {
   wrap({
-    self$`_rexpr`$count()
+    check_dots_empty0(...)
+    by <- parse_into_list_of_expressions(!!!by)
+    reverse <- extend_bool(reverse, length(by), "reverse", "...")
+    self$`_ldf`$top_k(k, by, reverse)
   })
 }
diff --git a/src/rust/src/lazyframe/general.rs b/src/rust/src/lazyframe/general.rs
index 85116775..095c822b 100644
--- a/src/rust/src/lazyframe/general.rs
+++ b/src/rust/src/lazyframe/general.rs
@@ -648,13 +648,15 @@ impl PlRLazyFrame {
         Ok(out.into())
     }
 
-    fn std(&self, ddof: u8) -> Result<PlRLazyFrame> {
+    fn std(&self, ddof: NumericScalar) -> Result<PlRLazyFrame> {
+        let ddof = <Wrap<u8>>::try_from(ddof)?.0;
         let ldf = self.ldf.clone();
         let out = ldf.std(ddof);
         Ok(out.into())
     }
 
-    fn var(&self, ddof: u8) -> Result<PlRLazyFrame> {
+    fn var(&self, ddof: NumericScalar) -> Result<PlRLazyFrame> {
+        let ddof = <Wrap<u8>>::try_from(ddof)?.0;
         let ldf = self.ldf.clone();
         let out = ldf.var(ddof);
         Ok(out.into())

From c89f35c8a3157a3765a317cfb7bd0098ee0dd6ed Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Tue, 5 Nov 2024 10:59:54 +0100
Subject: [PATCH 05/71] more

---
 R/lazyframe-frame.R | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index 7c0a8b18..23e7cf27 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -567,18 +567,18 @@ lazyframe__quantile <- function(
   })
 }
 
-#' @inherit Expr_fill_nan title params
+#' @inherit expr__fill_nan title params
 #'
 #' @inherit as_polars_lf return
 #' @examples
-#' df <- pl$LazyFrame(
+#' lf <- pl$LazyFrame(
 #'   a = c(1.5, 2, NaN, 4),
 #'   b = c(1.5, NaN, NaN, 4)
 #' )
-#' df$fill_nan(99)$collect()
+#' lf$fill_nan(99)$collect()
 lazyframe__fill_nan <- function(value) {
   wrap({
-    self$`_ldf`$fill_nan(value)
+    self$`_ldf`$fill_nan(as_polars_expr(value)$`_rexpr`)
   })
 }
 
@@ -586,14 +586,14 @@ lazyframe__fill_nan <- function(value) {
 #'
 #' @inherit as_polars_lf return
 #' @examples
-#' df <- pl$LazyFrame(
+#' lf <- pl$LazyFrame(
 #'   a = c(1.5, 2, NA, 4),
 #'   b = c(1.5, NA, NA, 4)
 #' )
-#' df$fill_null(99)$collect()
+#' lf$fill_null(99)$collect()
 lazyframe__fill_null <- function(fill_value) {
   wrap({
-    self$`_ldf`$fill_null(wrap_e_result(fill_value))
+    self$`_ldf`$fill_null(as_polars_expr(fill_value)$`_rexpr`)
   })
 }
 
@@ -720,7 +720,7 @@ lazyframe__unique <- function(
 #' (`$agg()`, `$filter()`, etc.).
 #'
 #' @param ... Column(s) to group by.
-#' Accepts [expression][Expr_class] input. Characters are parsed as column names.
+#' Accepts [expression][expr__class] input. Characters are parsed as column names.
 #' @param maintain_order Ensure that the order of the groups is consistent with the input data.
 #' This is slower than a default group by.
 #' Setting this to `TRUE` blocks the possibility to run on the streaming engine.
@@ -1426,7 +1426,7 @@ lazyframe__with_context <- function(other) {
 
 #' Create rolling groups based on a date/time or integer column
 #'
-#' @inherit Expr_rolling description details params
+#' @inherit expr__rolling description details params
 #' @param index_column Column used to group based on the time window. Often of
 #' type Date/Datetime. This column must be sorted in ascending order (or, if `by`
 #' is specified, then it must be sorted in ascending order within each group). In

From eb018394a9806c7ecfc8b0d476998d75e31d8f6b Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Tue, 5 Nov 2024 11:44:32 +0100
Subject: [PATCH 06/71] more

---
 R/lazyframe-frame.R | 121 ++++++++++++++++++++++++++------------------
 src/rust/Cargo.toml |   1 +
 2 files changed, 73 insertions(+), 49 deletions(-)

diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index 23e7cf27..5b2222db 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -102,6 +102,38 @@ lazyframe__select <- function(...) {
   })
 }
 
+#' Start a group by operation
+#'
+#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Column(s) to group by.
+#' Accepts expression input. Strings are parsed as column names.
+#' @param .maintain_order Ensure that the order of the groups is consistent with
+#' the input data. This is slower than a default group by. Setting this to
+#' `TRUE` blocks the possibility to run on the streaming engine.
+#'
+# TODO: need a proper definition to link to
+#' @return A lazy groupby
+#' @examples
+#' # Group by one column and call agg() to compute the grouped sum of another
+#' # column.
+#' lf <- pl$LazyFrame(
+#'   a = c("a", "b", "a", "b", "c"),
+#'   b = c(1, 2, 1, 3, 3),
+#'   c = c(5, 4, 3, 2, 1)
+#' )
+#' lf$group_by("a")$agg(pl$col("b")$sum())$collect()
+#'
+#' # Set .maintain_order = TRUE to ensure the order of the groups is consistent
+#' # with the input.
+#' lf$group_by("a", .maintain_order = TRUE)$agg(pl$col("b")$sum())$collect()
+#'
+#' # Group by multiple columns by passing a vector of column names.
+#' lf$group_by(c("a", "b"))$agg(pl$col("c")$max())$collect()
+#' 
+#' # Or use positional arguments to group by multiple columns in the same way. 
+#' # Expressions are also accepted.
+#' lf$
+#'   group_by("a", pl$col("b") / 2)$
+#'   agg(pl$col("c")$mean())$collect()
 lazyframe__group_by <- function(..., .maintain_order = FALSE) {
   wrap({
     exprs <- parse_into_list_of_expressions(...)
@@ -302,6 +334,38 @@ lazyframe__cast <- function(..., .strict = TRUE) {
   })
 }
 
+#' Filter the rows in the LazyFrame based on a predicate expression
+#'
+#' The original order of the remaining rows is preserved. Rows where the filter
+#' does not evaluate to `TRUE` are discarded, including nulls.
+#'
+#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Expression that evaluates to
+#' a boolean Series.
+#'
+#' @inherit as_polars_lf return
+#' @examples
+#' lf <- pl$LazyFrame(
+#'   foo = c(1, 2, 3, NA, 4, NA, 0),
+#'   bar = c(6, 7, 8, NA, NA, 9, 0),
+#'   ham = c("a", "b", "c", NA, "d", "e", "f")
+#' )
+#'
+#' # Filter on one condition
+#' lf$filter(pl$col("foo") > 1)$collect()
+#'
+#' # Filter on multiple conditions
+#' lf$filter((pl$col("foo") < 3) & (pl$col("ham") == "a"))$collect()
+#'
+#' # Filter on an OR condition
+#' lf$filter((pl$col("foo") == 1) | (pl$col("ham") == " c"))$collect()
+#'
+#' # Filter by comparing two columns against each other
+#' lf$filter(pl$col("foo") == pl$col("bar"))$collect()
+#' lf$filter(pl$col("foo") != pl$col("bar"))$collect()
+#'
+#' # Notice how the row with null values is filtered out$ In order to keep the
+#' # rows with nulls, use:
+#' lf$filter(pl$col("foo")$ne_missing(pl$col("bar")))$collect()
 lazyframe__filter <- function(...) {
   parse_predicates_constraints_into_expression(...) |>
     self$`_ldf`$filter() |>
@@ -390,8 +454,8 @@ lazyframe__with_columns <- function(...) {
 
 #' Remove columns from the DataFrame
 #'
-#' @param  <[`dynamic-dots`][rlang::dyn-dots]> Names of the columns that should
-#' be removed from the dataframe. Accepts column selector input.
+#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Names of the columns that
+#' should be removed from the dataframe. Accepts column selector input.
 #' @param strict Validate that all column names exist in the current schema,
 #' and throw an exception if any do not.
 #'
@@ -436,19 +500,19 @@ lazyframe__tail <- function(n = 5) {
 }
 
 
-#' Get the first row of a LazyFrame
+#' Get the first row of the LazyFrame
 #'
 #' @inherit as_polars_lf return
 #' @examples
-#' as_polars_lf(mtcars)$first()$collect()
+#' lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1))
+#' lf$first()$collect()
 lazyframe__first <- function() {
   wrap({
-    self$`_ldf`$first()
+    self$slice(0, 1)
   })
 }
 
-#' Get the last row of a LazyFrame
-#' @description Aggregate the columns in the LazyFrame to their maximum value.
+#' Get the last row of the LazyFrame
 #'
 #' @inherit as_polars_lf return
 #' @examples
@@ -456,7 +520,7 @@ lazyframe__first <- function() {
 #' lf$last()$collect()
 lazyframe__last <- function() {
   wrap({
-    self$`_ldf`$last()
+    self$tail(1)
   })
 }
 
@@ -714,47 +778,6 @@ lazyframe__unique <- function(
   })
 }
 
-#' Group a LazyFrame
-#' @description This doesn't modify the data but only stores information about
-#' the group structure. This structure can then be used by several functions
-#' (`$agg()`, `$filter()`, etc.).
-#'
-#' @param ... Column(s) to group by.
-#' Accepts [expression][expr__class] input. Characters are parsed as column names.
-#' @param maintain_order Ensure that the order of the groups is consistent with the input data.
-#' This is slower than a default group by.
-#' Setting this to `TRUE` blocks the possibility to run on the streaming engine.
-#' The default value can be changed with `options(polars.maintain_order = TRUE)`.
-#' @return [LazyGroupBy][LazyGroupBy_class] (a LazyFrame with special groupby methods like `$agg()`)
-#' @examples
-#' lf <- pl$LazyFrame(
-#'   a = c("a", "b", "a", "b", "c"),
-#'   b = c(1, 2, 1, 3, 3),
-#'   c = c(5, 4, 3, 2, 1)
-#' )
-#'
-#' lf$group_by("a")$agg(pl$col("b")$sum())$collect()
-#'
-#' # Set `maintain_order = TRUE` to ensure the order of the groups is consistent with the input.
-#' lf$group_by("a", maintain_order = TRUE)$agg(pl$col("c"))$collect()
-#'
-#' # Group by multiple columns by passing a list of column names.
-#' lf$group_by(c("a", "b"))$agg(pl$max("c"))$collect()
-#'
-#' # Or pass some arguments to group by multiple columns in the same way.
-#' # Expressions are also accepted.
-#' lf$group_by("a", pl$col("b") %/% 2)$agg(
-#'   pl$col("c")$mean()
-#' )$collect()
-#'
-#' # The columns will be renamed to the argument names.
-#' lf$group_by(d = "a", e = pl$col("b") %/% 2)$agg(
-#'   pl$col("c")$mean()
-#' )$collect()
-lazyframe__group_by <- function(..., maintain_order = polars_options()$maintain_order) {
-  self$`_ldf`$group_by(unpack_list(..., .context = "in $group_by():"), maintain_order)
-}
-
 #' Join LazyFrames
 #'
 #' This function can do both mutating joins (adding columns based on matching
diff --git a/src/rust/Cargo.toml b/src/rust/Cargo.toml
index 9378f879..e596ceae 100644
--- a/src/rust/Cargo.toml
+++ b/src/rust/Cargo.toml
@@ -63,6 +63,7 @@ features = [
     "pivot",
     "propagate_nans",
     "range",
+    "round_series",
     "semi_anti_join",
     "serde",
     "serde-lazy",

From c56a28c04491cd5a00aa2c5004aef1e52b528c22 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Tue, 5 Nov 2024 14:27:48 +0100
Subject: [PATCH 07/71] joins

---
 R/check_polars.R    |   2 +-
 R/lazyframe-frame.R | 308 +++++++++++++++++++++++++++++++-------------
 src/rust/Cargo.toml |   1 +
 3 files changed, 220 insertions(+), 91 deletions(-)

diff --git a/R/check_polars.R b/R/check_polars.R
index 4a5a95fe..9cd32a8f 100644
--- a/R/check_polars.R
+++ b/R/check_polars.R
@@ -296,7 +296,7 @@ check_date_or_datetime <- function(
     arg = caller_arg(x),
     call = caller_env()) {
   if (!missing(x)) {
-    if (inherits(x, c("Date", "POSIXct", "polars_expr"))) {
+    if (inherits(x, c("Date", "POSIXct", "POSIXlt", "polars_expr"))) {
       return(invisible(NULL))
     }
     if (allow_null && is_null(x)) {
diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index 5b2222db..e88e4b69 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -128,8 +128,8 @@ lazyframe__select <- function(...) {
 #'
 #' # Group by multiple columns by passing a vector of column names.
 #' lf$group_by(c("a", "b"))$agg(pl$col("c")$max())$collect()
-#' 
-#' # Or use positional arguments to group by multiple columns in the same way. 
+#'
+#' # Or use positional arguments to group by multiple columns in the same way.
 #' # Expressions are also accepted.
 #' lf$
 #'   group_by("a", pl$col("b") / 2)$
@@ -489,11 +489,27 @@ lazyframe__slice <- function(offset, length = NULL) {
   })
 }
 
+#' Get the first `n` rows
+#'
+#' @param n Number of rows to return.
+#' @inherit as_polars_lf return
+#' @examples
+#' lf <- pl$LazyFrame(a = 1:6, b = 7:12)
+#' lf$head()$collect()
+#' lf$head(2)$collect()
 lazyframe__head <- function(n = 5) {
   self$slice(0, n) |>
     wrap()
 }
 
+#' Get the last `n` rows
+#'
+#' @inheritParams lazyframe__head
+#' @inherit as_polars_lf return
+#' @examples
+#' lf <- pl$LazyFrame(a = 1:6, b = 7:12)
+#' lf$tail()$collect()
+#' lf$tail(2)$collect()
 lazyframe__tail <- function(n = 5) {
   self$`_ldf`$tail(n) |>
     wrap()
@@ -785,13 +801,24 @@ lazyframe__unique <- function(
 #' observations based on matching observations, for example with `how =
 #' "inner"`).
 #'
+#' @inheritParams rlang::check_dots_empty0
 #' @param other LazyFrame to join with.
 #' @param on Either a vector of column names or a list of expressions and/or
 #'   strings. Use `left_on` and `right_on` if the column names to match on are
 #'   different between the two DataFrames.
-#' @param how One of the following methods: "inner", "left", "right", "full",
-#'   "semi", "anti", "cross".
-#' @param ... Ignored.
+#' @param how One of the following methods:
+#' * "inner": returns rows that have matching values in both tables
+#' * "left": returns all rows from the left table, and the matched rows from
+#'   the right table
+#' * "right": returns all rows from the right table, and the matched rows from
+#'   the left table
+#' * "full": returns all rows when there is a match in either left or right
+#'   table
+#' * "cross": returns the Cartesian product of rows from both tables
+#' * "semi": returns rows from the left table that have a match in the right
+#'   table.
+#' * "anti": returns rows from the left table that have no match in the right
+#'   table.
 #' @param left_on,right_on Same as `on` but only for the left or the right
 #'   DataFrame. They must have the same length.
 #' @param suffix Suffix to add to duplicated column names.
@@ -802,8 +829,7 @@ lazyframe__unique <- function(
 #' * `"1:m"`: one-to-many, check if join keys are unique in left dataset
 #' * `"m:1"`: many-to-one, check if join keys are unique in right dataset
 #'
-#' Note that this is currently not supported by the streaming engine, and is
-#' only supported when joining by single columns.
+#' Note that this is currently not supported by the streaming engine.
 #'
 #' @param join_nulls Join on null values. By default null values will never
 #'   produce matches.
@@ -815,29 +841,29 @@ lazyframe__unique <- function(
 #' - `NULL`: join specific.
 #' - `TRUE`: Always coalesce join columns.
 #' - `FALSE`: Never coalesce join columns.
+#' Note that joining on any other expressions than `col` will turn off
+#' coalescing.
 #'
 #' @inherit as_polars_lf return
 #' @examples
-#' # inner join by default
-#' df1 <- pl$LazyFrame(list(key = 1:3, payload = c("f", "i", NA)))
-#' df2 <- pl$LazyFrame(list(key = c(3L, 4L, 5L, NA_integer_)))
-#' df1$join(other = df2, on = "key")
-#'
-#' # cross join
-#' df1 <- pl$LazyFrame(x = letters[1:3])
-#' df2 <- pl$LazyFrame(y = 1:4)
-#' df1$join(other = df2, how = "cross")
-#'
-#' # use "validate" to ensure join keys are not duplicated
-#' df1 <- pl$LazyFrame(x = letters[1:5], y = 1:5)
-#' df2 <- pl$LazyFrame(x = c("a", letters[1:4]), y2 = 6:10)
-#'
-#' # this throws an error because there are two keys in df2 that match the key
-#' # in df1
-#' tryCatch(
-#'   df1$join(df2, on = "x", validate = "1:1")$collect(),
-#'   error = function(e) print(e)
+#' lf <- pl$LazyFrame(
+#'   foo = 1:3,
+#'   bar = c(6, 7, 8),
+#'   ham = c("a", "b", "c")
+#' )
+#' other_lf <- pl$LazyFrame(
+#'   apple = c("x", "y", "z"),
+#'   ham = c("a", "b", "d")
 #' )
+#' lf$join(other_lf, on = "ham")$collect()
+#'
+#' lf$join(other_lf, on = "ham", how = "full")$collect()
+#'
+#' lf$join(other_lf, on = "ham", how = "left", coalesce = TRUE)$collect()
+#'
+#' lf$join(other_lf, on = "ham", how = "semi")$collect()
+#'
+#' lf$join(other_lf, on = "ham", how = "anti")$collect()
 lazyframe__join <- function(
     other,
     on = NULL,
@@ -851,42 +877,65 @@ lazyframe__join <- function(
     allow_parallel = TRUE,
     force_parallel = FALSE,
     coalesce = NULL) {
-  uw <- \(res) wrap({
-    res
-  })
-
-
-  if (!is_polars_lf(other)) {
-    Err_plain("`other` must be a LazyFrame.") |> uw()
-  }
-
-  if (how == "cross") {
-    if (!is.null(on) || !is.null(left_on) || !is.null(right_on)) {
-      Err_plain("cross join should not pass join keys.") |> uw()
+  wrap({
+    check_dots_empty0(...)
+    check_polars_lf(other)
+    how <- arg_match0(
+      how,
+      values = c("inner", "full", "left", "right", "semi", "anti", "cross")
+    )
+    validate <- arg_match0(validate, values = c("m:m", "1:m", "m:1", "1:1"))
+    uses_on <- !is.null(on)
+    uses_left_on <- !is.null(left_on)
+    uses_right_on <- !is.null(right_on)
+    uses_lr_on <- uses_left_on | uses_right_on
+    if (uses_on && uses_lr_on) {
+      abort("cannot use 'on' in conjunction with 'left_on' or 'right_on'.")
     }
-    rexprs_left <- as.list(NULL)
-    rexprs_right <- as.list(NULL)
-  } else {
-    if (!is.null(on)) {
-      rexprs_right <- rexprs_left <- as.list(on)
-    } else if ((!is.null(left_on) && !is.null(right_on))) {
-      rexprs_left <- as.list(left_on)
-      rexprs_right <- as.list(right_on)
-    } else {
-      Err_plain("must specify either `on`, or `left_on` and `right_on`.") |> uw()
+    if (uses_left_on && !uses_right_on) {
+      abort("'left_on' requires corresponding 'right_on'")
+    }
+    if (!uses_left_on && uses_right_on) {
+      abort("'right_on' requires corresponding 'left_on'")
+    }
+    if (how == "cross") {
+      if (uses_on | uses_lr_on) {
+        abort("cross join should not pass join keys.")
+      }
+      return(
+        self$`_ldf`$join(
+          other$`_ldf`, as.list(NULL), as.list(NULL),
+          how = how, validate = validate,
+          join_nulls = join_nulls, suffix = suffix,
+          allow_parallel = allow_parallel, force_parallel = force_parallel,
+          coalesce = coalesce
+        )
+      )
     }
-  }
 
-  self$`_ldf`$join(
-    lf, other, rexprs_left, rexprs_right, how, validate, join_nulls, suffix,
-    allow_parallel, force_parallel, coalesce
-  ) |>
-    uw()
+    if (uses_on) {
+      rexprs_right <- rexprs_left <- parse_into_list_of_expressions(!!!on)
+    } else if (uses_lr_on) {
+      rexprs_left <- parse_into_list_of_expressions(!!!left_on)
+      rexprs_right <- parse_into_list_of_expressions(!!!right_on)
+    } else {
+      abort("must specify either `on`, or `left_on` and `right_on`.")
+    }
+    self$`_ldf`$join(
+      other$`_ldf`, rexprs_left, rexprs_right,
+      how = how, validate = validate,
+      join_nulls = join_nulls, suffix = suffix,
+      allow_parallel = allow_parallel, force_parallel = force_parallel,
+      coalesce = coalesce
+    )
+  })
 }
 
 #' Perform a join based on one or multiple (in)equality predicates
 #'
 #' @description
+#' `r lifecycle::badge("experimental")`
+#'
 #' This performs an inner join, so only rows where all predicates are true are
 #' included in the result, and a row from either LazyFrame may be included
 #' multiple times in the result.
@@ -894,11 +943,11 @@ lazyframe__join <- function(
 #' Note that the row order of the input LazyFrames is not preserved.
 #'
 #' @param other LazyFrame to join with.
-#' @param ... (In)Equality condition to join the two tables on. When a column
-#' name occurs in both tables, the proper suffix must be applied in the
-#' predicate. For example, if both tables have a column `"x"` that you want to
-#' use in the conditions, you must refer to the column of the right table as
-#' `"x<suffix>"`.
+#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> (In)Equality condition to
+#' join the two tables on. When a column name occurs in both tables, the proper
+#' suffix must be applied in the predicate. For example, if both tables have a
+#' column `"x"` that you want to use in the conditions, you must refer to the
+#' column of the right table as `"x<suffix>"`.
 #' @param suffix Suffix to append to columns with a duplicate name.
 #'
 #' @inherit as_polars_lf return
@@ -929,7 +978,8 @@ lazyframe__join_where <- function(
     suffix = "_right") {
   wrap({
     check_polars_lf(other)
-    self$`_ldf`$join_where(other, unpack_list(..., .context = "in $join_where():"), suffix)
+    by <- parse_into_list_of_expressions(...)
+    self$`_ldf`$join_where(other$`_ldf`, by, suffix)
   })
 }
 
@@ -1499,12 +1549,35 @@ lazyframe__rolling <- function(
 
 #' Group based on a date/time or integer column
 #'
-#' @inherit lazyframe__rolling description details params
+#' Time windows are calculated and rows are assigned to windows. Different from
+#' a normal group by is that a row can be member of multiple groups. By
+#' default, the windows look like:
+#' * [start, start + period)
+#' * [start + every, start + every + period)
+#' * [start + 2*every, start + 2*every + period)
+#' * …
+#'
+#' where `start` is determined by `start_by`, `offset`, `every`, and the
+#' earliest datapoint. See the `start_by` argument description for details.
 #'
+#' @inheritParams rlang::check_dots_empty0
+#' @param index_column Column used to group based on the time window. Often of
+#' type Date/Datetime. This column must be sorted in ascending order (or, if
+#' `group_by` is specified, then it must be sorted in ascending order within
+#' each group).
+#' In case of a dynamic group by on indices, the data type needs to be either
+#' Int32 or In64. Note that Int32 gets temporarily cast to Int64, so if
+#' performance matters, use an Int64 column.
 #' @param every Interval of the window.
+#' @param period Length of the window. If `NULL` (default), it will equal
+#' `every`.
+#' @param offset Offset of the window, does not take effect if
+#' `start_by = "datapoint"`. Defaults to zero.
 #' @param include_boundaries Add two columns `"_lower_boundary"` and
 #' `"_upper_boundary"` columns that show the boundaries of the window. This will
 #' impact performance because it’s harder to parallelize.
+#' @param closed Define which sides of the interval are closed (inclusive).
+#' Default is `"left"`.
 #' @param label Define which label to use for the window:
 #' * `"left"`: lower boundary of the window
 #' * `"right"`: upper boundary of the window
@@ -1512,49 +1585,72 @@ lazyframe__rolling <- function(
 #' you don’t need the label to be at one of the boundaries, choose this option
 #' for maximum performance.
 #' @param start_by The strategy to determine the start of the first window by:
-#' * `"window"`: start by taking the earliest timestamp, truncating it with `every`,
-#'   and then adding `offset`. Note that weekly windows start on Monday.
+#' * `"window"`: start by taking the earliest timestamp, truncating it with
+#'   `every`, and then adding `offset`. Note that weekly windows start on
+#'   Monday.
 #' * `"datapoint"`: start from the first encountered data point.
 #' * a day of the week (only takes effect if `every` contains `"w"`): `"monday"`
 #'   starts the window on the Monday before the first data point, etc.
 #'
+#' @details
+#' The `every`, `period`, and `offset` arguments are created with the following
+#' string language:
+#' - 1ns # 1 nanosecond
+#' - 1us # 1 microsecond
+#' - 1ms # 1 millisecond
+#' - 1s  # 1 second
+#' - 1m  # 1 minute
+#' - 1h  # 1 hour
+#' - 1d  # 1 day
+#' - 1w  # 1 calendar week
+#' - 1mo # 1 calendar month
+#' - 1y  # 1 calendar year
+#' These strings can be combined:
+#'   - 3d12h4m25s # 3 days, 12 hours, 4 minutes, and 25 seconds
+#'
+#' In case of a `group_by_dynamic` on an integer column, the windows are
+#' defined by:
+#' - 1i # length 1
+#' - 10i # length 10
+#'
 #' @return A [LazyGroupBy][LazyGroupBy_class] object
 #' @seealso
 #' - [`<LazyFrame>$rolling()`][lazyframe__rolling]
+#'
 #' @examples
-#' lf <- pl$LazyFrame(
+#' lf <- pl$select(
 #'   time = pl$datetime_range(
 #'     start = strptime("2021-12-16 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"),
 #'     end = strptime("2021-12-16 03:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"),
 #'     interval = "30m"
 #'   ),
 #'   n = 0:6
-#' )
+#' )$lazy()
 #' lf$collect()
 #'
-#' # get the sum in the following hour relative to the "time" column
-#' lf$group_by_dynamic("time", every = "1h")$agg(
-#'   vals = pl$col("n"),
-#'   sum = pl$col("n")$sum()
+#' # Group by windows of 1 hour.
+#' lf$group_by_dynamic("time", every = "1h", closed = "right")$agg(
+#'   vals = pl$col("n")
 #' )$collect()
 #'
-#' # using "include_boundaries = TRUE" is helpful to see the period considered
-#' lf$group_by_dynamic("time", every = "1h", include_boundaries = TRUE)$agg(
-#'   vals = pl$col("n")
+#' # The window boundaries can also be added to the aggregation result
+#' lf$group_by_dynamic(
+#'   "time",
+#'   every = "1h", include_boundaries = TRUE, closed = "right"
+#' )$agg(
+#'   pl$col("n")$mean()
 #' )$collect()
 #'
-#' # in the example above, the values didn't include the one *exactly* 1h after
-#' # the start because "closed = 'left'" by default.
-#' # Changing it to "right" includes values that are exactly 1h after. Note that
-#' # the value at 00:00:00 now becomes included in the interval [23:00:00 - 00:00:00],
-#' # even if this interval wasn't there originally
-#' lf$group_by_dynamic("time", every = "1h", closed = "right")$agg(
-#'   vals = pl$col("n")
+#' # When closed = "left", the window excludes the right end of interval:
+#' # [lower_bound, upper_bound)
+#' lf$group_by_dynamic("time", every = "1h", closed = "left")$agg(
+#'   pl$col("n")
 #' )$collect()
-#' # To keep both boundaries, we use "closed = 'both'". Some values now belong to
-#' # several groups:
+#'
+#' # When closed = "both" the time values at the window boundaries belong to 2
+#' # groups.
 #' lf$group_by_dynamic("time", every = "1h", closed = "both")$agg(
-#'   vals = pl$col("n")
+#'   pl$col("n")
 #' )$collect()
 #'
 #' # Dynamic group bys can also be combined with grouping on normal keys
@@ -1596,14 +1692,27 @@ lazyframe__group_by_dynamic <- function(
     label = "left",
     group_by = NULL,
     start_by = "window") {
-  every <- parse_as_polars_duration_string(every)
-  offset <- parse_as_polars_duration_string(offset) %||% negate_duration_string(every)
-  period <- parse_as_polars_duration_string(period) %||% every
-
-  self$`_ldf`$group_by_dynamic(
-    lf, index_column, every, period, offset, label, include_boundaries, closed,
-    wrap_elist_result(group_by, str_to_lit = FALSE), start_by
-  )
+  wrap({
+    check_dots_empty0(...)
+    closed <- arg_match0(closed, values = c("both", "left", "right", "none"))
+    start_by <- arg_match0(
+      start_by,
+      values = c(
+        "window", "datapoint", "monday", "tuesday", "wednesday", "thursday",
+        "friday", "saturday", "sunday"
+      )
+    )
+    every <- parse_as_polars_duration_string(every)
+    offset <- parse_as_polars_duration_string(offset) %||% "0ns"
+    period <- parse_as_polars_duration_string(period) %||% every
+    group_by <- parse_into_list_of_expressions(!!!group_by)
+
+    self$`_ldf`$group_by_dynamic(
+      as_polars_expr(index_column)$`_rexpr`, every, period, offset, label,
+      include_boundaries, closed,
+      group_by, start_by
+    )
+  })
 }
 
 #' Plot the query plan
@@ -1862,3 +1971,22 @@ lazyframe__top_k <- function(k, ..., by, reverse = FALSE) {
     self$`_ldf`$top_k(k, by, reverse)
   })
 }
+
+#' Interpolate intermediate values
+#'
+#' The interpolation method is linear.
+#' @inherit as_polars_lf return
+#'
+#' @examples
+#' lf <- pl$LazyFrame(
+#'   foo = c(1, NA, 9, 10),
+#'   bar = c(6, 7, 9, NA),
+#'   ham = c(1, NA, NA, 9)
+#' )
+#'
+#' lf$interpolate()$collect()
+lazyframe__interpolate <- function() {
+  wrap({
+    self$select(pl$col("*")$interpolate())
+  })
+}
diff --git a/src/rust/Cargo.toml b/src/rust/Cargo.toml
index e596ceae..07df66bc 100644
--- a/src/rust/Cargo.toml
+++ b/src/rust/Cargo.toml
@@ -42,6 +42,7 @@ features = [
     "extract_jsonpath",
     "find_many",
     "fused",
+    "iejoin",
     "ipc",
     "is_in",
     "json",

From a0610d0a0ad5b3f9ae752336ac2303cdb19748fc Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Tue, 5 Nov 2024 14:45:53 +0100
Subject: [PATCH 08/71] more

---
 R/lazyframe-frame.R | 113 +++++++++++++++++++++++++++++---------------
 1 file changed, 75 insertions(+), 38 deletions(-)

diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index e88e4b69..9f18f022 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -502,6 +502,22 @@ lazyframe__head <- function(n = 5) {
     wrap()
 }
 
+#' Get the first `n` rows
+#'
+#' Alias for [`<LazyFrame>$head()`][lazyframe__head].
+#'
+#' @inheritParams lazyframe__head
+#' @inherit as_polars_lf return
+#' @examples
+#' lf <- pl$LazyFrame(a = 1:6, b = 7:12)
+#' lf$limit()$collect()
+#' lf$limit(2)$collect()
+lazyframe__limit <- function(n = 5) {
+  wrap({
+    self$head(n)
+  })
+}
+
 #' Get the last `n` rows
 #'
 #' @inheritParams lazyframe__head
@@ -694,12 +710,12 @@ lazyframe__shift <- function(n = 1, fill_value = NULL) {
   self$`_ldf`$shift(n, fill_value)
 }
 
-#' Reverse
-#' @description Reverse the LazyFrame (the last row becomes the first one, etc.).
+#' Reverse the LazyFrame
 #'
 #' @inherit as_polars_lf return
 #' @examples
-#' as_polars_lf(mtcars)$reverse()$collect()
+#' lf <- pl$LazyFrame(key = c("a", "b", "c"), val = 1:3)
+#' lf$reverse()$collect()
 lazyframe__reverse <- function() {
   wrap({
     self$`_ldf`$reverse()
@@ -1136,17 +1152,22 @@ lazyframe__unpivot <- function(
   ) |> unwrap("in $unpivot( ): ")
 }
 
-#' Rename column names of a LazyFrame
+#' Rename column names
+#'
+#' @param mapping Either a function that takes a character vector as input and
+#' returns one as input, or a named list where names are old column names and
+#' values are the new ones.
+#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> If `mapping` is missing,
+#' those values are used.
+#' @param strict Validate that all column names exist in the current schema,
+#' and throw an error if any do not. (Note that this parameter is a no-op when
+#' passing a function to `mapping`).
 #'
 #' @details
-#' If existing names are swapped (e.g. `A` points to `B` and `B` points to `A`),
-#' polars will block projection and predicate pushdowns at this node.
-#' @inherit pl_LazyFrame return
-#' @param ... One of the following:
-#' - Key value pairs that map from old name to new name, like `old_name = "new_name"`.
-#' - As above but with params wrapped in a list
-#' - An R function that takes the old names character vector as input and
-#'   returns the new names character vector.
+#' If existing names are swapped (e.g. 'A' points to 'B' and 'B' points to
+#' 'A'), polars will block projection and predicate pushdowns at this node.
+#'
+#' @inherit as_polars_lf return
 #' @examples
 #' lf <- pl$LazyFrame(
 #'   foo = 1:3,
@@ -1159,33 +1180,20 @@ lazyframe__unpivot <- function(
 #' lf$rename(
 #'   \(column_name) paste0("c", substr(column_name, 2, 100))
 #' )$collect()
-lazyframe__rename <- function(...) {
-  uw <- \(res) wrap({
-    res
-  })
-
-
-  if (!nargs()) {
-    Err_plain("No arguments provided for `$rename()`.") |>
-      uw()
-  }
-
-  mapping <- list2(...)
-  if (is.function(mapping[[1L]])) {
-    result({
-      existing <- names(self)
-      new <- mapping[[1L]](existing)
-    }) |>
-      uw()
-  } else {
-    if (is.list(mapping[[1L]])) {
-      mapping <- mapping[[1L]]
+lazyframe__rename <- function(mapping, ..., strict = TRUE) {
+  wrap({
+    if (!missing(mapping) && is_function(mapping)) {
+      check_dots_empty0(...)
+      self$select(pl$all()$name$map(mapping))
+    } else {
+      if (missing(mapping) || !is.list(mapping)) {
+        mapping <- list2(...)
+      }
+      existing <- names(mapping)
+      new <- unlist(mapping)
+      self$`_ldf`$rename(existing, new, strict)
     }
-    new <- unname(unlist(mapping))
-    existing <- names(mapping)
-  }
-  self$`_ldf`$rename(existing, new) |>
-    uw()
+  })
 }
 
 #' Fetch `n` rows of a LazyFrame
@@ -1990,3 +1998,32 @@ lazyframe__interpolate <- function() {
     self$select(pl$col("*")$interpolate())
   })
 }
+
+#' Take two sorted DataFrames and merge them by the sorted key
+#'
+#' The output of this operation will also be sorted. It is the callers
+#' responsibility that the frames are sorted by that key, otherwise the output
+#' will not make sense. The schemas of both LazyFrames must be equal.
+#'
+#' @param other Other DataFrame that must be merged.
+#' @param key Key that is sorted.
+#'
+#' @inherit as_polars_lf return
+#'
+#' @examples
+#' lf1 <- pl$LazyFrame(
+#'   name = c("steve", "elise", "bob"),
+#'   age = c(42, 44, 18)
+#' )$sort("age")
+#'
+#' lf2 <- pl$LazyFrame(
+#'   name = c("anna", "megan", "steve", "thomas"),
+#'   age = c(21, 33, 42, 20)
+#' )$sort("age")
+#'
+#' lf1$merge_sorted(lf2, key = "age")$collect()
+lazyframe__merge_sorted <- function(other, key) {
+  wrap({
+    self$`_ldf`$merge_sorted(other$`_ldf`, key)
+  })
+}

From 8b6e5826b18cf3b2bc76043c719788c4aaedeeb6 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Tue, 5 Nov 2024 17:18:52 +0100
Subject: [PATCH 09/71] more

---
 R/lazyframe-frame.R | 236 ++++++++++++++++++++++++++++++++------------
 1 file changed, 173 insertions(+), 63 deletions(-)

diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index 9f18f022..1ff3b591 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -45,7 +45,6 @@ wrap.PlRLazyFrame <- function(x, ...) {
   self
 }
 
-# TODO: link to pl__select
 #' Select and modify columns of a LazyFrame
 #'
 #' @description
@@ -58,11 +57,11 @@ wrap.PlRLazyFrame <- function(x, ...) {
 #' be able to use it in another `$select()` or `$with_columns()` call.
 #'
 #' @inherit as_polars_lf return
-#' @param ... <[`dynamic-dots`][rlang::dyn-dots]>
-#' Name-value pairs of objects to be converted to polars [expressions][Expr]
-#' by the [as_polars_expr()] function.
-#' Characters are parsed as column names, other non-expression inputs are parsed as [literals][pl__lit].
-#' Each name will be used as the expression name.
+#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Name-value pairs of objects
+#' to be converted to polars [expressions][Expr] by the [as_polars_expr()]
+#' function. Characters are parsed as column names, other non-expression inputs
+#' are parsed as [literals][pl__lit]. Each name will be used as the expression
+#' name.
 #' @examples
 #' # Pass the name of a column to select that column.
 #' lf <- pl$LazyFrame(
@@ -96,12 +95,34 @@ wrap.PlRLazyFrame <- function(x, ...) {
 lazyframe__select <- function(...) {
   wrap({
     structify <- parse_env_auto_structify()
-
     parse_into_list_of_expressions(..., `__structify` = structify) |>
       self$`_ldf`$select()
   })
 }
 
+#' Select columns from this LazyFrame
+#'
+#' This will run all expression sequentially instead of in parallel. Use this
+#' when the work per expression is cheap.
+#'
+#' @inherit as_polars_lf return
+#' @inheritParams lazyframe__select
+#'
+#' @examples
+#' lf <- pl$LazyFrame(
+#'   foo = 1:3,
+#'   bar = 6:8,
+#'   ham = letters[1:3]
+#' )
+#' lf$select_seq("foo")$collect()
+lazyframe__select_seq <- function(...) {
+  wrap({
+    structify <- parse_env_auto_structify()
+    parse_into_list_of_expressions(..., `__structify` = structify) |>
+      self$`_ldf`$select_seq()
+  })
+}
+
 #' Start a group by operation
 #'
 #' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Column(s) to group by.
@@ -372,6 +393,38 @@ lazyframe__filter <- function(...) {
     wrap()
 }
 
+#' Sort the LazyFrame by the given columns
+#'
+#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Column(s) to sort by. Can be
+#' character values indicating column names or Expr(s).
+#' @param descending Sort in descending order. When sorting by multiple
+#' columns, this can be specified per column by passing a logical vector.
+#' @param nulls_last Place null values last. When sorting by multiple
+#' columns, this can be specified per column by passing a logical vector.
+#' @param maintain_order Whether the order should be maintained if elements are
+#' equal. If `TRUE`, streaming is not possible and performance might be worse
+#' since this requires a stable search.
+#' @param multithreaded Sort using multiple threads.
+#'
+#' @inherit as_polars_lf return
+#' @examples
+#' lf <- pl$LazyFrame(
+#'   a = c(1, 2, NA, 4),
+#'   b = c(6, 5, 4, 3),
+#'   c = c("a", "c", "b", "a")
+#' )
+#'
+#' # Pass a single column name to sort by that column.
+#' lf$sort("a")$collect()
+#'
+#' # Sorting by expressions is also supported
+#' lf$sort(pl$col("a") + pl$col("b") * 2, nulls_last = TRUE)$collect()
+#'
+#' # Sort by multiple columns by passing a vector of columns
+#' lf$sort(c("c", "a"), descending = TRUE)$collect()
+#'
+#' # Or use positional arguments to sort by multiple columns in the same way
+#' lf$sort("c", "a", descending = c(FALSE, TRUE))$collect()
 lazyframe__sort <- function(
     ...,
     descending = FALSE,
@@ -693,21 +746,31 @@ lazyframe__fill_null <- function(fill_value) {
   })
 }
 
-#' Shift a LazyFrame
+#' Shift values by the given number of indices
 #'
-#' @inherit DataFrame_shift description params
+#' @inheritParams rlang::check_dots_empty0
+#' @param n Number of indices to shift forward. If a negative value is passed,
+#' values are shifted in the opposite direction instead.
+#' @param fill_value Fill the resulting null values with this value. Accepts
+#' expression input. Non-expression inputs are parsed as literals.
 #'
 #' @inherit as_polars_lf return
 #' @examples
 #' lf <- pl$LazyFrame(a = 1:4, b = 5:8)
 #'
-#' lf$shift(2)$collect()
+#' # By default, values are shifted forward by one index.
+#' lf$shift()$collect()
 #'
+#' # Pass a negative value to shift in the opposite direction instead.
 #' lf$shift(-2)$collect()
 #'
+#' # Specify fill_value to fill the resulting null values.
 #' lf$shift(-2, fill_value = 100)$collect()
-lazyframe__shift <- function(n = 1, fill_value = NULL) {
-  self$`_ldf`$shift(n, fill_value)
+lazyframe__shift <- function(n = 1, ..., fill_value = NULL) {
+  wrap({
+    check_dots_empty0(...)
+    self$`_ldf`$shift(as_polars_expr(n)$`_rexpr`, as_polars_expr(fill_value)$`_rexpr`)
+  })
 }
 
 #' Reverse the LazyFrame
@@ -722,14 +785,16 @@ lazyframe__reverse <- function() {
   })
 }
 
-#' Slice
-#' @description Get a slice of the LazyFrame.
-#' @inheritParams DataFrame_slice
+#' Get a slice of the LazyFrame.
+#'
+#' @param offset Start index. Negative indexing is supported.
+#' @param length Length of the slice. If `NULL` (default), all rows starting at
+#' the offset will be selected.
+#'
 #' @return A [LazyFrame][lazyframe__class]
 #' @examples
-#' as_polars_lf(mtcars)$slice(2, 4)$collect()
-#' as_polars_lf(mtcars)$slice(30)$collect()
-#' mtcars[2:6, ]
+#' lf <- pl$LazyFrame(x = c("a", "b", "c"), y = 1:3, z = 4:6)
+#' lf$slice(1, 2)$collect()
 lazyframe__slice <- function(offset, length = NULL) {
   wrap({
     self$`_ldf`$slice(offset, length)
@@ -783,30 +848,45 @@ lazyframe__drop_nulls <- function(subset = NULL) {
   })
 }
 
-#' @inherit DataFrame_unique title description params
+#' Drop duplicate rows from this DataFrame
+#'
+#' @inheritParams rlang::check_dots_empty0
+#' @param subset Column name(s) or selector(s), to consider when identifying
+#' duplicate rows. If `NULL` (default), use all columns.
+#' @param keep Which of the duplicate rows to keep. Must be one of:
+#' * `"any"`: does not give any guarantee of which row is kept. This allows
+#'   more optimizations.
+#' * `"none"`: don’t keep duplicate rows.
+#' * `"first"`: keep first unique row.
+#' * `"last"`: keep last unique row.
+#' @param maintain_order Keep the same order as the original LazyFrame. This is
+#' more expensive to compute. Setting this to `TRUE` blocks the possibility to
+#' run on the streaming engine.
 #'
 #' @inherit as_polars_lf return
 #' @examples
-#' df <- pl$LazyFrame(
-#'   x = sample(10, 100, rep = TRUE),
-#'   y = sample(10, 100, rep = TRUE)
+#' lf <- pl$LazyFrame(
+#'   foo = c(1, 2, 3, 1),
+#'   bar = c("a", "a", "a", "a"),
+#'   ham = c("b", "b", "b", "b"),
 #' )
-#' df$collect()$height
+#' lf$unique(maintain_order = TRUE)$collect()
 #'
-#' df$unique()$collect()$height
-#' df$unique(subset = "x")$collect()$height
+#' lf$unique(subset = c("bar", "ham"), maintain_order = TRUE)$collect()
 #'
-#' df$unique(keep = "last")
-#'
-#' # only keep unique rows
-#' df$unique(keep = "none")
+#' lf$unique(keep = "last", maintain_order = TRUE)$collect()
 lazyframe__unique <- function(
     subset = NULL,
     ...,
-    keep = "any",
+    keep = c("any", "none", "first", "last"),
     maintain_order = FALSE) {
   wrap({
-    self$`_ldf`$unique(subset, keep, maintain_order)
+    check_dots_empty0(...)
+    keep <- arg_match0(keep, values = c("any", "none", "first", "last"))
+    if (!is.null(subset)) {
+      subset <- parse_into_list_of_expressions(!!!subset)
+    }
+    self$`_ldf`$unique(subset = subset, keep = keep, maintain_order = maintain_order)
   })
 }
 
@@ -1438,11 +1518,15 @@ lazyframe__clone <- function() {
 }
 
 
-#' Unnest the Struct columns of a LazyFrame
+#' Decompose struct columns into separate columns for each of their fields
+#'
+#' The new columns will be inserted into the LazyFrame at the location of the
+#' struct column.
 #'
-#' @inheritParams DataFrame_unnest
+#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Name of the struct column(s)
+#' that should be unnested.
 #'
-#' @inherit as_polars_lf return where some or all columns of datatype Struct are unnested.
+#' @inherit as_polars_lf return
 #' @examples
 #' lf <- pl$LazyFrame(
 #'   a = 1:5,
@@ -1455,19 +1539,10 @@ lazyframe__clone <- function() {
 #' )
 #' lf$collect()
 #'
-#' # by default, all struct columns are unnested
-#' lf$unnest()$collect()
-#'
-#' # we can specify specific columns to unnest
 #' lf$unnest("a_and_c")$collect()
 lazyframe__unnest <- function(...) {
-  columns <- unpack_list(..., .context = "in $unnest():")
-  if (length(columns) == 0) {
-    columns <- names(which(dtypes_are_struct(self$`_ldf`$schema(ok))))
-  } else {
-    columns <- unlist(columns)
-  }
   wrap({
+    columns <- parse_into_list_of_expressions(...)
     self$`_ldf`$unnest(columns)
   })
 }
@@ -1507,16 +1582,30 @@ lazyframe__with_context <- function(other) {
 
 #' Create rolling groups based on a date/time or integer column
 #'
-#' @inherit expr__rolling description details params
-#' @param index_column Column used to group based on the time window. Often of
-#' type Date/Datetime. This column must be sorted in ascending order (or, if `by`
-#' is specified, then it must be sorted in ascending order within each group). In
-#' case of a rolling group by on indices, dtype needs to be either Int32 or Int64.
-#' Note that Int32 gets temporarily cast to Int64, so if performance matters use
-#' an Int64 column.
-#' @param group_by Also group by this column/these columns.
+#' @description
+#' Different from `group_by_dynamic`, the windows are now determined by the
+#' individual values and are not of constant intervals. For constant intervals
+#' use [`<LazyFrame>$group_by_dynamic()`][lazyframe__group_by_dynamic].
+#'
+#' If you have a time series `<t_0, t_1, ..., t_n>`, then by default the
+#' windows created will be:
+#' * `(t_0 - period, t_0]`
+#' * `(t_1 - period, t_1]`
+#' * …
+#' * `(t_n - period, t_n]`
 #'
-#' @inheritSection polars_duration_string  Polars duration string language
+#' whereas if you pass a non-default `offset`, then the windows will be:
+#' * `(t_0 + offset, t_0 + offset + period]`
+#' * `(t_1 + offset, t_1 + offset + period]`
+#' * …
+#' * `(t_n + offset, t_n + offset + period]`
+#'
+#' @inheritParams rlang::check_dots_empty0
+#' @inheritParams lazyframe__group_by_dynamic
+#' @param period Length of the window - must be non-negative.
+#' @param offset Offset of the window. Default is `-period`.
+#'
+#' @inherit expr__rolling_max params details
 #' @return A [LazyGroupBy][LazyGroupBy_class] object
 #' @seealso
 #' - [`<LazyFrame>$group_by_dynamic()`][lazyframe__group_by_dynamic]
@@ -1531,13 +1620,13 @@ lazyframe__with_context <- function(other) {
 #' )
 #'
 #' df <- pl$LazyFrame(dt = dates, a = c(3, 7, 5, 9, 2, 1))$with_columns(
-#'   pl$col("dt")$str$strptime(pl$Datetime())$set_sorted()
+#'   pl$col("dt")$str$strptime(pl$Datetime())
 #' )
 #'
 #' df$rolling(index_column = "dt", period = "2d")$agg(
-#'   sum_a = pl$sum("a"),
-#'   min_a = pl$min("a"),
-#'   max_a = pl$max("a")
+#'   sum_a = pl$col("a")$sum(),
+#'   min_a = pl$col("a")$min(),
+#'   max_a = pl$col("a")$max()
 #' )$collect()
 lazyframe__rolling <- function(
     index_column,
@@ -1546,12 +1635,16 @@ lazyframe__rolling <- function(
     offset = NULL,
     closed = "right",
     group_by = NULL) {
-  period <- parse_as_polars_duration_string(period)
-  offset <- parse_as_polars_duration_string(offset) %||% negate_duration_string(period)
-  self$`_ldf`$rolling(
-    lf, index_column, period, offset, closed,
-    wrap_elist_result(group_by, str_to_lit = FALSE)
-  )
+  wrap({
+    check_dots_empty0(...)
+    closed <- arg_match0(closed, values = c("both", "left", "right", "none"))
+    period <- parse_as_polars_duration_string(period)
+    offset <- parse_as_polars_duration_string(offset) %||% negate_duration_string(period)
+    by <- parse_into_list_of_expressions(!!!group_by)
+    self$`_ldf`$rolling(
+      as_polars_expr(index_column)$`_rexpr`, period, offset, closed, by
+    )
+  })
 }
 
 
@@ -2027,3 +2120,20 @@ lazyframe__merge_sorted <- function(other, key) {
     self$`_ldf`$merge_sorted(other$`_ldf`, key)
   })
 }
+
+#' Indicate that one or multiple columns are sorted
+#'
+#' This can speed up future operations, but it can lead to incorrect results if
+#' the data is **not** sorted! Use with care!
+#'
+#' @inheritParams rlang::check_dots_empty0
+#' @param column Columns that are sorted.
+#' @param descending Whether the columns are sorted in descending order.
+#'
+#' @inherit as_polars_lf return
+lazyframe__set_sorted <- function(column, ..., descending = FALSE) {
+  wrap({
+    check_dots_empty0(...)
+    self$with_columns(pl$col(column)$set_sorted(descending = descending))
+  })
+}

From 215f2a237ccd45fb60f8e7edeef80475e70d4888 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Tue, 5 Nov 2024 18:36:21 +0100
Subject: [PATCH 10/71] collect_schema(), redoc

---
 R/000-wrappers.R                   |   7 +
 R/lazyframe-frame.R                | 205 ++++++++++++++++++++++++-----
 man/as_polars_df.Rd                |  33 +++--
 man/dataframe__cast.Rd             |  12 +-
 man/dataframe__select.Rd           |  10 +-
 man/dataframe__with_columns.Rd     |  10 +-
 man/lazyframe__bottom_k.Rd         |  39 ++++++
 man/lazyframe__cast.Rd             |  37 ++++++
 man/lazyframe__clear.Rd            |  29 ++++
 man/lazyframe__clone.Rd            |  42 ++++++
 man/lazyframe__collect.Rd          |  54 +++++---
 man/lazyframe__collect_schema.Rd   |  28 ++++
 man/lazyframe__count.Rd            |  18 +++
 man/lazyframe__drop.Rd             |  34 +++++
 man/lazyframe__drop_nulls.Rd       |  34 +++++
 man/lazyframe__explain.Rd          |  85 ++++++++++++
 man/lazyframe__explode.Rd          |  27 ++++
 man/lazyframe__fetch.Rd            |  89 +++++++++++++
 man/lazyframe__filter.Rd           |  43 ++++++
 man/lazyframe__first.Rd            |  18 +++
 man/lazyframe__gather_every.Rd     |  25 ++++
 man/lazyframe__group_by.Rd         |  45 +++++++
 man/lazyframe__group_by_dynamic.Rd | 178 +++++++++++++++++++++++++
 man/lazyframe__head.Rd             |  22 ++++
 man/lazyframe__interpolate.Rd      |  23 ++++
 man/lazyframe__join.Rd             | 108 +++++++++++++++
 man/lazyframe__join_asof.Rd        | 134 +++++++++++++++++++
 man/lazyframe__join_where.Rd       |  52 ++++++++
 man/lazyframe__last.Rd             |  18 +++
 man/lazyframe__limit.Rd            |  22 ++++
 man/lazyframe__max.Rd              |  18 +++
 man/lazyframe__mean.Rd             |  18 +++
 man/lazyframe__median.Rd           |  18 +++
 man/lazyframe__merge_sorted.Rd     |  34 +++++
 man/lazyframe__min.Rd              |  18 +++
 man/lazyframe__null_count.Rd       |  18 +++
 man/lazyframe__profile.Rd          | 107 +++++++++++++++
 man/lazyframe__quantile.Rd         |  21 +++
 man/lazyframe__rename.Rd           |  43 ++++++
 man/lazyframe__reverse.Rd          |  18 +++
 man/lazyframe__rolling.Rd          |  83 ++++++++++++
 man/lazyframe__select.Rd           |  10 +-
 man/lazyframe__select_seq.Rd       |  30 +++++
 man/lazyframe__set_sorted.Rd       |  22 ++++
 man/lazyframe__shift.Rd            |  35 +++++
 man/lazyframe__slice.Rd            |  24 ++++
 man/lazyframe__sort.Rd             |  55 ++++++++
 man/lazyframe__sql.Rd              |  64 +++++++++
 man/lazyframe__std.Rd              |  19 +++
 man/lazyframe__sum.Rd              |  18 +++
 man/lazyframe__tail.Rd             |  34 +++++
 man/lazyframe__to_dot.Rd           |  78 +++++++++++
 man/lazyframe__top_k.Rd            |  41 ++++++
 man/lazyframe__unique.Rd           |  50 +++++++
 man/lazyframe__unnest.Rd           |  33 +++++
 man/lazyframe__unpivot.Rd          |  45 +++++++
 man/lazyframe__var.Rd              |  19 +++
 man/lazyframe__with_columns.Rd     |  10 +-
 man/lazyframe__with_columns_seq.Rd |  66 ++++++++++
 man/lazyframe__with_context.Rd     |  39 ++++++
 man/lazyframe__with_row_index.Rd   |  32 +++++
 man/pl__struct.Rd                  |  10 +-
 src/init.c                         |   6 +
 src/rust/api.h                     |   1 +
 src/rust/src/lazyframe/general.rs  |  63 +++++++--
 65 files changed, 2574 insertions(+), 107 deletions(-)
 create mode 100644 man/lazyframe__bottom_k.Rd
 create mode 100644 man/lazyframe__cast.Rd
 create mode 100644 man/lazyframe__clear.Rd
 create mode 100644 man/lazyframe__clone.Rd
 create mode 100644 man/lazyframe__collect_schema.Rd
 create mode 100644 man/lazyframe__count.Rd
 create mode 100644 man/lazyframe__drop.Rd
 create mode 100644 man/lazyframe__drop_nulls.Rd
 create mode 100644 man/lazyframe__explain.Rd
 create mode 100644 man/lazyframe__explode.Rd
 create mode 100644 man/lazyframe__fetch.Rd
 create mode 100644 man/lazyframe__filter.Rd
 create mode 100644 man/lazyframe__first.Rd
 create mode 100644 man/lazyframe__gather_every.Rd
 create mode 100644 man/lazyframe__group_by.Rd
 create mode 100644 man/lazyframe__group_by_dynamic.Rd
 create mode 100644 man/lazyframe__head.Rd
 create mode 100644 man/lazyframe__interpolate.Rd
 create mode 100644 man/lazyframe__join.Rd
 create mode 100644 man/lazyframe__join_asof.Rd
 create mode 100644 man/lazyframe__join_where.Rd
 create mode 100644 man/lazyframe__last.Rd
 create mode 100644 man/lazyframe__limit.Rd
 create mode 100644 man/lazyframe__max.Rd
 create mode 100644 man/lazyframe__mean.Rd
 create mode 100644 man/lazyframe__median.Rd
 create mode 100644 man/lazyframe__merge_sorted.Rd
 create mode 100644 man/lazyframe__min.Rd
 create mode 100644 man/lazyframe__null_count.Rd
 create mode 100644 man/lazyframe__profile.Rd
 create mode 100644 man/lazyframe__quantile.Rd
 create mode 100644 man/lazyframe__rename.Rd
 create mode 100644 man/lazyframe__reverse.Rd
 create mode 100644 man/lazyframe__rolling.Rd
 create mode 100644 man/lazyframe__select_seq.Rd
 create mode 100644 man/lazyframe__set_sorted.Rd
 create mode 100644 man/lazyframe__shift.Rd
 create mode 100644 man/lazyframe__slice.Rd
 create mode 100644 man/lazyframe__sort.Rd
 create mode 100644 man/lazyframe__sql.Rd
 create mode 100644 man/lazyframe__std.Rd
 create mode 100644 man/lazyframe__sum.Rd
 create mode 100644 man/lazyframe__tail.Rd
 create mode 100644 man/lazyframe__to_dot.Rd
 create mode 100644 man/lazyframe__top_k.Rd
 create mode 100644 man/lazyframe__unique.Rd
 create mode 100644 man/lazyframe__unnest.Rd
 create mode 100644 man/lazyframe__unpivot.Rd
 create mode 100644 man/lazyframe__var.Rd
 create mode 100644 man/lazyframe__with_columns_seq.Rd
 create mode 100644 man/lazyframe__with_context.Rd
 create mode 100644 man/lazyframe__with_row_index.Rd

diff --git a/R/000-wrappers.R b/R/000-wrappers.R
index 32e00034..281d9633 100644
--- a/R/000-wrappers.R
+++ b/R/000-wrappers.R
@@ -2494,6 +2494,12 @@ class(`PlRExpr`) <- c("PlRExpr__bundle", "savvy_neopolars__sealed")
   }
 }
 
+`PlRLazyFrame_collect_schema` <- function(self) {
+  function() {
+    .Call(savvy_PlRLazyFrame_collect_schema__impl, `self`)
+  }
+}
+
 `PlRLazyFrame_unnest` <- function(self) {
   function(`columns`) {
     .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_unnest__impl, `self`, `columns`))
@@ -2563,6 +2569,7 @@ class(`PlRExpr`) <- c("PlRExpr__bundle", "savvy_neopolars__sealed")
   e$`unpivot` <- `PlRLazyFrame_unpivot`(ptr)
   e$`with_row_index` <- `PlRLazyFrame_with_row_index`(ptr)
   e$`clone` <- `PlRLazyFrame_clone`(ptr)
+  e$`collect_schema` <- `PlRLazyFrame_collect_schema`(ptr)
   e$`unnest` <- `PlRLazyFrame_unnest`(ptr)
   e$`count` <- `PlRLazyFrame_count`(ptr)
   e$`merge_sorted` <- `PlRLazyFrame_merge_sorted`(ptr)
diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index 1ff3b591..2f95cfba 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -162,28 +162,45 @@ lazyframe__group_by <- function(..., .maintain_order = FALSE) {
   })
 }
 
-# TODO: see also section
 #' Materialize this LazyFrame into a DataFrame
 #'
 #' By default, all query optimizations are enabled.
-#' Individual optimizations may be disabled by setting the corresponding parameter to `FALSE`.
-#' @inherit pl__DataFrame return
-#' @inheritParams rlang::args_dots_empty
-#' @param type_coercion A logical, indicats type coercion optimization.
-#' @param predicate_pushdown A logical, indicats predicate pushdown optimization.
-#' @param projection_pushdown A logical, indicats projection pushdown optimization.
-#' @param simplify_expression A logical, indicats simplify expression optimization.
-#' @param slice_pushdown A logical, indicats slice pushdown optimization.
-#' @param comm_subplan_elim A logical, indicats tring to cache branching subplans that occur on self-joins or unions.
-#' @param comm_subexpr_elim A logical, indicats tring to cache common subexpressions.
-#' @param cluster_with_columns A logical, indicats to combine sequential independent calls to with_columns.
-#' @param no_optimization A logical. If `TRUE`, turn off (certain) optimizations.
-#' @param streaming A logical. If `TRUE`, process the query in batches to handle larger-than-memory data.
-#' If `FALSE` (default), the entire query is processed in a single batch.
-#' Note that streaming mode is considered unstable.
-#' It may be changed at any point without it being considered a breaking change.
-#' @param _eager A logical, indicates to turn off multi-node optimizations and the other optimizations.
-#' This option is intended for internal use only.
+#'
+#' @inheritParams rlang::check_dots_empty0
+#' @param type_coercion Logical. Coerce types such that operations succeed and
+#' run on minimal required memory.
+#' @param predicate_pushdown Logical. Applies filters as early as possible at
+#' scan level.
+#' @param projection_pushdown Logical. Select only the columns that are needed
+#' at the scan level.
+#' @param simplify_expression Logical. Various optimizations, such as constant
+#' folding and replacing expensive operations with faster alternatives.
+#' @param slice_pushdown Logical. Only load the required slice from the scan
+#' level. Don't materialize sliced outputs (e.g. `join$head(10)`).
+#' @param comm_subplan_elim Logical. Will try to cache branching subplans that
+#'  occur on self-joins or unions.
+#' @param comm_subexpr_elim Logical. Common subexpressions will be cached and
+#' reused.
+#' @param cluster_with_columns Combine sequential independent calls to
+#' [`with_columns()`][lazyframe__with_columns].
+#' @param streaming `r lifecycle::badge("experimental")` Logical. Process the
+#' query in batches to handle larger-than-memory data. If `FALSE` (default),
+#' the entire query is processed in a single batch.
+#' @param _eager A logical, indicates to turn off multi-node optimizations and
+#' the other optimizations. This option is intended for internal use only.
+#'
+#' @inherit as_polars_lf return
+#'
+#' @seealso
+#'  - [`$fetch()`][lazyframe__fetch] - fast limited query check
+#'  - [`$profile()`][lazyframe__profile] - same as `$collect()` but also returns
+#'    a table with each operation profiled.
+#'  - [`$collect_in_background()`][lazyframe__collect_in_background] - non-blocking
+#'    collect returns a future handle. Can also just be used via
+#'    `$collect(collect_in_background = TRUE)`.
+#'  - [`$sink_parquet()`][lazyframe__sink_parquet()] streams query to a parquet file.
+#'  - [`$sink_ipc()`][lazyframe__sink_ipc()] streams query to a arrow file.
+#'
 #' @examples
 #' lf <- pl$LazyFrame(
 #'   a = c("a", "b", "a", "b", "b", "c"),
@@ -238,6 +255,35 @@ lazyframe__collect <- function(
   })
 }
 
+#' Resolve the schema of this LazyFrame
+#'
+#' This resolves the query plan but does not trigger computations.
+#'
+#' @return A named list with names indicating column names and values indicating
+#' column data types.
+#'
+#' @examples
+#' lf <- pl$LazyFrame(
+#'   foo = 1:3,
+#'   bar = 6:8,
+#'   ham = c("a", "b", "c")
+#' )
+#'
+#' lf$collect_schema()
+#'
+#' lf$with_columns(
+#'   baz = (pl$col("foo") + pl$col("bar"))$cast(pl$String),
+#'   pl$col("bar")$cast(pl$Int64)
+#' )$collect_schema()
+lazyframe__collect_schema <- function() {
+  wrap({
+    lapply(self$`_ldf`$collect_schema(), function(x) {
+      .savvy_wrap_PlRDataType(x) |>
+        wrap()
+    })
+  })
+}
+
 #' Create a string representation of the query plan
 #'
 #' The query plan is read from bottom to top. When `optimized = FALSE`, the
@@ -505,6 +551,66 @@ lazyframe__with_columns <- function(...) {
   })
 }
 
+#' Modify/append column(s) of a LazyFrame
+#'
+#' @description
+#' This will run all expression sequentially instead of in parallel. Use this
+#' only when the work per expression is cheap.
+#'
+#' Add columns or modify existing ones with expressions. This is similar to
+#' `dplyr::mutate()` as it keeps unmentioned columns (unlike `$select()`).
+#'
+#' However, unlike `dplyr::mutate()`, one cannot use new variables in subsequent
+#' expressions in the same `$with_columns_seq()`call. For instance, if you create a
+#' variable `x`, you will only be able to use it in another `$with_columns_seq()`
+#' or `$select()` call.
+#'
+#' @inherit as_polars_lf return
+#' @inheritParams lazyframe__select
+#' @examples
+#' # Pass an expression to add it as a new column.
+#' lf <- pl$LazyFrame(
+#'   a = 1:4,
+#'   b = c(0.5, 4, 10, 13),
+#'   c = c(TRUE, TRUE, FALSE, TRUE),
+#' )
+#' lf$with_columns_seq((pl$col("a")^2)$alias("a^2"))$collect()
+#'
+#' # Added columns will replace existing columns with the same name.
+#' lf$with_columns_seq(a = pl$col("a")$cast(pl$Float64))$collect()
+#'
+#' # Multiple columns can be added
+#' lf$with_columns_seq(
+#'   (pl$col("a")^2)$alias("a^2"),
+#'   (pl$col("b") / 2)$alias("b/2"),
+#'   (pl$col("c")$not())$alias("not c"),
+#' )$collect()
+#'
+#' # Name expression instead of `$alias()`
+#' lf$with_columns_seq(
+#'   `a^2` = pl$col("a")^2,
+#'   `b/2` = pl$col("b") / 2,
+#'   `not c` = pl$col("c")$not(),
+#' )$collect()
+#'
+#' # Expressions with multiple outputs can automatically be instantiated
+#' # as Structs by enabling the experimental setting `POLARS_AUTO_STRUCTIFY`:
+#' if (requireNamespace("withr", quietly = TRUE)) {
+#'   withr::with_envvar(c(POLARS_AUTO_STRUCTIFY = "1"), {
+#'     lf$drop("c")$with_columns_seq(
+#'       diffs = pl$col("a", "b")$diff()$name$suffix("_diff"),
+#'     )$collect()
+#'   })
+#' }
+lazyframe__with_columns_seq <- function(...) {
+  wrap({
+    structify <- parse_env_auto_structify()
+
+    parse_into_list_of_expressions(..., `__structify` = structify) |>
+      self$`_ldf`$with_columns_seq()
+  })
+}
+
 #' Remove columns from the DataFrame
 #'
 #' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Names of the columns that
@@ -1190,27 +1296,22 @@ lazyframe__join_asof <- function(
 }
 
 
-#' Unpivot a Frame from wide to long format
+#' Unpivot a LazyFrame from wide to long format
 #'
+#' This function is useful to massage a LazyFrame into a format where one or
+#' more columns are identifier variables (`index`) while all other columns,
+#' considered measured variables (`on`), are “unpivoted” to the row axis
+#' leaving just two non-identifier columns, "variable" and "value".
+#'
+#' @inheritParams rlang::check_dots_empty0
 #' @param on Values to use as identifier variables. If `value_vars` is
 #' empty all columns that are not in `id_vars` will be used.
-#' @param ... Not used.
 #' @param index Columns to use as identifier variables.
 #' @param variable_name Name to give to the new column containing the names of
 #' the melted columns. Defaults to "variable".
 #' @param value_name Name to give to the new column containing the values of
 #' the melted columns. Defaults to `"value"`.
 #'
-#' @details
-#' Optionally leaves identifiers set.
-#'
-#' This function is useful to massage a Frame into a format where one or more
-#' columns are identifier variables (id_vars), while all other columns, considered
-#' measured variables (value_vars), are "unpivoted" to the row axis, leaving just
-#' two non-identifier columns, 'variable' and 'value'.
-#'
-#'
-#'
 #' @inherit as_polars_lf return
 #'
 #' @examples
@@ -1226,10 +1327,16 @@ lazyframe__unpivot <- function(
     index = NULL,
     variable_name = NULL,
     value_name = NULL) {
-  self$`_ldf`$unpivot(
-    lf, on %||% character(), index %||% character(),
-    value_name, variable_name
-  ) |> unwrap("in $unpivot( ): ")
+  wrap({
+    check_dots_empty0(...)
+    if (!is.null(on)) {
+      on <- parse_into_list_of_expressions(!!!on)
+    }
+    if (!is.null(index)) {
+      index <- parse_into_list_of_expressions(!!!index)
+    }
+    self$`_ldf`$unpivot(on, index, value_name, variable_name)
+  })
 }
 
 #' Rename column names
@@ -2137,3 +2244,31 @@ lazyframe__set_sorted <- function(column, ..., descending = FALSE) {
     self$with_columns(pl$col(column)$set_sorted(descending = descending))
   })
 }
+
+#' Add a row index as the first column in the LazyFrame
+#'
+#' @description
+#' Using this function can have a negative effect on query performance. This
+#' may, for instance, block predicate pushdown optimization.
+#'
+#' @inheritParams rlang::check_dots_empty0
+#' @param name Name of the index column.
+#' @param offset Start the index at this offset. Cannot be negative.
+#'
+#' @inherit as_polars_lf return
+#' @examples
+#' lf <- pl$LazyFrame(x = c(1, 3, 5), y = c(2, 4, 6))
+#' lf$with_row_index()$collect()
+#'
+#' lf$with_row_index("id", offset = 1000)$collect()
+#'
+#' # An index column can also be created using the expressions int_range()
+#' # and len()$
+#' lf$with_columns(
+#'   index = pl$int_range(pl$len(), dtype = pl$UInt32)
+#' )$collect()
+lazyframe__with_row_index <- function(name = "index", offset = 0) {
+  wrap({
+    self$`_ldf`$with_row_index(name, offset)
+  })
+}
diff --git a/man/as_polars_df.Rd b/man/as_polars_df.Rd
index e3986253..d94243f5 100644
--- a/man/as_polars_df.Rd
+++ b/man/as_polars_df.Rd
@@ -56,28 +56,33 @@ If \code{NULL}, the column name is taken from the \link{Series} name.}
 the \code{\link[=series_struct_unnest]{<Series>$struct$unnest()}} method is used to create a \link{DataFrame}
 from the struct \link{Series}. In this case, the \code{column_name} argument is ignored.}
 
-\item{type_coercion}{A logical, indicats type coercion optimization.}
+\item{type_coercion}{Logical. Coerce types such that operations succeed and
+run on minimal required memory.}
 
-\item{predicate_pushdown}{A logical, indicats predicate pushdown optimization.}
+\item{predicate_pushdown}{Logical. Applies filters as early as possible at
+scan level.}
 
-\item{projection_pushdown}{A logical, indicats projection pushdown optimization.}
+\item{projection_pushdown}{Logical. Select only the columns that are needed
+at the scan level.}
 
-\item{simplify_expression}{A logical, indicats simplify expression optimization.}
+\item{simplify_expression}{Logical. Various optimizations, such as constant
+folding and replacing expensive operations with faster alternatives.}
 
-\item{slice_pushdown}{A logical, indicats slice pushdown optimization.}
+\item{slice_pushdown}{Logical. Only load the required slice from the scan
+level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).}
 
-\item{comm_subplan_elim}{A logical, indicats tring to cache branching subplans that occur on self-joins or unions.}
+\item{comm_subplan_elim}{Logical. Will try to cache branching subplans that
+occur on self-joins or unions.}
 
-\item{comm_subexpr_elim}{A logical, indicats tring to cache common subexpressions.}
+\item{comm_subexpr_elim}{Logical. Common subexpressions will be cached and
+reused.}
 
-\item{cluster_with_columns}{A logical, indicats to combine sequential independent calls to with_columns.}
+\item{cluster_with_columns}{Combine sequential independent calls to
+\code{\link[=lazyframe__with_columns]{with_columns()}}.}
 
-\item{no_optimization}{A logical. If \code{TRUE}, turn off (certain) optimizations.}
-
-\item{streaming}{A logical. If \code{TRUE}, process the query in batches to handle larger-than-memory data.
-If \code{FALSE} (default), the entire query is processed in a single batch.
-Note that streaming mode is considered unstable.
-It may be changed at any point without it being considered a breaking change.}
+\item{streaming}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} Logical. Process the
+query in batches to handle larger-than-memory data. If \code{FALSE} (default),
+the entire query is processed in a single batch.}
 }
 \value{
 A polars \link{DataFrame}
diff --git a/man/dataframe__cast.Rd b/man/dataframe__cast.Rd
index 10011b1c..f168d5a9 100644
--- a/man/dataframe__cast.Rd
+++ b/man/dataframe__cast.Rd
@@ -6,11 +6,21 @@
 \usage{
 dataframe__cast(..., .strict = TRUE)
 }
+\arguments{
+\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Either a datatype to which
+all columns will be cast, or a list where the names are column names and the
+values are the datatypes to convert to.}
+
+\item{.strict}{If \code{TRUE} (default), throw an error if a cast could not be done
+(for instance, due to an overflow). Otherwise, return \code{null}.}
+}
 \value{
 A polars \link{DataFrame}
 }
 \description{
-Cast DataFrame column(s) to the specified dtype
+This allows to convert all columns to a datatype or to convert only specific
+columns. Contrarily to the Python implementation, it is not possible to
+convert all columns of a specific datatype to another datatype.
 }
 \examples{
 df <- pl$DataFrame(
diff --git a/man/dataframe__select.Rd b/man/dataframe__select.Rd
index 9d570adc..0fbbc140 100644
--- a/man/dataframe__select.Rd
+++ b/man/dataframe__select.Rd
@@ -7,11 +7,11 @@
 dataframe__select(...)
 }
 \arguments{
-\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}>
-Name-value pairs of objects to be converted to polars \link[=Expr]{expressions}
-by the \code{\link[=as_polars_expr]{as_polars_expr()}} function.
-Characters are parsed as column names, other non-expression inputs are parsed as \link[=pl__lit]{literals}.
-Each name will be used as the expression name.}
+\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Name-value pairs of objects
+to be converted to polars \link[=Expr]{expressions} by the \code{\link[=as_polars_expr]{as_polars_expr()}}
+function. Characters are parsed as column names, other non-expression inputs
+are parsed as \link[=pl__lit]{literals}. Each name will be used as the expression
+name.}
 }
 \value{
 A polars \link{DataFrame}
diff --git a/man/dataframe__with_columns.Rd b/man/dataframe__with_columns.Rd
index 677dc3c0..53e6ae05 100644
--- a/man/dataframe__with_columns.Rd
+++ b/man/dataframe__with_columns.Rd
@@ -7,11 +7,11 @@
 dataframe__with_columns(...)
 }
 \arguments{
-\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}>
-Name-value pairs of objects to be converted to polars \link[=Expr]{expressions}
-by the \code{\link[=as_polars_expr]{as_polars_expr()}} function.
-Characters are parsed as column names, other non-expression inputs are parsed as \link[=pl__lit]{literals}.
-Each name will be used as the expression name.}
+\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Name-value pairs of objects
+to be converted to polars \link[=Expr]{expressions} by the \code{\link[=as_polars_expr]{as_polars_expr()}}
+function. Characters are parsed as column names, other non-expression inputs
+are parsed as \link[=pl__lit]{literals}. Each name will be used as the expression
+name.}
 }
 \value{
 A polars \link{DataFrame}
diff --git a/man/lazyframe__bottom_k.Rd b/man/lazyframe__bottom_k.Rd
new file mode 100644
index 00000000..903ffd46
--- /dev/null
+++ b/man/lazyframe__bottom_k.Rd
@@ -0,0 +1,39 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__bottom_k}
+\alias{lazyframe__bottom_k}
+\title{Return the \code{k} smallest rows}
+\usage{
+lazyframe__bottom_k(k, ..., by, reverse = FALSE)
+}
+\arguments{
+\item{k}{Number of rows to return.}
+
+\item{by}{Column(s) used to determine the bottom rows. Accepts expression
+input. Strings are parsed as column names.}
+
+\item{reverse}{Consider the \code{k} largest elements of the by column(s)
+(instead of the k smallest). This can be specified per column by passing a
+sequence of booleans.}
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+Non-null elements are always preferred over null elements, regardless of the
+value of \code{reverse}. The output is not guaranteed to be in any particular
+order, call \code{sort()} after this function if you wish the output to be sorted.
+}
+\examples{
+lf <- pl$LazyFrame(
+  a = c("a", "b", "a", "b", "b", "c"),
+  b = c(2, 1, 1, 3, 2, 1)
+)
+
+# Get the rows which contain the 4 smallest values in column b.
+lf$bottom_k(4, by = "b")$collect()
+
+# Get the rows which contain the 4 smallest values when sorting on column a
+# and b$
+lf$bottom_k(4, by = c("a", "b"))$collect()
+}
diff --git a/man/lazyframe__cast.Rd b/man/lazyframe__cast.Rd
new file mode 100644
index 00000000..56958008
--- /dev/null
+++ b/man/lazyframe__cast.Rd
@@ -0,0 +1,37 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__cast}
+\alias{lazyframe__cast}
+\title{Cast LazyFrame column(s) to the specified dtype(s)}
+\usage{
+lazyframe__cast(..., .strict = TRUE)
+}
+\arguments{
+\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Either a datatype to which
+all columns will be cast, or a list where the names are column names and the
+values are the datatypes to convert to.}
+
+\item{strict}{If \code{TRUE} (default), throw an error if a cast could not be done
+(for instance, due to an overflow). Otherwise, return \code{null}.}
+}
+\value{
+A LazyFrame
+}
+\description{
+This allows to convert all columns to a datatype or to convert only specific
+columns. Contrarily to the Python implementation, it is not possible to
+convert all columns of a specific datatype to another datatype.
+}
+\examples{
+lf <- pl$LazyFrame(
+  foo = 1:3,
+  bar = c(6, 7, 8),
+  ham = as.Date(c("2020-01-02", "2020-03-04", "2020-05-06"))
+)
+
+# Cast only some columns
+lf$cast(foo = pl$Float32, bar = pl$UInt8)$collect()
+
+# Cast all columns to the same type
+lf$cast(pl$String)$collect()
+}
diff --git a/man/lazyframe__clear.Rd b/man/lazyframe__clear.Rd
new file mode 100644
index 00000000..91e4d73c
--- /dev/null
+++ b/man/lazyframe__clear.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__clear}
+\alias{lazyframe__clear}
+\title{Create an empty or n-row null-filled copy of the LazyFrame}
+\usage{
+lazyframe__clear(n = 0)
+}
+\arguments{
+\item{n}{Number of (empty) rows to return in the cleared frame.}
+}
+\value{
+A n-row null-filled LazyFrame with an identical schema
+}
+\description{
+Returns a n-row null-filled LazyFrame with an identical schema. \code{n} can be
+greater than the current number of rows in the LazyFrame.
+}
+\examples{
+df <- pl$LazyFrame(
+  a = c(NA, 2, 3, 4),
+  b = c(0.5, NA, 2.5, 13),
+  c = c(TRUE, TRUE, FALSE, NA)
+)
+
+df$clear()
+
+df$clear(n = 5)
+}
diff --git a/man/lazyframe__clone.Rd b/man/lazyframe__clone.Rd
new file mode 100644
index 00000000..4e51d5f7
--- /dev/null
+++ b/man/lazyframe__clone.Rd
@@ -0,0 +1,42 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__clone}
+\alias{lazyframe__clone}
+\title{Clone a LazyFrame}
+\usage{
+lazyframe__clone()
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+This makes a very cheap deep copy/clone of an existing
+\code{\link[=lazyframe__class]{LazyFrame}}. Rarely useful as \code{LazyFrame}s are nearly 100\%
+immutable. Any modification of a \code{LazyFrame} should lead to a clone anyways,
+but this can be useful when dealing with attributes (see examples).
+}
+\examples{
+df1 <- as_polars_lf(iris)
+
+# Make a function to take a LazyFrame, add an attribute, and return a LazyFrame
+give_attr <- function(data) {
+  attr(data, "created_on") <- "2024-01-29"
+  data
+}
+df2 <- give_attr(df1)
+
+# Problem: the original LazyFrame also gets the attribute while it shouldn't!
+attributes(df1)
+
+# Use $clone() inside the function to avoid that
+give_attr <- function(data) {
+  data <- data$clone()
+  attr(data, "created_on") <- "2024-01-29"
+  data
+}
+df1 <- as_polars_lf(iris)
+df2 <- give_attr(df1)
+
+# now, the original LazyFrame doesn't get this attribute
+attributes(df1)
+}
diff --git a/man/lazyframe__collect.Rd b/man/lazyframe__collect.Rd
index 3ccae6b3..21b4a51b 100644
--- a/man/lazyframe__collect.Rd
+++ b/man/lazyframe__collect.Rd
@@ -20,40 +20,44 @@ lazyframe__collect(
 )
 }
 \arguments{
-\item{...}{These dots are for future extensions and must be empty.}
+\item{...}{Dots which should be empty.}
 
-\item{type_coercion}{A logical, indicats type coercion optimization.}
+\item{type_coercion}{Logical. Coerce types such that operations succeed and
+run on minimal required memory.}
 
-\item{predicate_pushdown}{A logical, indicats predicate pushdown optimization.}
+\item{predicate_pushdown}{Logical. Applies filters as early as possible at
+scan level.}
 
-\item{projection_pushdown}{A logical, indicats projection pushdown optimization.}
+\item{projection_pushdown}{Logical. Select only the columns that are needed
+at the scan level.}
 
-\item{simplify_expression}{A logical, indicats simplify expression optimization.}
+\item{simplify_expression}{Logical. Various optimizations, such as constant
+folding and replacing expensive operations with faster alternatives.}
 
-\item{slice_pushdown}{A logical, indicats slice pushdown optimization.}
+\item{slice_pushdown}{Logical. Only load the required slice from the scan
+level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).}
 
-\item{comm_subplan_elim}{A logical, indicats tring to cache branching subplans that occur on self-joins or unions.}
+\item{comm_subplan_elim}{Logical. Will try to cache branching subplans that
+occur on self-joins or unions.}
 
-\item{comm_subexpr_elim}{A logical, indicats tring to cache common subexpressions.}
+\item{comm_subexpr_elim}{Logical. Common subexpressions will be cached and
+reused.}
 
-\item{cluster_with_columns}{A logical, indicats to combine sequential independent calls to with_columns.}
+\item{cluster_with_columns}{Combine sequential independent calls to
+\code{\link[=lazyframe__with_columns]{with_columns()}}.}
 
-\item{no_optimization}{A logical. If \code{TRUE}, turn off (certain) optimizations.}
+\item{streaming}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} Logical. Process the
+query in batches to handle larger-than-memory data. If \code{FALSE} (default),
+the entire query is processed in a single batch.}
 
-\item{streaming}{A logical. If \code{TRUE}, process the query in batches to handle larger-than-memory data.
-If \code{FALSE} (default), the entire query is processed in a single batch.
-Note that streaming mode is considered unstable.
-It may be changed at any point without it being considered a breaking change.}
-
-\item{_eager}{A logical, indicates to turn off multi-node optimizations and the other optimizations.
-This option is intended for internal use only.}
+\item{_eager}{A logical, indicates to turn off multi-node optimizations and
+the other optimizations. This option is intended for internal use only.}
 }
 \value{
-A polars \link{DataFrame}
+A polars \link{LazyFrame}
 }
 \description{
 By default, all query optimizations are enabled.
-Individual optimizations may be disabled by setting the corresponding parameter to \code{FALSE}.
 }
 \examples{
 lf <- pl$LazyFrame(
@@ -68,3 +72,15 @@ lf$group_by("a")$agg(pl$all()$sum())$collect(
   streaming = TRUE
 )
 }
+\seealso{
+\itemize{
+\item \code{\link[=lazyframe__fetch]{$fetch()}} - fast limited query check
+\item \code{\link[=lazyframe__profile]{$profile()}} - same as \verb{$collect()} but also returns
+a table with each operation profiled.
+\item \code{\link[=lazyframe__collect_in_background]{$collect_in_background()}} - non-blocking
+collect returns a future handle. Can also just be used via
+\verb{$collect(collect_in_background = TRUE)}.
+\item \code{\link[=lazyframe__sink_parquet]{$sink_parquet()}} streams query to a parquet file.
+\item \code{\link[=lazyframe__sink_ipc]{$sink_ipc()}} streams query to a arrow file.
+}
+}
diff --git a/man/lazyframe__collect_schema.Rd b/man/lazyframe__collect_schema.Rd
new file mode 100644
index 00000000..4d46c94c
--- /dev/null
+++ b/man/lazyframe__collect_schema.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__collect_schema}
+\alias{lazyframe__collect_schema}
+\title{Resolve the schema of this LazyFrame}
+\usage{
+lazyframe__collect_schema()
+}
+\value{
+A named list with names indicating column names and values indicating
+column data types.
+}
+\description{
+This resolves the query plan but does not trigger computations.
+}
+\examples{
+lf <- pl$LazyFrame(
+  foo = 1:3,
+  bar = 6:8,
+  ham = c("a", "b", "c")
+)
+
+lf$collect_schema()
+
+lf$with_columns(
+  baz = (pl$col("foo") + pl$col("bar"))$cast(pl$String)
+)$collect_schema()
+}
diff --git a/man/lazyframe__count.Rd b/man/lazyframe__count.Rd
new file mode 100644
index 00000000..a8d52e21
--- /dev/null
+++ b/man/lazyframe__count.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__count}
+\alias{lazyframe__count}
+\title{Return the number of non-null elements for each column}
+\usage{
+lazyframe__count()
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+Return the number of non-null elements for each column
+}
+\examples{
+lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, NA), c = rep(NA, 4))
+lf$count()$collect()
+}
diff --git a/man/lazyframe__drop.Rd b/man/lazyframe__drop.Rd
new file mode 100644
index 00000000..95552531
--- /dev/null
+++ b/man/lazyframe__drop.Rd
@@ -0,0 +1,34 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__drop}
+\alias{lazyframe__drop}
+\title{Remove columns from the DataFrame}
+\usage{
+lazyframe__drop(..., strict = TRUE)
+}
+\arguments{
+\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Names of the columns that
+should be removed from the dataframe. Accepts column selector input.}
+
+\item{strict}{Validate that all column names exist in the current schema,
+and throw an exception if any do not.}
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+Remove columns from the DataFrame
+}
+\examples{
+# Drop columns by passing the name of those columns
+lf <- pl$LazyFrame(
+  foo = 1:3,
+  bar = c(6, 7, 8),
+  ham = c("a", "b", "c")
+)
+lf$drop("ham")$collect()
+lf$drop("ham", "bar")$collect()
+
+# Drop multiple columns by passing a selector
+lf$drop(cs$all())$collect()
+}
diff --git a/man/lazyframe__drop_nulls.Rd b/man/lazyframe__drop_nulls.Rd
new file mode 100644
index 00000000..dd3fcc78
--- /dev/null
+++ b/man/lazyframe__drop_nulls.Rd
@@ -0,0 +1,34 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__drop_nulls}
+\alias{lazyframe__drop_nulls}
+\title{Drop all rows that contain null values}
+\usage{
+lazyframe__drop_nulls(subset = NULL)
+}
+\arguments{
+\item{subset}{Column name(s) for which null values are considered. If \code{NULL}
+(default), use all columns.}
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+The original order of the remaining rows is preserved.
+}
+\examples{
+lf <- pl$LazyFrame(
+  foo = 1:3,
+  bar = c(6, NA, 8),
+  ham = c("a", "b", NA)
+)
+
+# The default behavior of this method is to drop rows where any single value
+# of the row is null.
+lf$drop_nulls()$collect()
+
+# This behaviour can be constrained to consider only a subset of columns, as
+# defined by name or with a selector. For example, dropping rows if there is
+# a null in any of the integer columns:
+lf$drop_nulls(subset = cs$integer())$collect()
+}
diff --git a/man/lazyframe__explain.Rd b/man/lazyframe__explain.Rd
new file mode 100644
index 00000000..f902d783
--- /dev/null
+++ b/man/lazyframe__explain.Rd
@@ -0,0 +1,85 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__explain}
+\alias{lazyframe__explain}
+\title{Create a string representation of the query plan}
+\usage{
+lazyframe__explain(
+  ...,
+  format = c("plain", "tree"),
+  optimized = TRUE,
+  type_coercion = TRUE,
+  predicate_pushdown = TRUE,
+  projection_pushdown = TRUE,
+  simplify_expression = TRUE,
+  slice_pushdown = TRUE,
+  comm_subplan_elim = TRUE,
+  comm_subexpr_elim = TRUE,
+  cluster_with_columns = TRUE,
+  streaming = FALSE
+)
+}
+\arguments{
+\item{...}{Dots which should be empty.}
+
+\item{format}{The format to use for displaying the logical plan. Must be
+either \code{"plain"} (default) or \code{"tree"}.}
+
+\item{optimized}{Return an optimized query plan. If \code{TRUE} (default), the
+subsequent optimization flags control which optimizations run.}
+
+\item{type_coercion}{Logical. Coerce types such that operations succeed and
+run on minimal required memory.}
+
+\item{predicate_pushdown}{Logical. Applies filters as early as possible at
+scan level.}
+
+\item{projection_pushdown}{Logical. Select only the columns that are needed
+at the scan level.}
+
+\item{simplify_expression}{Logical. Various optimizations, such as constant
+folding and replacing expensive operations with faster alternatives.}
+
+\item{slice_pushdown}{Logical. Only load the required slice from the scan
+level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).}
+
+\item{comm_subplan_elim}{Logical. Will try to cache branching subplans that
+occur on self-joins or unions.}
+
+\item{comm_subexpr_elim}{Logical. Common subexpressions will be cached and
+reused.}
+
+\item{cluster_with_columns}{Combine sequential independent calls to
+\code{\link[=lazyframe__with_columns]{with_columns()}}.}
+
+\item{streaming}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} Logical. Process the
+query in batches to handle larger-than-memory data. If \code{FALSE} (default),
+the entire query is processed in a single batch.}
+}
+\value{
+A character value containing the query plan.
+}
+\description{
+The query plan is read from bottom to top. When \code{optimized = FALSE}, the
+query as it was written by the user is shown. This is not what Polars runs.
+Instead, it applies optimizations that are displayed by default by \verb{$explain()}.
+One classic example is the predicate pushdown, which applies the filter as
+early as possible (i.e. at the bottom of the plan).
+}
+\examples{
+lazy_frame <- as_polars_lf(iris)
+
+# Prepare your query
+lazy_query <- lazy_frame$sort("Species")$filter(pl$col("Species") != "setosa")
+
+# This is the query that was written by the user, without any optimizations
+# (use cat() for better printing)
+lazy_query$explain(optimized = FALSE) |> cat()
+
+# This is the query after `polars` optimizes it: instead of sorting first and
+# then filtering, it is faster to filter first and then sort the rest.
+lazy_query$explain() |> cat()
+
+# Also possible to see this as tree format
+lazy_query$explain(format = "tree") |> cat()
+}
diff --git a/man/lazyframe__explode.Rd b/man/lazyframe__explode.Rd
new file mode 100644
index 00000000..e438e172
--- /dev/null
+++ b/man/lazyframe__explode.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__explode}
+\alias{lazyframe__explode}
+\title{Explode the DataFrame to long format by exploding the given columns}
+\usage{
+lazyframe__explode(...)
+}
+\arguments{
+\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Column names, expressions, or
+a selector defining them. The underlying columns being exploded must be of
+the \code{List} or \code{Array} data type.}
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+Explode the DataFrame to long format by exploding the given columns
+}
+\examples{
+lf <- pl$LazyFrame(
+  letters = c("a", "a", "b", "c"),
+  numbers = list(1, c(2, 3), c(4, 5), c(6, 7, 8))
+)
+
+lf$explode("numbers")$collect()
+}
diff --git a/man/lazyframe__fetch.Rd b/man/lazyframe__fetch.Rd
new file mode 100644
index 00000000..21345e1b
--- /dev/null
+++ b/man/lazyframe__fetch.Rd
@@ -0,0 +1,89 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__fetch}
+\alias{lazyframe__fetch}
+\title{Fetch \code{n} rows of a LazyFrame}
+\usage{
+lazyframe__fetch(
+  n_rows = 500,
+  ...,
+  type_coercion = TRUE,
+  predicate_pushdown = TRUE,
+  projection_pushdown = TRUE,
+  simplify_expression = TRUE,
+  slice_pushdown = TRUE,
+  comm_subplan_elim = TRUE,
+  comm_subexpr_elim = TRUE,
+  cluster_with_columns = TRUE,
+  streaming = FALSE,
+  no_optimization = FALSE
+)
+}
+\arguments{
+\item{n_rows}{Integer. Maximum number of rows to fetch.}
+
+\item{...}{Dots which should be empty.}
+
+\item{type_coercion}{Logical. Coerce types such that operations succeed and
+run on minimal required memory.}
+
+\item{predicate_pushdown}{Logical. Applies filters as early as possible at
+scan level.}
+
+\item{projection_pushdown}{Logical. Select only the columns that are needed
+at the scan level.}
+
+\item{simplify_expression}{Logical. Various optimizations, such as constant
+folding and replacing expensive operations with faster alternatives.}
+
+\item{slice_pushdown}{Logical. Only load the required slice from the scan
+level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).}
+
+\item{comm_subplan_elim}{Logical. Will try to cache branching subplans that
+occur on self-joins or unions.}
+
+\item{comm_subexpr_elim}{Logical. Common subexpressions will be cached and
+reused.}
+
+\item{cluster_with_columns}{Combine sequential independent calls to
+\code{\link[=lazyframe__with_columns]{with_columns()}}.}
+
+\item{streaming}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} Logical. Process the
+query in batches to handle larger-than-memory data. If \code{FALSE} (default),
+the entire query is processed in a single batch.}
+}
+\value{
+A DataFrame of maximum n_rows
+}
+\description{
+This is similar to \verb{$collect()} but limit the number of rows to collect. It
+is mostly useful to check that a query works as expected.
+}
+\details{
+\verb{$fetch()} does not guarantee the final number of rows in the DataFrame output.
+It only guarantees that \code{n} rows are used at the beginning of the query.
+Filters, join operations and a lower number of rows available in the scanned
+file influence the final number of rows.
+}
+\examples{
+# fetch 3 rows
+as_polars_lf(iris)$fetch(3)
+
+# this fetch-query returns 4 rows, because we started with 3 and appended one
+# row in the query (see section 'Details')
+as_polars_lf(iris)$
+  select(pl$col("Species")$append("flora gigantica, alien"))$
+  fetch(3)
+}
+\seealso{
+\itemize{
+\item \code{\link[=lazyframe__collect]{$collect()}} - regular collect.
+\item \code{\link[=lazyframe__profile]{$profile()}} - same as \verb{$collect()} but also returns
+a table with each operation profiled.
+\item \code{\link[=lazyframe__collect_in_background]{$collect_in_background()}} - non-blocking
+collect returns a future handle. Can also just be used via
+\verb{$collect(collect_in_background = TRUE)}.
+\item \code{\link[=lazyframe__sink_parquet]{$sink_parquet()}} streams query to a parquet file.
+\item \code{\link[=lazyframe__sink_ipc]{$sink_ipc()}} streams query to a arrow file.
+}
+}
diff --git a/man/lazyframe__filter.Rd b/man/lazyframe__filter.Rd
new file mode 100644
index 00000000..22a94990
--- /dev/null
+++ b/man/lazyframe__filter.Rd
@@ -0,0 +1,43 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__filter}
+\alias{lazyframe__filter}
+\title{Filter the rows in the LazyFrame based on a predicate expression}
+\usage{
+lazyframe__filter(...)
+}
+\arguments{
+\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Expression that evaluates to
+a boolean Series.}
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+The original order of the remaining rows is preserved. Rows where the filter
+does not evaluate to \code{TRUE} are discarded, including nulls.
+}
+\examples{
+lf <- pl$LazyFrame(
+  foo = c(1, 2, 3, NA, 4, NA, 0),
+  bar = c(6, 7, 8, NA, NA, 9, 0),
+  ham = c("a", "b", "c", NA, "d", "e", "f")
+)
+
+# Filter on one condition
+lf$filter(pl$col("foo") > 1)$collect()
+
+# Filter on multiple conditions
+lf$filter((pl$col("foo") < 3) & (pl$col("ham") == "a"))$collect()
+
+# Filter on an OR condition
+lf$filter((pl$col("foo") == 1) | (pl$col("ham") == " c"))$collect()
+
+# Filter by comparing two columns against each other
+lf$filter(pl$col("foo") == pl$col("bar"))$collect()
+lf$filter(pl$col("foo") != pl$col("bar"))$collect()
+
+# Notice how the row with null values is filtered out$ In order to keep the
+# rows with nulls, use:
+lf$filter(pl$col("foo")$ne_missing(pl$col("bar")))$collect()
+}
diff --git a/man/lazyframe__first.Rd b/man/lazyframe__first.Rd
new file mode 100644
index 00000000..20d03d55
--- /dev/null
+++ b/man/lazyframe__first.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__first}
+\alias{lazyframe__first}
+\title{Get the first row of the LazyFrame}
+\usage{
+lazyframe__first()
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+Get the first row of the LazyFrame
+}
+\examples{
+lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1))
+lf$first()$collect()
+}
diff --git a/man/lazyframe__gather_every.Rd b/man/lazyframe__gather_every.Rd
new file mode 100644
index 00000000..7eaf654d
--- /dev/null
+++ b/man/lazyframe__gather_every.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__gather_every}
+\alias{lazyframe__gather_every}
+\title{Take every nth row in the LazyFrame}
+\usage{
+lazyframe__gather_every(n, offset = 0)
+}
+\arguments{
+\item{n}{Gather every \code{n}-th row.}
+
+\item{offset}{Starting index.}
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+Take every nth row in the LazyFrame
+}
+\examples{
+lf <- pl$LazyFrame(a = 1:4, b = 5:8)
+lf$gather_every(2)$collect()
+
+lf$gather_every(2, offset = 1)$collect()
+}
diff --git a/man/lazyframe__group_by.Rd b/man/lazyframe__group_by.Rd
new file mode 100644
index 00000000..51afd1ca
--- /dev/null
+++ b/man/lazyframe__group_by.Rd
@@ -0,0 +1,45 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__group_by}
+\alias{lazyframe__group_by}
+\title{Start a group by operation}
+\usage{
+lazyframe__group_by(..., .maintain_order = FALSE)
+}
+\arguments{
+\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Column(s) to group by.
+Accepts expression input. Strings are parsed as column names.}
+
+\item{.maintain_order}{Ensure that the order of the groups is consistent with
+the input data. This is slower than a default group by. Setting this to
+\code{TRUE} blocks the possibility to run on the streaming engine.}
+}
+\value{
+A lazy groupby
+}
+\description{
+Start a group by operation
+}
+\examples{
+# Group by one column and call agg() to compute the grouped sum of another
+# column.
+lf <- pl$LazyFrame(
+  a = c("a", "b", "a", "b", "c"),
+  b = c(1, 2, 1, 3, 3),
+  c = c(5, 4, 3, 2, 1)
+)
+lf$group_by("a")$agg(pl$col("b")$sum())$collect()
+
+# Set .maintain_order = TRUE to ensure the order of the groups is consistent
+# with the input.
+lf$group_by("a", .maintain_order = TRUE)$agg(pl$col("b")$sum())$collect()
+
+# Group by multiple columns by passing a vector of column names.
+lf$group_by(c("a", "b"))$agg(pl$col("c")$max())$collect()
+
+# Or use positional arguments to group by multiple columns in the same way.
+# Expressions are also accepted.
+lf$
+  group_by("a", pl$col("b") / 2)$
+  agg(pl$col("c")$mean())$collect()
+}
diff --git a/man/lazyframe__group_by_dynamic.Rd b/man/lazyframe__group_by_dynamic.Rd
new file mode 100644
index 00000000..624890ad
--- /dev/null
+++ b/man/lazyframe__group_by_dynamic.Rd
@@ -0,0 +1,178 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__group_by_dynamic}
+\alias{lazyframe__group_by_dynamic}
+\title{Group based on a date/time or integer column}
+\usage{
+lazyframe__group_by_dynamic(
+  index_column,
+  ...,
+  every,
+  period = NULL,
+  offset = NULL,
+  include_boundaries = FALSE,
+  closed = "left",
+  label = "left",
+  group_by = NULL,
+  start_by = "window"
+)
+}
+\arguments{
+\item{index_column}{Column used to group based on the time window. Often of
+type Date/Datetime. This column must be sorted in ascending order (or, if
+\code{group_by} is specified, then it must be sorted in ascending order within
+each group).
+In case of a dynamic group by on indices, the data type needs to be either
+Int32 or In64. Note that Int32 gets temporarily cast to Int64, so if
+performance matters, use an Int64 column.}
+
+\item{...}{Dots which should be empty.}
+
+\item{every}{Interval of the window.}
+
+\item{period}{Length of the window. If \code{NULL} (default), it will equal
+\code{every}.}
+
+\item{offset}{Offset of the window, does not take effect if
+\code{start_by = "datapoint"}. Defaults to zero.}
+
+\item{include_boundaries}{Add two columns \code{"_lower_boundary"} and
+\code{"_upper_boundary"} columns that show the boundaries of the window. This will
+impact performance because it’s harder to parallelize.}
+
+\item{closed}{Define which sides of the interval are closed (inclusive).
+Default is \code{"left"}.}
+
+\item{label}{Define which label to use for the window:
+\itemize{
+\item \code{"left"}: lower boundary of the window
+\item \code{"right"}: upper boundary of the window
+\item \code{"datapoint"}: the first value of the index column in the given window. If
+you don’t need the label to be at one of the boundaries, choose this option
+for maximum performance.
+}}
+
+\item{start_by}{The strategy to determine the start of the first window by:
+\itemize{
+\item \code{"window"}: start by taking the earliest timestamp, truncating it with
+\code{every}, and then adding \code{offset}. Note that weekly windows start on
+Monday.
+\item \code{"datapoint"}: start from the first encountered data point.
+\item a day of the week (only takes effect if \code{every} contains \code{"w"}): \code{"monday"}
+starts the window on the Monday before the first data point, etc.
+}}
+}
+\value{
+A \link[=LazyGroupBy_class]{LazyGroupBy} object
+}
+\description{
+Time windows are calculated and rows are assigned to windows. Different from
+a normal group by is that a row can be member of multiple groups. By
+default, the windows look like:
+\itemize{
+\item [start, start + period)
+\item [start + every, start + every + period)
+\item [start + 2\emph{every, start + 2}every + period)
+\item …
+}
+}
+\details{
+where \code{start} is determined by \code{start_by}, \code{offset}, \code{every}, and the
+earliest datapoint. See the \code{start_by} argument description for details.
+
+The \code{every}, \code{period}, and \code{offset} arguments are created with the following
+string language:
+\itemize{
+\item 1ns # 1 nanosecond
+\item 1us # 1 microsecond
+\item 1ms # 1 millisecond
+\item 1s  # 1 second
+\item 1m  # 1 minute
+\item 1h  # 1 hour
+\item 1d  # 1 day
+\item 1w  # 1 calendar week
+\item 1mo # 1 calendar month
+\item 1y  # 1 calendar year
+These strings can be combined:
+\itemize{
+\item 3d12h4m25s # 3 days, 12 hours, 4 minutes, and 25 seconds
+}
+}
+
+In case of a \code{group_by_dynamic} on an integer column, the windows are
+defined by:
+\itemize{
+\item 1i # length 1
+\item 10i # length 10
+}
+}
+\examples{
+lf <- pl$select(
+  time = pl$datetime_range(
+    start = strptime("2021-12-16 00:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"),
+    end = strptime("2021-12-16 03:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"),
+    interval = "30m"
+  ),
+  n = 0:6
+)$lazy()
+lf$collect()
+
+# Group by windows of 1 hour.
+lf$group_by_dynamic("time", every = "1h", closed = "right")$agg(
+  vals = pl$col("n")
+)$collect()
+
+# The window boundaries can also be added to the aggregation result
+lf$group_by_dynamic(
+  "time",
+  every = "1h", include_boundaries = TRUE, closed = "right"
+)$agg(
+  pl$col("n")$mean()
+)$collect()
+
+# When closed = "left", the window excludes the right end of interval:
+# [lower_bound, upper_bound)
+lf$group_by_dynamic("time", every = "1h", closed = "left")$agg(
+  pl$col("n")
+)$collect()
+
+# When closed = "both" the time values at the window boundaries belong to 2
+# groups.
+lf$group_by_dynamic("time", every = "1h", closed = "both")$agg(
+  pl$col("n")
+)$collect()
+
+# Dynamic group bys can also be combined with grouping on normal keys
+lf <- lf$with_columns(
+  groups = as_polars_series(c("a", "a", "a", "b", "b", "a", "a"))
+)
+lf$collect()
+
+lf$group_by_dynamic(
+  "time",
+  every = "1h",
+  closed = "both",
+  group_by = "groups",
+  include_boundaries = TRUE
+)$agg(pl$col("n"))$collect()
+
+# We can also create a dynamic group by based on an index column
+lf <- pl$LazyFrame(
+  idx = 0:5,
+  A = c("A", "A", "B", "B", "B", "C")
+)$with_columns(pl$col("idx")$set_sorted())
+lf$collect()
+
+lf$group_by_dynamic(
+  "idx",
+  every = "2i",
+  period = "3i",
+  include_boundaries = TRUE,
+  closed = "right"
+)$agg(A_agg_list = pl$col("A"))$collect()
+}
+\seealso{
+\itemize{
+\item \code{\link[=lazyframe__rolling]{<LazyFrame>$rolling()}}
+}
+}
diff --git a/man/lazyframe__head.Rd b/man/lazyframe__head.Rd
new file mode 100644
index 00000000..3b202274
--- /dev/null
+++ b/man/lazyframe__head.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__head}
+\alias{lazyframe__head}
+\title{Get the first \code{n} rows}
+\usage{
+lazyframe__head(n = 5)
+}
+\arguments{
+\item{n}{Number of rows to return.}
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+Get the first \code{n} rows
+}
+\examples{
+lf <- pl$LazyFrame(a = 1:6, b = 7:12)
+lf$head()$collect()
+lf$head(2)$collect()
+}
diff --git a/man/lazyframe__interpolate.Rd b/man/lazyframe__interpolate.Rd
new file mode 100644
index 00000000..b6cafaf6
--- /dev/null
+++ b/man/lazyframe__interpolate.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__interpolate}
+\alias{lazyframe__interpolate}
+\title{Interpolate intermediate values}
+\usage{
+lazyframe__interpolate()
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+The interpolation method is linear.
+}
+\examples{
+lf <- pl$LazyFrame(
+  foo = c(1, NA, 9, 10),
+  bar = c(6, 7, 9, NA),
+  ham = c(1, NA, NA, 9)
+)
+
+lf$interpolate()$collect()
+}
diff --git a/man/lazyframe__join.Rd b/man/lazyframe__join.Rd
new file mode 100644
index 00000000..653d5d65
--- /dev/null
+++ b/man/lazyframe__join.Rd
@@ -0,0 +1,108 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__join}
+\alias{lazyframe__join}
+\title{Join LazyFrames}
+\usage{
+lazyframe__join(
+  other,
+  on = NULL,
+  how = "inner",
+  ...,
+  left_on = NULL,
+  right_on = NULL,
+  suffix = "_right",
+  validate = "m:m",
+  join_nulls = FALSE,
+  allow_parallel = TRUE,
+  force_parallel = FALSE,
+  coalesce = NULL
+)
+}
+\arguments{
+\item{other}{LazyFrame to join with.}
+
+\item{on}{Either a vector of column names or a list of expressions and/or
+strings. Use \code{left_on} and \code{right_on} if the column names to match on are
+different between the two DataFrames.}
+
+\item{how}{One of the following methods:
+\itemize{
+\item "inner": returns rows that have matching values in both tables
+\item "left": returns all rows from the left table, and the matched rows from
+the right table
+\item "right": returns all rows from the right table, and the matched rows from
+the left table
+\item "full": returns all rows when there is a match in either left or right
+table
+\item "cross": returns the Cartesian product of rows from both tables
+\item "semi": returns rows from the left table that have a match in the right
+table.
+\item "anti": returns rows from the left table that have no match in the right
+table.
+}}
+
+\item{...}{Dots which should be empty.}
+
+\item{left_on, right_on}{Same as \code{on} but only for the left or the right
+DataFrame. They must have the same length.}
+
+\item{suffix}{Suffix to add to duplicated column names.}
+
+\item{validate}{Checks if join is of specified type:
+\itemize{
+\item \code{"m:m"} (default): many-to-many, doesn't perform any checks;
+\item \code{"1:1"}: one-to-one, check if join keys are unique in both left and right
+datasets;
+\item \code{"1:m"}: one-to-many, check if join keys are unique in left dataset
+\item \code{"m:1"}: many-to-one, check if join keys are unique in right dataset
+}
+
+Note that this is currently not supported by the streaming engine.}
+
+\item{join_nulls}{Join on null values. By default null values will never
+produce matches.}
+
+\item{allow_parallel}{Allow the physical plan to optionally evaluate the
+computation of both DataFrames up to the join in parallel.}
+
+\item{force_parallel}{Force the physical plan to evaluate the computation of
+both DataFrames up to the join in parallel.}
+
+\item{coalesce}{Coalescing behavior (merging of join columns).
+\itemize{
+\item \code{NULL}: join specific.
+\item \code{TRUE}: Always coalesce join columns.
+\item \code{FALSE}: Never coalesce join columns.
+Note that joining on any other expressions than \code{col} will turn off
+coalescing.
+}}
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+This function can do both mutating joins (adding columns based on matching
+observations, for example with \code{how = "left"}) and filtering joins (keeping
+observations based on matching observations, for example with \code{how = "inner"}).
+}
+\examples{
+lf <- pl$LazyFrame(
+  foo = 1:3,
+  bar = c(6, 7, 8),
+  ham = c("a", "b", "c")
+)
+other_lf <- pl$LazyFrame(
+  apple = c("x", "y", "z"),
+  ham = c("a", "b", "d")
+)
+lf$join(other_lf, on = "ham")$collect()
+
+lf$join(other_lf, on = "ham", how = "full")$collect()
+
+lf$join(other_lf, on = "ham", how = "left", coalesce = TRUE)$collect()
+
+lf$join(other_lf, on = "ham", how = "semi")$collect()
+
+lf$join(other_lf, on = "ham", how = "anti")$collect()
+}
diff --git a/man/lazyframe__join_asof.Rd b/man/lazyframe__join_asof.Rd
new file mode 100644
index 00000000..fbc72693
--- /dev/null
+++ b/man/lazyframe__join_asof.Rd
@@ -0,0 +1,134 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__join_asof}
+\alias{lazyframe__join_asof}
+\title{Perform joins on nearest keys}
+\usage{
+lazyframe__join_asof(
+  other,
+  ...,
+  left_on = NULL,
+  right_on = NULL,
+  on = NULL,
+  by_left = NULL,
+  by_right = NULL,
+  by = NULL,
+  strategy = c("backward", "forward", "nearest"),
+  suffix = "_right",
+  tolerance = NULL,
+  allow_parallel = TRUE,
+  force_parallel = FALSE,
+  coalesce = TRUE
+)
+}
+\arguments{
+\item{other}{LazyFrame}
+
+\item{...}{Not used, blocks use of further positional arguments}
+
+\item{by_left, by_right}{Same as \code{by} but only for the left or the right
+table. They must have the same length.}
+
+\item{by}{Join on these columns before performing asof join. Either a vector
+of column names or a list of expressions and/or strings. Use \code{left_by} and
+\code{right_by} if the column names to match on are different between the two
+tables.}
+
+\item{strategy}{Strategy for where to find match:
+\itemize{
+\item "backward" (default): search for the last row in the right table whose \code{on}
+key is less than or equal to the left key.
+\item "forward": search for the first row in the right table whose \code{on} key is
+greater than or equal to the left key.
+\item "nearest": search for the last row in the right table whose value is nearest
+to the left key. String keys are not currently supported for a nearest
+search.
+}}
+
+\item{tolerance}{Numeric tolerance. By setting this the join will only be done if the near
+keys are within this distance. If an asof join is done on columns of dtype
+"Date", "Datetime", "Duration" or "Time", use the Polars duration string language.
+About the language, see the \verb{Polars duration string language} section for details.
+
+There may be a circumstance where R types are not sufficient to express a
+numeric tolerance. In that case, you can use the expression syntax like
+\code{tolerance = pl$lit(42)$cast(pl$Uint64)}}
+
+\item{coalesce}{Coalescing behavior (merging of \code{on} / \code{left_on} / \code{right_on}
+columns):
+\itemize{
+\item \code{TRUE}: Always coalesce join columns;
+\item \code{FALSE}: Never coalesce join columns.
+Note that joining on any other expressions than \code{col} will turn off coalescing.
+}}
+}
+\description{
+This is similar to a left-join except that we match on nearest key rather
+than equal keys.
+}
+\details{
+Both tables (DataFrames or LazyFrames) must be sorted by the asof_join key.
+}
+\section{Polars duration string language}{
+
+Polars duration string language is a simple representation of
+durations. It is used in many Polars functions that accept durations.
+
+It has the following format:
+\itemize{
+\item 1ns (1 nanosecond)
+\item 1us (1 microsecond)
+\item 1ms (1 millisecond)
+\item 1s (1 second)
+\item 1m (1 minute)
+\item 1h (1 hour)
+\item 1d (1 calendar day)
+\item 1w (1 calendar week)
+\item 1mo (1 calendar month)
+\item 1q (1 calendar quarter)
+\item 1y (1 calendar year)
+}
+
+Or combine them: \code{"3d12h4m25s"} # 3 days, 12 hours, 4 minutes, and 25 seconds
+
+By "calendar day", we mean the corresponding time on the next day
+(which may not be 24 hours, due to daylight savings).
+Similarly for "calendar week", "calendar month", "calendar quarter", and "calendar year".
+}
+
+\examples{
+#
+# create two LazyFrame to join asof
+gdp <- pl$LazyFrame(
+  date = as.Date(c("2015-1-1", "2016-1-1", "2017-5-1", "2018-1-1", "2019-1-1")),
+  gdp = c(4321, 4164, 4411, 4566, 4696),
+  group = c("b", "a", "a", "b", "b")
+)
+
+pop <- pl$LazyFrame(
+  date = as.Date(c("2016-5-12", "2017-5-12", "2018-5-12", "2019-5-12")),
+  population = c(82.19, 82.66, 83.12, 83.52),
+  group = c("b", "b", "a", "a")
+)
+
+# optional make sure tables are already sorted with "on" join-key
+gdp <- gdp$sort("date")
+pop <- pop$sort("date")
+
+
+# Left-join_asof LazyFrame pop with gdp on "date"
+# Look backward in gdp to find closest matching date
+pop$join_asof(gdp, on = "date", strategy = "backward")$collect()
+
+# .... and forward
+pop$join_asof(gdp, on = "date", strategy = "forward")$collect()
+
+# join by a group: "only look within groups"
+pop$join_asof(gdp, on = "date", by = "group", strategy = "backward")$collect()
+
+# only look 2 weeks and 2 days back
+pop$join_asof(gdp, on = "date", strategy = "backward", tolerance = "2w2d")$collect()
+
+# only look 11 days back (numeric tolerance depends on polars type, <date> is in days)
+pop$join_asof(gdp, on = "date", strategy = "backward", tolerance = 11)$collect()
+}
diff --git a/man/lazyframe__join_where.Rd b/man/lazyframe__join_where.Rd
new file mode 100644
index 00000000..28a6a450
--- /dev/null
+++ b/man/lazyframe__join_where.Rd
@@ -0,0 +1,52 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__join_where}
+\alias{lazyframe__join_where}
+\title{Perform a join based on one or multiple (in)equality predicates}
+\usage{
+lazyframe__join_where(other, ..., suffix = "_right")
+}
+\arguments{
+\item{other}{LazyFrame to join with.}
+
+\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> (In)Equality condition to
+join the two tables on. When a column name occurs in both tables, the proper
+suffix must be applied in the predicate. For example, if both tables have a
+column \code{"x"} that you want to use in the conditions, you must refer to the
+column of the right table as \code{"x<suffix>"}.}
+
+\item{suffix}{Suffix to append to columns with a duplicate name.}
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
+
+This performs an inner join, so only rows where all predicates are true are
+included in the result, and a row from either LazyFrame may be included
+multiple times in the result.
+
+Note that the row order of the input LazyFrames is not preserved.
+}
+\examples{
+east <- pl$LazyFrame(
+  id = c(100, 101, 102),
+  dur = c(120, 140, 160),
+  rev = c(12, 14, 16),
+  cores = c(2, 8, 4)
+)
+
+west <- pl$LazyFrame(
+  t_id = c(404, 498, 676, 742),
+  time = c(90, 130, 150, 170),
+  cost = c(9, 13, 15, 16),
+  cores = c(4, 2, 1, 4)
+)
+
+east$join_where(
+  west,
+  pl$col("dur") < pl$col("time"),
+  pl$col("rev") < pl$col("cost")
+)$collect()
+}
diff --git a/man/lazyframe__last.Rd b/man/lazyframe__last.Rd
new file mode 100644
index 00000000..a1ab582c
--- /dev/null
+++ b/man/lazyframe__last.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__last}
+\alias{lazyframe__last}
+\title{Get the last row of the LazyFrame}
+\usage{
+lazyframe__last()
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+Get the last row of the LazyFrame
+}
+\examples{
+lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1))
+lf$last()$collect()
+}
diff --git a/man/lazyframe__limit.Rd b/man/lazyframe__limit.Rd
new file mode 100644
index 00000000..8f323354
--- /dev/null
+++ b/man/lazyframe__limit.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__limit}
+\alias{lazyframe__limit}
+\title{Get the first \code{n} rows}
+\usage{
+lazyframe__limit(n = 5)
+}
+\arguments{
+\item{n}{Number of rows to return.}
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+Alias for \code{\link[=lazyframe__head]{<LazyFrame>$head()}}.
+}
+\examples{
+lf <- pl$LazyFrame(a = 1:6, b = 7:12)
+lf$limit()$collect()
+lf$limit(2)$collect()
+}
diff --git a/man/lazyframe__max.Rd b/man/lazyframe__max.Rd
new file mode 100644
index 00000000..f798a561
--- /dev/null
+++ b/man/lazyframe__max.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__max}
+\alias{lazyframe__max}
+\title{Aggregate the columns in the LazyFrame to their maximum value}
+\usage{
+lazyframe__max()
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+Aggregate the columns in the LazyFrame to their maximum value
+}
+\examples{
+lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1))
+lf$max()$collect()
+}
diff --git a/man/lazyframe__mean.Rd b/man/lazyframe__mean.Rd
new file mode 100644
index 00000000..f19405d0
--- /dev/null
+++ b/man/lazyframe__mean.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__mean}
+\alias{lazyframe__mean}
+\title{Aggregate the columns in the LazyFrame to their mean value}
+\usage{
+lazyframe__mean()
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+Aggregate the columns in the LazyFrame to their mean value
+}
+\examples{
+lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1))
+lf$mean()$collect()
+}
diff --git a/man/lazyframe__median.Rd b/man/lazyframe__median.Rd
new file mode 100644
index 00000000..7bcf7a69
--- /dev/null
+++ b/man/lazyframe__median.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__median}
+\alias{lazyframe__median}
+\title{Aggregate the columns in the LazyFrame to their median value}
+\usage{
+lazyframe__median()
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+Aggregate the columns in the LazyFrame to their median value
+}
+\examples{
+lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1))
+lf$median()$collect()
+}
diff --git a/man/lazyframe__merge_sorted.Rd b/man/lazyframe__merge_sorted.Rd
new file mode 100644
index 00000000..1b7eea03
--- /dev/null
+++ b/man/lazyframe__merge_sorted.Rd
@@ -0,0 +1,34 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__merge_sorted}
+\alias{lazyframe__merge_sorted}
+\title{Take two sorted DataFrames and merge them by the sorted key}
+\usage{
+lazyframe__merge_sorted(other, key)
+}
+\arguments{
+\item{other}{Other DataFrame that must be merged.}
+
+\item{key}{Key that is sorted.}
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+The output of this operation will also be sorted. It is the callers
+responsibility that the frames are sorted by that key, otherwise the output
+will not make sense. The schemas of both LazyFrames must be equal.
+}
+\examples{
+lf1 <- pl$LazyFrame(
+  name = c("steve", "elise", "bob"),
+  age = c(42, 44, 18)
+)$sort("age")
+
+lf2 <- pl$LazyFrame(
+  name = c("anna", "megan", "steve", "thomas"),
+  age = c(21, 33, 42, 20)
+)$sort("age")
+
+lf1$merge_sorted(lf2, key = "age")$collect()
+}
diff --git a/man/lazyframe__min.Rd b/man/lazyframe__min.Rd
new file mode 100644
index 00000000..a2946a86
--- /dev/null
+++ b/man/lazyframe__min.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__min}
+\alias{lazyframe__min}
+\title{Aggregate the columns in the LazyFrame to their minimum value}
+\usage{
+lazyframe__min()
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+Aggregate the columns in the LazyFrame to their minimum value
+}
+\examples{
+lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1))
+lf$min()$collect()
+}
diff --git a/man/lazyframe__null_count.Rd b/man/lazyframe__null_count.Rd
new file mode 100644
index 00000000..ec9955fe
--- /dev/null
+++ b/man/lazyframe__null_count.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__null_count}
+\alias{lazyframe__null_count}
+\title{Return the number of null elements for each column}
+\usage{
+lazyframe__null_count()
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+Return the number of null elements for each column
+}
+\examples{
+lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, NA), c = rep(NA, 4))
+lf$null_count()$collect()
+}
diff --git a/man/lazyframe__profile.Rd b/man/lazyframe__profile.Rd
new file mode 100644
index 00000000..11210994
--- /dev/null
+++ b/man/lazyframe__profile.Rd
@@ -0,0 +1,107 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__profile}
+\alias{lazyframe__profile}
+\title{Collect and profile a lazy query.}
+\usage{
+lazyframe__profile(
+  type_coercion = TRUE,
+  predicate_pushdown = TRUE,
+  projection_pushdown = TRUE,
+  simplify_expression = TRUE,
+  slice_pushdown = TRUE,
+  comm_subplan_elim = TRUE,
+  comm_subexpr_elim = TRUE,
+  cluster_with_columns = TRUE,
+  streaming = FALSE,
+  no_optimization = FALSE,
+  collect_in_background = FALSE,
+  show_plot = FALSE,
+  truncate_nodes = 0
+)
+}
+\arguments{
+\item{type_coercion}{Logical. Coerce types such that operations succeed and
+run on minimal required memory.}
+
+\item{predicate_pushdown}{Logical. Applies filters as early as possible at
+scan level.}
+
+\item{projection_pushdown}{Logical. Select only the columns that are needed
+at the scan level.}
+
+\item{simplify_expression}{Logical. Various optimizations, such as constant
+folding and replacing expensive operations with faster alternatives.}
+
+\item{slice_pushdown}{Logical. Only load the required slice from the scan
+level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).}
+
+\item{comm_subplan_elim}{Logical. Will try to cache branching subplans that
+occur on self-joins or unions.}
+
+\item{comm_subexpr_elim}{Logical. Common subexpressions will be cached and
+reused.}
+
+\item{cluster_with_columns}{Combine sequential independent calls to
+\code{\link[=lazyframe__with_columns]{with_columns()}}.}
+
+\item{streaming}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} Logical. Process the
+query in batches to handle larger-than-memory data. If \code{FALSE} (default),
+the entire query is processed in a single batch.}
+
+\item{show_plot}{Show a Gantt chart of the profiling result}
+
+\item{truncate_nodes}{Truncate the label lengths in the Gantt chart to this
+number of characters. If \code{0} (default), do not truncate.}
+}
+\value{
+List of two \code{DataFrame}s: one with the collected result, the other
+with the timings of each step. If \code{show_graph = TRUE}, then the plot is
+also stored in the list.
+}
+\description{
+This will run the query and return a list containing the
+materialized DataFrame and a DataFrame that contains profiling information
+of each node that is executed.
+}
+\details{
+The units of the timings are microseconds.
+}
+\examples{
+## Simplest use case
+pl$LazyFrame()$select(pl$lit(2) + 2)$profile()
+
+## Use $profile() to compare two queries
+
+# -1-  map each Species-group with native polars, takes ~120us only
+as_polars_lf(iris)$
+  sort("Sepal.Length")$
+  group_by("Species", maintain_order = TRUE)$
+  agg(pl$col(pl$Float64)$first() + 5)$
+  profile()
+
+# -2-  map each Species-group of each numeric column with an R function, takes ~7000us (slow!)
+
+# some R function, prints `.` for each time called by polars
+r_func <- \(s) {
+  cat(".")
+  s$to_r()[1] + 5
+}
+
+as_polars_lf(iris)$
+  sort("Sepal.Length")$
+  group_by("Species", maintain_order = TRUE)$
+  agg(pl$col(pl$Float64)$map_elements(r_func))$
+  profile()
+}
+\seealso{
+\itemize{
+\item \code{\link[=lazyframe__collect]{$collect()}} - regular collect.
+\item \code{\link[=lazyframe__fetch]{$fetch()}} - fast limited query check
+\item \code{\link[=lazyframe__collect_in_background]{$collect_in_background()}} - non-blocking
+collect returns a future handle. Can also just be used via
+\verb{$collect(collect_in_background = TRUE)}.
+\item \code{\link[=lazyframe__sink_parquet]{$sink_parquet()}} streams query to a parquet file.
+\item \code{\link[=lazyframe__sink_ipc]{$sink_ipc()}} streams query to a arrow file.
+}
+}
diff --git a/man/lazyframe__quantile.Rd b/man/lazyframe__quantile.Rd
new file mode 100644
index 00000000..7621d742
--- /dev/null
+++ b/man/lazyframe__quantile.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__quantile}
+\alias{lazyframe__quantile}
+\title{Aggregate the columns in the DataFrame to a unique quantile value}
+\usage{
+lazyframe__quantile(
+  quantile,
+  interpolation = c("nearest", "higher", "lower", "midpoint", "linear")
+)
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+Aggregate the columns in the DataFrame to a unique quantile value
+}
+\examples{
+lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1))
+lf$quantile(0.7)$collect()
+}
diff --git a/man/lazyframe__rename.Rd b/man/lazyframe__rename.Rd
new file mode 100644
index 00000000..b34bef06
--- /dev/null
+++ b/man/lazyframe__rename.Rd
@@ -0,0 +1,43 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__rename}
+\alias{lazyframe__rename}
+\title{Rename column names}
+\usage{
+lazyframe__rename(mapping, ..., strict = TRUE)
+}
+\arguments{
+\item{mapping}{Either a function that takes a character vector as input and
+returns one as input, or a named list where names are old column names and
+values are the new ones.}
+
+\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> If \code{mapping} is missing,
+those values are used.}
+
+\item{strict}{Validate that all column names exist in the current schema,
+and throw an error if any do not. (Note that this parameter is a no-op when
+passing a function to \code{mapping}).}
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+Rename column names
+}
+\details{
+If existing names are swapped (e.g. 'A' points to 'B' and 'B' points to
+'A'), polars will block projection and predicate pushdowns at this node.
+}
+\examples{
+lf <- pl$LazyFrame(
+  foo = 1:3,
+  bar = 6:8,
+  ham = letters[1:3]
+)
+
+lf$rename(foo = "apple")$collect()
+
+lf$rename(
+  \(column_name) paste0("c", substr(column_name, 2, 100))
+)$collect()
+}
diff --git a/man/lazyframe__reverse.Rd b/man/lazyframe__reverse.Rd
new file mode 100644
index 00000000..d9675b14
--- /dev/null
+++ b/man/lazyframe__reverse.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__reverse}
+\alias{lazyframe__reverse}
+\title{Reverse the LazyFrame}
+\usage{
+lazyframe__reverse()
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+Reverse the LazyFrame
+}
+\examples{
+lf <- pl$LazyFrame(key = c("a", "b", "c"), val = 1:3)
+lf$reverse()$collect()
+}
diff --git a/man/lazyframe__rolling.Rd b/man/lazyframe__rolling.Rd
new file mode 100644
index 00000000..dec217e3
--- /dev/null
+++ b/man/lazyframe__rolling.Rd
@@ -0,0 +1,83 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__rolling}
+\alias{lazyframe__rolling}
+\title{Create rolling groups based on a date/time or integer column}
+\usage{
+lazyframe__rolling(
+  index_column,
+  ...,
+  period,
+  offset = NULL,
+  closed = "right",
+  group_by = NULL
+)
+}
+\arguments{
+\item{index_column}{Column used to group based on the time window. Often of
+type Date/Datetime. This column must be sorted in ascending order (or, if
+\code{group_by} is specified, then it must be sorted in ascending order within
+each group).
+In case of a dynamic group by on indices, the data type needs to be either
+Int32 or In64. Note that Int32 gets temporarily cast to Int64, so if
+performance matters, use an Int64 column.}
+
+\item{...}{Dots which should be empty.}
+
+\item{period}{Length of the window - must be non-negative.}
+
+\item{offset}{Offset of the window. Default is \code{-period}.}
+
+\item{closed}{Define which sides of the interval are closed (inclusive).
+Default is \code{"left"}.}
+}
+\value{
+A \link[=LazyGroupBy_class]{LazyGroupBy} object
+}
+\description{
+Different from \code{group_by_dynamic}, the windows are now determined by the
+individual values and are not of constant intervals. For constant intervals
+use \code{\link[=lazyframe__group_by_dynamic]{<LazyFrame>$group_by_dynamic()}}.
+
+If you have a time series \verb{<t_0, t_1, ..., t_n>}, then by default the
+windows created will be:
+\itemize{
+\item \verb{(t_0 - period, t_0]}
+\item \verb{(t_1 - period, t_1]}
+\item …
+\item \verb{(t_n - period, t_n]}
+}
+
+whereas if you pass a non-default \code{offset}, then the windows will be:
+\itemize{
+\item \verb{(t_0 + offset, t_0 + offset + period]}
+\item \verb{(t_1 + offset, t_1 + offset + period]}
+\item …
+\item \verb{(t_n + offset, t_n + offset + period]}
+}
+}
+\examples{
+dates <- c(
+  "2020-01-01 13:45:48",
+  "2020-01-01 16:42:13",
+  "2020-01-01 16:45:09",
+  "2020-01-02 18:12:48",
+  "2020-01-03 19:45:32",
+  "2020-01-08 23:16:43"
+)
+
+df <- pl$LazyFrame(dt = dates, a = c(3, 7, 5, 9, 2, 1))$with_columns(
+  pl$col("dt")$str$strptime(pl$Datetime())
+)
+
+df$rolling(index_column = "dt", period = "2d")$agg(
+  sum_a = pl$col("a")$sum(),
+  min_a = pl$col("a")$min(),
+  max_a = pl$col("a")$max()
+)$collect()
+}
+\seealso{
+\itemize{
+\item \code{\link[=lazyframe__group_by_dynamic]{<LazyFrame>$group_by_dynamic()}}
+}
+}
diff --git a/man/lazyframe__select.Rd b/man/lazyframe__select.Rd
index 3ef41b08..e6586188 100644
--- a/man/lazyframe__select.Rd
+++ b/man/lazyframe__select.Rd
@@ -7,11 +7,11 @@
 lazyframe__select(...)
 }
 \arguments{
-\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}>
-Name-value pairs of objects to be converted to polars \link[=Expr]{expressions}
-by the \code{\link[=as_polars_expr]{as_polars_expr()}} function.
-Characters are parsed as column names, other non-expression inputs are parsed as \link[=pl__lit]{literals}.
-Each name will be used as the expression name.}
+\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Name-value pairs of objects
+to be converted to polars \link[=Expr]{expressions} by the \code{\link[=as_polars_expr]{as_polars_expr()}}
+function. Characters are parsed as column names, other non-expression inputs
+are parsed as \link[=pl__lit]{literals}. Each name will be used as the expression
+name.}
 }
 \value{
 A polars \link{LazyFrame}
diff --git a/man/lazyframe__select_seq.Rd b/man/lazyframe__select_seq.Rd
new file mode 100644
index 00000000..7fd3a7fc
--- /dev/null
+++ b/man/lazyframe__select_seq.Rd
@@ -0,0 +1,30 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__select_seq}
+\alias{lazyframe__select_seq}
+\title{Select columns from this LazyFrame}
+\usage{
+lazyframe__select_seq(...)
+}
+\arguments{
+\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Name-value pairs of objects
+to be converted to polars \link[=Expr]{expressions} by the \code{\link[=as_polars_expr]{as_polars_expr()}}
+function. Characters are parsed as column names, other non-expression inputs
+are parsed as \link[=pl__lit]{literals}. Each name will be used as the expression
+name.}
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+This will run all expression sequentially instead of in parallel. Use this
+when the work per expression is cheap.
+}
+\examples{
+lf <- pl$LazyFrame(
+  foo = 1:3,
+  bar = 6:8,
+  ham = letters[1:3]
+)
+lf$select_seq("foo")$collect()
+}
diff --git a/man/lazyframe__set_sorted.Rd b/man/lazyframe__set_sorted.Rd
new file mode 100644
index 00000000..bf53468a
--- /dev/null
+++ b/man/lazyframe__set_sorted.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__set_sorted}
+\alias{lazyframe__set_sorted}
+\title{Indicate that one or multiple columns are sorted}
+\usage{
+lazyframe__set_sorted(column, ..., descending = FALSE)
+}
+\arguments{
+\item{column}{Columns that are sorted.}
+
+\item{...}{Dots which should be empty.}
+
+\item{descending}{Whether the columns are sorted in descending order.}
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+This can speed up future operations, but it can lead to incorrect results if
+the data is \strong{not} sorted! Use with care!
+}
diff --git a/man/lazyframe__shift.Rd b/man/lazyframe__shift.Rd
new file mode 100644
index 00000000..e4fafbb2
--- /dev/null
+++ b/man/lazyframe__shift.Rd
@@ -0,0 +1,35 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__shift}
+\alias{lazyframe__shift}
+\title{Shift values by the given number of indices}
+\usage{
+lazyframe__shift(n = 1, ..., fill_value = NULL)
+}
+\arguments{
+\item{n}{Number of indices to shift forward. If a negative value is passed,
+values are shifted in the opposite direction instead.}
+
+\item{...}{Dots which should be empty.}
+
+\item{fill_value}{Fill the resulting null values with this value. Accepts
+expression input. Non-expression inputs are parsed as literals.}
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+Shift values by the given number of indices
+}
+\examples{
+lf <- pl$LazyFrame(a = 1:4, b = 5:8)
+
+# By default, values are shifted forward by one index.
+lf$shift()$collect()
+
+# Pass a negative value to shift in the opposite direction instead.
+lf$shift(-2)$collect()
+
+# Specify fill_value to fill the resulting null values.
+lf$shift(-2, fill_value = 100)$collect()
+}
diff --git a/man/lazyframe__slice.Rd b/man/lazyframe__slice.Rd
new file mode 100644
index 00000000..c268a768
--- /dev/null
+++ b/man/lazyframe__slice.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__slice}
+\alias{lazyframe__slice}
+\title{Get a slice of the LazyFrame.}
+\usage{
+lazyframe__slice(offset, length = NULL)
+}
+\arguments{
+\item{offset}{Start index. Negative indexing is supported.}
+
+\item{length}{Length of the slice. If \code{NULL} (default), all rows starting at
+the offset will be selected.}
+}
+\value{
+A \link[=lazyframe__class]{LazyFrame}
+}
+\description{
+Get a slice of the LazyFrame.
+}
+\examples{
+lf <- pl$LazyFrame(x = c("a", "b", "c"), y = 1:3, z = 4:6)
+lf$slice(1, 2)$collect()
+}
diff --git a/man/lazyframe__sort.Rd b/man/lazyframe__sort.Rd
new file mode 100644
index 00000000..cd537530
--- /dev/null
+++ b/man/lazyframe__sort.Rd
@@ -0,0 +1,55 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__sort}
+\alias{lazyframe__sort}
+\title{Sort the LazyFrame by the given columns}
+\usage{
+lazyframe__sort(
+  ...,
+  descending = FALSE,
+  nulls_last = FALSE,
+  multithreaded = TRUE,
+  maintain_order = FALSE
+)
+}
+\arguments{
+\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Column(s) to sort by. Can be
+character values indicating column names or Expr(s).}
+
+\item{descending}{Sort in descending order. When sorting by multiple
+columns, this can be specified per column by passing a logical vector.}
+
+\item{nulls_last}{Place null values last. When sorting by multiple
+columns, this can be specified per column by passing a logical vector.}
+
+\item{multithreaded}{Sort using multiple threads.}
+
+\item{maintain_order}{Whether the order should be maintained if elements are
+equal. If \code{TRUE}, streaming is not possible and performance might be worse
+since this requires a stable search.}
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+Sort the LazyFrame by the given columns
+}
+\examples{
+lf <- pl$LazyFrame(
+  a = c(1, 2, NA, 4),
+  b = c(6, 5, 4, 3),
+  c = c("a", "c", "b", "a")
+)
+
+# Pass a single column name to sort by that column.
+lf$sort("a")$collect()
+
+# Sorting by expressions is also supported
+lf$sort(pl$col("a") + pl$col("b") * 2, nulls_last = TRUE)$collect()
+
+# Sort by multiple columns by passing a vector of columns
+lf$sort(c("c", "a"), descending = TRUE)$collect()
+
+# Or use positional arguments to sort by multiple columns in the same way
+lf$sort("c", "a", descending = c(FALSE, TRUE))$collect()
+}
diff --git a/man/lazyframe__sql.Rd b/man/lazyframe__sql.Rd
new file mode 100644
index 00000000..8daf4d75
--- /dev/null
+++ b/man/lazyframe__sql.Rd
@@ -0,0 +1,64 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__sql}
+\alias{lazyframe__sql}
+\title{Execute a SQL query against the LazyFrame}
+\usage{
+lazyframe__sql(query, ..., table_name = NULL, envir = parent.frame())
+}
+\arguments{
+\item{table_name}{\code{NULL} (default) or a character of an explicit name for the table
+that represents the calling frame (the alias \code{"self"} will always be registered/available).}
+}
+\description{
+The calling frame is automatically registered as a table in the SQL context
+under the name \code{"self"}. All \link[=DataFrame_class]{DataFrames} and
+\link[=lazyframe__class]{LazyFrames} found in the \code{envir} are also registered,
+using their variable name.
+More control over registration and execution behaviour is available by
+the \link[=SQLContext_class]{SQLContext} object.
+}
+\details{
+This functionality is considered \strong{unstable}, although it is close to
+being considered stable. It may be changed at any point without it being
+considered a breaking change.
+}
+\examples{
+\dontshow{if (polars_info()$features$sql) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
+lf1 <- pl$LazyFrame(a = 1:3, b = 6:8, c = c("z", "y", "x"))
+lf2 <- pl$LazyFrame(a = 3:1, d = c(125, -654, 888))
+
+# Query the LazyFrame using SQL:
+lf1$sql("SELECT c, b FROM self WHERE a > 1")$collect()
+
+# Join two LazyFrames:
+lf1$sql(
+  "
+SELECT self.*, d
+FROM self
+INNER JOIN lf2 USING (a)
+WHERE a > 1 AND b < 8
+"
+)$collect()
+
+# Apply SQL transforms (aliasing "self" to "frame") and subsequently
+# filter natively (you can freely mix SQL and native operations):
+lf1$sql(
+  query = r"(
+SELECT
+ a,
+MOD(a, 2) == 0 AS a_is_even,
+(b::float / 2) AS 'b/2',
+CONCAT_WS(':', c, c, c) AS c_c_c
+FROM frame
+ORDER BY a
+)",
+  table_name = "frame"
+)$filter(!pl$col("c_c_c")$str$starts_with("x"))$collect()
+\dontshow{\}) # examplesIf}
+}
+\seealso{
+\itemize{
+\item \link[=SQLContext_class]{SQLContext}
+}
+}
diff --git a/man/lazyframe__std.Rd b/man/lazyframe__std.Rd
new file mode 100644
index 00000000..e69e9d76
--- /dev/null
+++ b/man/lazyframe__std.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__std}
+\alias{lazyframe__std}
+\title{Aggregate the columns of this LazyFrame to their standard deviation values}
+\usage{
+lazyframe__std(ddof = 1)
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+Aggregate the columns of this LazyFrame to their standard deviation values
+}
+\examples{
+lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1))
+lf$std()$collect()
+lf$std(ddof = 0)$collect()
+}
diff --git a/man/lazyframe__sum.Rd b/man/lazyframe__sum.Rd
new file mode 100644
index 00000000..b1391c71
--- /dev/null
+++ b/man/lazyframe__sum.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__sum}
+\alias{lazyframe__sum}
+\title{Aggregate the columns of this LazyFrame to their sum values}
+\usage{
+lazyframe__sum()
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+Aggregate the columns of this LazyFrame to their sum values
+}
+\examples{
+lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1))
+lf$sum()$collect()
+}
diff --git a/man/lazyframe__tail.Rd b/man/lazyframe__tail.Rd
new file mode 100644
index 00000000..aebd3e0f
--- /dev/null
+++ b/man/lazyframe__tail.Rd
@@ -0,0 +1,34 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__tail}
+\alias{lazyframe__tail}
+\title{Get the last \code{n} rows}
+\usage{
+lazyframe__tail(n = 5L)
+
+lazyframe__tail(n = 5L)
+}
+\arguments{
+\item{n}{Number of rows to return.}
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+Get the last \code{n} rows
+
+Get the last \code{n} rows.
+}
+\examples{
+lf <- pl$LazyFrame(a = 1:6, b = 7:12)
+lf$tail()$collect()
+lf$tail(2)$collect()
+lf <- pl$LazyFrame(a = 1:6, b = 7:12)
+
+lf$tail()$collect()
+
+lf$tail(2)$collect()
+}
+\seealso{
+\code{\link[=lazyframe__head]{<LazyFrame>$head()}}
+}
diff --git a/man/lazyframe__to_dot.Rd b/man/lazyframe__to_dot.Rd
new file mode 100644
index 00000000..3b650103
--- /dev/null
+++ b/man/lazyframe__to_dot.Rd
@@ -0,0 +1,78 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__to_dot}
+\alias{lazyframe__to_dot}
+\title{Plot the query plan}
+\usage{
+lazyframe__to_dot(
+  ...,
+  optimized = TRUE,
+  type_coercion = TRUE,
+  predicate_pushdown = TRUE,
+  projection_pushdown = TRUE,
+  simplify_expression = TRUE,
+  slice_pushdown = TRUE,
+  comm_subplan_elim = TRUE,
+  comm_subexpr_elim = TRUE,
+  cluster_with_columns = TRUE,
+  streaming = FALSE
+)
+}
+\arguments{
+\item{...}{Not used..}
+
+\item{optimized}{Optimize the query plan.}
+
+\item{type_coercion}{Logical. Coerce types such that operations succeed and
+run on minimal required memory.}
+
+\item{predicate_pushdown}{Logical. Applies filters as early as possible at
+scan level.}
+
+\item{projection_pushdown}{Logical. Select only the columns that are needed
+at the scan level.}
+
+\item{simplify_expression}{Logical. Various optimizations, such as constant
+folding and replacing expensive operations with faster alternatives.}
+
+\item{slice_pushdown}{Logical. Only load the required slice from the scan
+level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).}
+
+\item{comm_subplan_elim}{Logical. Will try to cache branching subplans that
+occur on self-joins or unions.}
+
+\item{comm_subexpr_elim}{Logical. Common subexpressions will be cached and
+reused.}
+
+\item{cluster_with_columns}{Combine sequential independent calls to
+\code{\link[=lazyframe__with_columns]{with_columns()}}.}
+
+\item{streaming}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} Logical. Process the
+query in batches to handle larger-than-memory data. If \code{FALSE} (default),
+the entire query is processed in a single batch.}
+}
+\value{
+A character vector
+}
+\description{
+This only returns the "dot" output that can be passed to other packages, such
+as \code{DiagrammeR::grViz()}.
+}
+\examples{
+lf <- pl$LazyFrame(
+  a = c("a", "b", "a", "b", "b", "c"),
+  b = 1:6,
+  c = 6:1
+)
+
+query <- lf$group_by("a", maintain_order = TRUE)$agg(
+  pl$all()$sum()
+)$sort(
+  "a"
+)
+
+query$to_dot() |> cat()
+
+# You could print the graph by using DiagrammeR for example, with
+# query$to_dot() |> DiagrammeR::grViz().
+}
diff --git a/man/lazyframe__top_k.Rd b/man/lazyframe__top_k.Rd
new file mode 100644
index 00000000..60e13e1f
--- /dev/null
+++ b/man/lazyframe__top_k.Rd
@@ -0,0 +1,41 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__top_k}
+\alias{lazyframe__top_k}
+\title{Return the \code{k} largest rows}
+\usage{
+lazyframe__top_k(k, ..., by, reverse = FALSE)
+}
+\arguments{
+\item{k}{Number of rows to return.}
+
+\item{...}{Dots which should be empty.}
+
+\item{by}{Column(s) used to determine the bottom rows. Accepts expression
+input. Strings are parsed as column names.}
+
+\item{reverse}{Consider the \code{k} smallest elements of the \code{by} column(s)
+(instead of the \code{k} largest). This can be specified per column by passing a
+sequence of booleans.}
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+Non-null elements are always preferred over null elements, regardless of the
+value of \code{reverse}. The output is not guaranteed to be in any particular
+order, call \code{sort()} after this function if you wish the output to be sorted.
+}
+\examples{
+lf <- pl$LazyFrame(
+  a = c("a", "b", "a", "b", "b", "c"),
+  b = c(2, 1, 1, 3, 2, 1)
+)
+
+# Get the rows which contain the 4 largest values in column b.
+lf$top_k(4, by = "b")$collect()
+
+# Get the rows which contain the 4 largest values when sorting on column a
+# and b$
+lf$top_k(4, by = c("a", "b"))$collect()
+}
diff --git a/man/lazyframe__unique.Rd b/man/lazyframe__unique.Rd
new file mode 100644
index 00000000..22d160d3
--- /dev/null
+++ b/man/lazyframe__unique.Rd
@@ -0,0 +1,50 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__unique}
+\alias{lazyframe__unique}
+\title{Drop duplicate rows from this DataFrame}
+\usage{
+lazyframe__unique(
+  subset = NULL,
+  ...,
+  keep = c("any", "none", "first", "last"),
+  maintain_order = FALSE
+)
+}
+\arguments{
+\item{subset}{Column name(s) or selector(s), to consider when identifying
+duplicate rows. If \code{NULL} (default), use all columns.}
+
+\item{...}{Dots which should be empty.}
+
+\item{keep}{Which of the duplicate rows to keep. Must be one of:
+\itemize{
+\item \code{"any"}: does not give any guarantee of which row is kept. This allows
+more optimizations.
+\item \code{"none"}: don’t keep duplicate rows.
+\item \code{"first"}: keep first unique row.
+\item \code{"last"}: keep last unique row.
+}}
+
+\item{maintain_order}{Keep the same order as the original LazyFrame. This is
+more expensive to compute. Setting this to \code{TRUE} blocks the possibility to
+run on the streaming engine.}
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+Drop duplicate rows from this DataFrame
+}
+\examples{
+lf <- pl$LazyFrame(
+  foo = c(1, 2, 3, 1),
+  bar = c("a", "a", "a", "a"),
+  ham = c("b", "b", "b", "b"),
+)
+lf$unique(maintain_order = TRUE)$collect()
+
+lf$unique(subset = c("bar", "ham"), maintain_order = TRUE)$collect()
+
+lf$unique(keep = "last", maintain_order = TRUE)$collect()
+}
diff --git a/man/lazyframe__unnest.Rd b/man/lazyframe__unnest.Rd
new file mode 100644
index 00000000..a316732d
--- /dev/null
+++ b/man/lazyframe__unnest.Rd
@@ -0,0 +1,33 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__unnest}
+\alias{lazyframe__unnest}
+\title{Decompose struct columns into separate columns for each of their fields}
+\usage{
+lazyframe__unnest(...)
+}
+\arguments{
+\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Name of the struct column(s)
+that should be unnested.}
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+The new columns will be inserted into the LazyFrame at the location of the
+struct column.
+}
+\examples{
+lf <- pl$LazyFrame(
+  a = 1:5,
+  b = c("one", "two", "three", "four", "five"),
+  c = 6:10
+)$
+  select(
+  pl$struct("b"),
+  pl$struct(c("a", "c"))$alias("a_and_c")
+)
+lf$collect()
+
+lf$unnest("a_and_c")$collect()
+}
diff --git a/man/lazyframe__unpivot.Rd b/man/lazyframe__unpivot.Rd
new file mode 100644
index 00000000..5c37f657
--- /dev/null
+++ b/man/lazyframe__unpivot.Rd
@@ -0,0 +1,45 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__unpivot}
+\alias{lazyframe__unpivot}
+\title{Unpivot a LazyFrame from wide to long format}
+\usage{
+lazyframe__unpivot(
+  on = NULL,
+  ...,
+  index = NULL,
+  variable_name = NULL,
+  value_name = NULL
+)
+}
+\arguments{
+\item{on}{Values to use as identifier variables. If \code{value_vars} is
+empty all columns that are not in \code{id_vars} will be used.}
+
+\item{...}{Dots which should be empty.}
+
+\item{index}{Columns to use as identifier variables.}
+
+\item{variable_name}{Name to give to the new column containing the names of
+the melted columns. Defaults to "variable".}
+
+\item{value_name}{Name to give to the new column containing the values of
+the melted columns. Defaults to \code{"value"}.}
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+This function is useful to massage a LazyFrame into a format where one or
+more columns are identifier variables (\code{index}) while all other columns,
+considered measured variables (\code{on}), are “unpivoted” to the row axis
+leaving just two non-identifier columns, "variable" and "value".
+}
+\examples{
+lf <- pl$LazyFrame(
+  a = c("x", "y", "z"),
+  b = c(1, 3, 5),
+  c = c(2, 4, 6)
+)
+lf$unpivot(index = "a", on = c("b", "c"))$collect()
+}
diff --git a/man/lazyframe__var.Rd b/man/lazyframe__var.Rd
new file mode 100644
index 00000000..4e8c3ab5
--- /dev/null
+++ b/man/lazyframe__var.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__var}
+\alias{lazyframe__var}
+\title{Aggregate the columns in the LazyFrame to their variance value}
+\usage{
+lazyframe__var(ddof = 1)
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+Aggregate the columns in the LazyFrame to their variance value
+}
+\examples{
+lf <- pl$LazyFrame(a = 1:4, b = c(1, 2, 1, 1))
+lf$var()$collect()
+lf$var(ddof = 0)$collect()
+}
diff --git a/man/lazyframe__with_columns.Rd b/man/lazyframe__with_columns.Rd
index 60262f97..77d29306 100644
--- a/man/lazyframe__with_columns.Rd
+++ b/man/lazyframe__with_columns.Rd
@@ -7,11 +7,11 @@
 lazyframe__with_columns(...)
 }
 \arguments{
-\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}>
-Name-value pairs of objects to be converted to polars \link[=Expr]{expressions}
-by the \code{\link[=as_polars_expr]{as_polars_expr()}} function.
-Characters are parsed as column names, other non-expression inputs are parsed as \link[=pl__lit]{literals}.
-Each name will be used as the expression name.}
+\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Name-value pairs of objects
+to be converted to polars \link[=Expr]{expressions} by the \code{\link[=as_polars_expr]{as_polars_expr()}}
+function. Characters are parsed as column names, other non-expression inputs
+are parsed as \link[=pl__lit]{literals}. Each name will be used as the expression
+name.}
 }
 \value{
 A polars \link{LazyFrame}
diff --git a/man/lazyframe__with_columns_seq.Rd b/man/lazyframe__with_columns_seq.Rd
new file mode 100644
index 00000000..f2928535
--- /dev/null
+++ b/man/lazyframe__with_columns_seq.Rd
@@ -0,0 +1,66 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__with_columns_seq}
+\alias{lazyframe__with_columns_seq}
+\title{Modify/append column(s) of a LazyFrame}
+\usage{
+lazyframe__with_columns_seq(...)
+}
+\arguments{
+\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Name-value pairs of objects
+to be converted to polars \link[=Expr]{expressions} by the \code{\link[=as_polars_expr]{as_polars_expr()}}
+function. Characters are parsed as column names, other non-expression inputs
+are parsed as \link[=pl__lit]{literals}. Each name will be used as the expression
+name.}
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+This will run all expression sequentially instead of in parallel. Use this
+only when the work per expression is cheap.
+
+Add columns or modify existing ones with expressions. This is similar to
+\code{dplyr::mutate()} as it keeps unmentioned columns (unlike \verb{$select()}).
+
+However, unlike \code{dplyr::mutate()}, one cannot use new variables in subsequent
+expressions in the same \verb{$with_columns_seq()}call. For instance, if you create a
+variable \code{x}, you will only be able to use it in another \verb{$with_columns_seq()}
+or \verb{$select()} call.
+}
+\examples{
+# Pass an expression to add it as a new column.
+lf <- pl$LazyFrame(
+  a = 1:4,
+  b = c(0.5, 4, 10, 13),
+  c = c(TRUE, TRUE, FALSE, TRUE),
+)
+lf$with_columns_seq((pl$col("a")^2)$alias("a^2"))$collect()
+
+# Added columns will replace existing columns with the same name.
+lf$with_columns_seq(a = pl$col("a")$cast(pl$Float64))$collect()
+
+# Multiple columns can be added
+lf$with_columns_seq(
+  (pl$col("a")^2)$alias("a^2"),
+  (pl$col("b") / 2)$alias("b/2"),
+  (pl$col("c")$not())$alias("not c"),
+)$collect()
+
+# Name expression instead of `$alias()`
+lf$with_columns_seq(
+  `a^2` = pl$col("a")^2,
+  `b/2` = pl$col("b") / 2,
+  `not c` = pl$col("c")$not(),
+)$collect()
+
+# Expressions with multiple outputs can automatically be instantiated
+# as Structs by enabling the experimental setting `POLARS_AUTO_STRUCTIFY`:
+if (requireNamespace("withr", quietly = TRUE)) {
+  withr::with_envvar(c(POLARS_AUTO_STRUCTIFY = "1"), {
+    lf$drop("c")$with_columns_seq(
+      diffs = pl$col("a", "b")$diff()$name$suffix("_diff"),
+    )$collect()
+  })
+}
+}
diff --git a/man/lazyframe__with_context.Rd b/man/lazyframe__with_context.Rd
new file mode 100644
index 00000000..4a47d8b4
--- /dev/null
+++ b/man/lazyframe__with_context.Rd
@@ -0,0 +1,39 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__with_context}
+\alias{lazyframe__with_context}
+\title{Add an external context to the computation graph}
+\usage{
+lazyframe__with_context(other)
+}
+\arguments{
+\item{other}{Data/LazyFrame to have access to. This can be a list of DataFrames
+and LazyFrames.}
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+This allows expressions to also access columns from DataFrames or LazyFrames
+that are not part of this one.
+}
+\examples{
+lf <- pl$LazyFrame(a = c(1, 2, 3), b = c("a", "c", NA))
+lf_other <- pl$LazyFrame(c = c("foo", "ham"))
+
+lf$with_context(lf_other)$select(
+  pl$col("b") + pl$col("c")$first()
+)$collect()
+
+# Fill nulls with the median from another lazyframe:
+train_lf <- pl$LazyFrame(
+  feature_0 = c(-1.0, 0, 1), feature_1 = c(-1.0, 0, 1)
+)
+test_lf <- pl$LazyFrame(
+  feature_0 = c(-1.0, NA, 1), feature_1 = c(-1.0, 0, 1)
+)
+
+test_lf$with_context(train_lf$select(pl$all()$name$suffix("_train")))$select(
+  pl$col("feature_0")$fill_null(pl$col("feature_0_train")$median())
+)$collect()
+}
diff --git a/man/lazyframe__with_row_index.Rd b/man/lazyframe__with_row_index.Rd
new file mode 100644
index 00000000..2a6fc206
--- /dev/null
+++ b/man/lazyframe__with_row_index.Rd
@@ -0,0 +1,32 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__with_row_index}
+\alias{lazyframe__with_row_index}
+\title{Add a row index as the first column in the LazyFrame}
+\usage{
+lazyframe__with_row_index(name = "index", offset = 0)
+}
+\arguments{
+\item{name}{Name of the index column.}
+
+\item{offset}{Start the index at this offset. Cannot be negative.}
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+Using this function can have a negative effect on query performance. This
+may, for instance, block predicate pushdown optimization.
+}
+\examples{
+lf <- pl$LazyFrame(x = c(1, 3, 5), y = c(2, 4, 6))
+lf$with_row_index()$collect()
+
+lf$with_row_index("id", offset = 1000)$collect()
+
+# An index column can also be created using the expressions int_range()
+# and len()$
+lf$with_columns(
+  index = pl$int_range(pl$len(), dtype = pl$UInt32)
+)$collect()
+}
diff --git a/man/pl__struct.Rd b/man/pl__struct.Rd
index 1f2daa36..c6424779 100644
--- a/man/pl__struct.Rd
+++ b/man/pl__struct.Rd
@@ -7,11 +7,11 @@
 pl__struct(...)
 }
 \arguments{
-\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}>
-Name-value pairs of objects to be converted to polars \link[=Expr]{expressions}
-by the \code{\link[=as_polars_expr]{as_polars_expr()}} function.
-Characters are parsed as column names, other non-expression inputs are parsed as \link[=pl__lit]{literals}.
-Each name will be used as the expression name.}
+\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Name-value pairs of objects
+to be converted to polars \link[=Expr]{expressions} by the \code{\link[=as_polars_expr]{as_polars_expr()}}
+function. Characters are parsed as column names, other non-expression inputs
+are parsed as \link[=pl__lit]{literals}. Each name will be used as the expression
+name.}
 }
 \value{
 A polars \link{expression}
diff --git a/src/init.c b/src/init.c
index 38be6b57..0e955ec6 100644
--- a/src/init.c
+++ b/src/init.c
@@ -1704,6 +1704,11 @@ SEXP savvy_PlRLazyFrame_clone__impl(SEXP self__) {
     return handle_result(res);
 }
 
+SEXP savvy_PlRLazyFrame_collect_schema__impl(SEXP self__) {
+    SEXP res = savvy_PlRLazyFrame_collect_schema__ffi(self__);
+    return handle_result(res);
+}
+
 SEXP savvy_PlRLazyFrame_unnest__impl(SEXP self__, SEXP c_arg__columns) {
     SEXP res = savvy_PlRLazyFrame_unnest__ffi(self__, c_arg__columns);
     return handle_result(res);
@@ -2250,6 +2255,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"savvy_PlRLazyFrame_unpivot__impl", (DL_FUNC) &savvy_PlRLazyFrame_unpivot__impl, 5},
     {"savvy_PlRLazyFrame_with_row_index__impl", (DL_FUNC) &savvy_PlRLazyFrame_with_row_index__impl, 3},
     {"savvy_PlRLazyFrame_clone__impl", (DL_FUNC) &savvy_PlRLazyFrame_clone__impl, 1},
+    {"savvy_PlRLazyFrame_collect_schema__impl", (DL_FUNC) &savvy_PlRLazyFrame_collect_schema__impl, 1},
     {"savvy_PlRLazyFrame_unnest__impl", (DL_FUNC) &savvy_PlRLazyFrame_unnest__impl, 2},
     {"savvy_PlRLazyFrame_count__impl", (DL_FUNC) &savvy_PlRLazyFrame_count__impl, 1},
     {"savvy_PlRLazyFrame_merge_sorted__impl", (DL_FUNC) &savvy_PlRLazyFrame_merge_sorted__impl, 3},
diff --git a/src/rust/api.h b/src/rust/api.h
index 6be30108..200a27dc 100644
--- a/src/rust/api.h
+++ b/src/rust/api.h
@@ -344,6 +344,7 @@ SEXP savvy_PlRLazyFrame_drop_nulls__ffi(SEXP self__, SEXP c_arg__subset);
 SEXP savvy_PlRLazyFrame_unpivot__ffi(SEXP self__, SEXP c_arg__on, SEXP c_arg__index, SEXP c_arg__value_name, SEXP c_arg__variable_name);
 SEXP savvy_PlRLazyFrame_with_row_index__ffi(SEXP self__, SEXP c_arg__name, SEXP c_arg__offset);
 SEXP savvy_PlRLazyFrame_clone__ffi(SEXP self__);
+SEXP savvy_PlRLazyFrame_collect_schema__ffi(SEXP self__);
 SEXP savvy_PlRLazyFrame_unnest__ffi(SEXP self__, SEXP c_arg__columns);
 SEXP savvy_PlRLazyFrame_count__ffi(SEXP self__);
 SEXP savvy_PlRLazyFrame_merge_sorted__ffi(SEXP self__, SEXP c_arg__other, SEXP c_arg__key);
diff --git a/src/rust/src/lazyframe/general.rs b/src/rust/src/lazyframe/general.rs
index 095c822b..bf7eb800 100644
--- a/src/rust/src/lazyframe/general.rs
+++ b/src/rust/src/lazyframe/general.rs
@@ -1,7 +1,8 @@
 use super::*;
 use crate::{PlRDataFrame, PlRDataType, PlRExpr, PlRLazyFrame, PlRLazyGroupBy, RPolarsErr};
 use savvy::{
-    savvy, ListSexp, LogicalSexp, NumericScalar, OwnedStringSexp, Result, Sexp, StringSexp,
+    savvy, ListSexp, LogicalSexp, NumericScalar, OwnedListSexp, OwnedStringSexp, Result, Sexp,
+    StringSexp,
 };
 
 #[savvy]
@@ -768,19 +769,55 @@ impl PlRLazyFrame {
         Ok(self.ldf.clone().into())
     }
 
-    // fn collect_schema(&mut self, py: Python) -> Result<ListSexp> {
-    //     let schema = py
-    //         .allow_threads(|| self.ldf.collect_schema())
-    //         .map_err(RPolarsErr::from)?;
+    fn collect_schema(&mut self) -> Result<Sexp> {
+        use crate::{
+            r_threads::{concurrent_handler, ThreadCom},
+            r_udf::{RUdfReturn, RUdfSignature, CONFIG},
+        };
+        fn serve_r(
+            udf_sig: RUdfSignature,
+        ) -> std::result::Result<RUdfReturn, Box<dyn std::error::Error>> {
+            udf_sig.eval()
+        }
 
-    //     let schema_dict = PyDict::new_bound(py);
-    //     schema.iter_fields().for_each(|fld| {
-    //         schema_dict
-    //             .set_item(fld.name().as_str(), Wrap(fld.dtype().clone()))
-    //             .unwrap()
-    //     });
-    //     Ok(schema_dict.to_object(py))
-    // }
+        let mut ldf = self.ldf.clone();
+        let schema = if ThreadCom::try_from_global(&CONFIG).is_ok() {
+            ldf.collect_schema().map_err(RPolarsErr::from)?
+        } else {
+            concurrent_handler(
+                // closure 1: spawned by main thread
+                // tc is a ThreadCom which any child thread can use to submit R jobs to main thread
+                move |tc| {
+                    // get return value
+                    let retval = ldf.collect_schema();
+
+                    // drop the last two ThreadCom clones, signals to main/R-serving thread to shut down.
+                    ThreadCom::kill_global(&CONFIG);
+                    drop(tc);
+
+                    retval
+                },
+                // closure 2: how to serve polars worker R job request in main thread
+                serve_r,
+                // CONFIG is "global variable" where any new thread can request a clone of ThreadCom to establish contact with main thread
+                &CONFIG,
+            )
+            .map_err(|e| e.to_string())?
+            .map_err(RPolarsErr::from)?
+        };
+
+        let mut out = OwnedListSexp::new(schema.len(), true)?;
+        for i in 0..schema.len() {
+            let (fld_name, fld_value) = schema.get_at_index(i).unwrap();
+            let fld_value = <PlRDataType>::from(fld_value);
+            let _ = out.set_name(i, fld_name.as_str());
+            unsafe {
+                let _ = out.set_value_unchecked(i, Sexp::try_from(fld_value)?.0);
+            }
+        }
+
+        Ok(out.into())
+    }
 
     fn unnest(&self, columns: ListSexp) -> Result<PlRLazyFrame> {
         let columns = <Wrap<Vec<Expr>>>::from(columns).0;

From 514f16b3384697312862ed1907f4885afbf54e70 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Wed, 6 Nov 2024 10:15:24 +0100
Subject: [PATCH 11/71] $profile()

---
 R/000-wrappers.R                  |   7 ++
 R/lazyframe-frame.R               | 146 ++++++++++++++++++++++++++----
 R/utils-various.R                 |  60 ++++++++++++
 src/init.c                        |   6 ++
 src/rust/api.h                    |   1 +
 src/rust/src/lazyframe/general.rs |  76 ++++++++++------
 6 files changed, 247 insertions(+), 49 deletions(-)

diff --git a/R/000-wrappers.R b/R/000-wrappers.R
index 281d9633..e4e06470 100644
--- a/R/000-wrappers.R
+++ b/R/000-wrappers.R
@@ -2330,6 +2330,12 @@ class(`PlRExpr`) <- c("PlRExpr__bundle", "savvy_neopolars__sealed")
   }
 }
 
+`PlRLazyFrame_profile` <- function(self) {
+  function() {
+    .Call(savvy_PlRLazyFrame_profile__impl, `self`)
+  }
+}
+
 `PlRLazyFrame_select_seq` <- function(self) {
   function(`exprs`) {
     .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_select_seq__impl, `self`, `exprs`))
@@ -2543,6 +2549,7 @@ class(`PlRExpr`) <- c("PlRExpr__bundle", "savvy_neopolars__sealed")
   e$`top_k` <- `PlRLazyFrame_top_k`(ptr)
   e$`bottom_k` <- `PlRLazyFrame_bottom_k`(ptr)
   e$`cache` <- `PlRLazyFrame_cache`(ptr)
+  e$`profile` <- `PlRLazyFrame_profile`(ptr)
   e$`select_seq` <- `PlRLazyFrame_select_seq`(ptr)
   e$`rolling` <- `PlRLazyFrame_rolling`(ptr)
   e$`group_by_dynamic` <- `PlRLazyFrame_group_by_dynamic`(ptr)
diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index 2f95cfba..bb501900 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -284,6 +284,115 @@ lazyframe__collect_schema <- function() {
   })
 }
 
+#' Collect and profile a lazy query.
+#'
+#' This will run the query and return a list containing the materialized
+#' DataFrame and a DataFrame that contains profiling information of each node
+#' that is executed.
+#'
+#' @inheritParams rlang::check_dots_empty0
+#' @inheritParams lazyframe__collect
+#' @param show_plot Show a Gantt chart of the profiling result
+#' @param truncate_nodes Truncate the label lengths in the Gantt chart to this
+#' number of characters. If `0` (default), do not truncate.
+#'
+#' @details
+#' The units of the timings are microseconds.
+#'
+#' @return List of two `DataFrame`s: one with the collected result, the other
+#' with the timings of each step. If `show_graph = TRUE`, then the plot is
+#' also stored in the list.
+#' @seealso
+#'  - [`$collect()`][LazyFrame_collect] - regular collect.
+#'  - [`$fetch()`][LazyFrame_fetch] - fast limited query check
+#'  - [`$collect_in_background()`][LazyFrame_collect_in_background] - non-blocking
+#'    collect returns a future handle. Can also just be used via
+#'    `$collect(collect_in_background = TRUE)`.
+#'  - [`$sink_parquet()`][LazyFrame_sink_parquet()] streams query to a parquet file.
+#'  - [`$sink_ipc()`][LazyFrame_sink_ipc()] streams query to a arrow file.
+#'
+#' @examples
+#' ## Simplest use case
+#' pl$LazyFrame()$select(pl$lit(2) + 2)$profile()
+#'
+#' ## Use $profile() to compare two queries
+#'
+#' # -1-  map each Species-group with native polars
+#' pl$LazyFrame(iris)$
+#'   sort("Sepal.Length")$
+#'   group_by("Species", maintain_order = TRUE)$
+#'   agg(pl$col(pl$Float64)$first() + 5)$
+#'   profile()
+#'
+#' # -2-  map each Species-group of each numeric column with an R function
+#'
+#' # some R function, prints `.` for each time called by polars
+#' r_func <- \(s) {
+#'   cat(".")
+#'   s$to_r()[1] + 5
+#' }
+#'
+#' pl$LazyFrame(iris)$
+#'   sort("Sepal.Length")$
+#'   group_by("Species", maintain_order = TRUE)$
+#'   agg(pl$col(pl$Float64)$map_elements(r_func))$
+#'   profile()
+lazyframe__profile <- function(
+    ...,
+    type_coercion = TRUE,
+    predicate_pushdown = TRUE,
+    projection_pushdown = TRUE,
+    simplify_expression = TRUE,
+    slice_pushdown = TRUE,
+    comm_subplan_elim = TRUE,
+    comm_subexpr_elim = TRUE,
+    cluster_with_columns = TRUE,
+    streaming = FALSE,
+    no_optimization = FALSE,
+    collect_in_background = FALSE,
+    show_plot = FALSE,
+    truncate_nodes = 0) {
+  wrap({
+    check_dots_empty0(...)
+
+    if (isTRUE(no_optimization)) {
+      predicate_pushdown <- FALSE
+      projection_pushdown <- FALSE
+      slice_pushdown <- FALSE
+      comm_subplan_elim <- FALSE
+      comm_subexpr_elim <- FALSE
+      cluster_with_columns <- FALSE
+    }
+
+    if (isTRUE(streaming)) {
+      comm_subplan_elim <- FALSE
+    }
+
+    lf <- self$`_rexpr`$optimization_toggle(
+      type_coercion = type_coercion,
+      predicate_pushdown = predicate_pushdown,
+      projection_pushdown = projection_pushdown,
+      simplify_expression = simplify_expression,
+      slice_pushdown = slice_pushdown,
+      comm_subplan_elim = comm_subplan_elim,
+      comm_subexpr_elim = comm_subexpr_elim,
+      cluster_with_columns = cluster_with_columns,
+      streaming = streaming,
+      eager = FALSE
+    )
+
+    out <- lf |>
+      .pr$LazyFrame$profile()
+
+    if (isTRUE(show_plot)) {
+      out[["plot"]] <- make_profile_plot(out, truncate_nodes) |>
+        result() |>
+        unwrap("in $profile()")
+    }
+    out
+  })
+}
+
 #' Create a string representation of the query plan
 #'
 #' The query plan is read from bottom to top. When `optimized = FALSE`, the
@@ -1538,28 +1647,27 @@ lazyframe__profile <- function(
     comm_subplan_elim <- FALSE
   }
 
-  lf <- self |>
-    self$`_ldf`$optimization_toggle(
-      pe_coercion = type_coercion,
-      predicate_pushdown = predicate_pushdown,
-      projection_pushdown = projection_pushdown,
-      simplify_expression = simplify_expression,
-      slice_pushdown = slice_pushdown,
-      comm_subplan_elim = comm_subplan_elim,
-      comm_subexpr_elim = comm_subexpr_elim,
-      cluster_with_columns = cluster_with_columns,
-      streaming = streaming,
-      eager = FALSE
-    )
+  lf <- self$`_ldf`$optimization_toggle(
+    type_coercion = type_coercion,
+    predicate_pushdown = predicate_pushdown,
+    projection_pushdown = projection_pushdown,
+    simplify_expression = simplify_expression,
+    slice_pushdown = slice_pushdown,
+    comm_subplan_elim = comm_subplan_elim,
+    comm_subexpr_elim = comm_subexpr_elim,
+    cluster_with_columns = cluster_with_columns,
+    streaming = streaming,
+    `_eager` = FALSE
+  )
 
-  out <- lf |>
-    self$`_ldf`$profile() >
-    unwrap("in $profile()")
+  out <- lapply(self$`_ldf`$profile(), \(x) {
+    x |>
+      .savvy_wrap_PlRDataFrame() |>
+      wrap()
+  })
 
   if (isTRUE(show_plot)) {
-    out[["plot"]] <- make_profile_plot(out, truncate_nodes) |>
-      result() |>
-      unwrap("in $profile()")
+    out[["plot"]] <- make_profile_plot(out, truncate_nodes)
   }
 
   out
diff --git a/R/utils-various.R b/R/utils-various.R
index 5840ed62..2aab4e6f 100644
--- a/R/utils-various.R
+++ b/R/utils-various.R
@@ -16,3 +16,63 @@ extend_bool <- function(value, n_match, value_name, match_name) {
     value
   }
 }
+
+#' @noRd
+make_profile_plot <- function(data, truncate_nodes) {
+  check_installed("ggplot2")
+  timings <- as.data.frame(data[[2]])
+  timings$node <- factor(timings$node, levels = unique(timings$node))
+  total_timing <- max(timings$end)
+  if (total_timing > 10000000) {
+    unit <- "s"
+    total_timing <- paste0(total_timing / 1000000, "s")
+    timings$start <- timings$start / 1000000
+    timings$end <- timings$end / 1000000
+  } else if (total_timing > 10000) {
+    unit <- "ms"
+    total_timing <- paste0(total_timing / 1000, "ms")
+    timings$start <- timings$start / 1000
+    timings$end <- timings$end / 1000
+  } else {
+    unit <- "\U00B5s"
+    total_timing <- paste0(total_timing, "\U00B5s")
+  }
+
+  # for some reason, there's an error if I use rlang::.data directly in aes()
+  .data <- rlang::.data
+
+  plot <- ggplot2::ggplot(
+    timings,
+    ggplot2::aes(
+      x = .data[["start"]], xend = .data[["end"]],
+      y = .data[["node"]], yend = .data[["node"]]
+    )
+  ) +
+    ggplot2::geom_segment(linewidth = 6) +
+    ggplot2::xlab(
+      paste0("Node duration in ", unit, ". Total duration: ", total_timing)
+    ) +
+    ggplot2::ylab(NULL) +
+    ggplot2::theme(
+      axis.text = ggplot2::element_text(size = 12)
+    )
+
+  if (truncate_nodes > 0) {
+    plot <- plot +
+      ggplot2::scale_y_discrete(
+        labels = rev(paste0(strtrim(timings$node, truncate_nodes), "...")),
+        limits = rev
+      )
+  } else {
+    plot <- plot +
+      ggplot2::scale_y_discrete(
+        limits = rev
+      )
+  }
+
+  # do not show the plot if we're running testthat
+  if (!identical(Sys.getenv("TESTTHAT"), "true")) {
+    print(plot)
+  }
+  plot
+}
diff --git a/src/init.c b/src/init.c
index 0e955ec6..452cd69d 100644
--- a/src/init.c
+++ b/src/init.c
@@ -1574,6 +1574,11 @@ SEXP savvy_PlRLazyFrame_cache__impl(SEXP self__) {
     return handle_result(res);
 }
 
+SEXP savvy_PlRLazyFrame_profile__impl(SEXP self__) {
+    SEXP res = savvy_PlRLazyFrame_profile__ffi(self__);
+    return handle_result(res);
+}
+
 SEXP savvy_PlRLazyFrame_select_seq__impl(SEXP self__, SEXP c_arg__exprs) {
     SEXP res = savvy_PlRLazyFrame_select_seq__ffi(self__, c_arg__exprs);
     return handle_result(res);
@@ -2229,6 +2234,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"savvy_PlRLazyFrame_top_k__impl", (DL_FUNC) &savvy_PlRLazyFrame_top_k__impl, 4},
     {"savvy_PlRLazyFrame_bottom_k__impl", (DL_FUNC) &savvy_PlRLazyFrame_bottom_k__impl, 4},
     {"savvy_PlRLazyFrame_cache__impl", (DL_FUNC) &savvy_PlRLazyFrame_cache__impl, 1},
+    {"savvy_PlRLazyFrame_profile__impl", (DL_FUNC) &savvy_PlRLazyFrame_profile__impl, 1},
     {"savvy_PlRLazyFrame_select_seq__impl", (DL_FUNC) &savvy_PlRLazyFrame_select_seq__impl, 2},
     {"savvy_PlRLazyFrame_rolling__impl", (DL_FUNC) &savvy_PlRLazyFrame_rolling__impl, 6},
     {"savvy_PlRLazyFrame_group_by_dynamic__impl", (DL_FUNC) &savvy_PlRLazyFrame_group_by_dynamic__impl, 10},
diff --git a/src/rust/api.h b/src/rust/api.h
index 200a27dc..76bf584f 100644
--- a/src/rust/api.h
+++ b/src/rust/api.h
@@ -318,6 +318,7 @@ SEXP savvy_PlRLazyFrame_sort__ffi(SEXP self__, SEXP c_arg__by_column, SEXP c_arg
 SEXP savvy_PlRLazyFrame_top_k__ffi(SEXP self__, SEXP c_arg__k, SEXP c_arg__by, SEXP c_arg__reverse);
 SEXP savvy_PlRLazyFrame_bottom_k__ffi(SEXP self__, SEXP c_arg__k, SEXP c_arg__by, SEXP c_arg__reverse);
 SEXP savvy_PlRLazyFrame_cache__ffi(SEXP self__);
+SEXP savvy_PlRLazyFrame_profile__ffi(SEXP self__);
 SEXP savvy_PlRLazyFrame_select_seq__ffi(SEXP self__, SEXP c_arg__exprs);
 SEXP savvy_PlRLazyFrame_rolling__ffi(SEXP self__, SEXP c_arg__index_column, SEXP c_arg__period, SEXP c_arg__offset, SEXP c_arg__closed, SEXP c_arg__by);
 SEXP savvy_PlRLazyFrame_group_by_dynamic__ffi(SEXP self__, SEXP c_arg__index_column, SEXP c_arg__every, SEXP c_arg__period, SEXP c_arg__offset, SEXP c_arg__label, SEXP c_arg__include_boundaries, SEXP c_arg__closed, SEXP c_arg__group_by, SEXP c_arg__start_by);
diff --git a/src/rust/src/lazyframe/general.rs b/src/rust/src/lazyframe/general.rs
index bf7eb800..5b07ed8e 100644
--- a/src/rust/src/lazyframe/general.rs
+++ b/src/rust/src/lazyframe/general.rs
@@ -261,38 +261,54 @@ impl PlRLazyFrame {
         Ok(ldf.cache().into())
     }
 
-    // fn profile(&self, py: Python) -> Result<(PlRDataFrame, PlRDataFrame)> {
-    //     // if we don't allow threads and we have udfs trying to acquire the gil from different
-    //     // threads we deadlock.
-    //     let (df, time_df) = py.allow_threads(|| {
-    //         let ldf = self.ldf.clone();
-    //         ldf.profile().map_err(RPolarsErr::from)
-    //     })?;
-    //     Ok((df.into(), time_df.into()))
-    // }
+    fn profile(&self) -> Result<Sexp> {
+        use crate::{
+            r_threads::{concurrent_handler, ThreadCom},
+            r_udf::{RUdfReturn, RUdfSignature, CONFIG},
+        };
+        fn serve_r(
+            udf_sig: RUdfSignature,
+        ) -> std::result::Result<RUdfReturn, Box<dyn std::error::Error>> {
+            udf_sig.eval()
+        }
 
-    //     fn collect_with_callback(&self, lambda: PyObject) {
-    //         let ldf = self.ldf.clone();
+        let ldf = self.ldf.clone();
+        let (data, timings) = if ThreadCom::try_from_global(&CONFIG).is_ok() {
+            let ldf = self.ldf.clone();
+            ldf.profile().map_err(RPolarsErr::from)?
+        } else {
+            concurrent_handler(
+                // closure 1: spawned by main thread
+                // tc is a ThreadCom which any child thread can use to submit R jobs to main thread
+                move |tc| {
+                    // get return value
+                    let retval = ldf.profile();
 
-    //         polars_core::POOL.spawn(move || {
-    //             let result = ldf
-    //                 .collect()
-    //                 .map(PlRDataFrame::new)
-    //                 .map_err(RPolarsErr::from);
-
-    //             Python::with_gil(|py| match result {
-    //                 Ok(df) => {
-    //                     lambda.call1(py, (df,)).map_err(|err| err.restore(py)).ok();
-    //                 }
-    //                 Err(err) => {
-    //                     lambda
-    //                         .call1(py, (PyErr::from(err).to_object(py),))
-    //                         .map_err(|err| err.restore(py))
-    //                         .ok();
-    //                 }
-    //             });
-    //         });
-    //     }
+                    // drop the last two ThreadCom clones, signals to main/R-serving thread to shut down.
+                    ThreadCom::kill_global(&CONFIG);
+                    drop(tc);
+
+                    retval
+                },
+                // closure 2: how to serve polars worker R job request in main thread
+                serve_r,
+                // CONFIG is "global variable" where any new thread can request a clone of ThreadCom to establish contact with main thread
+                &CONFIG,
+            )
+            .map_err(|e| e.to_string())?
+            .map_err(RPolarsErr::from)?
+        };
+
+        let data = <PlRDataFrame>::from(data);
+        let timings = <PlRDataFrame>::from(timings);
+
+        let mut out = OwnedListSexp::new(2, true)?;
+        unsafe {
+            let _ = out.set_value_unchecked(0, Sexp::try_from(data)?.0);
+            let _ = out.set_value_unchecked(1, Sexp::try_from(timings)?.0);
+        };
+        Ok(out.into())
+    }
 
     // fn sink_parquet(
     //     &self,

From 84985264a7c5f1cc8ef8713bf50e7774fda48c05 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Wed, 6 Nov 2024 11:37:46 +0100
Subject: [PATCH 12/71] serde

---
 R/000-wrappers.R                  | 12 +++++++++++
 R/lazyframe-frame.R               | 34 ++++++++++++++++++++++++++++---
 src/init.c                        | 12 +++++++++++
 src/rust/api.h                    |  2 ++
 src/rust/src/lazyframe/general.rs | 11 +++++-----
 src/rust/src/lazyframe/mod.rs     |  1 +
 src/rust/src/lazyframe/serde.rs   | 12 +++++++++++
 7 files changed, 75 insertions(+), 9 deletions(-)
 create mode 100644 src/rust/src/lazyframe/serde.rs

diff --git a/R/000-wrappers.R b/R/000-wrappers.R
index e4e06470..0c4c5e94 100644
--- a/R/000-wrappers.R
+++ b/R/000-wrappers.R
@@ -230,6 +230,11 @@ NULL
   .savvy_wrap_PlRWhen(.Call(savvy_when__impl, `condition`))
 }
 
+
+`deserialize_lf` <- function(`json`) {
+  .savvy_wrap_PlRLazyFrame(.Call(savvy_deserialize_lf__impl, `json`))
+}
+
 ### wrapper functions for PlRChainedThen
 
 `PlRChainedThen_when` <- function(self) {
@@ -2336,6 +2341,12 @@ class(`PlRExpr`) <- c("PlRExpr__bundle", "savvy_neopolars__sealed")
   }
 }
 
+`PlRLazyFrame_serialize` <- function(self) {
+  function() {
+    .Call(savvy_PlRLazyFrame_serialize__impl, `self`)
+  }
+}
+
 `PlRLazyFrame_select_seq` <- function(self) {
   function(`exprs`) {
     .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_select_seq__impl, `self`, `exprs`))
@@ -2550,6 +2561,7 @@ class(`PlRExpr`) <- c("PlRExpr__bundle", "savvy_neopolars__sealed")
   e$`bottom_k` <- `PlRLazyFrame_bottom_k`(ptr)
   e$`cache` <- `PlRLazyFrame_cache`(ptr)
   e$`profile` <- `PlRLazyFrame_profile`(ptr)
+  e$`serialize` <- `PlRLazyFrame_serialize`(ptr)
   e$`select_seq` <- `PlRLazyFrame_select_seq`(ptr)
   e$`rolling` <- `PlRLazyFrame_rolling`(ptr)
   e$`group_by_dynamic` <- `PlRLazyFrame_group_by_dynamic`(ptr)
diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index bb501900..4b7780e4 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -1569,8 +1569,10 @@ lazyframe__fetch <- function(
   self$`_ldf`$fetch(n_rows)
 }
 
-#' Collect and profile a lazy query.
-#' @description This will run the query and return a list containing the
+#' Collect and profile a lazy query
+#'
+#' @description
+#' This will run the query and return a list containing the
 #' materialized DataFrame and a DataFrame that contains profiling information
 #' of each node that is executed.
 #'
@@ -1581,7 +1583,6 @@ lazyframe__fetch <- function(
 #'
 #' @details The units of the timings are microseconds.
 #'
-#'
 #' @return List of two `DataFrame`s: one with the collected result, the other
 #' with the timings of each step. If `show_graph = TRUE`, then the plot is
 #' also stored in the list.
@@ -1673,6 +1674,33 @@ lazyframe__profile <- function(
   out
 }
 
+#' Serialize the logical plan of this LazyFrame to a string in JSON format
+#'
+#' @return A character value
+#' @examples
+#' lf <- pl$LazyFrame(a = 1:3)$sum()
+#' lf$serialize()
+lazyframe__serialize <- function() {
+  wrap({
+    self$`_ldf`$serialize()
+  })
+}
+
+#' Read a logical plan from a file to construct a LazyFrame
+#'
+#' @param source String containing the LazyFrame logical plan in JSON format.
+#'
+#' @return A character value
+#' @examples
+#' lf <- pl$LazyFrame(a = 1:3)$sum()
+#' ser <- lf$serialize()
+#' pl$deserialize_lf(ser)
+pl__deserialize_lf <- function(source) {
+  wrap({
+    deserialize_lf(source)
+  })
+}
+
 #' Explode the DataFrame to long format by exploding the given columns
 #'
 #' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Column names, expressions, or
diff --git a/src/init.c b/src/init.c
index 452cd69d..02f3e266 100644
--- a/src/init.c
+++ b/src/init.c
@@ -189,6 +189,11 @@ SEXP savvy_when__impl(SEXP c_arg__condition) {
     return handle_result(res);
 }
 
+SEXP savvy_deserialize_lf__impl(SEXP c_arg__json) {
+    SEXP res = savvy_deserialize_lf__ffi(c_arg__json);
+    return handle_result(res);
+}
+
 SEXP savvy_PlRChainedThen_when__impl(SEXP self__, SEXP c_arg__condition) {
     SEXP res = savvy_PlRChainedThen_when__ffi(self__, c_arg__condition);
     return handle_result(res);
@@ -1579,6 +1584,11 @@ SEXP savvy_PlRLazyFrame_profile__impl(SEXP self__) {
     return handle_result(res);
 }
 
+SEXP savvy_PlRLazyFrame_serialize__impl(SEXP self__) {
+    SEXP res = savvy_PlRLazyFrame_serialize__ffi(self__);
+    return handle_result(res);
+}
+
 SEXP savvy_PlRLazyFrame_select_seq__impl(SEXP self__, SEXP c_arg__exprs) {
     SEXP res = savvy_PlRLazyFrame_select_seq__ffi(self__, c_arg__exprs);
     return handle_result(res);
@@ -1957,6 +1967,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"savvy_time_range__impl", (DL_FUNC) &savvy_time_range__impl, 4},
     {"savvy_time_ranges__impl", (DL_FUNC) &savvy_time_ranges__impl, 4},
     {"savvy_when__impl", (DL_FUNC) &savvy_when__impl, 1},
+    {"savvy_deserialize_lf__impl", (DL_FUNC) &savvy_deserialize_lf__impl, 1},
     {"savvy_PlRChainedThen_when__impl", (DL_FUNC) &savvy_PlRChainedThen_when__impl, 2},
     {"savvy_PlRChainedThen_otherwise__impl", (DL_FUNC) &savvy_PlRChainedThen_otherwise__impl, 2},
     {"savvy_PlRChainedWhen_then__impl", (DL_FUNC) &savvy_PlRChainedWhen_then__impl, 2},
@@ -2235,6 +2246,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"savvy_PlRLazyFrame_bottom_k__impl", (DL_FUNC) &savvy_PlRLazyFrame_bottom_k__impl, 4},
     {"savvy_PlRLazyFrame_cache__impl", (DL_FUNC) &savvy_PlRLazyFrame_cache__impl, 1},
     {"savvy_PlRLazyFrame_profile__impl", (DL_FUNC) &savvy_PlRLazyFrame_profile__impl, 1},
+    {"savvy_PlRLazyFrame_serialize__impl", (DL_FUNC) &savvy_PlRLazyFrame_serialize__impl, 1},
     {"savvy_PlRLazyFrame_select_seq__impl", (DL_FUNC) &savvy_PlRLazyFrame_select_seq__impl, 2},
     {"savvy_PlRLazyFrame_rolling__impl", (DL_FUNC) &savvy_PlRLazyFrame_rolling__impl, 6},
     {"savvy_PlRLazyFrame_group_by_dynamic__impl", (DL_FUNC) &savvy_PlRLazyFrame_group_by_dynamic__impl, 10},
diff --git a/src/rust/api.h b/src/rust/api.h
index 76bf584f..0f2b5d4d 100644
--- a/src/rust/api.h
+++ b/src/rust/api.h
@@ -29,6 +29,7 @@ SEXP savvy_datetime_ranges__ffi(SEXP c_arg__start, SEXP c_arg__end, SEXP c_arg__
 SEXP savvy_time_range__ffi(SEXP c_arg__start, SEXP c_arg__end, SEXP c_arg__every, SEXP c_arg__closed);
 SEXP savvy_time_ranges__ffi(SEXP c_arg__start, SEXP c_arg__end, SEXP c_arg__every, SEXP c_arg__closed);
 SEXP savvy_when__ffi(SEXP c_arg__condition);
+SEXP savvy_deserialize_lf__ffi(SEXP c_arg__json);
 
 // methods and associated functions for PlRChainedThen
 SEXP savvy_PlRChainedThen_when__ffi(SEXP self__, SEXP c_arg__condition);
@@ -319,6 +320,7 @@ SEXP savvy_PlRLazyFrame_top_k__ffi(SEXP self__, SEXP c_arg__k, SEXP c_arg__by, S
 SEXP savvy_PlRLazyFrame_bottom_k__ffi(SEXP self__, SEXP c_arg__k, SEXP c_arg__by, SEXP c_arg__reverse);
 SEXP savvy_PlRLazyFrame_cache__ffi(SEXP self__);
 SEXP savvy_PlRLazyFrame_profile__ffi(SEXP self__);
+SEXP savvy_PlRLazyFrame_serialize__ffi(SEXP self__);
 SEXP savvy_PlRLazyFrame_select_seq__ffi(SEXP self__, SEXP c_arg__exprs);
 SEXP savvy_PlRLazyFrame_rolling__ffi(SEXP self__, SEXP c_arg__index_column, SEXP c_arg__period, SEXP c_arg__offset, SEXP c_arg__closed, SEXP c_arg__by);
 SEXP savvy_PlRLazyFrame_group_by_dynamic__ffi(SEXP self__, SEXP c_arg__index_column, SEXP c_arg__every, SEXP c_arg__period, SEXP c_arg__offset, SEXP c_arg__label, SEXP c_arg__include_boundaries, SEXP c_arg__closed, SEXP c_arg__group_by, SEXP c_arg__start_by);
diff --git a/src/rust/src/lazyframe/general.rs b/src/rust/src/lazyframe/general.rs
index 5b07ed8e..009d4a02 100644
--- a/src/rust/src/lazyframe/general.rs
+++ b/src/rust/src/lazyframe/general.rs
@@ -425,12 +425,11 @@ impl PlRLazyFrame {
     //         Ok(())
     //     }
 
-    // fn fetch(&self, py: Python, n_rows: NumericScalar) -> Result<PlRDataFrame> {
-    //     let ldf = self.ldf.clone();
-    //     let n_rows = <Wrap<usize>>::try_from(n_rows)?.0;
-    //     let df = py.allow_threads(|| ldf.fetch(n_rows).map_err(RPolarsErr::from))?;
-    //     Ok(df.into())
-    // }
+    fn serialize(&self) -> Result<Sexp> {
+        let dump = serde_json::to_string(&self.ldf.logical_plan)
+            .map_err(|err| RPolarsErr::Other(err.to_string()))?;
+        dump.try_into()
+    }
 
     fn select_seq(&mut self, exprs: ListSexp) -> Result<PlRLazyFrame> {
         let ldf = self.ldf.clone();
diff --git a/src/rust/src/lazyframe/mod.rs b/src/rust/src/lazyframe/mod.rs
index 20725f28..779aa05b 100644
--- a/src/rust/src/lazyframe/mod.rs
+++ b/src/rust/src/lazyframe/mod.rs
@@ -1,4 +1,5 @@
 mod general;
+mod serde;
 
 use crate::prelude::*;
 use savvy::{savvy, EnvironmentSexp};
diff --git a/src/rust/src/lazyframe/serde.rs b/src/rust/src/lazyframe/serde.rs
new file mode 100644
index 00000000..feefb4f8
--- /dev/null
+++ b/src/rust/src/lazyframe/serde.rs
@@ -0,0 +1,12 @@
+use crate::{prelude::*, PlRLazyFrame, RPolarsErr};
+use savvy::{savvy, Result};
+
+#[savvy]
+fn deserialize_lf(json: &str) -> Result<PlRLazyFrame> {
+    let lp = serde_json::from_str::<DslPlan>(json).map_err(|_| {
+        let msg = "could not deserialize input into a LazyFrame";
+        RPolarsErr::Other(msg.to_string())
+    })?;
+    let out = LazyFrame::from(lp);
+    Ok(<PlRLazyFrame>::from(out))
+}

From de3e5e05f18f0fd887b0b0195878e2d888859221 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Wed, 6 Nov 2024 12:11:57 +0100
Subject: [PATCH 13/71] sink_parquet

---
 R/000-wrappers.R                  |   7 ++
 R/lazyframe-frame.R               | 108 ++++++++++++++++++++++++++++++
 R/utils-various.R                 |  53 +++++++++++++++
 src/init.c                        |   6 ++
 src/rust/api.h                    |   1 +
 src/rust/src/conversion/mod.rs    |  66 ++++++++++++++++++
 src/rust/src/lazyframe/general.rs |  67 ++++++++++--------
 7 files changed, 281 insertions(+), 27 deletions(-)

diff --git a/R/000-wrappers.R b/R/000-wrappers.R
index 0c4c5e94..68807e02 100644
--- a/R/000-wrappers.R
+++ b/R/000-wrappers.R
@@ -2341,6 +2341,12 @@ class(`PlRExpr`) <- c("PlRExpr__bundle", "savvy_neopolars__sealed")
   }
 }
 
+`PlRLazyFrame_sink_parquet` <- function(self) {
+  function(`path`, `compression`, `maintain_order`, `statistics`, `compression_level` = NULL, `row_group_size` = NULL, `data_page_size` = NULL) {
+    invisible(.Call(savvy_PlRLazyFrame_sink_parquet__impl, `self`, `path`, `compression`, `maintain_order`, `statistics`, `compression_level`, `row_group_size`, `data_page_size`))
+  }
+}
+
 `PlRLazyFrame_serialize` <- function(self) {
   function() {
     .Call(savvy_PlRLazyFrame_serialize__impl, `self`)
@@ -2561,6 +2567,7 @@ class(`PlRExpr`) <- c("PlRExpr__bundle", "savvy_neopolars__sealed")
   e$`bottom_k` <- `PlRLazyFrame_bottom_k`(ptr)
   e$`cache` <- `PlRLazyFrame_cache`(ptr)
   e$`profile` <- `PlRLazyFrame_profile`(ptr)
+  e$`sink_parquet` <- `PlRLazyFrame_sink_parquet`(ptr)
   e$`serialize` <- `PlRLazyFrame_serialize`(ptr)
   e$`select_seq` <- `PlRLazyFrame_select_seq`(ptr)
   e$`rolling` <- `PlRLazyFrame_rolling`(ptr)
diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index 4b7780e4..48cc0ac5 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -2408,3 +2408,111 @@ lazyframe__with_row_index <- function(name = "index", offset = 0) {
     self$`_ldf`$with_row_index(name, offset)
   })
 }
+
+#' Evaluate the query in streaming mode and write to a Parquet file
+#'
+#' @description
+#' `r lifecycle::badge("experimental")`
+#'
+#' This allows streaming results that are larger than RAM to be written to disk.
+#'
+#' @inheritParams rlang::check_dots_empty0
+#' @param path A character. File path to which the file should be written.
+#' @param compression The compression method. Must be one of:
+#' * `"lz4"`: fast compression/decompression.
+#' * `"uncompressed"`
+#' * `"snappy"`: this guarantees that the parquet file will be compatible with
+#'   older parquet readers.
+#' * `"gzip"`
+#' * `"lzo"`
+#' * `"brotli"`
+#' * `"zstd"`: good compression performance.
+#' @param compression_level `NULL` or integer. The level of compression to use.
+#'  Only used if method is one of `"gzip"`, `"brotli"`, or `"zstd"`. Higher
+#' compression means smaller files on disk:
+#'  * `"gzip"`: min-level: 0, max-level: 10.
+#'  * `"brotli"`: min-level: 0, max-level: 11.
+#'  * `"zstd"`: min-level: 1, max-level: 22.
+#' @param statistics Whether statistics should be written to the Parquet
+#' headers. Possible values:
+#' * `TRUE`: enable default set of statistics (default)
+#' * `FALSE`: disable all statistics
+#' * `"full"`: calculate and write all available statistics.
+#' * A named list where all values must be `TRUE` or `FALSE`, e.g.
+#'   `list(min = TRUE, max = FALSE)`. Statistics available are `"min"`, `"max"`,
+#'   `"distinct_count"`, `"null_count"`.
+#' @param row_group_size Size of the row groups in number of rows. If `NULL`
+#' (default), the chunks of the DataFrame are used. Writing in smaller chunks
+#' may reduce memory pressure and improve writing speeds.
+#' @param data_page_size Size of the data page in bytes. If `NULL` (default), it
+#' is set to 1024^2 bytes.
+#' @param maintain_order Maintain the order in which data is processed. Setting
+#' this to `FALSE` will be slightly faster.
+#' @inheritParams lazyframe__collect
+#'
+#' @rdname IO_sink_parquet
+#' @return Invisibly returns the input LazyFrame
+#'
+#' @examples
+#' # sink table 'mtcars' from mem to parquet
+#' tmpf <- tempfile()
+#' pl$LazyFrame(mtcars)$sink_parquet(tmpf)
+#'
+#' # stream a query end-to-end
+#' tmpf2 <- tempfile()
+#' pl$scan_parquet(tmpf)$select(pl$col("cyl") * 2)$sink_parquet(tmpf2)
+#'
+#' # load parquet directly into a DataFrame / memory
+#' pl$scan_parquet(tmpf2)$collect()
+lazyframe__sink_parquet <- function(
+    path,
+    ...,
+    compression = "zstd",
+    compression_level = 3,
+    statistics = TRUE,
+    row_group_size = NULL,
+    data_page_size = NULL,
+    maintain_order = TRUE,
+    type_coercion = TRUE,
+    predicate_pushdown = TRUE,
+    projection_pushdown = TRUE,
+    simplify_expression = TRUE,
+    slice_pushdown = TRUE,
+    no_optimization = FALSE) {
+  wrap({
+    check_dots_empty0(...)
+
+    if (isTRUE(no_optimization)) {
+      predicate_pushdown <- FALSE
+      projection_pushdown <- FALSE
+      slice_pushdown <- FALSE
+    }
+
+    lf <- self$`_ldf`$optimization_toggle(
+      type_coercion = type_coercion,
+      predicate_pushdown = predicate_pushdown,
+      projection_pushdown = projection_pushdown,
+      simplify_expression = simplify_expression,
+      slice_pushdown = slice_pushdown,
+      comm_subplan_elim = FALSE,
+      comm_subexpr_elim = FALSE,
+      cluster_with_columns = FALSE,
+      streaming = FALSE,
+      `_eager` = FALSE
+    )
+
+    statistics <- translate_statistics(statistics)
+
+    self$`_ldf`$sink_parquet(
+      path = path,
+      compression = compression,
+      compression_level = compression_level,
+      statistics = statistics,
+      row_group_size = row_group_size,
+      data_page_size = data_page_size,
+      maintain_order = maintain_order
+    )
+
+    invisible(self)
+  })
+}
diff --git a/R/utils-various.R b/R/utils-various.R
index 2aab4e6f..c62c2bfe 100644
--- a/R/utils-various.R
+++ b/R/utils-various.R
@@ -76,3 +76,56 @@ make_profile_plot <- function(data, truncate_nodes) {
   }
   plot
 }
+
+#' @noRd
+translate_statistics <- function(statistics, call = caller_env()) {
+  if (length(statistics) != 1 && !is.list(statistics)) {
+    abort("`statistics` must be of length 1.", call = call)
+  }
+  if (is.logical(statistics)) {
+    if (isTRUE(statistics)) {
+      statistics <- list(
+        min = TRUE,
+        max = TRUE,
+        distinct_count = FALSE,
+        null_count = TRUE
+      )
+    } else {
+      statistics <- list(
+        min = FALSE,
+        max = FALSE,
+        distinct_count = FALSE,
+        null_count = FALSE
+      )
+    }
+  } else if (is.character(statistics)) {
+    if (statistics == "full") {
+      statistics <- list(
+        min = TRUE,
+        max = TRUE,
+        distinct_count = TRUE,
+        null_count = TRUE
+      )
+    } else {
+      abort("`statistics` must be TRUE/FALSE, \"full\", or a named list.", call = call)
+    }
+  } else if (is.list(statistics)) {
+    default <- list(
+      min = TRUE,
+      max = TRUE,
+      distinct_count = FALSE,
+      null_count = TRUE
+    )
+    statistics <- utils::modifyList(default, statistics)
+    nms <- names(statistics)
+    invalid <- nms[!nms %in% c("min", "max", "distinct_count", "null_count")]
+    if (length(invalid) > 0) {
+      msg <- paste0("`", invalid, "`", collapse = ", ")
+      abort(
+        paste0("In `statistics`,", msg, "are not valid keys."),
+        call = call
+      )
+    }
+  }
+  statistics
+}
diff --git a/src/init.c b/src/init.c
index 02f3e266..17ddbcd7 100644
--- a/src/init.c
+++ b/src/init.c
@@ -1584,6 +1584,11 @@ SEXP savvy_PlRLazyFrame_profile__impl(SEXP self__) {
     return handle_result(res);
 }
 
+SEXP savvy_PlRLazyFrame_sink_parquet__impl(SEXP self__, SEXP c_arg__path, SEXP c_arg__compression, SEXP c_arg__maintain_order, SEXP c_arg__statistics, SEXP c_arg__compression_level, SEXP c_arg__row_group_size, SEXP c_arg__data_page_size) {
+    SEXP res = savvy_PlRLazyFrame_sink_parquet__ffi(self__, c_arg__path, c_arg__compression, c_arg__maintain_order, c_arg__statistics, c_arg__compression_level, c_arg__row_group_size, c_arg__data_page_size);
+    return handle_result(res);
+}
+
 SEXP savvy_PlRLazyFrame_serialize__impl(SEXP self__) {
     SEXP res = savvy_PlRLazyFrame_serialize__ffi(self__);
     return handle_result(res);
@@ -2246,6 +2251,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"savvy_PlRLazyFrame_bottom_k__impl", (DL_FUNC) &savvy_PlRLazyFrame_bottom_k__impl, 4},
     {"savvy_PlRLazyFrame_cache__impl", (DL_FUNC) &savvy_PlRLazyFrame_cache__impl, 1},
     {"savvy_PlRLazyFrame_profile__impl", (DL_FUNC) &savvy_PlRLazyFrame_profile__impl, 1},
+    {"savvy_PlRLazyFrame_sink_parquet__impl", (DL_FUNC) &savvy_PlRLazyFrame_sink_parquet__impl, 8},
     {"savvy_PlRLazyFrame_serialize__impl", (DL_FUNC) &savvy_PlRLazyFrame_serialize__impl, 1},
     {"savvy_PlRLazyFrame_select_seq__impl", (DL_FUNC) &savvy_PlRLazyFrame_select_seq__impl, 2},
     {"savvy_PlRLazyFrame_rolling__impl", (DL_FUNC) &savvy_PlRLazyFrame_rolling__impl, 6},
diff --git a/src/rust/api.h b/src/rust/api.h
index 0f2b5d4d..c2fd38c2 100644
--- a/src/rust/api.h
+++ b/src/rust/api.h
@@ -320,6 +320,7 @@ SEXP savvy_PlRLazyFrame_top_k__ffi(SEXP self__, SEXP c_arg__k, SEXP c_arg__by, S
 SEXP savvy_PlRLazyFrame_bottom_k__ffi(SEXP self__, SEXP c_arg__k, SEXP c_arg__by, SEXP c_arg__reverse);
 SEXP savvy_PlRLazyFrame_cache__ffi(SEXP self__);
 SEXP savvy_PlRLazyFrame_profile__ffi(SEXP self__);
+SEXP savvy_PlRLazyFrame_sink_parquet__ffi(SEXP self__, SEXP c_arg__path, SEXP c_arg__compression, SEXP c_arg__maintain_order, SEXP c_arg__statistics, SEXP c_arg__compression_level, SEXP c_arg__row_group_size, SEXP c_arg__data_page_size);
 SEXP savvy_PlRLazyFrame_serialize__ffi(SEXP self__);
 SEXP savvy_PlRLazyFrame_select_seq__ffi(SEXP self__, SEXP c_arg__exprs);
 SEXP savvy_PlRLazyFrame_rolling__ffi(SEXP self__, SEXP c_arg__index_column, SEXP c_arg__period, SEXP c_arg__offset, SEXP c_arg__closed, SEXP c_arg__by);
diff --git a/src/rust/src/conversion/mod.rs b/src/rust/src/conversion/mod.rs
index e8b74754..7097c9d9 100644
--- a/src/rust/src/conversion/mod.rs
+++ b/src/rust/src/conversion/mod.rs
@@ -560,3 +560,69 @@ impl TryFrom<&str> for Wrap<StartBy> {
         Ok(Wrap(parsed))
     }
 }
+
+pub(crate) fn parse_parquet_compression(
+    compression: &str,
+    compression_level: Option<i32>,
+) -> savvy::Result<ParquetCompression> {
+    let parsed = match compression {
+        "uncompressed" => ParquetCompression::Uncompressed,
+        "snappy" => ParquetCompression::Snappy,
+        "gzip" => ParquetCompression::Gzip(
+            compression_level
+                .map(|lvl| {
+                    GzipLevel::try_new(lvl as u8)
+                        .map_err(|e| savvy::Error::new(format!("{e:?}").as_str()))
+                })
+                .transpose()?,
+        ),
+        "lzo" => ParquetCompression::Lzo,
+        "brotli" => ParquetCompression::Brotli(
+            compression_level
+                .map(|lvl| {
+                    BrotliLevel::try_new(lvl as u32)
+                        .map_err(|e| savvy::Error::new(format!("{e:?}").as_str()))
+                })
+                .transpose()?,
+        ),
+        "lz4" => ParquetCompression::Lz4Raw,
+        "zstd" => ParquetCompression::Zstd(
+            compression_level
+                .map(|lvl| {
+                    ZstdLevel::try_new(lvl)
+                        .map_err(|e| savvy::Error::new(format!("{e:?}").as_str()))
+                })
+                .transpose()?,
+        ),
+        _ => unreachable!(),
+    };
+    Ok(parsed)
+}
+
+impl TryFrom<ListSexp> for Wrap<StatisticsOptions> {
+    type Error = String;
+
+    fn try_from(statistics: ListSexp) -> Result<Self, String> {
+        let hm = statistics
+            .iter()
+            .map(|xi| {
+                let name = xi.0;
+                let value = xi.1.into_typed();
+                let value = match value {
+                    TypedSexp::Logical(val) => {
+                        let tmp = val.to_vec();
+                        *tmp.get(0).unwrap()
+                    }
+                    _ => unreachable!(),
+                };
+                (name, value)
+            })
+            .collect::<std::collections::HashMap<&str, bool>>();
+        let mut out = StatisticsOptions::default();
+        out.min_value = *hm.get(&"min").unwrap();
+        out.max_value = *hm.get(&"max").unwrap();
+        out.distinct_count = *hm.get(&"distinct_count").unwrap();
+        out.null_count = *hm.get(&"null_count").unwrap();
+        Ok(Wrap(out))
+    }
+}
diff --git a/src/rust/src/lazyframe/general.rs b/src/rust/src/lazyframe/general.rs
index 009d4a02..4211007e 100644
--- a/src/rust/src/lazyframe/general.rs
+++ b/src/rust/src/lazyframe/general.rs
@@ -4,6 +4,7 @@ use savvy::{
     savvy, ListSexp, LogicalSexp, NumericScalar, OwnedListSexp, OwnedStringSexp, Result, Sexp,
     StringSexp,
 };
+use std::path::PathBuf;
 
 #[savvy]
 impl PlRLazyFrame {
@@ -310,35 +311,47 @@ impl PlRLazyFrame {
         Ok(out.into())
     }
 
-    // fn sink_parquet(
-    //     &self,
-    //     py: Python,
-    //     path: PathBuf,
-    //     compression: &str,
-    //     compression_level: Option<i32>,
-    //     statistics: Wrap<StatisticsOptions>,
-    //     row_group_size: Option<usize>,
-    //     data_page_size: Option<usize>,
-    //     maintain_order: bool,
-    // ) -> Result<()> {
-    //     let compression = parse_parquet_compression(compression, compression_level)?;
+    fn sink_parquet(
+        &self,
+        path: &str,
+        compression: &str,
+        maintain_order: bool,
+        statistics: ListSexp,
+        compression_level: Option<NumericScalar>,
+        row_group_size: Option<NumericScalar>,
+        data_page_size: Option<NumericScalar>,
+    ) -> Result<()> {
+        let path: PathBuf = path.into();
+        let statistics = <Wrap<StatisticsOptions>>::try_from(statistics)?.0;
+        let compression_level: Option<i32> = match compression_level {
+            Some(x) => Some(x.as_i32()?),
+            None => None,
+        };
+        let compression = parse_parquet_compression(compression, compression_level)?;
+        let row_group_size: Option<usize> = match row_group_size {
+            Some(x) => Some(<Wrap<usize>>::try_from(x)?.0),
+            None => None,
+        };
+        let data_page_size: Option<usize> = match data_page_size {
+            Some(x) => Some(<Wrap<usize>>::try_from(x)?.0),
+            None => None,
+        };
 
-    //     let options = ParquetWriteOptions {
-    //         compression,
-    //         statistics: statistics.0,
-    //         row_group_size,
-    //         data_page_size,
-    //         maintain_order,
-    //     };
+        let options = ParquetWriteOptions {
+            compression,
+            statistics,
+            row_group_size,
+            data_page_size,
+            maintain_order,
+        };
 
-    //     // if we don't allow threads and we have udfs trying to acquire the gil from different
-    //     // threads we deadlock.
-    //     py.allow_threads(|| {
-    //         let ldf = self.ldf.clone();
-    //         ldf.sink_parquet(path, options).map_err(RPolarsErr::from)
-    //     })?;
-    //     Ok(())
-    // }
+        let _ = self
+            .ldf
+            .clone()
+            .sink_parquet(path, options)
+            .map_err(RPolarsErr::from);
+        Ok(())
+    }
 
     // fn sink_ipc(
     //     &self,

From 216d0c2b7d968a63ce36089f1900fe31b5d231e3 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Wed, 6 Nov 2024 12:28:40 +0100
Subject: [PATCH 14/71] sink_ipc

---
 R/000-wrappers.R                  |  7 +++
 R/lazyframe-frame.R               | 78 ++++++++++++++++++++++++++++++-
 src/init.c                        |  6 +++
 src/rust/api.h                    |  1 +
 src/rust/src/conversion/mod.rs    | 13 ++++++
 src/rust/src/lazyframe/general.rs | 44 +++++++++--------
 6 files changed, 128 insertions(+), 21 deletions(-)

diff --git a/R/000-wrappers.R b/R/000-wrappers.R
index 68807e02..3a8e7d10 100644
--- a/R/000-wrappers.R
+++ b/R/000-wrappers.R
@@ -2347,6 +2347,12 @@ class(`PlRExpr`) <- c("PlRExpr__bundle", "savvy_neopolars__sealed")
   }
 }
 
+`PlRLazyFrame_sink_ipc` <- function(self) {
+  function(`path`, `maintain_order`, `compression` = NULL) {
+    invisible(.Call(savvy_PlRLazyFrame_sink_ipc__impl, `self`, `path`, `maintain_order`, `compression`))
+  }
+}
+
 `PlRLazyFrame_serialize` <- function(self) {
   function() {
     .Call(savvy_PlRLazyFrame_serialize__impl, `self`)
@@ -2568,6 +2574,7 @@ class(`PlRExpr`) <- c("PlRExpr__bundle", "savvy_neopolars__sealed")
   e$`cache` <- `PlRLazyFrame_cache`(ptr)
   e$`profile` <- `PlRLazyFrame_profile`(ptr)
   e$`sink_parquet` <- `PlRLazyFrame_sink_parquet`(ptr)
+  e$`sink_ipc` <- `PlRLazyFrame_sink_ipc`(ptr)
   e$`serialize` <- `PlRLazyFrame_serialize`(ptr)
   e$`select_seq` <- `PlRLazyFrame_select_seq`(ptr)
   e$`rolling` <- `PlRLazyFrame_rolling`(ptr)
diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index 48cc0ac5..081a8e8b 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -2456,7 +2456,7 @@ lazyframe__with_row_index <- function(name = "index", offset = 0) {
 #' @examples
 #' # sink table 'mtcars' from mem to parquet
 #' tmpf <- tempfile()
-#' pl$LazyFrame(mtcars)$sink_parquet(tmpf)
+#' as_polars_lf(mtcars)$sink_parquet(tmpf)
 #'
 #' # stream a query end-to-end
 #' tmpf2 <- tempfile()
@@ -2481,6 +2481,10 @@ lazyframe__sink_parquet <- function(
     no_optimization = FALSE) {
   wrap({
     check_dots_empty0(...)
+    compression <- arg_match0(
+      compression,
+      values = c("lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd")
+    )
 
     if (isTRUE(no_optimization)) {
       predicate_pushdown <- FALSE
@@ -2503,7 +2507,7 @@ lazyframe__sink_parquet <- function(
 
     statistics <- translate_statistics(statistics)
 
-    self$`_ldf`$sink_parquet(
+    lf$sink_parquet(
       path = path,
       compression = compression,
       compression_level = compression_level,
@@ -2516,3 +2520,73 @@ lazyframe__sink_parquet <- function(
     invisible(self)
   })
 }
+
+#' Evaluate the query in streaming mode and write to an IPC file
+#'
+#' @inherit lazyframe__sink_parquet description params return
+#' @inheritParams rlang::check_dots_empty0
+#' @param compression `NULL` or one of:
+#' * `"uncompressed"`: same as `NULL`.
+#' * `"lz4"`: fast compression/decompression.
+#' * `"zstd"`: good compression performance.
+#'
+#' @rdname IO_sink_ipc
+#'
+#' @examples
+#' # sink table 'mtcars' from mem to ipc
+#' tmpf <- tempfile()
+#' as_polars_lf(mtcars)$sink_ipc(tmpf)
+#'
+#' # stream a query end-to-end (not supported yet, https://github.com/pola-rs/polars/issues/1040)
+#' # tmpf2 = tempfile()
+#' # pl$scan_ipc(tmpf)$select(pl$col("cyl") * 2)$sink_ipc(tmpf2)
+#'
+#' # load ipc directly into a DataFrame / memory
+#' # pl$scan_ipc(tmpf2)$collect()
+lazyframe__sink_ipc <- function(
+    path,
+    ...,
+    compression = c("zstd", "lz4", "uncompressed"),
+    maintain_order = TRUE,
+    type_coercion = TRUE,
+    predicate_pushdown = TRUE,
+    projection_pushdown = TRUE,
+    simplify_expression = TRUE,
+    slice_pushdown = TRUE,
+    no_optimization = FALSE) {
+  wrap({
+    check_dots_empty0(...)
+    compression <- compression %||% "uncompressed"
+    compression <- arg_match0(
+      compression,
+      values = c("lz4", "uncompressed", "zstd")
+    )
+
+    if (isTRUE(no_optimization)) {
+      predicate_pushdown <- FALSE
+      projection_pushdown <- FALSE
+      slice_pushdown <- FALSE
+    }
+
+    lf <- self$`_ldf`$optimization_toggle(
+      type_coercion = type_coercion,
+      predicate_pushdown = predicate_pushdown,
+      projection_pushdown = projection_pushdown,
+      simplify_expression = simplify_expression,
+      slice_pushdown = slice_pushdown,
+      comm_subplan_elim = FALSE,
+      comm_subexpr_elim = FALSE,
+      cluster_with_columns = FALSE,
+      streaming = FALSE,
+      `_eager` = FALSE
+    )
+
+    lf$sink_ipc(
+      path = path,
+      compression = compression,
+      maintain_order = maintain_order
+    )
+
+    invisible(self)
+  })
+}
diff --git a/src/init.c b/src/init.c
index 17ddbcd7..0becca62 100644
--- a/src/init.c
+++ b/src/init.c
@@ -1589,6 +1589,11 @@ SEXP savvy_PlRLazyFrame_sink_parquet__impl(SEXP self__, SEXP c_arg__path, SEXP c
     return handle_result(res);
 }
 
+SEXP savvy_PlRLazyFrame_sink_ipc__impl(SEXP self__, SEXP c_arg__path, SEXP c_arg__maintain_order, SEXP c_arg__compression) {
+    SEXP res = savvy_PlRLazyFrame_sink_ipc__ffi(self__, c_arg__path, c_arg__maintain_order, c_arg__compression);
+    return handle_result(res);
+}
+
 SEXP savvy_PlRLazyFrame_serialize__impl(SEXP self__) {
     SEXP res = savvy_PlRLazyFrame_serialize__ffi(self__);
     return handle_result(res);
@@ -2252,6 +2257,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"savvy_PlRLazyFrame_cache__impl", (DL_FUNC) &savvy_PlRLazyFrame_cache__impl, 1},
     {"savvy_PlRLazyFrame_profile__impl", (DL_FUNC) &savvy_PlRLazyFrame_profile__impl, 1},
     {"savvy_PlRLazyFrame_sink_parquet__impl", (DL_FUNC) &savvy_PlRLazyFrame_sink_parquet__impl, 8},
+    {"savvy_PlRLazyFrame_sink_ipc__impl", (DL_FUNC) &savvy_PlRLazyFrame_sink_ipc__impl, 4},
     {"savvy_PlRLazyFrame_serialize__impl", (DL_FUNC) &savvy_PlRLazyFrame_serialize__impl, 1},
     {"savvy_PlRLazyFrame_select_seq__impl", (DL_FUNC) &savvy_PlRLazyFrame_select_seq__impl, 2},
     {"savvy_PlRLazyFrame_rolling__impl", (DL_FUNC) &savvy_PlRLazyFrame_rolling__impl, 6},
diff --git a/src/rust/api.h b/src/rust/api.h
index c2fd38c2..0d9a5d89 100644
--- a/src/rust/api.h
+++ b/src/rust/api.h
@@ -321,6 +321,7 @@ SEXP savvy_PlRLazyFrame_bottom_k__ffi(SEXP self__, SEXP c_arg__k, SEXP c_arg__by
 SEXP savvy_PlRLazyFrame_cache__ffi(SEXP self__);
 SEXP savvy_PlRLazyFrame_profile__ffi(SEXP self__);
 SEXP savvy_PlRLazyFrame_sink_parquet__ffi(SEXP self__, SEXP c_arg__path, SEXP c_arg__compression, SEXP c_arg__maintain_order, SEXP c_arg__statistics, SEXP c_arg__compression_level, SEXP c_arg__row_group_size, SEXP c_arg__data_page_size);
+SEXP savvy_PlRLazyFrame_sink_ipc__ffi(SEXP self__, SEXP c_arg__path, SEXP c_arg__maintain_order, SEXP c_arg__compression);
 SEXP savvy_PlRLazyFrame_serialize__ffi(SEXP self__);
 SEXP savvy_PlRLazyFrame_select_seq__ffi(SEXP self__, SEXP c_arg__exprs);
 SEXP savvy_PlRLazyFrame_rolling__ffi(SEXP self__, SEXP c_arg__index_column, SEXP c_arg__period, SEXP c_arg__offset, SEXP c_arg__closed, SEXP c_arg__by);
diff --git a/src/rust/src/conversion/mod.rs b/src/rust/src/conversion/mod.rs
index 7097c9d9..4130a05c 100644
--- a/src/rust/src/conversion/mod.rs
+++ b/src/rust/src/conversion/mod.rs
@@ -626,3 +626,16 @@ impl TryFrom<ListSexp> for Wrap<StatisticsOptions> {
         Ok(Wrap(out))
     }
 }
+
+impl TryFrom<&str> for Wrap<IpcCompression> {
+    type Error = String;
+
+    fn try_from(compression: &str) -> Result<Self, String> {
+        let parsed = match compression {
+            "lz4" => IpcCompression::LZ4,
+            "zstd" => IpcCompression::ZSTD,
+            _ => return Err(format!("unreachable")),
+        };
+        Ok(Wrap(parsed))
+    }
+}
diff --git a/src/rust/src/lazyframe/general.rs b/src/rust/src/lazyframe/general.rs
index 4211007e..b3ef3e5a 100644
--- a/src/rust/src/lazyframe/general.rs
+++ b/src/rust/src/lazyframe/general.rs
@@ -353,26 +353,32 @@ impl PlRLazyFrame {
         Ok(())
     }
 
-    // fn sink_ipc(
-    //     &self,
-    //     py: Python,
-    //     path: PathBuf,
-    //     compression: Option<Wrap<IpcCompression>>,
-    //     maintain_order: bool,
-    // ) -> Result<()> {
-    //     let options = IpcWriterOptions {
-    //         compression: compression.map(|c| c.0),
-    //         maintain_order,
-    //     };
+    fn sink_ipc(&self, path: &str, maintain_order: bool, compression: Option<&str>) -> Result<()> {
+        let path: PathBuf = path.into();
 
-    //     // if we don't allow threads and we have udfs trying to acquire the gil from different
-    //     // threads we deadlock.
-    //     py.allow_threads(|| {
-    //         let ldf = self.ldf.clone();
-    //         ldf.sink_ipc(path, options).map_err(RPolarsErr::from)
-    //     })?;
-    //     Ok(())
-    // }
+        let compression: Option<IpcCompression> = match compression {
+            Some(x) => {
+                if x == "uncompressed" {
+                    None
+                } else {
+                    Some(<Wrap<IpcCompression>>::try_from(x)?.0)
+                }
+            }
+
+            None => None,
+        };
+        let options = IpcWriterOptions {
+            compression,
+            maintain_order,
+        };
+
+        let _ = self
+            .ldf
+            .clone()
+            .sink_ipc(path, options)
+            .map_err(RPolarsErr::from);
+        Ok(())
+    }
 
     // fn sink_csv(
     //     &self,

From 17de1fcafdd8436042407c0b7cb71c04ae905baf Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Wed, 6 Nov 2024 14:54:59 +0100
Subject: [PATCH 15/71] sink_csv

---
 R/000-wrappers.R                  |   7 ++
 R/lazyframe-frame.R               | 121 ++++++++++++++++++++++++++++++
 src/init.c                        |   6 ++
 src/rust/api.h                    |   1 +
 src/rust/src/conversion/mod.rs    |  44 +++++++++++
 src/rust/src/lazyframe/general.rs | 120 ++++++++++++++++-------------
 6 files changed, 245 insertions(+), 54 deletions(-)

diff --git a/R/000-wrappers.R b/R/000-wrappers.R
index 3a8e7d10..826fd528 100644
--- a/R/000-wrappers.R
+++ b/R/000-wrappers.R
@@ -2353,6 +2353,12 @@ class(`PlRExpr`) <- c("PlRExpr__bundle", "savvy_neopolars__sealed")
   }
 }
 
+`PlRLazyFrame_sink_csv` <- function(self) {
+  function(`path`, `include_bom`, `include_header`, `separator`, `line_terminator`, `quote_char`, `maintain_order`, `batch_size`, `datetime_format` = NULL, `date_format` = NULL, `time_format` = NULL, `float_scientific` = NULL, `float_precision` = NULL, `null_value` = NULL, `quote_style` = NULL) {
+    invisible(.Call(savvy_PlRLazyFrame_sink_csv__impl, `self`, `path`, `include_bom`, `include_header`, `separator`, `line_terminator`, `quote_char`, `maintain_order`, `batch_size`, `datetime_format`, `date_format`, `time_format`, `float_scientific`, `float_precision`, `null_value`, `quote_style`))
+  }
+}
+
 `PlRLazyFrame_serialize` <- function(self) {
   function() {
     .Call(savvy_PlRLazyFrame_serialize__impl, `self`)
@@ -2575,6 +2581,7 @@ class(`PlRExpr`) <- c("PlRExpr__bundle", "savvy_neopolars__sealed")
   e$`profile` <- `PlRLazyFrame_profile`(ptr)
   e$`sink_parquet` <- `PlRLazyFrame_sink_parquet`(ptr)
   e$`sink_ipc` <- `PlRLazyFrame_sink_ipc`(ptr)
+  e$`sink_csv` <- `PlRLazyFrame_sink_csv`(ptr)
   e$`serialize` <- `PlRLazyFrame_serialize`(ptr)
   e$`select_seq` <- `PlRLazyFrame_select_seq`(ptr)
   e$`rolling` <- `PlRLazyFrame_rolling`(ptr)
diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index 081a8e8b..5cc1d687 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -2590,3 +2590,124 @@ lazyframe__sink_ipc <- function(
     invisible(self)
   })
 }
+
+#' Evaluate the query in streaming mode and write to a CSV file
+#'
+#' @inherit lazyframe__sink_parquet description params return
+#' @inheritParams rlang::check_dots_empty0
+#' @param include_bom Logical, whether to include UTF-8 BOM in the CSV output.
+#' @param include_header Logical, hether to include header in the CSV output.
+#' @param separator Separate CSV fields with this symbol.
+#' @param line_terminator String used to end each row.
+#' @param quote_char Byte to use as quoting character.
+#' @param batch_size Number of rows that will be processed per thread.
+#' @param datetime_format A format string, with the specifiers defined by the
+#' [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
+#' Rust crate. If no format specified, the default fractional-second precision
+#' is inferred from the maximum timeunit found in the frame’s Datetime cols (if
+#' any).
+#' @param date_format A format string, with the specifiers defined by the
+#' [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
+#' Rust crate.
+#' @param time_format A format string, with the specifiers defined by the
+#' [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
+#' Rust crate.
+#' @param float_precision Whether to use scientific form always (`TRUE`), never
+#' (`FALSE`), or automatically (`NULL`) for Float32 and Float64 datatypes.
+#' @param null_value A string representing null values (defaulting to the empty
+#' string).
+#' @param quote_style Determines the quoting strategy used. Must be one of:
+#' * `"necessary"` (default): This puts quotes around fields only when
+#'   necessary. They are necessary when fields contain a quote, delimiter or
+#'   record terminator. Quotes are also necessary when writing an empty record
+#'   (which is indistinguishable from a record with one empty field). This is
+#'   the default.
+#' * `"always"`: This puts quotes around every field. Always.
+#' * `"never"`: This never puts quotes around fields, even if that results in
+#'   invalid CSV data (e.g.: by not quoting strings containing the separator).
+#' * `"non_numeric"`: This puts quotes around all fields that are non-numeric.
+#'   Namely, when writing a field that does not parse as a valid float or
+#'   integer, then quotes will be used even if they aren`t strictly necessary.
+#'
+#' @inherit LazyFrame_sink_parquet return
+#' @rdname IO_sink_csv
+#'
+#' @examples
+#' # sink table 'mtcars' from mem to CSV
+#' tmpf <- tempfile()
+#' pl$LazyFrame(mtcars)$sink_csv(tmpf)
+#'
+#' # stream a query end-to-end
+#' tmpf2 <- tempfile()
+#' pl$scan_csv(tmpf)$select(pl$col("cyl") * 2)$sink_csv(tmpf2)
+#'
+#' # load parquet directly into a DataFrame / memory
+#' pl$scan_csv(tmpf2)$collect()
+lazyframe__sink_csv <- function(
+    path,
+    ...,
+    include_bom = FALSE,
+    include_header = TRUE,
+    separator = ",",
+    line_terminator = "\n",
+    quote_char = '"',
+    batch_size = 1024,
+    datetime_format = NULL,
+    date_format = NULL,
+    time_format = NULL,
+    float_precision = NULL,
+    null_value = "",
+    quote_style = "necessary",
+    maintain_order = TRUE,
+    type_coercion = TRUE,
+    predicate_pushdown = TRUE,
+    projection_pushdown = TRUE,
+    simplify_expression = TRUE,
+    slice_pushdown = TRUE,
+    no_optimization = FALSE) {
+  wrap({
+    check_dots_empty0(...)
+    quote_style <- arg_match0(
+      quote_style,
+      values = c("necessary", "always", "never", "non_numeric")
+    )
+
+    if (isTRUE(no_optimization)) {
+      predicate_pushdown <- FALSE
+      projection_pushdown <- FALSE
+      slice_pushdown <- FALSE
+    }
+
+    lf <- self$`_ldf`$optimization_toggle(
+      type_coercion = type_coercion,
+      predicate_pushdown = predicate_pushdown,
+      projection_pushdown = projection_pushdown,
+      simplify_expression = simplify_expression,
+      slice_pushdown = slice_pushdown,
+      comm_subplan_elim = FALSE,
+      comm_subexpr_elim = FALSE,
+      cluster_with_columns = FALSE,
+      streaming = FALSE,
+      `_eager` = FALSE
+    )
+
+    lf$sink_csv(
+      path = path,
+      include_bom = include_bom,
+      include_header = include_header,
+      separator = separator,
+      line_terminator = line_terminator,
+      quote_char = quote_char,
+      batch_size = batch_size,
+      datetime_format = datetime_format,
+      date_format = date_format,
+      time_format = time_format,
+      float_precision = float_precision,
+      null_value = null_value,
+      quote_style = quote_style,
+      maintain_order = maintain_order
+    )
+
+    invisible(self)
+  })
+}
diff --git a/src/init.c b/src/init.c
index 0becca62..6e837e0c 100644
--- a/src/init.c
+++ b/src/init.c
@@ -1594,6 +1594,11 @@ SEXP savvy_PlRLazyFrame_sink_ipc__impl(SEXP self__, SEXP c_arg__path, SEXP c_arg
     return handle_result(res);
 }
 
+SEXP savvy_PlRLazyFrame_sink_csv__impl(SEXP self__, SEXP c_arg__path, SEXP c_arg__include_bom, SEXP c_arg__include_header, SEXP c_arg__separator, SEXP c_arg__line_terminator, SEXP c_arg__quote_char, SEXP c_arg__maintain_order, SEXP c_arg__batch_size, SEXP c_arg__datetime_format, SEXP c_arg__date_format, SEXP c_arg__time_format, SEXP c_arg__float_scientific, SEXP c_arg__float_precision, SEXP c_arg__null_value, SEXP c_arg__quote_style) {
+    SEXP res = savvy_PlRLazyFrame_sink_csv__ffi(self__, c_arg__path, c_arg__include_bom, c_arg__include_header, c_arg__separator, c_arg__line_terminator, c_arg__quote_char, c_arg__maintain_order, c_arg__batch_size, c_arg__datetime_format, c_arg__date_format, c_arg__time_format, c_arg__float_scientific, c_arg__float_precision, c_arg__null_value, c_arg__quote_style);
+    return handle_result(res);
+}
+
 SEXP savvy_PlRLazyFrame_serialize__impl(SEXP self__) {
     SEXP res = savvy_PlRLazyFrame_serialize__ffi(self__);
     return handle_result(res);
@@ -2258,6 +2263,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"savvy_PlRLazyFrame_profile__impl", (DL_FUNC) &savvy_PlRLazyFrame_profile__impl, 1},
     {"savvy_PlRLazyFrame_sink_parquet__impl", (DL_FUNC) &savvy_PlRLazyFrame_sink_parquet__impl, 8},
     {"savvy_PlRLazyFrame_sink_ipc__impl", (DL_FUNC) &savvy_PlRLazyFrame_sink_ipc__impl, 4},
+    {"savvy_PlRLazyFrame_sink_csv__impl", (DL_FUNC) &savvy_PlRLazyFrame_sink_csv__impl, 16},
     {"savvy_PlRLazyFrame_serialize__impl", (DL_FUNC) &savvy_PlRLazyFrame_serialize__impl, 1},
     {"savvy_PlRLazyFrame_select_seq__impl", (DL_FUNC) &savvy_PlRLazyFrame_select_seq__impl, 2},
     {"savvy_PlRLazyFrame_rolling__impl", (DL_FUNC) &savvy_PlRLazyFrame_rolling__impl, 6},
diff --git a/src/rust/api.h b/src/rust/api.h
index 0d9a5d89..b1c01163 100644
--- a/src/rust/api.h
+++ b/src/rust/api.h
@@ -322,6 +322,7 @@ SEXP savvy_PlRLazyFrame_cache__ffi(SEXP self__);
 SEXP savvy_PlRLazyFrame_profile__ffi(SEXP self__);
 SEXP savvy_PlRLazyFrame_sink_parquet__ffi(SEXP self__, SEXP c_arg__path, SEXP c_arg__compression, SEXP c_arg__maintain_order, SEXP c_arg__statistics, SEXP c_arg__compression_level, SEXP c_arg__row_group_size, SEXP c_arg__data_page_size);
 SEXP savvy_PlRLazyFrame_sink_ipc__ffi(SEXP self__, SEXP c_arg__path, SEXP c_arg__maintain_order, SEXP c_arg__compression);
+SEXP savvy_PlRLazyFrame_sink_csv__ffi(SEXP self__, SEXP c_arg__path, SEXP c_arg__include_bom, SEXP c_arg__include_header, SEXP c_arg__separator, SEXP c_arg__line_terminator, SEXP c_arg__quote_char, SEXP c_arg__maintain_order, SEXP c_arg__batch_size, SEXP c_arg__datetime_format, SEXP c_arg__date_format, SEXP c_arg__time_format, SEXP c_arg__float_scientific, SEXP c_arg__float_precision, SEXP c_arg__null_value, SEXP c_arg__quote_style);
 SEXP savvy_PlRLazyFrame_serialize__ffi(SEXP self__);
 SEXP savvy_PlRLazyFrame_select_seq__ffi(SEXP self__, SEXP c_arg__exprs);
 SEXP savvy_PlRLazyFrame_rolling__ffi(SEXP self__, SEXP c_arg__index_column, SEXP c_arg__period, SEXP c_arg__offset, SEXP c_arg__closed, SEXP c_arg__by);
diff --git a/src/rust/src/conversion/mod.rs b/src/rust/src/conversion/mod.rs
index 4130a05c..43bd6745 100644
--- a/src/rust/src/conversion/mod.rs
+++ b/src/rust/src/conversion/mod.rs
@@ -1,3 +1,5 @@
+use std::num::NonZeroUsize;
+
 use crate::prelude::*;
 use crate::{PlRDataFrame, PlRDataType, PlRExpr, PlRLazyFrame};
 use polars::series::ops::NullBehavior;
@@ -224,6 +226,19 @@ impl TryFrom<NumericScalar> for Wrap<usize> {
     }
 }
 
+impl TryFrom<NumericScalar> for Wrap<NonZeroUsize> {
+    type Error = savvy::Error;
+
+    fn try_from(n: NumericScalar) -> Result<Self, savvy::Error> {
+        let n = n.as_usize()?;
+        if n == 0 {
+            Err("Cannot convert to non-zero usize.".into())
+        } else {
+            Ok(Wrap(NonZeroUsize::new(n).unwrap()))
+        }
+    }
+}
+
 impl TryFrom<NumericSexp> for Wrap<Vec<usize>> {
     type Error = savvy::Error;
 
@@ -639,3 +654,32 @@ impl TryFrom<&str> for Wrap<IpcCompression> {
         Ok(Wrap(parsed))
     }
 }
+
+impl TryFrom<&str> for Wrap<QuoteStyle> {
+    type Error = String;
+
+    fn try_from(compression: &str) -> Result<Self, String> {
+        let parsed = match compression {
+            "always" => QuoteStyle::Always,
+            "necessary" => QuoteStyle::Necessary,
+            "non_numeric" => QuoteStyle::NonNumeric,
+            "never" => QuoteStyle::Never,
+            _ => return Err(format!("unreachable")),
+        };
+        Ok(Wrap(parsed))
+    }
+}
+
+impl TryFrom<&str> for Wrap<u8> {
+    type Error = String;
+
+    fn try_from(string: &str) -> Result<Self, String> {
+        let mut utf8_byte_iter = string.as_bytes().iter();
+        match (utf8_byte_iter.next(), utf8_byte_iter.next()) {
+            (Some(s), None) => Ok(Wrap(*s)),
+            (None, None) => Err(format!("cannot extract single byte from empty string")),
+            (Some(_), Some(_)) => Err(format!("multi byte-string not allowed")),
+            (None, Some(_)) => unreachable!("the iter() cannot yield Some after None(depleted)"),
+        }
+    }
+}
diff --git a/src/rust/src/lazyframe/general.rs b/src/rust/src/lazyframe/general.rs
index b3ef3e5a..7abc88d0 100644
--- a/src/rust/src/lazyframe/general.rs
+++ b/src/rust/src/lazyframe/general.rs
@@ -4,6 +4,7 @@ use savvy::{
     savvy, ListSexp, LogicalSexp, NumericScalar, OwnedListSexp, OwnedStringSexp, Result, Sexp,
     StringSexp,
 };
+use std::num::NonZeroUsize;
 use std::path::PathBuf;
 
 #[savvy]
@@ -380,70 +381,81 @@ impl PlRLazyFrame {
         Ok(())
     }
 
-    // fn sink_csv(
-    //     &self,
-    //     py: Python,
-    //     path: PathBuf,
-    //     include_bom: bool,
-    //     include_header: bool,
-    //     separator: u8,
-    //     line_terminator: String,
-    //     quote_char: u8,
-    //     batch_size: NonZeroUsize,
-    //     datetime_format: Option<String>,
-    //     date_format: Option<String>,
-    //     time_format: Option<String>,
-    //     float_scientific: Option<bool>,
-    //     float_precision: Option<usize>,
-    //     null_value: Option<String>,
-    //     quote_style: Option<Wrap<QuoteStyle>>,
-    //     maintain_order: bool,
-    // ) -> Result<()> {
-    //     let quote_style = quote_style.map_or(QuoteStyle::default(), |wrap| wrap.0);
-    //     let null_value = null_value.unwrap_or(SerializeOptions::default().null);
-
-    //     let serialize_options = SerializeOptions {
-    //         date_format,
-    //         time_format,
-    //         datetime_format,
-    //         float_scientific,
-    //         float_precision,
-    //         separator,
-    //         quote_char,
-    //         null: null_value,
-    //         line_terminator,
-    //         quote_style,
-    //     };
+    fn sink_csv(
+        &self,
+        path: &str,
+        include_bom: bool,
+        include_header: bool,
+        separator: &str,
+        line_terminator: &str,
+        quote_char: &str,
+        maintain_order: bool,
+        batch_size: NumericScalar,
+        datetime_format: Option<&str>,
+        date_format: Option<&str>,
+        time_format: Option<&str>,
+        float_scientific: Option<bool>,
+        float_precision: Option<NumericScalar>,
+        null_value: Option<&str>,
+        quote_style: Option<&str>,
+    ) -> Result<()> {
+        let path: PathBuf = path.into();
+        let quote_style = match quote_style {
+            Some(x) => <Wrap<QuoteStyle>>::try_from(x)?.0,
+            None => QuoteStyle::default(),
+        };
+        let null_value = null_value
+            .map(|x| x.to_string())
+            .unwrap_or(SerializeOptions::default().null);
+        let batch_size = <Wrap<NonZeroUsize>>::try_from(batch_size)?.0;
+        let float_precision = match float_precision {
+            Some(x) => Some(<Wrap<usize>>::try_from(x)?.0),
+            None => None,
+        };
+        let separator = <Wrap<u8>>::try_from(separator)?.0;
+        let quote_char = <Wrap<u8>>::try_from(quote_char)?.0;
+
+        let serialize_options = SerializeOptions {
+            date_format: date_format.map(|x| x.to_string()),
+            time_format: time_format.map(|x| x.to_string()),
+            datetime_format: datetime_format.map(|x| x.to_string()),
+            float_scientific,
+            float_precision,
+            separator,
+            quote_char,
+            null: null_value.to_string(),
+            line_terminator: line_terminator.to_string(),
+            quote_style,
+        };
 
-    //     let options = CsvWriterOptions {
-    //         include_bom,
-    //         include_header,
-    //         maintain_order,
-    //         batch_size,
-    //         serialize_options,
-    //     };
+        let options = CsvWriterOptions {
+            include_bom,
+            include_header,
+            maintain_order,
+            batch_size,
+            serialize_options,
+        };
+
+        let _ = self
+            .ldf
+            .clone()
+            .sink_csv(path, options)
+            .map_err(RPolarsErr::from);
+        Ok(())
+    }
+
+    // fn sink_json(&self, py: Python, path: PathBuf, maintain_order: bool) -> Result<()> {
+    //     let options = JsonWriterOptions { maintain_order };
 
     //     // if we don't allow threads and we have udfs trying to acquire the gil from different
     //     // threads we deadlock.
     //     py.allow_threads(|| {
     //         let ldf = self.ldf.clone();
-    //         ldf.sink_csv(path, options).map_err(RPolarsErr::from)
+    //         ldf.sink_json(path, options).map_err(RPolarsErr::from)
     //     })?;
     //     Ok(())
     // }
 
-    //     fn sink_json(&self, py: Python, path: PathBuf, maintain_order: bool) -> Result<()> {
-    //         let options = JsonWriterOptions { maintain_order };
-
-    //         // if we don't allow threads and we have udfs trying to acquire the gil from different
-    //         // threads we deadlock.
-    //         py.allow_threads(|| {
-    //             let ldf = self.ldf.clone();
-    //             ldf.sink_json(path, options).map_err(RPolarsErr::from)
-    //         })?;
-    //         Ok(())
-    //     }
-
     fn serialize(&self) -> Result<Sexp> {
         let dump = serde_json::to_string(&self.ldf.logical_plan)
             .map_err(|err| RPolarsErr::Other(err.to_string()))?;

From 8a53cb9e37f989cfaf6a79597aca9cac97ee96bf Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Wed, 6 Nov 2024 15:01:06 +0100
Subject: [PATCH 16/71] sink_ndjson

---
 R/000-wrappers.R                  |  7 ++++
 R/lazyframe-frame.R               | 55 ++++++++++++++++++++++++++++++-
 src/init.c                        |  6 ++++
 src/rust/api.h                    |  1 +
 src/rust/src/lazyframe/general.rs | 22 ++++++-------
 5 files changed, 79 insertions(+), 12 deletions(-)

diff --git a/R/000-wrappers.R b/R/000-wrappers.R
index 826fd528..d7285703 100644
--- a/R/000-wrappers.R
+++ b/R/000-wrappers.R
@@ -2359,6 +2359,12 @@ class(`PlRExpr`) <- c("PlRExpr__bundle", "savvy_neopolars__sealed")
   }
 }
 
+`PlRLazyFrame_sink_json` <- function(self) {
+  function(`path`, `maintain_order`) {
+    invisible(.Call(savvy_PlRLazyFrame_sink_json__impl, `self`, `path`, `maintain_order`))
+  }
+}
+
 `PlRLazyFrame_serialize` <- function(self) {
   function() {
     .Call(savvy_PlRLazyFrame_serialize__impl, `self`)
@@ -2582,6 +2588,7 @@ class(`PlRExpr`) <- c("PlRExpr__bundle", "savvy_neopolars__sealed")
   e$`sink_parquet` <- `PlRLazyFrame_sink_parquet`(ptr)
   e$`sink_ipc` <- `PlRLazyFrame_sink_ipc`(ptr)
   e$`sink_csv` <- `PlRLazyFrame_sink_csv`(ptr)
+  e$`sink_json` <- `PlRLazyFrame_sink_json`(ptr)
   e$`serialize` <- `PlRLazyFrame_serialize`(ptr)
   e$`select_seq` <- `PlRLazyFrame_select_seq`(ptr)
   e$`rolling` <- `PlRLazyFrame_rolling`(ptr)
diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index 5cc1d687..5d682d09 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -2629,7 +2629,6 @@ lazyframe__sink_ipc <- function(
 #'   Namely, when writing a field that does not parse as a valid float or
 #'   integer, then quotes will be used even if they aren`t strictly necessary.
 #'
-#' @inherit LazyFrame_sink_parquet return
 #' @rdname IO_sink_csv
 #'
 #' @examples
@@ -2711,3 +2710,57 @@ lazyframe__sink_csv <- function(
     invisible(self)
   })
 }
+
+#' Evaluate the query in streaming mode and write to an NDJSON file
+#'
+#' @inherit lazyframe__sink_parquet description params return
+#' @inheritParams rlang::check_dots_empty0
+#'
+#' @rdname IO_sink_ndjson
+#'
+#' @examples
+#' # sink table 'mtcars' from mem to NDJSON
+#' tmpf <- tempfile(fileext = ".ndjson")
+#' pl$LazyFrame(mtcars)$sink_ndjson(tmpf)
+#'
+#' # load parquet directly into a DataFrame / memory
+#' pl$scan_ndjson(tmpf)$collect()
+lazyframe__sink_ndjson <- function(
+    path,
+    ...,
+    maintain_order = TRUE,
+    type_coercion = TRUE,
+    predicate_pushdown = TRUE,
+    projection_pushdown = TRUE,
+    simplify_expression = TRUE,
+    slice_pushdown = TRUE,
+    no_optimization = FALSE) {
+  wrap({
+    check_dots_empty0(...)
+    if (isTRUE(no_optimization)) {
+      predicate_pushdown <- FALSE
+      projection_pushdown <- FALSE
+      slice_pushdown <- FALSE
+    }
+
+    lf <- self$`_ldf`$optimization_toggle(
+      type_coercion = type_coercion,
+      predicate_pushdown = predicate_pushdown,
+      projection_pushdown = projection_pushdown,
+      simplify_expression = simplify_expression,
+      slice_pushdown = slice_pushdown,
+      comm_subplan_elim = FALSE,
+      comm_subexpr_elim = FALSE,
+      cluster_with_columns = FALSE,
+      streaming = FALSE,
+      `_eager` = FALSE
+    )
+
+    lf$sink_json(
+      path = path,
+      maintain_order = maintain_order
+    )
+
+    invisible(self)
+  })
+}
diff --git a/src/init.c b/src/init.c
index 6e837e0c..1e915a13 100644
--- a/src/init.c
+++ b/src/init.c
@@ -1599,6 +1599,11 @@ SEXP savvy_PlRLazyFrame_sink_csv__impl(SEXP self__, SEXP c_arg__path, SEXP c_arg
     return handle_result(res);
 }
 
+SEXP savvy_PlRLazyFrame_sink_json__impl(SEXP self__, SEXP c_arg__path, SEXP c_arg__maintain_order) {
+    SEXP res = savvy_PlRLazyFrame_sink_json__ffi(self__, c_arg__path, c_arg__maintain_order);
+    return handle_result(res);
+}
+
 SEXP savvy_PlRLazyFrame_serialize__impl(SEXP self__) {
     SEXP res = savvy_PlRLazyFrame_serialize__ffi(self__);
     return handle_result(res);
@@ -2264,6 +2269,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"savvy_PlRLazyFrame_sink_parquet__impl", (DL_FUNC) &savvy_PlRLazyFrame_sink_parquet__impl, 8},
     {"savvy_PlRLazyFrame_sink_ipc__impl", (DL_FUNC) &savvy_PlRLazyFrame_sink_ipc__impl, 4},
     {"savvy_PlRLazyFrame_sink_csv__impl", (DL_FUNC) &savvy_PlRLazyFrame_sink_csv__impl, 16},
+    {"savvy_PlRLazyFrame_sink_json__impl", (DL_FUNC) &savvy_PlRLazyFrame_sink_json__impl, 3},
     {"savvy_PlRLazyFrame_serialize__impl", (DL_FUNC) &savvy_PlRLazyFrame_serialize__impl, 1},
     {"savvy_PlRLazyFrame_select_seq__impl", (DL_FUNC) &savvy_PlRLazyFrame_select_seq__impl, 2},
     {"savvy_PlRLazyFrame_rolling__impl", (DL_FUNC) &savvy_PlRLazyFrame_rolling__impl, 6},
diff --git a/src/rust/api.h b/src/rust/api.h
index b1c01163..3362a6c9 100644
--- a/src/rust/api.h
+++ b/src/rust/api.h
@@ -323,6 +323,7 @@ SEXP savvy_PlRLazyFrame_profile__ffi(SEXP self__);
 SEXP savvy_PlRLazyFrame_sink_parquet__ffi(SEXP self__, SEXP c_arg__path, SEXP c_arg__compression, SEXP c_arg__maintain_order, SEXP c_arg__statistics, SEXP c_arg__compression_level, SEXP c_arg__row_group_size, SEXP c_arg__data_page_size);
 SEXP savvy_PlRLazyFrame_sink_ipc__ffi(SEXP self__, SEXP c_arg__path, SEXP c_arg__maintain_order, SEXP c_arg__compression);
 SEXP savvy_PlRLazyFrame_sink_csv__ffi(SEXP self__, SEXP c_arg__path, SEXP c_arg__include_bom, SEXP c_arg__include_header, SEXP c_arg__separator, SEXP c_arg__line_terminator, SEXP c_arg__quote_char, SEXP c_arg__maintain_order, SEXP c_arg__batch_size, SEXP c_arg__datetime_format, SEXP c_arg__date_format, SEXP c_arg__time_format, SEXP c_arg__float_scientific, SEXP c_arg__float_precision, SEXP c_arg__null_value, SEXP c_arg__quote_style);
+SEXP savvy_PlRLazyFrame_sink_json__ffi(SEXP self__, SEXP c_arg__path, SEXP c_arg__maintain_order);
 SEXP savvy_PlRLazyFrame_serialize__ffi(SEXP self__);
 SEXP savvy_PlRLazyFrame_select_seq__ffi(SEXP self__, SEXP c_arg__exprs);
 SEXP savvy_PlRLazyFrame_rolling__ffi(SEXP self__, SEXP c_arg__index_column, SEXP c_arg__period, SEXP c_arg__offset, SEXP c_arg__closed, SEXP c_arg__by);
diff --git a/src/rust/src/lazyframe/general.rs b/src/rust/src/lazyframe/general.rs
index 7abc88d0..32df80db 100644
--- a/src/rust/src/lazyframe/general.rs
+++ b/src/rust/src/lazyframe/general.rs
@@ -444,17 +444,17 @@ impl PlRLazyFrame {
         Ok(())
     }
 
-    // fn sink_json(&self, py: Python, path: PathBuf, maintain_order: bool) -> Result<()> {
-    //     let options = JsonWriterOptions { maintain_order };
-
-    //     // if we don't allow threads and we have udfs trying to acquire the gil from different
-    //     // threads we deadlock.
-    //     py.allow_threads(|| {
-    //         let ldf = self.ldf.clone();
-    //         ldf.sink_json(path, options).map_err(RPolarsErr::from)
-    //     })?;
-    //     Ok(())
-    // }
+    fn sink_json(&self, path: &str, maintain_order: bool) -> Result<()> {
+        let path: PathBuf = path.into();
+        let options = JsonWriterOptions { maintain_order };
+
+        let _ = self
+            .ldf
+            .clone()
+            .sink_json(path, options)
+            .map_err(RPolarsErr::from);
+        Ok(())
+    }
 
     fn serialize(&self) -> Result<Sexp> {
         let dump = serde_json::to_string(&self.ldf.logical_plan)

From 9fff00f24c1036c6dc0fed52fe3c4f601abc009c Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Wed, 6 Nov 2024 15:02:56 +0100
Subject: [PATCH 17/71] redoc

---
 man/IO_sink_csv.Rd               | 120 +++++++++++++++++++++++++++++++
 man/IO_sink_ipc.Rd               |  69 ++++++++++++++++++
 man/IO_sink_ndjson.Rd            |  57 +++++++++++++++
 man/IO_sink_parquet.Rd           | 105 +++++++++++++++++++++++++++
 man/lazyframe__collect_schema.Rd |   3 +-
 man/lazyframe__profile.Rd        |  61 ++++++++++++++++
 man/lazyframe__serialize.Rd      |  18 +++++
 man/pl.Rd                        |   2 +-
 man/pl__deserialize_lf.Rd        |  22 ++++++
 9 files changed, 455 insertions(+), 2 deletions(-)
 create mode 100644 man/IO_sink_csv.Rd
 create mode 100644 man/IO_sink_ipc.Rd
 create mode 100644 man/IO_sink_ndjson.Rd
 create mode 100644 man/IO_sink_parquet.Rd
 create mode 100644 man/lazyframe__serialize.Rd
 create mode 100644 man/pl__deserialize_lf.Rd

diff --git a/man/IO_sink_csv.Rd b/man/IO_sink_csv.Rd
new file mode 100644
index 00000000..0ca3a8f5
--- /dev/null
+++ b/man/IO_sink_csv.Rd
@@ -0,0 +1,120 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__sink_csv}
+\alias{lazyframe__sink_csv}
+\title{Evaluate the query in streaming mode and write to a CSV file}
+\usage{
+lazyframe__sink_csv(
+  path,
+  ...,
+  include_bom = FALSE,
+  include_header = TRUE,
+  separator = ",",
+  line_terminator = "\\n",
+  quote_char = "\\"",
+  batch_size = 1024,
+  datetime_format = NULL,
+  date_format = NULL,
+  time_format = NULL,
+  float_precision = NULL,
+  null_value = "",
+  quote_style = "necessary",
+  maintain_order = TRUE,
+  type_coercion = TRUE,
+  predicate_pushdown = TRUE,
+  projection_pushdown = TRUE,
+  simplify_expression = TRUE,
+  slice_pushdown = TRUE,
+  no_optimization = FALSE
+)
+}
+\arguments{
+\item{path}{A character. File path to which the file should be written.}
+
+\item{...}{Dots which should be empty.}
+
+\item{include_bom}{Logical, whether to include UTF-8 BOM in the CSV output.}
+
+\item{include_header}{Logical, hether to include header in the CSV output.}
+
+\item{separator}{Separate CSV fields with this symbol.}
+
+\item{line_terminator}{String used to end each row.}
+
+\item{quote_char}{Byte to use as quoting character.}
+
+\item{batch_size}{Number of rows that will be processed per thread.}
+
+\item{datetime_format}{A format string, with the specifiers defined by the
+\href{https://docs.rs/chrono/latest/chrono/format/strftime/index.html}{chrono}
+Rust crate. If no format specified, the default fractional-second precision
+is inferred from the maximum timeunit found in the frame’s Datetime cols (if
+any).}
+
+\item{date_format}{A format string, with the specifiers defined by the
+\href{https://docs.rs/chrono/latest/chrono/format/strftime/index.html}{chrono}
+Rust crate.}
+
+\item{time_format}{A format string, with the specifiers defined by the
+\href{https://docs.rs/chrono/latest/chrono/format/strftime/index.html}{chrono}
+Rust crate.}
+
+\item{float_precision}{Whether to use scientific form always (\code{TRUE}), never
+(\code{FALSE}), or automatically (\code{NULL}) for Float32 and Float64 datatypes.}
+
+\item{null_value}{A string representing null values (defaulting to the empty
+string).}
+
+\item{quote_style}{Determines the quoting strategy used. Must be one of:
+\itemize{
+\item \code{"necessary"} (default): This puts quotes around fields only when
+necessary. They are necessary when fields contain a quote, delimiter or
+record terminator. Quotes are also necessary when writing an empty record
+(which is indistinguishable from a record with one empty field). This is
+the default.
+\item \code{"always"}: This puts quotes around every field. Always.
+\item \code{"never"}: This never puts quotes around fields, even if that results in
+invalid CSV data (e.g.: by not quoting strings containing the separator).
+\item \code{"non_numeric"}: This puts quotes around all fields that are non-numeric.
+Namely, when writing a field that does not parse as a valid float or
+integer, then quotes will be used even if they aren`t strictly necessary.
+}}
+
+\item{maintain_order}{Maintain the order in which data is processed. Setting
+this to \code{FALSE} will be slightly faster.}
+
+\item{type_coercion}{Logical. Coerce types such that operations succeed and
+run on minimal required memory.}
+
+\item{predicate_pushdown}{Logical. Applies filters as early as possible at
+scan level.}
+
+\item{projection_pushdown}{Logical. Select only the columns that are needed
+at the scan level.}
+
+\item{simplify_expression}{Logical. Various optimizations, such as constant
+folding and replacing expensive operations with faster alternatives.}
+
+\item{slice_pushdown}{Logical. Only load the required slice from the scan
+level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).}
+}
+\value{
+Invisibly returns the input LazyFrame
+}
+\description{
+\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
+
+This allows streaming results that are larger than RAM to be written to disk.
+}
+\examples{
+# sink table 'mtcars' from mem to CSV
+tmpf <- tempfile()
+pl$LazyFrame(mtcars)$sink_csv(tmpf)
+
+# stream a query end-to-end
+tmpf2 <- tempfile()
+pl$scan_csv(tmpf)$select(pl$col("cyl") * 2)$sink_csv(tmpf2)
+
+# load parquet directly into a DataFrame / memory
+pl$scan_csv(tmpf2)$collect()
+}
diff --git a/man/IO_sink_ipc.Rd b/man/IO_sink_ipc.Rd
new file mode 100644
index 00000000..3d8dfb1d
--- /dev/null
+++ b/man/IO_sink_ipc.Rd
@@ -0,0 +1,69 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__sink_ipc}
+\alias{lazyframe__sink_ipc}
+\title{Evaluate the query in streaming mode and write to an IPC file}
+\usage{
+lazyframe__sink_ipc(
+  path,
+  ...,
+  compression = c("zstd", "lz4", "uncompressed"),
+  maintain_order = TRUE,
+  type_coercion = TRUE,
+  predicate_pushdown = TRUE,
+  projection_pushdown = TRUE,
+  simplify_expression = TRUE,
+  slice_pushdown = TRUE,
+  no_optimization = FALSE
+)
+}
+\arguments{
+\item{path}{A character. File path to which the file should be written.}
+
+\item{...}{Dots which should be empty.}
+
+\item{compression}{\code{NULL} or one of:
+\itemize{
+\item \code{"uncompressed"}: same as \code{NULL}.
+\item \code{"lz4"}: fast compression/decompression.
+\item \code{"zstd"}: good compression performance.
+}}
+
+\item{maintain_order}{Maintain the order in which data is processed. Setting
+this to \code{FALSE} will be slightly faster.}
+
+\item{type_coercion}{Logical. Coerce types such that operations succeed and
+run on minimal required memory.}
+
+\item{predicate_pushdown}{Logical. Applies filters as early as possible at
+scan level.}
+
+\item{projection_pushdown}{Logical. Select only the columns that are needed
+at the scan level.}
+
+\item{simplify_expression}{Logical. Various optimizations, such as constant
+folding and replacing expensive operations with faster alternatives.}
+
+\item{slice_pushdown}{Logical. Only load the required slice from the scan
+level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).}
+}
+\value{
+Invisibly returns the input LazyFrame
+}
+\description{
+\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
+
+This allows streaming results that are larger than RAM to be written to disk.
+}
+\examples{
+# sink table 'mtcars' from mem to ipc
+tmpf <- tempfile()
+as_polars_lf(mtcars)$sink_ipc(tmpf)
+
+# stream a query end-to-end (not supported yet, https://github.com/pola-rs/polars/issues/1040)
+# tmpf2 = tempfile()
+# pl$scan_ipc(tmpf)$select(pl$col("cyl") * 2)$sink_ipc(tmpf2)
+
+# load ipc directly into a DataFrame / memory
+# pl$scan_ipc(tmpf2)$collect()
+}
diff --git a/man/IO_sink_ndjson.Rd b/man/IO_sink_ndjson.Rd
new file mode 100644
index 00000000..8885e03c
--- /dev/null
+++ b/man/IO_sink_ndjson.Rd
@@ -0,0 +1,57 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__sink_ndjson}
+\alias{lazyframe__sink_ndjson}
+\title{Evaluate the query in streaming mode and write to an NDJSON file}
+\usage{
+lazyframe__sink_ndjson(
+  path,
+  ...,
+  maintain_order = TRUE,
+  type_coercion = TRUE,
+  predicate_pushdown = TRUE,
+  projection_pushdown = TRUE,
+  simplify_expression = TRUE,
+  slice_pushdown = TRUE,
+  no_optimization = FALSE
+)
+}
+\arguments{
+\item{path}{A character. File path to which the file should be written.}
+
+\item{...}{Dots which should be empty.}
+
+\item{maintain_order}{Maintain the order in which data is processed. Setting
+this to \code{FALSE} will be slightly faster.}
+
+\item{type_coercion}{Logical. Coerce types such that operations succeed and
+run on minimal required memory.}
+
+\item{predicate_pushdown}{Logical. Applies filters as early as possible at
+scan level.}
+
+\item{projection_pushdown}{Logical. Select only the columns that are needed
+at the scan level.}
+
+\item{simplify_expression}{Logical. Various optimizations, such as constant
+folding and replacing expensive operations with faster alternatives.}
+
+\item{slice_pushdown}{Logical. Only load the required slice from the scan
+level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).}
+}
+\value{
+Invisibly returns the input LazyFrame
+}
+\description{
+\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
+
+This allows streaming results that are larger than RAM to be written to disk.
+}
+\examples{
+# sink table 'mtcars' from mem to NDJSON
+tmpf <- tempfile(fileext = ".ndjson")
+pl$LazyFrame(mtcars)$sink_ndjson(tmpf)
+
+# load parquet directly into a DataFrame / memory
+pl$scan_ndjson(tmpf)$collect()
+}
diff --git a/man/IO_sink_parquet.Rd b/man/IO_sink_parquet.Rd
new file mode 100644
index 00000000..e74481ff
--- /dev/null
+++ b/man/IO_sink_parquet.Rd
@@ -0,0 +1,105 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__sink_parquet}
+\alias{lazyframe__sink_parquet}
+\title{Evaluate the query in streaming mode and write to a Parquet file}
+\usage{
+lazyframe__sink_parquet(
+  path,
+  ...,
+  compression = "zstd",
+  compression_level = 3,
+  statistics = TRUE,
+  row_group_size = NULL,
+  data_page_size = NULL,
+  maintain_order = TRUE,
+  type_coercion = TRUE,
+  predicate_pushdown = TRUE,
+  projection_pushdown = TRUE,
+  simplify_expression = TRUE,
+  slice_pushdown = TRUE,
+  no_optimization = FALSE
+)
+}
+\arguments{
+\item{path}{A character. File path to which the file should be written.}
+
+\item{...}{Dots which should be empty.}
+
+\item{compression}{The compression method. Must be one of:
+\itemize{
+\item \code{"lz4"}: fast compression/decompression.
+\item \code{"uncompressed"}
+\item \code{"snappy"}: this guarantees that the parquet file will be compatible with
+older parquet readers.
+\item \code{"gzip"}
+\item \code{"lzo"}
+\item \code{"brotli"}
+\item \code{"zstd"}: good compression performance.
+}}
+
+\item{compression_level}{\code{NULL} or integer. The level of compression to use.
+Only used if method is one of \code{"gzip"}, \code{"brotli"}, or \code{"zstd"}. Higher
+compression means smaller files on disk:
+\itemize{
+\item \code{"gzip"}: min-level: 0, max-level: 10.
+\item \code{"brotli"}: min-level: 0, max-level: 11.
+\item \code{"zstd"}: min-level: 1, max-level: 22.
+}}
+
+\item{statistics}{Whether statistics should be written to the Parquet
+headers. Possible values:
+\itemize{
+\item \code{TRUE}: enable default set of statistics (default)
+\item \code{FALSE}: disable all statistics
+\item \code{"full"}: calculate and write all available statistics.
+\item A named list where all values must be \code{TRUE} or \code{FALSE}, e.g.
+\code{list(min = TRUE, max = FALSE)}. Statistics available are \code{"min"}, \code{"max"},
+\code{"distinct_count"}, \code{"null_count"}.
+}}
+
+\item{row_group_size}{Size of the row groups in number of rows. If \code{NULL}
+(default), the chunks of the DataFrame are used. Writing in smaller chunks
+may reduce memory pressure and improve writing speeds.}
+
+\item{data_page_size}{Size of the data page in bytes. If \code{NULL} (default), it
+is set to 1024^2 bytes.}
+
+\item{maintain_order}{Maintain the order in which data is processed. Setting
+this to \code{FALSE} will be slightly faster.}
+
+\item{type_coercion}{Logical. Coerce types such that operations succeed and
+run on minimal required memory.}
+
+\item{predicate_pushdown}{Logical. Applies filters as early as possible at
+scan level.}
+
+\item{projection_pushdown}{Logical. Select only the columns that are needed
+at the scan level.}
+
+\item{simplify_expression}{Logical. Various optimizations, such as constant
+folding and replacing expensive operations with faster alternatives.}
+
+\item{slice_pushdown}{Logical. Only load the required slice from the scan
+level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).}
+}
+\value{
+Invisibly returns the input LazyFrame
+}
+\description{
+\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
+
+This allows streaming results that are larger than RAM to be written to disk.
+}
+\examples{
+# sink table 'mtcars' from mem to parquet
+tmpf <- tempfile()
+as_polars_lf(mtcars)$sink_parquet(tmpf)
+
+# stream a query end-to-end
+tmpf2 <- tempfile()
+pl$scan_parquet(tmpf)$select(pl$col("cyl") * 2)$sink_parquet(tmpf2)
+
+# load parquet directly into a DataFrame / memory
+pl$scan_parquet(tmpf2)$collect()
+}
diff --git a/man/lazyframe__collect_schema.Rd b/man/lazyframe__collect_schema.Rd
index 4d46c94c..6a59a51f 100644
--- a/man/lazyframe__collect_schema.Rd
+++ b/man/lazyframe__collect_schema.Rd
@@ -23,6 +23,7 @@ lf <- pl$LazyFrame(
 lf$collect_schema()
 
 lf$with_columns(
-  baz = (pl$col("foo") + pl$col("bar"))$cast(pl$String)
+  baz = (pl$col("foo") + pl$col("bar"))$cast(pl$String),
+  pl$col("bar")$cast(pl$Int64)
 )$collect_schema()
 }
diff --git a/man/lazyframe__profile.Rd b/man/lazyframe__profile.Rd
index 11210994..8223fe3a 100644
--- a/man/lazyframe__profile.Rd
+++ b/man/lazyframe__profile.Rd
@@ -4,6 +4,22 @@
 \alias{lazyframe__profile}
 \title{Collect and profile a lazy query.}
 \usage{
+lazyframe__profile(
+  type_coercion = TRUE,
+  predicate_pushdown = TRUE,
+  projection_pushdown = TRUE,
+  simplify_expression = TRUE,
+  slice_pushdown = TRUE,
+  comm_subplan_elim = TRUE,
+  comm_subexpr_elim = TRUE,
+  cluster_with_columns = TRUE,
+  streaming = FALSE,
+  no_optimization = FALSE,
+  collect_in_background = FALSE,
+  show_plot = FALSE,
+  truncate_nodes = 0
+)
+
 lazyframe__profile(
   type_coercion = TRUE,
   predicate_pushdown = TRUE,
@@ -55,16 +71,26 @@ the entire query is processed in a single batch.}
 number of characters. If \code{0} (default), do not truncate.}
 }
 \value{
+List of two \code{DataFrame}s: one with the collected result, the other
+with the timings of each step. If \code{show_graph = TRUE}, then the plot is
+also stored in the list.
+
 List of two \code{DataFrame}s: one with the collected result, the other
 with the timings of each step. If \code{show_graph = TRUE}, then the plot is
 also stored in the list.
 }
 \description{
+This will run the query and return a list containing the materialized
+DataFrame and a DataFrame that contains profiling information of each node
+that is executed.
+
 This will run the query and return a list containing the
 materialized DataFrame and a DataFrame that contains profiling information
 of each node that is executed.
 }
 \details{
+The units of the timings are microseconds.
+
 The units of the timings are microseconds.
 }
 \examples{
@@ -73,6 +99,31 @@ pl$LazyFrame()$select(pl$lit(2) + 2)$profile()
 
 ## Use $profile() to compare two queries
 
+# -1-  map each Species-group with native polars
+pl$LazyFrame(iris)$
+  sort("Sepal.Length")$
+  group_by("Species", maintain_order = TRUE)$
+  agg(pl$col(pl$Float64)$first() + 5)$
+  profile()
+
+# -2-  map each Species-group of each numeric column with an R function
+
+# some R function, prints `.` for each time called by polars
+r_func <- \(s) {
+  cat(".")
+  s$to_r()[1] + 5
+}
+
+pl$LazyFrame(iris)$
+  sort("Sepal.Length")$
+  group_by("Species", maintain_order = TRUE)$
+  agg(pl$col(pl$Float64)$map_elements(r_func))$
+  profile()
+## Simplest use case
+pl$LazyFrame()$select(pl$lit(2) + 2)$profile()
+
+## Use $profile() to compare two queries
+
 # -1-  map each Species-group with native polars, takes ~120us only
 as_polars_lf(iris)$
   sort("Sepal.Length")$
@@ -95,6 +146,16 @@ as_polars_lf(iris)$
   profile()
 }
 \seealso{
+\itemize{
+\item \code{\link[=LazyFrame_collect]{$collect()}} - regular collect.
+\item \code{\link[=LazyFrame_fetch]{$fetch()}} - fast limited query check
+\item \code{\link[=LazyFrame_collect_in_background]{$collect_in_background()}} - non-blocking
+collect returns a future handle. Can also just be used via
+\verb{$collect(collect_in_background = TRUE)}.
+\item \code{\link[=LazyFrame_sink_parquet]{$sink_parquet()}} streams query to a parquet file.
+\item \code{\link[=LazyFrame_sink_ipc]{$sink_ipc()}} streams query to a arrow file.
+}
+
 \itemize{
 \item \code{\link[=lazyframe__collect]{$collect()}} - regular collect.
 \item \code{\link[=lazyframe__fetch]{$fetch()}} - fast limited query check
diff --git a/man/lazyframe__serialize.Rd b/man/lazyframe__serialize.Rd
new file mode 100644
index 00000000..f532da6c
--- /dev/null
+++ b/man/lazyframe__serialize.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__serialize}
+\alias{lazyframe__serialize}
+\title{Serialize the logical plan of this LazyFrame to a string in JSON format}
+\usage{
+lazyframe__serialize()
+}
+\value{
+A character value
+}
+\description{
+Serialize the logical plan of this LazyFrame to a string in JSON format
+}
+\examples{
+lf <- pl$LazyFrame(a = 1:3)$sum()
+lf$serialize()
+}
diff --git a/man/pl.Rd b/man/pl.Rd
index 0efcc18f..9a22c135 100644
--- a/man/pl.Rd
+++ b/man/pl.Rd
@@ -5,7 +5,7 @@
 \alias{pl}
 \title{Polars top-level function namespace}
 \format{
-An object of class \code{polars_object} of length 51.
+An object of class \code{polars_object} of length 52.
 }
 \usage{
 pl
diff --git a/man/pl__deserialize_lf.Rd b/man/pl__deserialize_lf.Rd
new file mode 100644
index 00000000..0d71fb6f
--- /dev/null
+++ b/man/pl__deserialize_lf.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{pl__deserialize_lf}
+\alias{pl__deserialize_lf}
+\title{Read a logical plan from a file to construct a LazyFrame}
+\usage{
+pl__deserialize_lf(source)
+}
+\arguments{
+\item{source}{String containing the LazyFrame logical plan in JSON format.}
+}
+\value{
+A character value
+}
+\description{
+Read a logical plan from a file to construct a LazyFrame
+}
+\examples{
+lf <- pl$LazyFrame(a = 1:3)$sum()
+ser <- lf$serialize()
+pl$deserialize_lf(ser)
+}

From 2f6334da97b9ca910aa2a13f29f1e047516f2174 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Wed, 6 Nov 2024 17:07:44 +0100
Subject: [PATCH 18/71] join_asof

---
 R/000-wrappers.R                  |  10 ++
 R/lazyframe-frame.R               | 260 +++++++++++++++++-------------
 man/lazyframe__join_asof.Rd       | 116 ++++++++-----
 src/init.c                        |   6 +
 src/rust/api.h                    |   1 +
 src/rust/src/conversion/mod.rs    |  55 +++++--
 src/rust/src/lazyframe/general.rs |  93 ++++++-----
 7 files changed, 334 insertions(+), 207 deletions(-)

diff --git a/R/000-wrappers.R b/R/000-wrappers.R
index d7285703..ee44f3cc 100644
--- a/R/000-wrappers.R
+++ b/R/000-wrappers.R
@@ -2397,6 +2397,15 @@ class(`PlRExpr`) <- c("PlRExpr__bundle", "savvy_neopolars__sealed")
   }
 }
 
+`PlRLazyFrame_join_asof` <- function(self) {
+  function(`other`, `left_on`, `right_on`, `allow_parallel`, `force_parallel`, `suffix`, `coalesce`, `strategy`, `left_by` = NULL, `right_by` = NULL, `tolerance` = NULL, `tolerance_str` = NULL) {
+    `other` <- .savvy_extract_ptr(`other`, "PlRLazyFrame")
+    `left_on` <- .savvy_extract_ptr(`left_on`, "PlRExpr")
+    `right_on` <- .savvy_extract_ptr(`right_on`, "PlRExpr")
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_join_asof__impl, `self`, `other`, `left_on`, `right_on`, `allow_parallel`, `force_parallel`, `suffix`, `coalesce`, `strategy`, `left_by`, `right_by`, `tolerance`, `tolerance_str`))
+  }
+}
+
 `PlRLazyFrame_join` <- function(self) {
   function(`other`, `left_on`, `right_on`, `allow_parallel`, `force_parallel`, `join_nulls`, `how`, `suffix`, `validate`, `coalesce` = NULL) {
     `other` <- .savvy_extract_ptr(`other`, "PlRLazyFrame")
@@ -2594,6 +2603,7 @@ class(`PlRExpr`) <- c("PlRExpr__bundle", "savvy_neopolars__sealed")
   e$`rolling` <- `PlRLazyFrame_rolling`(ptr)
   e$`group_by_dynamic` <- `PlRLazyFrame_group_by_dynamic`(ptr)
   e$`with_context` <- `PlRLazyFrame_with_context`(ptr)
+  e$`join_asof` <- `PlRLazyFrame_join_asof`(ptr)
   e$`join` <- `PlRLazyFrame_join`(ptr)
   e$`join_where` <- `PlRLazyFrame_join_where`(ptr)
   e$`with_columns_seq` <- `PlRLazyFrame_with_columns_seq`(ptr)
diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index 5d682d09..07a1a10c 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -1294,117 +1294,6 @@ lazyframe__join_where <- function(
   })
 }
 
-#' Perform joins on nearest keys
-#'
-#' This is similar to a left-join except that we match on nearest key rather
-#' than equal keys.
-#'
-#' Both tables (DataFrames or LazyFrames) must be sorted by the asof_join key.
-#' @param other LazyFrame
-#' @param ...  Not used, blocks use of further positional arguments
-#' @inheritParams DataFrame_join
-#' @param by Join on these columns before performing asof join. Either a vector
-#' of column names or a list of expressions and/or strings. Use `left_by` and
-#' `right_by` if the column names to match on are different between the two
-#' tables.
-#' @param by_left,by_right Same as `by` but only for the left or the right
-#' table. They must have the same length.
-#' @param strategy Strategy for where to find match:
-#' * "backward" (default): search for the last row in the right table whose `on`
-#'   key is less than or equal to the left key.
-#' * "forward": search for the first row in the right table whose `on` key is
-#'   greater than or equal to the left key.
-#' * "nearest": search for the last row in the right table whose value is nearest
-#'   to the left key. String keys are not currently supported for a nearest
-#'   search.
-#' @param tolerance
-#' Numeric tolerance. By setting this the join will only be done if the near
-#' keys are within this distance. If an asof join is done on columns of dtype
-#' "Date", "Datetime", "Duration" or "Time", use the Polars duration string language.
-#' About the language, see the `Polars duration string language` section for details.
-#'
-#' There may be a circumstance where R types are not sufficient to express a
-#' numeric tolerance. In that case, you can use the expression syntax like
-#' `tolerance = pl$lit(42)$cast(pl$Uint64)`
-#' @param coalesce Coalescing behavior (merging of `on` / `left_on` / `right_on`
-#' columns):
-#' * `TRUE`: Always coalesce join columns;
-#' * `FALSE`: Never coalesce join columns.
-#' Note that joining on any other expressions than `col` will turn off coalescing.
-#'
-#' @inheritSection polars_duration_string  Polars duration string language
-#' @examples #
-#' # create two LazyFrame to join asof
-#' gdp <- pl$LazyFrame(
-#'   date = as.Date(c("2015-1-1", "2016-1-1", "2017-5-1", "2018-1-1", "2019-1-1")),
-#'   gdp = c(4321, 4164, 4411, 4566, 4696),
-#'   group = c("b", "a", "a", "b", "b")
-#' )
-#'
-#' pop <- pl$LazyFrame(
-#'   date = as.Date(c("2016-5-12", "2017-5-12", "2018-5-12", "2019-5-12")),
-#'   population = c(82.19, 82.66, 83.12, 83.52),
-#'   group = c("b", "b", "a", "a")
-#' )
-#'
-#' # optional make sure tables are already sorted with "on" join-key
-#' gdp <- gdp$sort("date")
-#' pop <- pop$sort("date")
-#'
-#'
-#' # Left-join_asof LazyFrame pop with gdp on "date"
-#' # Look backward in gdp to find closest matching date
-#' pop$join_asof(gdp, on = "date", strategy = "backward")$collect()
-#'
-#' # .... and forward
-#' pop$join_asof(gdp, on = "date", strategy = "forward")$collect()
-#'
-#' # join by a group: "only look within groups"
-#' pop$join_asof(gdp, on = "date", by = "group", strategy = "backward")$collect()
-#'
-#' # only look 2 weeks and 2 days back
-#' pop$join_asof(gdp, on = "date", strategy = "backward", tolerance = "2w2d")$collect()
-#'
-#' # only look 11 days back (numeric tolerance depends on polars type, <date> is in days)
-#' pop$join_asof(gdp, on = "date", strategy = "backward", tolerance = 11)$collect()
-lazyframe__join_asof <- function(
-    other,
-    ...,
-    left_on = NULL,
-    right_on = NULL,
-    on = NULL,
-    by_left = NULL,
-    by_right = NULL,
-    by = NULL,
-    strategy = c("backward", "forward", "nearest"),
-    suffix = "_right",
-    tolerance = NULL,
-    allow_parallel = TRUE,
-    force_parallel = FALSE,
-    coalesce = TRUE) {
-  if (!is.null(by)) by_left <- by_right <- by
-  if (!is.null(on)) left_on <- right_on <- on
-  tolerance_str <- if (is.character(tolerance)) tolerance else NULL
-  tolerance_num <- if (!is.character(tolerance)) tolerance else NULL
-
-  self$`_ldf`$join_asof(
-    lf = self,
-    other = other,
-    left_on = left_on,
-    right_on = right_on,
-    left_by = by_left,
-    right_by = by_right,
-    allow_parallel = allow_parallel,
-    force_parallel = force_parallel,
-    suffix = suffix,
-    strategy = strategy,
-    tolerance = tolerance_num,
-    tolerance_str = tolerance_str,
-    coalesce = coalesce
-  )
-}
-
-
 #' Unpivot a LazyFrame from wide to long format
 #'
 #' This function is useful to massage a LazyFrame into a format where one or
@@ -2764,3 +2653,152 @@ lazyframe__sink_ndjson <- function(
     invisible(self)
   })
 }
+
+#' Perform joins on nearest keys
+#'
+#' @description
+#' This is similar to a left-join except that we match on nearest key rather
+#' than equal keys. Both frames must be sorted by the `asof_join` key.
+#'
+#' @inheritParams rlang::check_dots_empty0
+#' @param other LazyFrame to join with.
+#' @inheritParams dataframe__join
+#' @param by Join on these columns before performing asof join. Either a vector
+#' of column names or a list of expressions and/or strings. Use `left_by` and
+#' `right_by` if the column names to match on are different between the two
+#' tables.
+#' @param by_left,by_right Same as `by` but only for the left or the right
+#' table. They must have the same length.
+#' @param strategy Strategy for where to find match:
+#' * `"backward"` (default): search for the last row in the right table whose
+#'   `on` key is less than or equal to the left key.
+#' * `"forward"`: search for the first row in the right table whose `on` key is
+#'   greater than or equal to the left key.
+#' * `"nearest"`: search for the last row in the right table whose value is
+#'   nearest to the left key. String keys are not currently supported for a
+#'   nearest search.
+#' @param tolerance Numeric tolerance. By setting this the join will only be
+#' done if the near keys are within this distance. If an asof join is done on
+#' columns of dtype "Date", "Datetime", "Duration" or "Time", use the Polars
+#' duration string language (see details).
+#'
+#' @param coalesce Coalescing behavior (merging of `on` / `left_on` /
+#' `right_on` columns):
+#' * `TRUE`: Always coalesce join columns;
+#' * `FALSE`: Never coalesce join columns.
+#' Note that joining on any other expressions than `col` will turn off
+#' coalescing.
+#'
+#' @inheritSection polars_duration_string Polars duration string language
+#' @examples
+#' gdp <- pl$LazyFrame(
+#'   date = as.Date(c("2016-1-1", "2017-5-1", "2018-1-1", "2019-1-1", "2020-1-1")),
+#'   gdp = c(4164, 4411, 4566, 4696, 4827)
+#' )
+#'
+#' pop <- pl$LazyFrame(
+#'   date = as.Date(c("2016-3-1", "2018-8-1", "2019-1-1")),
+#'   population = c(82.19, 82.66, 83.12)
+#' )
+#'
+#' # optional make sure tables are already sorted with "on" join-key
+#' gdp <- gdp$sort("date")
+#' pop <- pop$sort("date")
+#'
+#'
+#' # Note how the dates don’t quite match. If we join them using join_asof and
+#' # strategy = 'backward', then each date from population which doesn’t have
+#' # an exact match is matched with the closest earlier date from gdp:
+#' pop$join_asof(gdp, on = "date", strategy = "backward")$collect()
+#'
+#' # Note how:
+#' # - date 2016-03-01 from population is matched with 2016-01-01 from gdp;
+#' # - date 2018-08-01 from population is matched with 2018-01-01 from gdp.
+#' # You can verify this by passing coalesce = FALSE:
+#' pop$join_asof(
+#'   gdp,
+#'   on = "date", strategy = "backward", coalesce = FALSE
+#' )$collect()
+#'
+#' # If we instead use strategy = 'forward', then each date from population
+#' # which doesn’t have an exact match is matched with the closest later date
+#' # from gdp:
+#' pop$join_asof(gdp, on = "date", strategy = "forward")$collect()
+#'
+#' # Note how:
+#' # - date 2016-03-01 from population is matched with 2017-01-01 from gdp;
+#' # - date 2018-08-01 from population is matched with 2019-01-01 from gdp.
+#'
+#' # Finally, strategy = 'nearest' gives us a mix of the two results above, as
+#' # each date from population which doesn’t have an exact match is matched
+#' # with the closest date from gdp, regardless of whether it’s earlier or
+#' # later:
+#' pop$join_asof(gdp, on = "date", strategy = "nearest")$collect()
+#'
+#' # Note how:
+#' # - date 2016-03-01 from population is matched with 2016-01-01 from gdp;
+#' # - date 2018-08-01 from population is matched with 2019-01-01 from gdp.
+#'
+#' # The `by` argument allows joining on another column first, before the asof
+#' # join. In this example we join by country first, then asof join by date, as
+#' # above.
+#' gdp2 <- pl$LazyFrame(
+#'   country = rep(c("Germany", "Netherlands"), each = 5),
+#'   date = rep(
+#'     as.Date(c("2016-1-1", "2017-1-1", "2018-1-1", "2019-1-1", "2020-1-1")),
+#'     2
+#'   ),
+#'   gdp = c(4164, 4411, 4566, 4696, 4827, 784, 833, 914, 910, 909)
+#' )$sort("country", "date")
+#' gdp2$collect()
+#'
+#' pop2 <- pl$LazyFrame(
+#'   country = rep(c("Germany", "Netherlands"), each = 3),
+#'   date = rep(as.Date(c("2016-3-1", "2018-8-1", "2019-1-1")), 2),
+#'   population = c(82.19, 82.66, 83.12, 17.11, 17.32, 17.40)
+#' )$sort("country", "date")
+#' pop2$collect()
+#'
+#' pop2$join_asof(
+#'   gdp2,
+#'   by = "country", on = "date", strategy = "nearest"
+#' )$collect()
+lazyframe__join_asof <- function(
+    other,
+    ...,
+    left_on = NULL,
+    right_on = NULL,
+    on = NULL,
+    by_left = NULL,
+    by_right = NULL,
+    by = NULL,
+    strategy = c("backward", "forward", "nearest"),
+    suffix = "_right",
+    tolerance = NULL,
+    allow_parallel = TRUE,
+    force_parallel = FALSE,
+    coalesce = TRUE) {
+  wrap({
+    check_dots_empty0(...)
+    strategy <- arg_match0(strategy, values = c("backward", "forward", "nearest"))
+    if (!is.null(by)) by_left <- by_right <- by
+    if (!is.null(on)) left_on <- right_on <- on
+    tolerance_str <- if (is.character(tolerance)) tolerance else NULL
+    tolerance_num <- if (!is.character(tolerance)) tolerance else NULL
+
+    self$`_ldf`$join_asof(
+      other = other$`_ldf`,
+      left_on = as_polars_expr(left_on)$`_rexpr`,
+      right_on = as_polars_expr(right_on)$`_rexpr`,
+      left_by = by_left,
+      right_by = by_right,
+      allow_parallel = allow_parallel,
+      force_parallel = force_parallel,
+      suffix = suffix,
+      strategy = strategy,
+      tolerance = tolerance_num,
+      tolerance_str = tolerance_str,
+      coalesce = coalesce
+    )
+  })
+}
diff --git a/man/lazyframe__join_asof.Rd b/man/lazyframe__join_asof.Rd
index fbc72693..fe3cacd3 100644
--- a/man/lazyframe__join_asof.Rd
+++ b/man/lazyframe__join_asof.Rd
@@ -22,9 +22,9 @@ lazyframe__join_asof(
 )
 }
 \arguments{
-\item{other}{LazyFrame}
+\item{other}{LazyFrame to join with.}
 
-\item{...}{Not used, blocks use of further positional arguments}
+\item{...}{Dots which should be empty.}
 
 \item{by_left, by_right}{Same as \code{by} but only for the left or the right
 table. They must have the same length.}
@@ -36,38 +36,32 @@ tables.}
 
 \item{strategy}{Strategy for where to find match:
 \itemize{
-\item "backward" (default): search for the last row in the right table whose \code{on}
-key is less than or equal to the left key.
-\item "forward": search for the first row in the right table whose \code{on} key is
+\item \code{"backward"} (default): search for the last row in the right table whose
+\code{on} key is less than or equal to the left key.
+\item \code{"forward"}: search for the first row in the right table whose \code{on} key is
 greater than or equal to the left key.
-\item "nearest": search for the last row in the right table whose value is nearest
-to the left key. String keys are not currently supported for a nearest
-search.
+\item \code{"nearest"}: search for the last row in the right table whose value is
+nearest to the left key. String keys are not currently supported for a
+nearest search.
 }}
 
-\item{tolerance}{Numeric tolerance. By setting this the join will only be done if the near
-keys are within this distance. If an asof join is done on columns of dtype
-"Date", "Datetime", "Duration" or "Time", use the Polars duration string language.
-About the language, see the \verb{Polars duration string language} section for details.
+\item{tolerance}{Numeric tolerance. By setting this the join will only be
+done if the near keys are within this distance. If an asof join is done on
+columns of dtype "Date", "Datetime", "Duration" or "Time", use the Polars
+duration string language (see details).}
 
-There may be a circumstance where R types are not sufficient to express a
-numeric tolerance. In that case, you can use the expression syntax like
-\code{tolerance = pl$lit(42)$cast(pl$Uint64)}}
-
-\item{coalesce}{Coalescing behavior (merging of \code{on} / \code{left_on} / \code{right_on}
-columns):
+\item{coalesce}{Coalescing behavior (merging of \code{on} / \code{left_on} /
+\code{right_on} columns):
 \itemize{
 \item \code{TRUE}: Always coalesce join columns;
 \item \code{FALSE}: Never coalesce join columns.
-Note that joining on any other expressions than \code{col} will turn off coalescing.
+Note that joining on any other expressions than \code{col} will turn off
+coalescing.
 }}
 }
 \description{
 This is similar to a left-join except that we match on nearest key rather
-than equal keys.
-}
-\details{
-Both tables (DataFrames or LazyFrames) must be sorted by the asof_join key.
+than equal keys. Both frames must be sorted by the \code{asof_join} key.
 }
 \section{Polars duration string language}{
 
@@ -97,18 +91,14 @@ Similarly for "calendar week", "calendar month", "calendar quarter", and "calend
 }
 
 \examples{
-#
-# create two LazyFrame to join asof
 gdp <- pl$LazyFrame(
-  date = as.Date(c("2015-1-1", "2016-1-1", "2017-5-1", "2018-1-1", "2019-1-1")),
-  gdp = c(4321, 4164, 4411, 4566, 4696),
-  group = c("b", "a", "a", "b", "b")
+  date = as.Date(c("2016-1-1", "2017-5-1", "2018-1-1", "2019-1-1", "2020-1-1")),
+  gdp = c(4164, 4411, 4566, 4696, 4827)
 )
 
 pop <- pl$LazyFrame(
-  date = as.Date(c("2016-5-12", "2017-5-12", "2018-5-12", "2019-5-12")),
-  population = c(82.19, 82.66, 83.12, 83.52),
-  group = c("b", "b", "a", "a")
+  date = as.Date(c("2016-3-1", "2018-8-1", "2019-1-1")),
+  population = c(82.19, 82.66, 83.12)
 )
 
 # optional make sure tables are already sorted with "on" join-key
@@ -116,19 +106,61 @@ gdp <- gdp$sort("date")
 pop <- pop$sort("date")
 
 
-# Left-join_asof LazyFrame pop with gdp on "date"
-# Look backward in gdp to find closest matching date
+# Note how the dates don’t quite match. If we join them using join_asof and
+# strategy = 'backward', then each date from population which doesn’t have
+# an exact match is matched with the closest earlier date from gdp:
 pop$join_asof(gdp, on = "date", strategy = "backward")$collect()
 
-# .... and forward
+# Note how:
+# - date 2016-03-01 from population is matched with 2016-01-01 from gdp;
+# - date 2018-08-01 from population is matched with 2018-01-01 from gdp.
+# You can verify this by passing coalesce = FALSE:
+pop$join_asof(
+  gdp,
+  on = "date", strategy = "backward", coalesce = FALSE
+)$collect()
+
+# If we instead use strategy = 'forward', then each date from population
+# which doesn’t have an exact match is matched with the closest later date
+# from gdp:
 pop$join_asof(gdp, on = "date", strategy = "forward")$collect()
 
-# join by a group: "only look within groups"
-pop$join_asof(gdp, on = "date", by = "group", strategy = "backward")$collect()
-
-# only look 2 weeks and 2 days back
-pop$join_asof(gdp, on = "date", strategy = "backward", tolerance = "2w2d")$collect()
-
-# only look 11 days back (numeric tolerance depends on polars type, <date> is in days)
-pop$join_asof(gdp, on = "date", strategy = "backward", tolerance = 11)$collect()
+# Note how:
+# - date 2016-03-01 from population is matched with 2017-01-01 from gdp;
+# - date 2018-08-01 from population is matched with 2019-01-01 from gdp.
+
+# Finally, strategy = 'nearest' gives us a mix of the two results above, as
+# each date from population which doesn’t have an exact match is matched
+# with the closest date from gdp, regardless of whether it’s earlier or
+# later:
+pop$join_asof(gdp, on = "date", strategy = "nearest")$collect()
+
+# Note how:
+# - date 2016-03-01 from population is matched with 2016-01-01 from gdp;
+# - date 2018-08-01 from population is matched with 2019-01-01 from gdp.
+
+# The `by` argument allows joining on another column first, before the asof
+# join. In this example we join by country first, then asof join by date, as
+# above.
+gdp2 <- pl$LazyFrame(
+  country = rep(c("Germany", "Netherlands"), each = 5),
+  date = rep(
+    as.Date(c("2016-1-1", "2017-1-1", "2018-1-1", "2019-1-1", "2020-1-1")),
+    2
+  ),
+  gdp = c(4164, 4411, 4566, 4696, 4827, 784, 833, 914, 910, 909)
+)$sort("country", "date")
+gdp2$collect()
+
+pop2 <- pl$LazyFrame(
+  country = rep(c("Germany", "Netherlands"), each = 3),
+  date = rep(as.Date(c("2016-3-1", "2018-8-1", "2019-1-1")), 2),
+  population = c(82.19, 82.66, 83.12, 17.11, 17.32, 17.40)
+)$sort("country", "date")
+pop2$collect()
+
+pop2$join_asof(
+  gdp2,
+  by = "country", on = "date", strategy = "nearest"
+)$collect()
 }
diff --git a/src/init.c b/src/init.c
index 1e915a13..08bc3f21 100644
--- a/src/init.c
+++ b/src/init.c
@@ -1629,6 +1629,11 @@ SEXP savvy_PlRLazyFrame_with_context__impl(SEXP self__, SEXP c_arg__contexts) {
     return handle_result(res);
 }
 
+SEXP savvy_PlRLazyFrame_join_asof__impl(SEXP self__, SEXP c_arg__other, SEXP c_arg__left_on, SEXP c_arg__right_on, SEXP c_arg__allow_parallel, SEXP c_arg__force_parallel, SEXP c_arg__suffix, SEXP c_arg__coalesce, SEXP c_arg__strategy, SEXP c_arg__left_by, SEXP c_arg__right_by, SEXP c_arg__tolerance, SEXP c_arg__tolerance_str) {
+    SEXP res = savvy_PlRLazyFrame_join_asof__ffi(self__, c_arg__other, c_arg__left_on, c_arg__right_on, c_arg__allow_parallel, c_arg__force_parallel, c_arg__suffix, c_arg__coalesce, c_arg__strategy, c_arg__left_by, c_arg__right_by, c_arg__tolerance, c_arg__tolerance_str);
+    return handle_result(res);
+}
+
 SEXP savvy_PlRLazyFrame_join__impl(SEXP self__, SEXP c_arg__other, SEXP c_arg__left_on, SEXP c_arg__right_on, SEXP c_arg__allow_parallel, SEXP c_arg__force_parallel, SEXP c_arg__join_nulls, SEXP c_arg__how, SEXP c_arg__suffix, SEXP c_arg__validate, SEXP c_arg__coalesce) {
     SEXP res = savvy_PlRLazyFrame_join__ffi(self__, c_arg__other, c_arg__left_on, c_arg__right_on, c_arg__allow_parallel, c_arg__force_parallel, c_arg__join_nulls, c_arg__how, c_arg__suffix, c_arg__validate, c_arg__coalesce);
     return handle_result(res);
@@ -2275,6 +2280,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"savvy_PlRLazyFrame_rolling__impl", (DL_FUNC) &savvy_PlRLazyFrame_rolling__impl, 6},
     {"savvy_PlRLazyFrame_group_by_dynamic__impl", (DL_FUNC) &savvy_PlRLazyFrame_group_by_dynamic__impl, 10},
     {"savvy_PlRLazyFrame_with_context__impl", (DL_FUNC) &savvy_PlRLazyFrame_with_context__impl, 2},
+    {"savvy_PlRLazyFrame_join_asof__impl", (DL_FUNC) &savvy_PlRLazyFrame_join_asof__impl, 13},
     {"savvy_PlRLazyFrame_join__impl", (DL_FUNC) &savvy_PlRLazyFrame_join__impl, 11},
     {"savvy_PlRLazyFrame_join_where__impl", (DL_FUNC) &savvy_PlRLazyFrame_join_where__impl, 4},
     {"savvy_PlRLazyFrame_with_columns_seq__impl", (DL_FUNC) &savvy_PlRLazyFrame_with_columns_seq__impl, 2},
diff --git a/src/rust/api.h b/src/rust/api.h
index 3362a6c9..c1567172 100644
--- a/src/rust/api.h
+++ b/src/rust/api.h
@@ -329,6 +329,7 @@ SEXP savvy_PlRLazyFrame_select_seq__ffi(SEXP self__, SEXP c_arg__exprs);
 SEXP savvy_PlRLazyFrame_rolling__ffi(SEXP self__, SEXP c_arg__index_column, SEXP c_arg__period, SEXP c_arg__offset, SEXP c_arg__closed, SEXP c_arg__by);
 SEXP savvy_PlRLazyFrame_group_by_dynamic__ffi(SEXP self__, SEXP c_arg__index_column, SEXP c_arg__every, SEXP c_arg__period, SEXP c_arg__offset, SEXP c_arg__label, SEXP c_arg__include_boundaries, SEXP c_arg__closed, SEXP c_arg__group_by, SEXP c_arg__start_by);
 SEXP savvy_PlRLazyFrame_with_context__ffi(SEXP self__, SEXP c_arg__contexts);
+SEXP savvy_PlRLazyFrame_join_asof__ffi(SEXP self__, SEXP c_arg__other, SEXP c_arg__left_on, SEXP c_arg__right_on, SEXP c_arg__allow_parallel, SEXP c_arg__force_parallel, SEXP c_arg__suffix, SEXP c_arg__coalesce, SEXP c_arg__strategy, SEXP c_arg__left_by, SEXP c_arg__right_by, SEXP c_arg__tolerance, SEXP c_arg__tolerance_str);
 SEXP savvy_PlRLazyFrame_join__ffi(SEXP self__, SEXP c_arg__other, SEXP c_arg__left_on, SEXP c_arg__right_on, SEXP c_arg__allow_parallel, SEXP c_arg__force_parallel, SEXP c_arg__join_nulls, SEXP c_arg__how, SEXP c_arg__suffix, SEXP c_arg__validate, SEXP c_arg__coalesce);
 SEXP savvy_PlRLazyFrame_join_where__ffi(SEXP self__, SEXP c_arg__other, SEXP c_arg__predicates, SEXP c_arg__suffix);
 SEXP savvy_PlRLazyFrame_with_columns_seq__ffi(SEXP self__, SEXP c_arg__exprs);
diff --git a/src/rust/src/conversion/mod.rs b/src/rust/src/conversion/mod.rs
index 43bd6745..e9ebabb6 100644
--- a/src/rust/src/conversion/mod.rs
+++ b/src/rust/src/conversion/mod.rs
@@ -3,9 +3,9 @@ use std::num::NonZeroUsize;
 use crate::prelude::*;
 use crate::{PlRDataFrame, PlRDataType, PlRExpr, PlRLazyFrame};
 use polars::series::ops::NullBehavior;
-use savvy::{ListSexp, NumericScalar, NumericSexp, NumericTypedSexp, TypedSexp};
+use savvy::{ListSexp, NumericScalar, NumericSexp, NumericTypedSexp, Sexp, TypedSexp};
 pub mod base_date;
-mod chunked_array;
+pub mod chunked_array;
 pub mod clock;
 pub mod data_table;
 
@@ -31,6 +31,25 @@ impl<T> From<T> for Wrap<T> {
     }
 }
 
+impl TryFrom<Sexp> for Wrap<AnyValue<'_>> {
+    type Error = String;
+    fn try_from(obj: Sexp) -> Result<Self, String> {
+        let typed = obj.into_typed();
+        let out = match typed {
+            TypedSexp::Integer(x) => AnyValue::Int64(*(x.to_vec().get(0).unwrap()) as i64),
+            TypedSexp::Real(x) => AnyValue::Float64(*(x.to_vec().get(0).unwrap())),
+            TypedSexp::Logical(x) => AnyValue::Boolean(*(x.to_vec().get(0).unwrap())),
+            TypedSexp::String(x) => {
+                let val = x.to_vec();
+                AnyValue::StringOwned((*val.get(0).unwrap()).into())
+            }
+            TypedSexp::Null(_) => AnyValue::Null,
+            _ => return Err(format!("Cannot cast to AnyValue")),
+        };
+        Ok(Wrap(out))
+    }
+}
+
 impl TryFrom<&str> for PlRDataType {
     type Error = String;
 
@@ -83,6 +102,20 @@ impl From<ListSexp> for Wrap<Vec<Option<Vec<u8>>>> {
     }
 }
 
+impl TryFrom<&str> for Wrap<u8> {
+    type Error = String;
+
+    fn try_from(string: &str) -> Result<Self, String> {
+        let mut utf8_byte_iter = string.as_bytes().iter();
+        match (utf8_byte_iter.next(), utf8_byte_iter.next()) {
+            (Some(s), None) => Ok(Wrap(*s)),
+            (None, None) => Err(format!("cannot extract single byte from empty string")),
+            (Some(_), Some(_)) => Err(format!("multi byte-string not allowed")),
+            (None, Some(_)) => unreachable!("the iter() cannot yield Some after None(depleted)"),
+        }
+    }
+}
+
 impl TryFrom<ListSexp> for Wrap<Vec<DataFrame>> {
     type Error = savvy::Error;
 
@@ -670,16 +703,16 @@ impl TryFrom<&str> for Wrap<QuoteStyle> {
     }
 }
 
-impl TryFrom<&str> for Wrap<u8> {
+impl TryFrom<&str> for Wrap<AsofStrategy> {
     type Error = String;
 
-    fn try_from(string: &str) -> Result<Self, String> {
-        let mut utf8_byte_iter = string.as_bytes().iter();
-        match (utf8_byte_iter.next(), utf8_byte_iter.next()) {
-            (Some(s), None) => Ok(Wrap(*s)),
-            (None, None) => Err(format!("cannot extract single byte from empty string")),
-            (Some(_), Some(_)) => Err(format!("multi byte-string not allowed")),
-            (None, Some(_)) => unreachable!("the iter() cannot yield Some after None(depleted)"),
-        }
+    fn try_from(strategy: &str) -> Result<Self, String> {
+        let parsed = match strategy {
+            "forward" => AsofStrategy::Forward,
+            "backward" => AsofStrategy::Backward,
+            "nearest" => AsofStrategy::Nearest,
+            _ => return Err(format!("unreachable")),
+        };
+        Ok(Wrap(parsed))
     }
 }
diff --git a/src/rust/src/lazyframe/general.rs b/src/rust/src/lazyframe/general.rs
index 32df80db..bb0a0af3 100644
--- a/src/rust/src/lazyframe/general.rs
+++ b/src/rust/src/lazyframe/general.rs
@@ -533,49 +533,56 @@ impl PlRLazyFrame {
         Ok(self.ldf.clone().with_context(contexts).into())
     }
 
-    // fn join_asof(
-    //     &self,
-    //     other: &PlRLazyFrame,
-    //     left_on: &PlRExpr,
-    //     right_on: &PlRExpr,
-    //     left_by: Option<Vec<&str>>,
-    //     right_by: Option<Vec<&str>>,
-    //     allow_parallel: bool,
-    //     force_parallel: bool,
-    //     suffix: String,
-    //     strategy: Wrap<AsofStrategy>,
-    //     tolerance: Option<Wrap<AnyValue<'_>>>,
-    //     tolerance_str: Option<String>,
-    //     coalesce: bool,
-    // ) -> Result<PlRLazyFrame> {
-    //     let coalesce = if coalesce {
-    //         JoinCoalesce::CoalesceColumns
-    //     } else {
-    //         JoinCoalesce::KeepColumns
-    //     };
-    //     let ldf = self.ldf.clone();
-    //     let other = other.ldf;
-    //     let left_on = left_on.inner;
-    //     let right_on = right_on.inner;
-    //     Ok(ldf
-    //         .join_builder()
-    //         .with(other)
-    //         .left_on([left_on])
-    //         .right_on([right_on])
-    //         .allow_parallel(allow_parallel)
-    //         .force_parallel(force_parallel)
-    //         .coalesce(coalesce)
-    //         .how(JoinType::AsOf(AsOfOptions {
-    //             strategy: strategy.0,
-    //             left_by: left_by.map(strings_to_pl_smallstr),
-    //             right_by: right_by.map(strings_to_pl_smallstr),
-    //             tolerance: tolerance.map(|t| t.0.into_static()),
-    //             tolerance_str: tolerance_str.map(|s| s.into()),
-    //         }))
-    //         .suffix(suffix)
-    //         .finish()
-    //         .into())
-    // }
+    fn join_asof(
+        &self,
+        other: &PlRLazyFrame,
+        left_on: &PlRExpr,
+        right_on: &PlRExpr,
+        allow_parallel: bool,
+        force_parallel: bool,
+        suffix: &str,
+        coalesce: bool,
+        strategy: &str,
+        left_by: Option<StringSexp>,
+        right_by: Option<StringSexp>,
+        tolerance: Option<Sexp>,
+        tolerance_str: Option<&str>,
+    ) -> Result<PlRLazyFrame> {
+        let coalesce = if coalesce {
+            JoinCoalesce::CoalesceColumns
+        } else {
+            JoinCoalesce::KeepColumns
+        };
+        let strategy = <Wrap<AsofStrategy>>::try_from(strategy)?.0;
+        let ldf = self.ldf.clone();
+        let other = other.ldf.clone();
+        let left_on = left_on.inner.clone();
+        let right_on = right_on.inner.clone();
+        let left_by = left_by.map(|x| x.to_vec().into_iter().map(|y| y.into()).collect());
+        let right_by = right_by.map(|x| x.to_vec().into_iter().map(|y| y.into()).collect());
+        let tolerance = match tolerance {
+            Some(x) => Some(<Wrap<AnyValue<'_>>>::try_from(x)?.0),
+            None => None,
+        };
+        Ok(ldf
+            .join_builder()
+            .with(other)
+            .left_on([left_on])
+            .right_on([right_on])
+            .allow_parallel(allow_parallel)
+            .force_parallel(force_parallel)
+            .coalesce(coalesce)
+            .how(JoinType::AsOf(AsOfOptions {
+                strategy,
+                left_by,
+                right_by,
+                tolerance: tolerance.map(|t| t.into_static()),
+                tolerance_str: tolerance_str.map(|s| s.into()),
+            }))
+            .suffix(suffix)
+            .finish()
+            .into())
+    }
 
     fn join(
         &self,

From b98861fb301b0f0b455a021662cfc7ac9500e339 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Thu, 21 Nov 2024 15:29:02 +0100
Subject: [PATCH 19/71] remove duplicated fun

---
 R/check_polars.R | 29 -----------------------------
 1 file changed, 29 deletions(-)

diff --git a/R/check_polars.R b/R/check_polars.R
index 9cd32a8f..aade5ebb 100644
--- a/R/check_polars.R
+++ b/R/check_polars.R
@@ -288,32 +288,3 @@ check_list_of_polars_dtype <- function(
     call = call
   )
 }
-
-check_date_or_datetime <- function(
-    x,
-    ...,
-    allow_null = FALSE,
-    arg = caller_arg(x),
-    call = caller_env()) {
-  if (!missing(x)) {
-    if (inherits(x, c("Date", "POSIXct", "POSIXlt", "polars_expr"))) {
-      return(invisible(NULL))
-    }
-    if (allow_null && is_null(x)) {
-      return(invisible(NULL))
-    }
-    if (is_character(x)) {
-      return(invisible(NULL))
-    }
-  }
-
-  stop_input_type(
-    x,
-    "a Date, POSIXct, character, or Polars expression",
-    ...,
-    allow_na = FALSE,
-    allow_null = allow_null,
-    arg = arg,
-    call = call
-  )
-}

From 0ac744c279103a41c8a997ea0e3fe560e23af044 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Thu, 21 Nov 2024 15:32:39 +0100
Subject: [PATCH 20/71] lint

---
 src/rust/src/conversion/mod.rs | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/rust/src/conversion/mod.rs b/src/rust/src/conversion/mod.rs
index dea2783c..5f86a2a4 100644
--- a/src/rust/src/conversion/mod.rs
+++ b/src/rust/src/conversion/mod.rs
@@ -37,12 +37,12 @@ impl TryFrom<Sexp> for Wrap<AnyValue<'_>> {
     fn try_from(obj: Sexp) -> Result<Self, String> {
         let typed = obj.into_typed();
         let out = match typed {
-            TypedSexp::Integer(x) => AnyValue::Int64(*(x.to_vec().get(0).unwrap()) as i64),
-            TypedSexp::Real(x) => AnyValue::Float64(*(x.to_vec().get(0).unwrap())),
-            TypedSexp::Logical(x) => AnyValue::Boolean(*(x.to_vec().get(0).unwrap())),
+            TypedSexp::Integer(x) => AnyValue::Int64(*(x.to_vec().first().unwrap()) as i64),
+            TypedSexp::Real(x) => AnyValue::Float64(*(x.to_vec().first().unwrap())),
+            TypedSexp::Logical(x) => AnyValue::Boolean(*(x.to_vec().first().unwrap())),
             TypedSexp::String(x) => {
                 let val = x.to_vec();
-                AnyValue::StringOwned((*val.get(0).unwrap()).into())
+                AnyValue::StringOwned((*val.first().unwrap()).into())
             }
             TypedSexp::Null(_) => AnyValue::Null,
             _ => return Err(format!("Cannot cast to AnyValue")),
@@ -691,7 +691,7 @@ impl TryFrom<ListSexp> for Wrap<StatisticsOptions> {
                 let value = match value {
                     TypedSexp::Logical(val) => {
                         let tmp = val.to_vec();
-                        *tmp.get(0).unwrap()
+                        *tmp.first().unwrap()
                     }
                     _ => unreachable!(),
                 };

From 7a87e3deebc4977ebefe77f933d57d00c324467f Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Thu, 21 Nov 2024 16:34:09 +0100
Subject: [PATCH 21/71] revamp tests tools

---
 tests/testthat/helper-expections.R    |  93 ++++--------
 tests/testthat/setup.R                |  22 +++
 tests/testthat/test-dataframe-frame.R |  59 --------
 tests/testthat/test-frame-lazy.R      | 206 ++++++++++++++++++++++++++
 tests/testthat/test-frame.R           | 198 +++++++++++++++++++++++++
 tests/testthat/test-lazyframe-frame.R | 134 -----------------
 6 files changed, 453 insertions(+), 259 deletions(-)
 create mode 100644 tests/testthat/setup.R
 delete mode 100644 tests/testthat/test-dataframe-frame.R
 create mode 100644 tests/testthat/test-frame-lazy.R
 create mode 100644 tests/testthat/test-frame.R
 delete mode 100644 tests/testthat/test-lazyframe-frame.R

diff --git a/tests/testthat/helper-expections.R b/tests/testthat/helper-expections.R
index 105d59bd..44b84bfd 100644
--- a/tests/testthat/helper-expections.R
+++ b/tests/testthat/helper-expections.R
@@ -1,72 +1,33 @@
-#' Compare the query result with LazyFrame and DataFrame
-#'
-#' Inspired by `compare_dplyr_binding` of the arrow package.
-#' @param object A polars query, must be started with `.input`.
-#' See the examples for details.
-#' @param input R object will be converted to a DataFrame or LazyFrame
-#' by `as_polars_df` or `as_polars_lf`.
-#' @param expected A polars DataFrame, the expected result of the query.
-#' @examples
-#' expect_query_equal(
-#'   .input$select("foo"),
-#'   pl$DataFrame(foo = NULL, bar = NULL),
-#'   pl$DataFrame(foo = NULL)
-#' )
-#' @noRd
-expect_query_equal <- function(object, input, expected) {
-  query <- rlang::enquo(object)
-  out_lazy <- rlang::eval_tidy(query, rlang::new_data_mask(rlang::env(.input = as_polars_lf(input))))$collect()
-  out_eager <- rlang::eval_tidy(query, rlang::new_data_mask(rlang::env(.input = as_polars_df(input))))
-
-  expect_equal(out_lazy, expected)
-  expect_equal(out_eager, expected)
-
-  invisible(NULL)
+### Helper to detect whether an expectation runs in a test where data is in
+### lazy mode.
+is_in_lazy_test <- function() {
+  nzchar(Sys.getenv("POLARS_IN_LAZY_TEST")) && Sys.getenv("POLARS_IN_LAZY_TEST") == "TRUE"
 }
 
-#' Compare the query error with LazyFrame and DataFrame
-#'
-#' Same as `expect_query_equal()`, but for `expect_error()`.
-#' @param object A polars query, must be started with `.input`.
-#' See `expect_query_equal()` for details.
-#' @param input R object will be converted to a DataFrame or LazyFrame
-#' by `as_polars_df()` or `as_polars_lf()`.
-#' @param regexp passed to `expect_error()`.
-#' @param class passed to `expect_error()`.
-#' @param ... passed to `expect_error()`.
-#' @noRd
-expect_query_error <- function(object, input, regexp = NULL, class = NULL, ...) {
-  query <- rlang::enquo(object)
-  expect_error(
-    rlang::eval_tidy(query, rlang::new_data_mask(rlang::env(.input = as_polars_lf(input))))$collect(),
-    regexp = regexp,
-    class = class,
-    ...
-  )
-  expect_error(
-    rlang::eval_tidy(query, rlang::new_data_mask(rlang::env(.input = as_polars_df(input)))),
-    regexp = regexp,
-    class = class,
-    ...
-  )
-
-  invisible(NULL)
+### Those helpers are equivalent to their counterparts without the "_lazy"
+### suffix but they run on lazyframes
+###
+### They shouldn't be used manually. Instead they are automatically inserted in
+### some test files by the code in setup.R.
+expect_equal_lazy <- function(x, y, ...) {
+  if (inherits(x, "polars_lazy_frame")) {
+    x <- x$collect()
+  }
+  if (inherits(y, "polars_lazy_frame")) {
+    y <- y$collect()
+  }
+  dots <- list2(...)
+  if (isTRUE(dots$skip_for_lazy)) {
+    skip("Test skipped for LazyFrame")
+    return(invisible())
+  }
+  expect_equal(x, y, ...)
 }
 
-#' Mix of `expect_query_equal()` and `expect_query_error()`
-#'
-#' The query only succeeds for DataFrame, but fails for LazyFrame.
-expect_eager_equal_lazy_error <- function(object, input, expected, regexp = NULL, class = NULL, ...) {
-  query <- rlang::enquo(object)
-  out_eager <- rlang::eval_tidy(query, rlang::new_data_mask(rlang::env(.input = as_polars_df(input))))
-
-  expect_equal(out_eager, expected)
-  expect_error(
-    rlang::eval_tidy(query, rlang::new_data_mask(rlang::env(.input = as_polars_lf(input))))$collect(),
-    regexp = regexp,
-    class = class,
-    ...
-  )
+expect_error_lazy <- function(current, pattern = ".*", ...) {
+  expect_error(current$collect(), pattern, ...)
+}
 
-  invisible(NULL)
+expect_snapshot_lazy <- function(current, ...) {
+  expect_snapshot(current$collect(), ...)
 }
diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R
new file mode 100644
index 00000000..a3c62f44
--- /dev/null
+++ b/tests/testthat/setup.R
@@ -0,0 +1,22 @@
+Sys.setenv(POLARS_IN_LAZY_TEST = FALSE)
+
+to_duplicate <- test_path("test-frame.R")
+
+for (i in to_duplicate) {
+  print(i)
+  tmp <- readLines(i)
+  out <- gsub("pl\\$DataFrame", "pl\\$LazyFrame", tmp)
+  out <- gsub("as_polars_df\\(", "as_polars_lf(", out)
+  out <- gsub("expect_equal\\(", "expect_equal_lazy(", out)
+  out <- gsub("expect_error\\(", "expect_error_lazy(", out)
+  out <- gsub("expect_snapshot", "expect_snapshot_lazy", out)
+  out <- paste0(
+    "#############################################\n",
+    "### [GENERATED AUTOMATICALLY] Update ", i, " instead.\n",
+    "#############################################\n\n",
+    "Sys.setenv(POLARS_IN_LAZY_TEST = TRUE)\n\n",
+    paste(out, collapse = "\n"),
+    "\n\nSys.setenv(POLARS_IN_LAZY_TEST = FALSE)"
+  )
+  cat(out, file = gsub("\\.R$", "-lazy\\.R", i))
+}
diff --git a/tests/testthat/test-dataframe-frame.R b/tests/testthat/test-dataframe-frame.R
deleted file mode 100644
index feba824f..00000000
--- a/tests/testthat/test-dataframe-frame.R
+++ /dev/null
@@ -1,59 +0,0 @@
-patrick::with_parameters_test_that(
-  "use pl$DataFrame() to construct a DataFrame",
-  .cases = {
-    tibble::tribble(
-      ~.test_name, ~object, ~expected,
-      "simple", pl$DataFrame(a = 1, b = list("b"), ), as_polars_df(list(a = 1, b = list("b"))),
-      "!!! for list", pl$DataFrame(!!!list(a = 1, b = list("b")), c = 1), as_polars_df(list(a = 1, b = list("b"), c = 1)),
-      "!!! for data.frame", pl$DataFrame(!!!data.frame(a = 1, b = "b"), c = 1), as_polars_df(list(a = 1, b = "b", c = 1)),
-      "empty", pl$DataFrame(), as_polars_df(list()),
-    )
-  },
-  code = {
-    expect_equal(object, expected)
-  }
-)
-
-test_that("pl$DataFrame() requires series the same length", {
-  expect_error(pl$DataFrame(a = 1:2, b = "foo"), "has length 2")
-})
-
-test_that("pl$DataFrame() rejects expressions", {
-  expect_error(
-    pl$DataFrame(a = 1:2, b = pl$lit("foo")),
-    r"(Try evaluating the expression first using `pl\$select\(\)`)"
-  )
-})
-
-test_that("to_struct()", {
-  expect_equal(
-    as_polars_df(mtcars)$to_struct("foo"),
-    as_polars_series(mtcars, "foo")
-  )
-})
-
-test_that("get_columns()", {
-  expect_equal(
-    pl$DataFrame(a = 1:2, b = c("foo", "bar"))$get_columns(),
-    list(
-      a = as_polars_series(1:2, "a"),
-      b = as_polars_series(c("foo", "bar"), "b")
-    )
-  )
-})
-
-test_that("to_series()", {
-  data <- data.frame(
-    a = 1:2,
-    b = c("foo", "bar")
-  )
-
-  expect_equal(
-    as_polars_df(data)$to_series(),
-    as_polars_series(data$a, "a")
-  )
-  expect_equal(
-    as_polars_df(data)$to_series(1),
-    as_polars_series(data$b, "b")
-  )
-})
diff --git a/tests/testthat/test-frame-lazy.R b/tests/testthat/test-frame-lazy.R
new file mode 100644
index 00000000..013405de
--- /dev/null
+++ b/tests/testthat/test-frame-lazy.R
@@ -0,0 +1,206 @@
+#############################################
+### [GENERATED AUTOMATICALLY] Update test-frame.R instead.
+#############################################
+
+Sys.setenv(POLARS_IN_LAZY_TEST = TRUE)
+
+patrick::with_parameters_test_that(
+  "use pl$LazyFrame() to construct a DataFrame",
+  .cases = {
+    tibble::tribble(
+      ~.test_name, ~object, ~expected,
+      "simple", pl$LazyFrame(a = 1, b = list("b"), ), as_polars_lf(list(a = 1, b = list("b"))),
+      "!!! for list", pl$LazyFrame(!!!list(a = 1, b = list("b")), c = 1), as_polars_lf(list(a = 1, b = list("b"), c = 1)),
+      "!!! for data.frame", pl$LazyFrame(!!!data.frame(a = 1, b = "b"), c = 1), as_polars_lf(list(a = 1, b = "b", c = 1)),
+      "empty", pl$LazyFrame(), as_polars_lf(list()),
+    )
+  },
+  code = {
+    expect_equal_lazy(object, expected)
+  }
+)
+
+test_that("pl$LazyFrame() requires series the same length", {
+  expect_error_lazy(pl$LazyFrame(a = 1:2, b = "foo"), "has length 2")
+})
+
+test_that("pl$LazyFrame() rejects expressions", {
+  expect_error_lazy(
+    pl$LazyFrame(a = 1:2, b = pl$lit("foo")),
+    r"(Try evaluating the expression first using `pl\$select\(\)`)"
+  )
+})
+
+test_that("to_struct()", {
+  skip_if(is_in_lazy_test())
+  expect_equal_lazy(
+    as_polars_lf(mtcars)$to_struct("foo"),
+    as_polars_series(mtcars, "foo")
+  )
+})
+
+test_that("get_columns()", {
+  skip_if(is_in_lazy_test())
+  expect_equal_lazy(
+    pl$LazyFrame(a = 1:2, b = c("foo", "bar"))$get_columns(),
+    list(
+      a = as_polars_series(1:2, "a"),
+      b = as_polars_series(c("foo", "bar"), "b")
+    )
+  )
+})
+
+test_that("to_series()", {
+  skip_if(is_in_lazy_test())
+  data <- data.frame(
+    a = 1:2,
+    b = c("foo", "bar")
+  )
+
+  expect_equal_lazy(
+    as_polars_lf(data)$to_series(),
+    as_polars_series(data$a, "a")
+  )
+  expect_equal_lazy(
+    as_polars_lf(data)$to_series(1),
+    as_polars_series(data$b, "b")
+  )
+})
+
+test_that("select works lazy/eager", {
+  .data <- pl$LazyFrame(
+    int32 = 1:5,
+    int64 = as_polars_series(1:5)$cast(pl$Int64),
+    string = letters[1:5],
+  )
+
+  expect_equal_lazy(
+    .data$select("int32"),
+    pl$LazyFrame(int32 = 1:5)
+  )
+  expect_equal_lazy(
+    .data$select(pl$lit("int32")),
+    pl$LazyFrame(literal = "int32")
+  )
+  expect_equal_lazy(
+    .data$select(foo = "int32"),
+    pl$LazyFrame(foo = 1:5)
+  )
+})
+
+test_that("POLARS_AUTO_STRUCTIFY works for select", {
+  .data <- pl$LazyFrame(
+    foo = 1:3,
+    bar = 6:8,
+    ham = letters[1:3],
+  )
+
+  withr::with_envvar(
+    c(POLARS_AUTO_STRUCTIFY = "foo"),
+    {
+      expect_error_lazy(
+        .data$select(1),
+        r"(Environment variable `POLARS_AUTO_STRUCTIFY` must be one of \('0', '1'\), got 'foo')"
+      )
+    }
+  )
+
+  withr::with_envvar(
+    c(POLARS_AUTO_STRUCTIFY = "0"),
+    {
+      expect_error_lazy(
+        .data$select(is_odd = ((pl$col(pl$Int32) %% 2) == 1)$name$suffix("_is_odd")),
+        "`keep`, `suffix`, `prefix` should be last expression"
+      )
+
+      expect_equal_lazy(
+        withr::with_envvar(c(POLARS_AUTO_STRUCTIFY = "1"), {
+          .data$select(is_odd = ((pl$col(pl$Int32) %% 2) == 1)$name$suffix("_is_odd"))
+        }),
+        as_polars_lf(.data)$select(
+          is_odd = pl$struct(((pl$col(pl$Int32) %% 2) == 1)$name$suffix("_is_odd")),
+        )$collect()
+      )
+    }
+  )
+})
+
+test_that("slice/head/tail work", {
+  .data <- pl$LazyFrame(
+    foo = 1:5,
+    bar = 6:10,
+  )
+
+  # slice
+  expect_equal_lazy(
+    .data$slice(1),
+    pl$LazyFrame(foo = 2:5, bar = 7:10)
+  )
+  expect_equal_lazy(
+    .data$slice(1, 2),
+    pl$LazyFrame(foo = 2:3, bar = 7:8)
+  )
+  expect_equal_lazy(
+    .data$slice(1, 2),
+    pl$LazyFrame(foo = 2:3, bar = 7:8)
+  )
+  expect_equal_lazy(
+    .data$slice(4, 100),
+    pl$LazyFrame(foo = 5L, bar = 10L)
+  )
+  if (is_in_lazy_test()) {
+    expect_error_lazy(
+      .data$slice(0, -2),
+      r"(-2.0 is out of range that can be safely converted to u32)"
+    )
+  } else {
+    expect_equal_lazy(
+      .data$slice(0, -2),
+      pl$LazyFrame(foo = 1:3, bar = 6:8)
+    )
+  }
+
+  # head
+  expect_equal_lazy(
+    .data$head(1),
+    pl$LazyFrame(foo = 1L, bar = 6L)
+  )
+  expect_equal_lazy(
+    .data$head(100),
+    .data
+  )
+  if (is_in_lazy_test()) {
+    expect_error_lazy(
+      .data$head(-4),
+      r"(-4.0 is out of range that can be safely converted to u32)"
+    )
+  } else {
+    expect_equal_lazy(
+      .data$head(-4),
+      pl$LazyFrame(foo = 1L, bar = 6L)
+    )
+  }
+
+  # tail
+  expect_equal_lazy(
+    .data$tail(1),
+    pl$LazyFrame(foo = 5L, bar = 10L)
+  )
+  expect_equal_lazy(
+    .data$tail(100),
+    .data
+  )
+  if (is_in_lazy_test()) {
+    expect_error_lazy(
+      .data$tail(-4),
+      r"(-4\.0 is out of range that can be safely converted to u32)"
+    )
+  } else {
+    expect_equal_lazy(
+      .data$tail(-4),
+      pl$LazyFrame(foo = 5L, bar = 10L)
+    )
+  }
+})
+
+Sys.setenv(POLARS_IN_LAZY_TEST = FALSE)
\ No newline at end of file
diff --git a/tests/testthat/test-frame.R b/tests/testthat/test-frame.R
new file mode 100644
index 00000000..7c6f1d1a
--- /dev/null
+++ b/tests/testthat/test-frame.R
@@ -0,0 +1,198 @@
+patrick::with_parameters_test_that(
+  "use pl$DataFrame() to construct a DataFrame",
+  .cases = {
+    tibble::tribble(
+      ~.test_name, ~object, ~expected,
+      "simple", pl$DataFrame(a = 1, b = list("b"), ), as_polars_df(list(a = 1, b = list("b"))),
+      "!!! for list", pl$DataFrame(!!!list(a = 1, b = list("b")), c = 1), as_polars_df(list(a = 1, b = list("b"), c = 1)),
+      "!!! for data.frame", pl$DataFrame(!!!data.frame(a = 1, b = "b"), c = 1), as_polars_df(list(a = 1, b = "b", c = 1)),
+      "empty", pl$DataFrame(), as_polars_df(list()),
+    )
+  },
+  code = {
+    expect_equal(object, expected)
+  }
+)
+
+test_that("pl$DataFrame() requires series the same length", {
+  expect_error(pl$DataFrame(a = 1:2, b = "foo"), "has length 2")
+})
+
+test_that("pl$DataFrame() rejects expressions", {
+  expect_error(
+    pl$DataFrame(a = 1:2, b = pl$lit("foo")),
+    r"(Try evaluating the expression first using `pl\$select\(\)`)"
+  )
+})
+
+test_that("to_struct()", {
+  skip_if(is_in_lazy_test())
+  expect_equal(
+    as_polars_df(mtcars)$to_struct("foo"),
+    as_polars_series(mtcars, "foo")
+  )
+})
+
+test_that("get_columns()", {
+  skip_if(is_in_lazy_test())
+  expect_equal(
+    pl$DataFrame(a = 1:2, b = c("foo", "bar"))$get_columns(),
+    list(
+      a = as_polars_series(1:2, "a"),
+      b = as_polars_series(c("foo", "bar"), "b")
+    )
+  )
+})
+
+test_that("to_series()", {
+  skip_if(is_in_lazy_test())
+  data <- data.frame(
+    a = 1:2,
+    b = c("foo", "bar")
+  )
+
+  expect_equal(
+    as_polars_df(data)$to_series(),
+    as_polars_series(data$a, "a")
+  )
+  expect_equal(
+    as_polars_df(data)$to_series(1),
+    as_polars_series(data$b, "b")
+  )
+})
+
+test_that("select works lazy/eager", {
+  .data <- pl$DataFrame(
+    int32 = 1:5,
+    int64 = as_polars_series(1:5)$cast(pl$Int64),
+    string = letters[1:5],
+  )
+
+  expect_equal(
+    .data$select("int32"),
+    pl$DataFrame(int32 = 1:5)
+  )
+  expect_equal(
+    .data$select(pl$lit("int32")),
+    pl$DataFrame(literal = "int32")
+  )
+  expect_equal(
+    .data$select(foo = "int32"),
+    pl$DataFrame(foo = 1:5)
+  )
+})
+
+test_that("POLARS_AUTO_STRUCTIFY works for select", {
+  .data <- pl$DataFrame(
+    foo = 1:3,
+    bar = 6:8,
+    ham = letters[1:3],
+  )
+
+  withr::with_envvar(
+    c(POLARS_AUTO_STRUCTIFY = "foo"),
+    {
+      expect_error(
+        .data$select(1),
+        r"(Environment variable `POLARS_AUTO_STRUCTIFY` must be one of \('0', '1'\), got 'foo')"
+      )
+    }
+  )
+
+  withr::with_envvar(
+    c(POLARS_AUTO_STRUCTIFY = "0"),
+    {
+      expect_error(
+        .data$select(is_odd = ((pl$col(pl$Int32) %% 2) == 1)$name$suffix("_is_odd")),
+        "`keep`, `suffix`, `prefix` should be last expression"
+      )
+
+      expect_equal(
+        withr::with_envvar(c(POLARS_AUTO_STRUCTIFY = "1"), {
+          .data$select(is_odd = ((pl$col(pl$Int32) %% 2) == 1)$name$suffix("_is_odd"))
+        }),
+        as_polars_lf(.data)$select(
+          is_odd = pl$struct(((pl$col(pl$Int32) %% 2) == 1)$name$suffix("_is_odd")),
+        )$collect()
+      )
+    }
+  )
+})
+
+test_that("slice/head/tail work", {
+  .data <- pl$DataFrame(
+    foo = 1:5,
+    bar = 6:10,
+  )
+
+  # slice
+  expect_equal(
+    .data$slice(1),
+    pl$DataFrame(foo = 2:5, bar = 7:10)
+  )
+  expect_equal(
+    .data$slice(1, 2),
+    pl$DataFrame(foo = 2:3, bar = 7:8)
+  )
+  expect_equal(
+    .data$slice(1, 2),
+    pl$DataFrame(foo = 2:3, bar = 7:8)
+  )
+  expect_equal(
+    .data$slice(4, 100),
+    pl$DataFrame(foo = 5L, bar = 10L)
+  )
+  if (is_in_lazy_test()) {
+    expect_error(
+      .data$slice(0, -2),
+      r"(-2.0 is out of range that can be safely converted to u32)"
+    )
+  } else {
+    expect_equal(
+      .data$slice(0, -2),
+      pl$DataFrame(foo = 1:3, bar = 6:8)
+    )
+  }
+
+  # head
+  expect_equal(
+    .data$head(1),
+    pl$DataFrame(foo = 1L, bar = 6L)
+  )
+  expect_equal(
+    .data$head(100),
+    .data
+  )
+  if (is_in_lazy_test()) {
+    expect_error(
+      .data$head(-4),
+      r"(-4.0 is out of range that can be safely converted to u32)"
+    )
+  } else {
+    expect_equal(
+      .data$head(-4),
+      pl$DataFrame(foo = 1L, bar = 6L)
+    )
+  }
+
+  # tail
+  expect_equal(
+    .data$tail(1),
+    pl$DataFrame(foo = 5L, bar = 10L)
+  )
+  expect_equal(
+    .data$tail(100),
+    .data
+  )
+  if (is_in_lazy_test()) {
+    expect_error(
+      .data$tail(-4),
+      r"(-4\.0 is out of range that can be safely converted to u32)"
+    )
+  } else {
+    expect_equal(
+      .data$tail(-4),
+      pl$DataFrame(foo = 5L, bar = 10L)
+    )
+  }
+})
diff --git a/tests/testthat/test-lazyframe-frame.R b/tests/testthat/test-lazyframe-frame.R
deleted file mode 100644
index 147dd624..00000000
--- a/tests/testthat/test-lazyframe-frame.R
+++ /dev/null
@@ -1,134 +0,0 @@
-test_that("select works lazy/eager", {
-  .data <- pl$DataFrame(
-    int32 = 1:5,
-    int64 = as_polars_series(1:5)$cast(pl$Int64),
-    string = letters[1:5],
-  )
-
-  expect_query_equal(
-    .input$select("int32"),
-    .data,
-    pl$DataFrame(int32 = 1:5)
-  )
-  expect_query_equal(
-    .input$select(pl$lit("int32")),
-    .data,
-    pl$DataFrame(literal = "int32")
-  )
-  expect_query_equal(
-    .input$select(foo = "int32"),
-    .data,
-    pl$DataFrame(foo = 1:5)
-  )
-})
-
-test_that("POLARS_AUTO_STRUCTIFY works for select", {
-  .data <- pl$DataFrame(
-    foo = 1:3,
-    bar = 6:8,
-    ham = letters[1:3],
-  )
-
-  withr::with_envvar(
-    c(POLARS_AUTO_STRUCTIFY = "foo"),
-    {
-      expect_query_error(
-        .input$select(1),
-        .data,
-        r"(Environment variable `POLARS_AUTO_STRUCTIFY` must be one of \('0', '1'\), got 'foo')"
-      )
-    }
-  )
-
-  withr::with_envvar(
-    c(POLARS_AUTO_STRUCTIFY = "0"),
-    {
-      expect_query_error(
-        .input$select(is_odd = ((pl$col(pl$Int32) %% 2) == 1)$name$suffix("_is_odd")),
-        .data,
-        "`keep`, `suffix`, `prefix` should be last expression"
-      )
-
-      expect_query_equal(
-        withr::with_envvar(c(POLARS_AUTO_STRUCTIFY = "1"), {
-          .input$select(is_odd = ((pl$col(pl$Int32) %% 2) == 1)$name$suffix("_is_odd"))
-        }),
-        .data,
-        as_polars_lf(.data)$select(
-          is_odd = pl$struct(((pl$col(pl$Int32) %% 2) == 1)$name$suffix("_is_odd")),
-        )$collect()
-      )
-    }
-  )
-})
-
-test_that("slice/head/tail works lazy/eager", {
-  .data <- pl$DataFrame(
-    foo = 1:5,
-    bar = 6:10,
-  )
-
-  # slice
-  expect_query_equal(
-    .input$slice(1),
-    .data,
-    pl$DataFrame(foo = 2:5, bar = 7:10)
-  )
-  expect_query_equal(
-    .input$slice(1, 2),
-    .data,
-    pl$DataFrame(foo = 2:3, bar = 7:8)
-  )
-  expect_query_equal(
-    .input$slice(1, 2),
-    .data,
-    pl$DataFrame(foo = 2:3, bar = 7:8)
-  )
-  expect_query_equal(
-    .input$slice(4, 100),
-    .data,
-    pl$DataFrame(foo = 5L, bar = 10L)
-  )
-  expect_eager_equal_lazy_error(
-    .input$slice(0, -2),
-    .data,
-    pl$DataFrame(foo = 1:3, bar = 6:8),
-    r"(negative slice length \(-2\) are invalid for LazyFrame)"
-  )
-
-  # head
-  expect_query_equal(
-    .input$head(1),
-    .data,
-    pl$DataFrame(foo = 1L, bar = 6L)
-  )
-  expect_query_equal(
-    .input$head(100),
-    .data,
-    .data
-  )
-  expect_eager_equal_lazy_error(
-    .input$head(-4),
-    .data,
-    pl$DataFrame(foo = 1L, bar = 6L),
-    r"(negative slice length \(-4\) are invalid for LazyFrame)"
-  )
-
-  # tail
-  expect_query_equal(
-    .input$tail(1),
-    .data,
-    pl$DataFrame(foo = 5L, bar = 10L)
-  )
-  expect_query_equal(
-    .input$tail(100),
-    .data,
-    .data
-  )
-  expect_eager_equal_lazy_error(
-    .input$tail(-4),
-    .data,
-    pl$DataFrame(foo = 5L, bar = 10L),
-    r"(-4\.0 is out of range that can be safely converted to u32)"
-  )
-})

From 0b3f944618d0dc2bcd1c32f141e7cb0a5a8a56ee Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Thu, 21 Nov 2024 16:42:37 +0100
Subject: [PATCH 22/71] minor

---
 tests/testthat/helper-expections.R | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tests/testthat/helper-expections.R b/tests/testthat/helper-expections.R
index 44b84bfd..d6a71945 100644
--- a/tests/testthat/helper-expections.R
+++ b/tests/testthat/helper-expections.R
@@ -16,11 +16,6 @@ expect_equal_lazy <- function(x, y, ...) {
   if (inherits(y, "polars_lazy_frame")) {
     y <- y$collect()
   }
-  dots <- list2(...)
-  if (isTRUE(dots$skip_for_lazy)) {
-    skip("Test skipped for LazyFrame")
-    return(invisible())
-  }
   expect_equal(x, y, ...)
 }
 

From a530ef5f9a4fcfd1836e66e0befe8bfc4a845c87 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Thu, 21 Nov 2024 16:43:19 +0100
Subject: [PATCH 23/71] typo

---
 tests/testthat/setup.R | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R
index a3c62f44..675817d8 100644
--- a/tests/testthat/setup.R
+++ b/tests/testthat/setup.R
@@ -3,7 +3,6 @@ Sys.setenv(POLARS_IN_LAZY_TEST = FALSE)
 to_duplicate <- test_path("test-frame.R")
 
 for (i in to_duplicate) {
-  print(i)
   tmp <- readLines(i)
   out <- gsub("pl\\$DataFrame", "pl\\$LazyFrame", tmp)
   out <- gsub("as_polars_df\\(", "as_polars_lf(", out)

From ae09900165341fa3d65e8a48adaa3ec1122381d1 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Fri, 22 Nov 2024 15:00:33 +0100
Subject: [PATCH 24/71] add unnest for dataframe

---
 R/000-wrappers.R                  |  7 ++++
 R/dataframe-frame.R               | 23 +++++++++++++
 src/init.c                        |  6 ++++
 src/rust/api.h                    |  1 +
 src/rust/src/dataframe/general.rs | 10 ++++++
 tests/testthat/test-frame.R       | 57 +++++++++++++++++++++++++++++++
 6 files changed, 104 insertions(+)

diff --git a/R/000-wrappers.R b/R/000-wrappers.R
index ef16e717..e1ba1cab 100644
--- a/R/000-wrappers.R
+++ b/R/000-wrappers.R
@@ -475,6 +475,12 @@ class(`PlRChainedWhen`) <- c("PlRChainedWhen__bundle", "savvy_neopolars__sealed"
   }
 }
 
+`PlRDataFrame_unnest` <- function(self) {
+  function(`columns`) {
+    .savvy_wrap_PlRDataFrame(.Call(savvy_PlRDataFrame_unnest__impl, `self`, `columns`))
+  }
+}
+
 `.savvy_wrap_PlRDataFrame` <- function(ptr) {
   e <- new.env(parent = emptyenv())
   e$.ptr <- ptr
@@ -496,6 +502,7 @@ class(`PlRChainedWhen`) <- c("PlRChainedWhen__bundle", "savvy_neopolars__sealed"
   e$`to_struct` <- `PlRDataFrame_to_struct`(ptr)
   e$`n_chunks` <- `PlRDataFrame_n_chunks`(ptr)
   e$`rechunk` <- `PlRDataFrame_rechunk`(ptr)
+  e$`unnest` <- `PlRDataFrame_unnest`(ptr)
 
   class(e) <- c("PlRDataFrame", "savvy_neopolars__sealed")
   e
diff --git a/R/dataframe-frame.R b/R/dataframe-frame.R
index d76e6027..3994cfcf 100644
--- a/R/dataframe-frame.R
+++ b/R/dataframe-frame.R
@@ -539,3 +539,26 @@ dataframe__rechunk <- function() {
     self$`_df`$rechunk()
   })
 }
+
+#' @inherit lazyframe__unnest title description params
+#' @inherit as_polars_df return
+#' @examples
+#' df <- pl$DataFrame(
+#'   a = 1:5,
+#'   b = c("one", "two", "three", "four", "five"),
+#'   c = 6:10
+#' )$
+#'   select(
+#'   pl$struct("b"),
+#'   a_and_c = pl$struct(c("a", "c"))
+#' )
+#' df
+#'
+#' df$unnest("a_and_c")
+dataframe__unnest <- function(...) {
+  wrap({
+    columns <- unlist(list2(...))
+    check_character(columns, allow_na = FALSE)
+    self$`_df`$unnest(columns)
+  })
+}
diff --git a/src/init.c b/src/init.c
index df2b47b3..8a8c7e83 100644
--- a/src/init.c
+++ b/src/init.c
@@ -359,6 +359,11 @@ SEXP savvy_PlRDataFrame_rechunk__impl(SEXP self__) {
     return handle_result(res);
 }
 
+SEXP savvy_PlRDataFrame_unnest__impl(SEXP self__, SEXP c_arg__columns) {
+    SEXP res = savvy_PlRDataFrame_unnest__ffi(self__, c_arg__columns);
+    return handle_result(res);
+}
+
 SEXP savvy_PlRDataType_new_from_name__impl(SEXP c_arg__name) {
     SEXP res = savvy_PlRDataType_new_from_name__ffi(c_arg__name);
     return handle_result(res);
@@ -2116,6 +2121,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"savvy_PlRDataFrame_to_struct__impl", (DL_FUNC) &savvy_PlRDataFrame_to_struct__impl, 2},
     {"savvy_PlRDataFrame_n_chunks__impl", (DL_FUNC) &savvy_PlRDataFrame_n_chunks__impl, 1},
     {"savvy_PlRDataFrame_rechunk__impl", (DL_FUNC) &savvy_PlRDataFrame_rechunk__impl, 1},
+    {"savvy_PlRDataFrame_unnest__impl", (DL_FUNC) &savvy_PlRDataFrame_unnest__impl, 2},
     {"savvy_PlRDataType_new_from_name__impl", (DL_FUNC) &savvy_PlRDataType_new_from_name__impl, 1},
     {"savvy_PlRDataType_new_decimal__impl", (DL_FUNC) &savvy_PlRDataType_new_decimal__impl, 2},
     {"savvy_PlRDataType_new_datetime__impl", (DL_FUNC) &savvy_PlRDataType_new_datetime__impl, 2},
diff --git a/src/rust/api.h b/src/rust/api.h
index 8fd352cd..65e625a3 100644
--- a/src/rust/api.h
+++ b/src/rust/api.h
@@ -69,6 +69,7 @@ SEXP savvy_PlRDataFrame_lazy__ffi(SEXP self__);
 SEXP savvy_PlRDataFrame_to_struct__ffi(SEXP self__, SEXP c_arg__name);
 SEXP savvy_PlRDataFrame_n_chunks__ffi(SEXP self__);
 SEXP savvy_PlRDataFrame_rechunk__ffi(SEXP self__);
+SEXP savvy_PlRDataFrame_unnest__ffi(SEXP self__, SEXP c_arg__columns);
 
 // methods and associated functions for PlRDataType
 SEXP savvy_PlRDataType_new_from_name__ffi(SEXP c_arg__name);
diff --git a/src/rust/src/dataframe/general.rs b/src/rust/src/dataframe/general.rs
index 783d8f9f..78435aed 100644
--- a/src/rust/src/dataframe/general.rs
+++ b/src/rust/src/dataframe/general.rs
@@ -160,4 +160,14 @@ impl PlRDataFrame {
         df.as_single_chunk_par();
         Ok(df.into())
     }
+
+    pub fn unnest(&self, columns: StringSexp) -> Result<PlRDataFrame> {
+        let columns = columns.to_vec();
+        Ok(self
+            .df
+            .clone()
+            .unnest(columns)
+            .map_err(RPolarsErr::from)?
+            .into())
+    }
 }
diff --git a/tests/testthat/test-frame.R b/tests/testthat/test-frame.R
index 7c6f1d1a..7a6af638 100644
--- a/tests/testthat/test-frame.R
+++ b/tests/testthat/test-frame.R
@@ -196,3 +196,60 @@ test_that("slice/head/tail work", {
     )
   }
 })
+
+
+test_that("unnest works correctly", {
+  df <- pl$DataFrame(
+    a = 1:5,
+    b = c("one", "two", "three", "four", "five"),
+    c = 6:10
+  )$
+    select(
+    foo = pl$lit(1),
+    pl$struct("b"),
+    pl$struct(c("a", "c"))$alias("a_and_c")
+  )
+
+  expect_identical(
+    df$unnest("b", "a_and_c"),
+    df$unnest(c("b", "a_and_c"))
+  )
+
+  # wrong input
+  expect_snapshot(
+    df$unnest("b", pl$col("a_and_c")),
+    error = TRUE
+  )
+  expect_snapshot(df$unnest(1), error = TRUE)
+
+  # wrong datatype
+  expect_snapshot(df$unnest("foo"), error = TRUE)
+})
+
+
+make_cases <- function() {
+  tibble::tribble(
+    ~.test_name, ~pola, ~base,
+    "max", "max", max,
+    "mean", "mean", mean,
+    "median", "median", median,
+    "max", "max", max,
+    "min", "min", min,
+    "std", "std", sd,
+    "sum", "sum", sum,
+    "var", "var", var,
+    "first", "first", function(x) head(x, 1),
+    "last", "last", function(x) tail(x, 1)
+  )
+}
+
+patrick::with_parameters_test_that(
+  "simple translations: eager",
+  {
+    browser()
+    a <- as_polars_df(mtcars)[[pola]]()
+    b <- as_polars_df(!!!lapply(mtcars, base))
+    expect_equal(a, b)
+  },
+  .cases = make_cases()
+)

From 6b4cc9c8410bee79efe45a5fba3d7227ded85d28 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Sat, 23 Nov 2024 22:42:46 +0100
Subject: [PATCH 25/71] rename files

---
 tests/testthat/_snaps/dataframe-frame.md      | 30 +++++++++
 tests/testthat/setup.R                        | 38 ++++++-----
 .../{test-frame.R => test-dataframe-frame.R}  | 28 --------
 ...st-frame-lazy.R => test-lazyframe-frame.R} | 66 +++++++++++++++++--
 4 files changed, 112 insertions(+), 50 deletions(-)
 create mode 100644 tests/testthat/_snaps/dataframe-frame.md
 rename tests/testthat/{test-frame.R => test-dataframe-frame.R} (89%)
 rename tests/testthat/{test-frame-lazy.R => test-lazyframe-frame.R} (78%)

diff --git a/tests/testthat/_snaps/dataframe-frame.md b/tests/testthat/_snaps/dataframe-frame.md
new file mode 100644
index 00000000..0393bb8a
--- /dev/null
+++ b/tests/testthat/_snaps/dataframe-frame.md
@@ -0,0 +1,30 @@
+# unnest works correctly
+
+    Code
+      df$unnest("b", pl$col("a_and_c"))
+    Condition
+      Error in `df$unnest()`:
+      ! Evaluation failed in `$unnest()`.
+      Caused by error in `df$unnest()`:
+      ! `columns` must be a character vector, not a list.
+
+---
+
+    Code
+      df$unnest(1)
+    Condition
+      Error in `df$unnest()`:
+      ! Evaluation failed in `$unnest()`.
+      Caused by error in `df$unnest()`:
+      ! `columns` must be a character vector, not the number 1.
+
+---
+
+    Code
+      df$unnest("foo")
+    Condition
+      Error in `df$unnest()`:
+      ! Evaluation failed in `$unnest()`.
+      Caused by error:
+      ! invalid series dtype: expected `Struct`, got `f64`
+
diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R
index 675817d8..9aa80877 100644
--- a/tests/testthat/setup.R
+++ b/tests/testthat/setup.R
@@ -1,21 +1,23 @@
 Sys.setenv(POLARS_IN_LAZY_TEST = FALSE)
 
-to_duplicate <- test_path("test-frame.R")
+orig <- test_path("test-dataframe-frame.R")
+dest <- test_path("test-lazyframe-frame.R")
 
-for (i in to_duplicate) {
-  tmp <- readLines(i)
-  out <- gsub("pl\\$DataFrame", "pl\\$LazyFrame", tmp)
-  out <- gsub("as_polars_df\\(", "as_polars_lf(", out)
-  out <- gsub("expect_equal\\(", "expect_equal_lazy(", out)
-  out <- gsub("expect_error\\(", "expect_error_lazy(", out)
-  out <- gsub("expect_snapshot", "expect_snapshot_lazy", out)
-  out <- paste0(
-    "#############################################\n",
-    "### [GENERATED AUTOMATICALLY] Update ", i, " instead.\n",
-    "#############################################\n\n",
-    "Sys.setenv(POLARS_IN_LAZY_TEST = TRUE)\n\n",
-    paste(out, collapse = "\n"),
-    "\n\nSys.setenv(POLARS_IN_LAZY_TEST = FALSE)"
-  )
-  cat(out, file = gsub("\\.R$", "-lazy\\.R", i))
-}
+tmp <- readLines(orig)
+out <- gsub("pl\\$DataFrame", "pl\\$LazyFrame", tmp)
+out <- gsub("as_polars_df\\(", "as_polars_lf(", out)
+out <- gsub("expect_equal\\(", "expect_equal_lazy(", out)
+out <- gsub("expect_error\\(", "expect_error_lazy(", out)
+out <- gsub("expect_snapshot", "expect_snapshot_lazy", out)
+out <- paste0(
+  "#############################################\n",
+  "### [GENERATED AUTOMATICALLY] Update ", orig, " instead.\n",
+  "#############################################\n\n",
+  "withr::with_envvar(
+    list(POLARS_IN_LAZY_TEST = TRUE),
+    {\n",
+  paste(out, collapse = "\n"),
+  "\n}
+  )"
+)
+cat(out, file = dest)
diff --git a/tests/testthat/test-frame.R b/tests/testthat/test-dataframe-frame.R
similarity index 89%
rename from tests/testthat/test-frame.R
rename to tests/testthat/test-dataframe-frame.R
index 7a6af638..062199f0 100644
--- a/tests/testthat/test-frame.R
+++ b/tests/testthat/test-dataframe-frame.R
@@ -225,31 +225,3 @@ test_that("unnest works correctly", {
   # wrong datatype
   expect_snapshot(df$unnest("foo"), error = TRUE)
 })
-
-
-make_cases <- function() {
-  tibble::tribble(
-    ~.test_name, ~pola, ~base,
-    "max", "max", max,
-    "mean", "mean", mean,
-    "median", "median", median,
-    "max", "max", max,
-    "min", "min", min,
-    "std", "std", sd,
-    "sum", "sum", sum,
-    "var", "var", var,
-    "first", "first", function(x) head(x, 1),
-    "last", "last", function(x) tail(x, 1)
-  )
-}
-
-patrick::with_parameters_test_that(
-  "simple translations: eager",
-  {
-    browser()
-    a <- as_polars_df(mtcars)[[pola]]()
-    b <- as_polars_df(!!!lapply(mtcars, base))
-    expect_equal(a, b)
-  },
-  .cases = make_cases()
-)
diff --git a/tests/testthat/test-frame-lazy.R b/tests/testthat/test-lazyframe-frame.R
similarity index 78%
rename from tests/testthat/test-frame-lazy.R
rename to tests/testthat/test-lazyframe-frame.R
index 013405de..f0304d00 100644
--- a/tests/testthat/test-frame-lazy.R
+++ b/tests/testthat/test-lazyframe-frame.R
@@ -1,9 +1,10 @@
 #############################################
-### [GENERATED AUTOMATICALLY] Update test-frame.R instead.
+### [GENERATED AUTOMATICALLY] Update test-dataframe-frame.R instead.
 #############################################
 
-Sys.setenv(POLARS_IN_LAZY_TEST = TRUE)
-
+withr::with_envvar(
+    list(POLARS_IN_LAZY_TEST = TRUE),
+    {
 patrick::with_parameters_test_that(
   "use pl$LazyFrame() to construct a DataFrame",
   .cases = {
@@ -203,4 +204,61 @@ test_that("slice/head/tail work", {
   }
 })
 
-Sys.setenv(POLARS_IN_LAZY_TEST = FALSE)
\ No newline at end of file
+
+test_that("unnest works correctly", {
+  df <- pl$LazyFrame(
+    a = 1:5,
+    b = c("one", "two", "three", "four", "five"),
+    c = 6:10
+  )$
+    select(
+    foo = pl$lit(1),
+    pl$struct("b"),
+    pl$struct(c("a", "c"))$alias("a_and_c")
+  )
+
+  expect_identical(
+    df$unnest("b", "a_and_c"),
+    df$unnest(c("b", "a_and_c"))
+  )
+
+  # wrong input
+  expect_snapshot_lazy(
+    df$unnest("b", pl$col("a_and_c")),
+    error = TRUE
+  )
+  expect_snapshot_lazy(df$unnest(1), error = TRUE)
+
+  # wrong datatype
+  expect_snapshot_lazy(df$unnest("foo"), error = TRUE)
+})
+
+
+make_cases <- function() {
+  tibble::tribble(
+    ~.test_name, ~pola, ~base,
+    "max", "max", max,
+    "mean", "mean", mean,
+    "median", "median", median,
+    "max", "max", max,
+    "min", "min", min,
+    "std", "std", sd,
+    "sum", "sum", sum,
+    "var", "var", var,
+    "first", "first", function(x) head(x, 1),
+    "last", "last", function(x) tail(x, 1)
+  )
+}
+
+patrick::with_parameters_test_that(
+  "simple translations: eager",
+  {
+    browser()
+    a <- as_polars_lf(mtcars)[[pola]]()
+    b <- as_polars_lf(!!!lapply(mtcars, base))
+    expect_equal_lazy(a, b)
+  },
+  .cases = make_cases()
+)
+}
+  )
\ No newline at end of file

From a93abf5db847e771ca39ca7f76b7759eca2b14cb Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Sat, 23 Nov 2024 22:46:40 +0100
Subject: [PATCH 26/71] minor

---
 tests/testthat/_snaps/lazyframe-frame.md | 28 ++++++++++++++++++++++++
 tests/testthat/test-lazyframe-frame.R    | 28 ------------------------
 2 files changed, 28 insertions(+), 28 deletions(-)
 create mode 100644 tests/testthat/_snaps/lazyframe-frame.md

diff --git a/tests/testthat/_snaps/lazyframe-frame.md b/tests/testthat/_snaps/lazyframe-frame.md
new file mode 100644
index 00000000..681c5d46
--- /dev/null
+++ b/tests/testthat/_snaps/lazyframe-frame.md
@@ -0,0 +1,28 @@
+# unnest works correctly
+
+    Code
+      current$collect()
+    Condition
+      Error in `current$collect()`:
+      ! Evaluation failed in `$collect()`.
+      Caused by error:
+      ! Invalid operation: invalid selector expression: dyn float: 1.0
+      
+      Resolved plan until failure:
+      
+      	---> FAILED HERE RESOLVING THIS_NODE <---
+       SELECT [dyn float: 1.0.alias("foo"), col("b").as_struct(), col("a").as_struct([col("c")]).alias("a_and_c")] FROM
+        DF ["a", "b", "c"]; PROJECT */3 COLUMNS; SELECTION: None
+
+---
+
+    Code
+      current$collect()
+    Output
+      polars: closing concurrent R handler
+    Condition
+      Error in `current$collect()`:
+      ! Evaluation failed in `$collect()`.
+      Caused by error:
+      ! A polars sub-thread panicked. See panic msg, which is likely more informative than this error: Any { .. }
+
diff --git a/tests/testthat/test-lazyframe-frame.R b/tests/testthat/test-lazyframe-frame.R
index f0304d00..83aeca5c 100644
--- a/tests/testthat/test-lazyframe-frame.R
+++ b/tests/testthat/test-lazyframe-frame.R
@@ -232,33 +232,5 @@ test_that("unnest works correctly", {
   # wrong datatype
   expect_snapshot_lazy(df$unnest("foo"), error = TRUE)
 })
-
-
-make_cases <- function() {
-  tibble::tribble(
-    ~.test_name, ~pola, ~base,
-    "max", "max", max,
-    "mean", "mean", mean,
-    "median", "median", median,
-    "max", "max", max,
-    "min", "min", min,
-    "std", "std", sd,
-    "sum", "sum", sum,
-    "var", "var", var,
-    "first", "first", function(x) head(x, 1),
-    "last", "last", function(x) tail(x, 1)
-  )
-}
-
-patrick::with_parameters_test_that(
-  "simple translations: eager",
-  {
-    browser()
-    a <- as_polars_lf(mtcars)[[pola]]()
-    b <- as_polars_lf(!!!lapply(mtcars, base))
-    expect_equal_lazy(a, b)
-  },
-  .cases = make_cases()
-)
 }
   )
\ No newline at end of file

From 3fcdcffd954b3a8d4677c431430b0b02285d189e Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Sat, 23 Nov 2024 22:59:34 +0100
Subject: [PATCH 27/71] back to previous way

---
 tests/testthat/helper-expections.R    |  72 +++++++
 tests/testthat/test-dataframe-frame.R | 168 -----------------
 tests/testthat/test-lazyframe-frame.R | 258 ++++++++------------------
 3 files changed, 150 insertions(+), 348 deletions(-)

diff --git a/tests/testthat/helper-expections.R b/tests/testthat/helper-expections.R
index d6a71945..ee9adb33 100644
--- a/tests/testthat/helper-expections.R
+++ b/tests/testthat/helper-expections.R
@@ -1,3 +1,75 @@
+#' Compare the query result with LazyFrame and DataFrame
+#'
+#' Inspired by `compare_dplyr_binding` of the arrow package.
+#' @param object A polars query, must be started with `.input`.
+#' See the examples for details.
+#' @param input R object will be converted to a DataFrame or LazyFrame
+#' by `as_polars_df` or `as_polars_lf`.
+#' @param expected A polars DataFrame, the expected result of the query.
+#' @examples
+#' expect_query_equal(
+#'   .input$select("foo"),
+#'   pl$DataFrame(foo = NULL, bar = NULL),
+#'   pl$DataFrame(foo = NULL)
+#' )
+#' @noRd
+expect_query_equal <- function(object, input, expected) {
+  query <- rlang::enquo(object)
+  out_lazy <- rlang::eval_tidy(query, rlang::new_data_mask(rlang::env(.input = as_polars_lf(input))))$collect()
+  out_eager <- rlang::eval_tidy(query, rlang::new_data_mask(rlang::env(.input = as_polars_df(input))))
+
+  expect_equal(out_lazy, expected)
+  expect_equal(out_eager, expected)
+
+  invisible(NULL)
+}
+
+#' Compare the query error with LazyFrame and DataFrame
+#'
+#' Same as `expect_query_equal()`, but for `expect_error()`.
+#' @param object A polars query, must be started with `.input`.
+#' See `expect_query_equal()` for details.
+#' @param input R object will be converted to a DataFrame or LazyFrame
+#' by `as_polars_df()` or `as_polars_lf()`.
+#' @param regexp passed to `expect_error()`.
+#' @param class passed to `expect_error()`.
+#' @param ... passed to `expect_error()`.
+#' @noRd
+expect_query_error <- function(object, input, regexp = NULL, class = NULL, ...) {
+  query <- rlang::enquo(object)
+  expect_error(
+    rlang::eval_tidy(query, rlang::new_data_mask(rlang::env(.input = as_polars_lf(input))))$collect(),
+    regexp = regexp,
+    class = class,
+    ...
+  )
+  expect_error(
+    rlang::eval_tidy(query, rlang::new_data_mask(rlang::env(.input = as_polars_df(input)))),
+    regexp = regexp,
+    class = class,
+    ...
+  )
+
+  invisible(NULL)
+}
+
+#' Mix of `expect_query_equal()` and `expect_query_error()`
+#'
+#' The query only succeeds for DataFrame, but fails for LazyFrame.
+expect_eager_equal_lazy_error <- function(object, input, expected, regexp = NULL, class = NULL, ...) {
+  query <- rlang::enquo(object)
+  out_eager <- rlang::eval_tidy(query, rlang::new_data_mask(rlang::env(.input = as_polars_df(input))))
+
+  expect_equal(out_eager, expected)
+  expect_error(
+    rlang::eval_tidy(query, rlang::new_data_mask(rlang::env(.input = as_polars_lf(input))))$collect(),
+    regexp = regexp,
+    class = class,
+    ...
+  )
+  invisible(NULL)
+}
+
 ### Helper to detect whether an expectation runs in a test where data is in
 ### lazy mode.
 is_in_lazy_test <- function() {
diff --git a/tests/testthat/test-dataframe-frame.R b/tests/testthat/test-dataframe-frame.R
index 062199f0..feba824f 100644
--- a/tests/testthat/test-dataframe-frame.R
+++ b/tests/testthat/test-dataframe-frame.R
@@ -26,7 +26,6 @@ test_that("pl$DataFrame() rejects expressions", {
 })
 
 test_that("to_struct()", {
-  skip_if(is_in_lazy_test())
   expect_equal(
     as_polars_df(mtcars)$to_struct("foo"),
     as_polars_series(mtcars, "foo")
@@ -34,7 +33,6 @@ test_that("to_struct()", {
 })
 
 test_that("get_columns()", {
-  skip_if(is_in_lazy_test())
   expect_equal(
     pl$DataFrame(a = 1:2, b = c("foo", "bar"))$get_columns(),
     list(
@@ -45,7 +43,6 @@ test_that("get_columns()", {
 })
 
 test_that("to_series()", {
-  skip_if(is_in_lazy_test())
   data <- data.frame(
     a = 1:2,
     b = c("foo", "bar")
@@ -60,168 +57,3 @@ test_that("to_series()", {
     as_polars_series(data$b, "b")
   )
 })
-
-test_that("select works lazy/eager", {
-  .data <- pl$DataFrame(
-    int32 = 1:5,
-    int64 = as_polars_series(1:5)$cast(pl$Int64),
-    string = letters[1:5],
-  )
-
-  expect_equal(
-    .data$select("int32"),
-    pl$DataFrame(int32 = 1:5)
-  )
-  expect_equal(
-    .data$select(pl$lit("int32")),
-    pl$DataFrame(literal = "int32")
-  )
-  expect_equal(
-    .data$select(foo = "int32"),
-    pl$DataFrame(foo = 1:5)
-  )
-})
-
-test_that("POLARS_AUTO_STRUCTIFY works for select", {
-  .data <- pl$DataFrame(
-    foo = 1:3,
-    bar = 6:8,
-    ham = letters[1:3],
-  )
-
-  withr::with_envvar(
-    c(POLARS_AUTO_STRUCTIFY = "foo"),
-    {
-      expect_error(
-        .data$select(1),
-        r"(Environment variable `POLARS_AUTO_STRUCTIFY` must be one of \('0', '1'\), got 'foo')"
-      )
-    }
-  )
-
-  withr::with_envvar(
-    c(POLARS_AUTO_STRUCTIFY = "0"),
-    {
-      expect_error(
-        .data$select(is_odd = ((pl$col(pl$Int32) %% 2) == 1)$name$suffix("_is_odd")),
-        "`keep`, `suffix`, `prefix` should be last expression"
-      )
-
-      expect_equal(
-        withr::with_envvar(c(POLARS_AUTO_STRUCTIFY = "1"), {
-          .data$select(is_odd = ((pl$col(pl$Int32) %% 2) == 1)$name$suffix("_is_odd"))
-        }),
-        as_polars_lf(.data)$select(
-          is_odd = pl$struct(((pl$col(pl$Int32) %% 2) == 1)$name$suffix("_is_odd")),
-        )$collect()
-      )
-    }
-  )
-})
-
-test_that("slice/head/tail work", {
-  .data <- pl$DataFrame(
-    foo = 1:5,
-    bar = 6:10,
-  )
-
-  # slice
-  expect_equal(
-    .data$slice(1),
-    pl$DataFrame(foo = 2:5, bar = 7:10)
-  )
-  expect_equal(
-    .data$slice(1, 2),
-    pl$DataFrame(foo = 2:3, bar = 7:8)
-  )
-  expect_equal(
-    .data$slice(1, 2),
-    pl$DataFrame(foo = 2:3, bar = 7:8)
-  )
-  expect_equal(
-    .data$slice(4, 100),
-    pl$DataFrame(foo = 5L, bar = 10L)
-  )
-  if (is_in_lazy_test()) {
-    expect_error(
-      .data$slice(0, -2),
-      r"(-2.0 is out of range that can be safely converted to u32)"
-    )
-  } else {
-    expect_equal(
-      .data$slice(0, -2),
-      pl$DataFrame(foo = 1:3, bar = 6:8)
-    )
-  }
-
-  # head
-  expect_equal(
-    .data$head(1),
-    pl$DataFrame(foo = 1L, bar = 6L)
-  )
-  expect_equal(
-    .data$head(100),
-    .data
-  )
-  if (is_in_lazy_test()) {
-    expect_error(
-      .data$head(-4),
-      r"(-4.0 is out of range that can be safely converted to u32)"
-    )
-  } else {
-    expect_equal(
-      .data$head(-4),
-      pl$DataFrame(foo = 1L, bar = 6L)
-    )
-  }
-
-  # tail
-  expect_equal(
-    .data$tail(1),
-    pl$DataFrame(foo = 5L, bar = 10L)
-  )
-  expect_equal(
-    .data$tail(100),
-    .data
-  )
-  if (is_in_lazy_test()) {
-    expect_error(
-      .data$tail(-4),
-      r"(-4\.0 is out of range that can be safely converted to u32)"
-    )
-  } else {
-    expect_equal(
-      .data$tail(-4),
-      pl$DataFrame(foo = 5L, bar = 10L)
-    )
-  }
-})
-
-
-test_that("unnest works correctly", {
-  df <- pl$DataFrame(
-    a = 1:5,
-    b = c("one", "two", "three", "four", "five"),
-    c = 6:10
-  )$
-    select(
-    foo = pl$lit(1),
-    pl$struct("b"),
-    pl$struct(c("a", "c"))$alias("a_and_c")
-  )
-
-  expect_identical(
-    df$unnest("b", "a_and_c"),
-    df$unnest(c("b", "a_and_c"))
-  )
-
-  # wrong input
-  expect_snapshot(
-    df$unnest("b", pl$col("a_and_c")),
-    error = TRUE
-  )
-  expect_snapshot(df$unnest(1), error = TRUE)
-
-  # wrong datatype
-  expect_snapshot(df$unnest("foo"), error = TRUE)
-})
diff --git a/tests/testthat/test-lazyframe-frame.R b/tests/testthat/test-lazyframe-frame.R
index 83aeca5c..c73a5048 100644
--- a/tests/testthat/test-lazyframe-frame.R
+++ b/tests/testthat/test-lazyframe-frame.R
@@ -1,96 +1,29 @@
-#############################################
-### [GENERATED AUTOMATICALLY] Update test-dataframe-frame.R instead.
-#############################################
-
-withr::with_envvar(
-    list(POLARS_IN_LAZY_TEST = TRUE),
-    {
-patrick::with_parameters_test_that(
-  "use pl$LazyFrame() to construct a DataFrame",
-  .cases = {
-    tibble::tribble(
-      ~.test_name, ~object, ~expected,
-      "simple", pl$LazyFrame(a = 1, b = list("b"), ), as_polars_lf(list(a = 1, b = list("b"))),
-      "!!! for list", pl$LazyFrame(!!!list(a = 1, b = list("b")), c = 1), as_polars_lf(list(a = 1, b = list("b"), c = 1)),
-      "!!! for data.frame", pl$LazyFrame(!!!data.frame(a = 1, b = "b"), c = 1), as_polars_lf(list(a = 1, b = "b", c = 1)),
-      "empty", pl$LazyFrame(), as_polars_lf(list()),
-    )
-  },
-  code = {
-    expect_equal_lazy(object, expected)
-  }
-)
-
-test_that("pl$LazyFrame() requires series the same length", {
-  expect_error_lazy(pl$LazyFrame(a = 1:2, b = "foo"), "has length 2")
-})
-
-test_that("pl$LazyFrame() rejects expressions", {
-  expect_error_lazy(
-    pl$LazyFrame(a = 1:2, b = pl$lit("foo")),
-    r"(Try evaluating the expression first using `pl\$select\(\)`)"
-  )
-})
-
-test_that("to_struct()", {
-  skip_if(is_in_lazy_test())
-  expect_equal_lazy(
-    as_polars_lf(mtcars)$to_struct("foo"),
-    as_polars_series(mtcars, "foo")
-  )
-})
-
-test_that("get_columns()", {
-  skip_if(is_in_lazy_test())
-  expect_equal_lazy(
-    pl$LazyFrame(a = 1:2, b = c("foo", "bar"))$get_columns(),
-    list(
-      a = as_polars_series(1:2, "a"),
-      b = as_polars_series(c("foo", "bar"), "b")
-    )
-  )
-})
-
-test_that("to_series()", {
-  skip_if(is_in_lazy_test())
-  data <- data.frame(
-    a = 1:2,
-    b = c("foo", "bar")
-  )
-
-  expect_equal_lazy(
-    as_polars_lf(data)$to_series(),
-    as_polars_series(data$a, "a")
-  )
-  expect_equal_lazy(
-    as_polars_lf(data)$to_series(1),
-    as_polars_series(data$b, "b")
-  )
-})
-
 test_that("select works lazy/eager", {
-  .data <- pl$LazyFrame(
+  .data <- pl$DataFrame(
     int32 = 1:5,
     int64 = as_polars_series(1:5)$cast(pl$Int64),
     string = letters[1:5],
   )
 
-  expect_equal_lazy(
-    .data$select("int32"),
-    pl$LazyFrame(int32 = 1:5)
+  expect_query_equal(
+    .input$select("int32"),
+    .data,
+    pl$DataFrame(int32 = 1:5)
   )
-  expect_equal_lazy(
-    .data$select(pl$lit("int32")),
-    pl$LazyFrame(literal = "int32")
+  expect_query_equal(
+    .input$select(pl$lit("int32")),
+    .data,
+    pl$DataFrame(literal = "int32")
   )
-  expect_equal_lazy(
-    .data$select(foo = "int32"),
-    pl$LazyFrame(foo = 1:5)
+  expect_query_equal(
+    .input$select(foo = "int32"),
+    .data,
+    pl$DataFrame(foo = 1:5)
   )
 })
 
 test_that("POLARS_AUTO_STRUCTIFY works for select", {
-  .data <- pl$LazyFrame(
+  .data <- pl$DataFrame(
     foo = 1:3,
     bar = 6:8,
     ham = letters[1:3],
@@ -99,8 +32,9 @@ test_that("POLARS_AUTO_STRUCTIFY works for select", {
   withr::with_envvar(
     c(POLARS_AUTO_STRUCTIFY = "foo"),
     {
-      expect_error_lazy(
-        .data$select(1),
+      expect_query_error(
+        .input$select(1),
+        .data,
         r"(Environment variable `POLARS_AUTO_STRUCTIFY` must be one of \('0', '1'\), got 'foo')"
       )
     }
@@ -109,15 +43,17 @@ test_that("POLARS_AUTO_STRUCTIFY works for select", {
   withr::with_envvar(
     c(POLARS_AUTO_STRUCTIFY = "0"),
     {
-      expect_error_lazy(
-        .data$select(is_odd = ((pl$col(pl$Int32) %% 2) == 1)$name$suffix("_is_odd")),
+      expect_query_error(
+        .input$select(is_odd = ((pl$col(pl$Int32) %% 2) == 1)$name$suffix("_is_odd")),
+        .data,
         "`keep`, `suffix`, `prefix` should be last expression"
       )
 
-      expect_equal_lazy(
+      expect_query_equal(
         withr::with_envvar(c(POLARS_AUTO_STRUCTIFY = "1"), {
-          .data$select(is_odd = ((pl$col(pl$Int32) %% 2) == 1)$name$suffix("_is_odd"))
+          .input$select(is_odd = ((pl$col(pl$Int32) %% 2) == 1)$name$suffix("_is_odd"))
         }),
+        .data,
         as_polars_lf(.data)$select(
           is_odd = pl$struct(((pl$col(pl$Int32) %% 2) == 1)$name$suffix("_is_odd")),
         )$collect()
@@ -126,111 +62,73 @@ test_that("POLARS_AUTO_STRUCTIFY works for select", {
   )
 })
 
-test_that("slice/head/tail work", {
-  .data <- pl$LazyFrame(
+test_that("slice/head/tail works lazy/eager", {
+  .data <- pl$DataFrame(
     foo = 1:5,
     bar = 6:10,
   )
 
   # slice
-  expect_equal_lazy(
-    .data$slice(1),
-    pl$LazyFrame(foo = 2:5, bar = 7:10)
-  )
-  expect_equal_lazy(
-    .data$slice(1, 2),
-    pl$LazyFrame(foo = 2:3, bar = 7:8)
-  )
-  expect_equal_lazy(
-    .data$slice(1, 2),
-    pl$LazyFrame(foo = 2:3, bar = 7:8)
-  )
-  expect_equal_lazy(
-    .data$slice(4, 100),
-    pl$LazyFrame(foo = 5L, bar = 10L)
-  )
-  if (is_in_lazy_test()) {
-    expect_error_lazy(
-      .data$slice(0, -2),
-      r"(-2.0 is out of range that can be safely converted to u32)"
-    )
-  } else {
-    expect_equal_lazy(
-      .data$slice(0, -2),
-      pl$LazyFrame(foo = 1:3, bar = 6:8)
-    )
-  }
-
-  # head
-  expect_equal_lazy(
-    .data$head(1),
-    pl$LazyFrame(foo = 1L, bar = 6L)
+  expect_query_equal(
+    .input$slice(1),
+    .data,
+    pl$DataFrame(foo = 2:5, bar = 7:10)
   )
-  expect_equal_lazy(
-    .data$head(100),
-    .data
+  expect_query_equal(
+    .input$slice(1, 2),
+    .data,
+    pl$DataFrame(foo = 2:3, bar = 7:8)
   )
-  if (is_in_lazy_test()) {
-    expect_error_lazy(
-      .data$head(-4),
-      r"(-4.0 is out of range that can be safely converted to u32)"
-    )
-  } else {
-    expect_equal_lazy(
-      .data$head(-4),
-      pl$LazyFrame(foo = 1L, bar = 6L)
-    )
-  }
-
-  # tail
-  expect_equal_lazy(
-    .data$tail(1),
-    pl$LazyFrame(foo = 5L, bar = 10L)
+  expect_query_equal(
+    .input$slice(1, 2),
+    .data,
+    pl$DataFrame(foo = 2:3, bar = 7:8)
   )
-  expect_equal_lazy(
-    .data$tail(100),
-    .data
+  expect_query_equal(
+    .input$slice(4, 100),
+    .data,
+    pl$DataFrame(foo = 5L, bar = 10L)
   )
-  if (is_in_lazy_test()) {
-    expect_error_lazy(
-      .data$tail(-4),
-      r"(-4\.0 is out of range that can be safely converted to u32)"
-    )
-  } else {
-    expect_equal_lazy(
-      .data$tail(-4),
-      pl$LazyFrame(foo = 5L, bar = 10L)
-    )
-  }
-})
-
-
-test_that("unnest works correctly", {
-  df <- pl$LazyFrame(
-    a = 1:5,
-    b = c("one", "two", "three", "four", "five"),
-    c = 6:10
-  )$
-    select(
-    foo = pl$lit(1),
-    pl$struct("b"),
-    pl$struct(c("a", "c"))$alias("a_and_c")
+  expect_eager_equal_lazy_error(
+    .input$slice(0, -2),
+    .data,
+    pl$DataFrame(foo = 1:3, bar = 6:8),
+    r"(-2.0 is out of range that can be safely converted to u32)"
   )
 
-  expect_identical(
-    df$unnest("b", "a_and_c"),
-    df$unnest(c("b", "a_and_c"))
+  # head
+  expect_query_equal(
+    .input$head(1),
+    .data,
+    pl$DataFrame(foo = 1L, bar = 6L)
+  )
+  expect_query_equal(
+    .input$head(100),
+    .data,
+    .data
   )
-
-  # wrong input
-  expect_snapshot_lazy(
-    df$unnest("b", pl$col("a_and_c")),
-    error = TRUE
+  expect_eager_equal_lazy_error(
+    .input$head(-4),
+    .data,
+    pl$DataFrame(foo = 1L, bar = 6L),
+    r"(-4.0 is out of range that can be safely converted to u32)"
   )
-  expect_snapshot_lazy(df$unnest(1), error = TRUE)
 
-  # wrong datatype
-  expect_snapshot_lazy(df$unnest("foo"), error = TRUE)
+  # tail
+  expect_query_equal(
+    .input$tail(1),
+    .data,
+    pl$DataFrame(foo = 5L, bar = 10L)
+  )
+  expect_query_equal(
+    .input$tail(100),
+    .data,
+    .data
+  )
+  expect_eager_equal_lazy_error(
+    .input$tail(-4),
+    .data,
+    pl$DataFrame(foo = 5L, bar = 10L),
+    r"(-4.0 is out of range that can be safely converted to u32)"
+  )
 })
-}
-  )
\ No newline at end of file

From 81b954c37bd8ef721d2608bf27000d68af0080b4 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Sat, 23 Nov 2024 23:20:32 +0100
Subject: [PATCH 28/71] improve custom expectations

---
 tests/testthat/helper-expections.R    | 62 ++++++++++++---------------
 tests/testthat/setup.R                | 23 ----------
 tests/testthat/test-lazyframe-frame.R | 38 ++++++++++++++++
 3 files changed, 65 insertions(+), 58 deletions(-)
 delete mode 100644 tests/testthat/setup.R

diff --git a/tests/testthat/helper-expections.R b/tests/testthat/helper-expections.R
index ee9adb33..5230f9c4 100644
--- a/tests/testthat/helper-expections.R
+++ b/tests/testthat/helper-expections.R
@@ -3,20 +3,41 @@
 #' Inspired by `compare_dplyr_binding` of the arrow package.
 #' @param object A polars query, must be started with `.input`.
 #' See the examples for details.
-#' @param input R object will be converted to a DataFrame or LazyFrame
-#' by `as_polars_df` or `as_polars_lf`.
-#' @param expected A polars DataFrame, the expected result of the query.
+#' @param ... Dynamic dots taking the various inputs specified in `object`. The
+#' last element MUST be the expected output.
 #' @examples
 #' expect_query_equal(
 #'   .input$select("foo"),
 #'   pl$DataFrame(foo = NULL, bar = NULL),
 #'   pl$DataFrame(foo = NULL)
 #' )
+#'
+#' a <- pl$DataFrame(x = 1:2, y = 3:4)
+#' b <- pl$DataFrame(x = 2, z = 5)
+#' expect_query_equal(
+#'   .input$join(.input2, on = "x", how = "inner"),
+#'   .input = a, .input2 = b,
+#'   pl$DataFrame(a = 2, y = 4, z = 5)
+#' )
 #' @noRd
-expect_query_equal <- function(object, input, expected) {
+expect_query_equal <- function(object, ...) {
   query <- rlang::enquo(object)
-  out_lazy <- rlang::eval_tidy(query, rlang::new_data_mask(rlang::env(.input = as_polars_lf(input))))$collect()
-  out_eager <- rlang::eval_tidy(query, rlang::new_data_mask(rlang::env(.input = as_polars_df(input))))
+  inputs <- list2(...)
+
+  # Otherwise the expected output needs to be named in all expect_query_equal()
+  expected <- inputs[[length(inputs)]]
+  inputs[[length(inputs)]] <- NULL
+
+  # Just a convenience to avoid naming `.input` when it's the only input in the
+  # query
+  inputs_lazy <- lapply(inputs, \(x) x$lazy())
+  if (length(inputs) == 1 && is.null(names(inputs))) {
+    names(inputs) <- ".input"
+    names(inputs_lazy) <- ".input"
+  }
+
+  out_lazy <- rlang::eval_tidy(query, rlang::new_data_mask(rlang::env(!!!inputs_lazy)))$collect()
+  out_eager <- rlang::eval_tidy(query, rlang::new_data_mask(rlang::env(!!!inputs)))
 
   expect_equal(out_lazy, expected)
   expect_equal(out_eager, expected)
@@ -69,32 +90,3 @@ expect_eager_equal_lazy_error <- function(object, input, expected, regexp = NULL
   )
   invisible(NULL)
 }
-
-### Helper to detect whether an expectation runs in a test where data is in
-### lazy mode.
-is_in_lazy_test <- function() {
-  nzchar(Sys.getenv("POLARS_IN_LAZY_TEST")) && Sys.getenv("POLARS_IN_LAZY_TEST") == "TRUE"
-}
-
-### Those helpers are equivalent to their counterparts without the "_lazy"
-### suffix but they run on lazyframes
-###
-### They shouldn't be used manually. Instead they are automatically inserted in
-### some test files by the code in setup.R.
-expect_equal_lazy <- function(x, y, ...) {
-  if (inherits(x, "polars_lazy_frame")) {
-    x <- x$collect()
-  }
-  if (inherits(y, "polars_lazy_frame")) {
-    y <- y$collect()
-  }
-  expect_equal(x, y, ...)
-}
-
-expect_error_lazy <- function(current, pattern = ".*", ...) {
-  expect_error(current$collect(), pattern, ...)
-}
-
-expect_snapshot_lazy <- function(current, ...) {
-  expect_snapshot(current$collect(), ...)
-}
diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R
deleted file mode 100644
index 9aa80877..00000000
--- a/tests/testthat/setup.R
+++ /dev/null
@@ -1,23 +0,0 @@
-Sys.setenv(POLARS_IN_LAZY_TEST = FALSE)
-
-orig <- test_path("test-dataframe-frame.R")
-dest <- test_path("test-lazyframe-frame.R")
-
-tmp <- readLines(orig)
-out <- gsub("pl\\$DataFrame", "pl\\$LazyFrame", tmp)
-out <- gsub("as_polars_df\\(", "as_polars_lf(", out)
-out <- gsub("expect_equal\\(", "expect_equal_lazy(", out)
-out <- gsub("expect_error\\(", "expect_error_lazy(", out)
-out <- gsub("expect_snapshot", "expect_snapshot_lazy", out)
-out <- paste0(
-  "#############################################\n",
-  "### [GENERATED AUTOMATICALLY] Update ", orig, " instead.\n",
-  "#############################################\n\n",
-  "withr::with_envvar(
-    list(POLARS_IN_LAZY_TEST = TRUE),
-    {\n",
-  paste(out, collapse = "\n"),
-  "\n}
-  )"
-)
-cat(out, file = dest)
diff --git a/tests/testthat/test-lazyframe-frame.R b/tests/testthat/test-lazyframe-frame.R
index c73a5048..df0a69ff 100644
--- a/tests/testthat/test-lazyframe-frame.R
+++ b/tests/testthat/test-lazyframe-frame.R
@@ -132,3 +132,41 @@ test_that("slice/head/tail works lazy/eager", {
     r"(-4.0 is out of range that can be safely converted to u32)"
   )
 })
+
+test_that("shift works lazy/eager", {
+  .data <- as_polars_df(mtcars[1:3, 1:2])
+  expect_query_equal(
+    .input$shift(2),
+    .data,
+    pl$DataFrame(mpg = c(NA, NA, 21), cyl = c(NA, NA, 6))
+  )
+  expect_query_equal(
+    .input$shift(2, fill_value = 999),
+    .data,
+    pl$DataFrame(mpg = c(999, 999, 21), cyl = c(999, 999, 6))
+  )
+})
+
+test_that("joins work lazy/eager", {
+  df <- pl$DataFrame(
+    foo = 1:3,
+    bar = c(6, 7, 8),
+    ham = c("a", "b", "c")
+  )
+
+  other_df <- pl$DataFrame(
+    apple = c("x", "y", "z"),
+    ham = c("a", "b", "d")
+  )
+
+  expect_query_equal(
+    .input$join(.input2, on = "ham"),
+    .input = df, .input2 = other_df,
+    pl$DataFrame(
+      foo = 1:2,
+      bar = c(6, 7),
+      ham = c("a", "b"),
+      apple = c("x", "y")
+    )
+  )
+})

From 4009a6f0094df6c6288589706c00839f1e2cea72 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Sun, 24 Nov 2024 15:25:37 +0100
Subject: [PATCH 29/71] more

---
 R/lazyframe-frame.R                   |  38 +++----
 tests/testthat/test-lazyframe-frame.R | 145 +++++++++++++++++++++++++-
 2 files changed, 163 insertions(+), 20 deletions(-)

diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index 962eeffb..51cb8f78 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -595,21 +595,22 @@ lazyframe__sort <- function(
     nulls_last = FALSE,
     multithreaded = TRUE,
     maintain_order = FALSE) {
-  wrap({
-    check_dots_unnamed()
+  check_dots_unnamed()
 
-    by <- parse_into_list_of_expressions(...)
-    descending <- extend_bool(descending, length(by), "descending", "...")
-    nulls_last <- extend_bool(nulls_last, length(by), "nulls_last", "...")
-
-    self$`_ldf`$sort_by_exprs(
-      by,
-      descending = descending,
-      nulls_last = nulls_last,
-      multithreaded = multithreaded,
-      maintain_order = maintain_order
-    )
-  })
+  by <- parse_into_list_of_expressions(...)
+  if (length(by) == 0) {
+    abort("`...` must contain at least one element.")
+  }
+  descending <- extend_bool(descending, length(by), "descending", "...")
+  nulls_last <- extend_bool(nulls_last, length(by), "nulls_last", "...")
+
+  self$`_ldf`$sort_by_exprs(
+    by,
+    descending = descending,
+    nulls_last = nulls_last,
+    multithreaded = multithreaded,
+    maintain_order = maintain_order
+  ) |> wrap()
 }
 
 #' Modify/append column(s) of a LazyFrame
@@ -661,12 +662,11 @@ lazyframe__sort <- function(
 #'   })
 #' }
 lazyframe__with_columns <- function(...) {
-  wrap({
-    structify <- parse_env_auto_structify()
+  structify <- parse_env_auto_structify()
 
-    parse_into_list_of_expressions(..., `__structify` = structify) |>
-      self$`_ldf`$with_columns()
-  })
+  parse_into_list_of_expressions(..., `__structify` = structify) |>
+    self$`_ldf`$with_columns() |>
+    wrap()
 }
 
 #' Modify/append column(s) of a LazyFrame
diff --git a/tests/testthat/test-lazyframe-frame.R b/tests/testthat/test-lazyframe-frame.R
index df0a69ff..3c5a2a60 100644
--- a/tests/testthat/test-lazyframe-frame.R
+++ b/tests/testthat/test-lazyframe-frame.R
@@ -56,7 +56,7 @@ test_that("POLARS_AUTO_STRUCTIFY works for select", {
         .data,
         as_polars_lf(.data)$select(
           is_odd = pl$struct(((pl$col(pl$Int32) %% 2) == 1)$name$suffix("_is_odd")),
-        )$collect()
+        )
       )
     }
   )
@@ -147,6 +147,7 @@ test_that("shift works lazy/eager", {
   )
 })
 
+# TODO-REWRITE: add $join() for DataFrame
 test_that("joins work lazy/eager", {
   df <- pl$DataFrame(
     foo = 1:3,
@@ -170,3 +171,145 @@ test_that("joins work lazy/eager", {
     )
   )
 })
+
+test_that("sort works with lazy/eager", {
+  expect_query_error(
+    .input$sort(complex(1)),
+    pl$DataFrame(x = 1),
+    "Unsupported class"
+  )
+  expect_query_error(
+    .input$sort(by = complex(1)),
+    pl$DataFrame(x = 1),
+    "must be passed by position"
+  )
+  expect_query_error(
+    .input$sort(),
+    pl$DataFrame(x = 1),
+    "at least one element"
+  )
+  # `descending` and `nulls_last` need either 1 or as many booleans as items
+  expect_query_error(
+    .input$sort("cyl", "mpg", "drat", descending = c(TRUE, FALSE)),
+    pl$DataFrame(x = 1),
+    "does not match"
+  )
+  expect_query_error(
+    .input$sort("cyl", "mpg", "drat", nulls_last = c(TRUE, FALSE)),
+    pl$DataFrame(x = 1),
+    "does not match"
+  )
+
+  # `descending` and `nulls_last` can only take booleans
+  expect_query_error(
+    .input$sort("cyl", "mpg", "drat", descending = 42),
+    pl$DataFrame(x = 1),
+    "must be a logical vector"
+  )
+  expect_query_error(
+    .input$sort("cyl", "mpg", "drat", descending = NULL),
+    pl$DataFrame(x = 1),
+    "must be a logical vector"
+  )
+  expect_query_error(
+    .input$sort("cyl", "mpg", "drat", nulls_last = 42),
+    pl$DataFrame(x = 1),
+    "must be a logical vector"
+  )
+  expect_query_error(
+    .input$sort("cyl", "mpg", "drat", nulls_last = NULL),
+    pl$DataFrame(x = 1),
+    "must be a logical vector"
+  )
+
+  df <- pl$DataFrame(
+    x = c(3, 3, 4, 1, 2),
+    y = c(2, 1, 5, 4, 3)
+  )
+  expect_query_equal(
+    .input$sort("x", maintain_order = TRUE),
+    df,
+    pl$DataFrame(x = c(1, 2, 3, 3, 4), y = c(4, 3, 2, 1, 5))
+  )
+  expect_query_equal(
+    .input$sort(pl$col("x"), maintain_order = TRUE),
+    df,
+    pl$DataFrame(x = c(1, 2, 3, 3, 4), y = c(4, 3, 2, 1, 5))
+  )
+  # several columns
+  expect_query_equal(
+    .input$sort("x", "y", maintain_order = TRUE),
+    df,
+    pl$DataFrame(x = c(1, 2, 3, 3, 4), y = c(4, 3, 1, 2, 5))
+  )
+  expect_query_equal(
+    .input$sort(pl$col("x"), pl$col("y"), maintain_order = TRUE),
+    df,
+    pl$DataFrame(x = c(1, 2, 3, 3, 4), y = c(4, 3, 1, 2, 5))
+  )
+  expect_query_equal(
+    .input$sort(c("x", "y"), maintain_order = TRUE),
+    df,
+    pl$DataFrame(x = c(1, 2, 3, 3, 4), y = c(4, 3, 1, 2, 5))
+  )
+
+  # descending arg
+  expect_query_equal(
+    .input$sort("x", "y", maintain_order = TRUE, descending = TRUE),
+    df,
+    pl$DataFrame(x = c(4, 3, 3, 2, 1), y = c(5, 2, 1, 3, 4))
+  )
+
+  # descending arg: vector of boolean
+  expect_query_equal(
+    .input$sort("x", "y", maintain_order = TRUE, descending = c(TRUE, FALSE)),
+    df,
+    pl$DataFrame(x = c(4, 3, 3, 2, 1), y = c(5, 1, 2, 3, 4))
+  )
+
+  # expr: one increasing and one decreasing
+  expect_query_equal(
+    .input$sort(-pl$col("x"), pl$col("y"), maintain_order = TRUE),
+    df,
+    pl$DataFrame(x = c(4, 3, 3, 2, 1), y = c(5, 1, 2, 3, 4))
+  )
+
+  # nulls_last
+  df <- pl$DataFrame(
+    x = c(NA, 3, 4, 1, 2),
+    y = c(2, 1, 5, 4, 3)
+  )
+  expect_query_equal(
+    .input$sort("x", "y", maintain_order = TRUE, nulls_last = TRUE),
+    df,
+    pl$DataFrame(x = c(1, 2, 3, 4, NA), y = c(4, 3, 1, 5, 2))
+  )
+  expect_query_equal(
+    .input$sort("x", "y", maintain_order = TRUE, nulls_last = FALSE),
+    df,
+    pl$DataFrame(x = c(NA, 1, 2, 3, 4), y = c(2, 4, 3, 1, 5))
+  )
+})
+
+# TODO-REWRITE: add $rename() for DataFrame
+patrick::with_parameters_test_that(
+  "rename works with lazy/eager",
+  {
+    dat <- do.call(fun, list(mtcars))
+    dat2 <- dat$rename(mpg = "miles_per_gallon", hp = "horsepower")
+    if (is_polars_lf(dat2)) {
+      dat2 <- dat2$collect()
+    }
+    nms <- names(dat2)
+    expect_false("hp" %in% nms)
+    expect_false("mpg" %in% nms)
+    expect_true("miles_per_gallon" %in% nms)
+    expect_true("horsepower" %in% nms)
+
+    expect_error(
+      dat$rename(),
+      "must be character, not NULL"
+    )
+  },
+  fun = c("as_polars_df", "as_polars_lf")
+)

From f4e657d18171a5e788e7e4fc7b5d6b0a95cf8de3 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Sun, 24 Nov 2024 22:47:12 +0100
Subject: [PATCH 30/71] more tests

---
 tests/testthat/test-lazyframe-frame.R | 83 +++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)

diff --git a/tests/testthat/test-lazyframe-frame.R b/tests/testthat/test-lazyframe-frame.R
index 3c5a2a60..c5e0328e 100644
--- a/tests/testthat/test-lazyframe-frame.R
+++ b/tests/testthat/test-lazyframe-frame.R
@@ -313,3 +313,86 @@ patrick::with_parameters_test_that(
   },
   fun = c("as_polars_df", "as_polars_lf")
 )
+
+# TODO-REWRITE: requires $name$map()
+# patrick::with_parameters_test_that(
+#   "rename works with lazy/eager",
+#   {
+#     dat <- do.call(fun, list(data.frame(foo = 1:3, bar = 6:8, ham = letters[1:3])))
+#     dat2 <- dat$rename(
+#       \(column_name) paste0("c", substr(column_name, 2, 100))
+#     )
+#     if (is_polars_lf(dat2)) {
+#       dat2 <- dat2$collect()
+#     }
+#     expect_named(dat2, c("coo", "car", "cam"))
+#   },
+#   fun = c("as_polars_df", "as_polars_lf")
+# )
+
+test_that("explode", {
+  df <- pl$DataFrame(
+    letters = c("a", "a", "b", "c"),
+    numbers = list(1, c(2, 3), c(4, 5), c(6, 7, 8)),
+    jumpers = list(1, c(2, 3), c(4, 5), c(6, 7, 8))
+  )
+
+  expected_df <- pl$DataFrame(
+    letters = c(rep("a", 3), "b", "b", rep("c", 3)),
+    numbers = 1:8,
+    jumpers = 1:8
+  )
+
+  # as vector
+  expect_query_equal(
+    .input$explode(c("numbers", "jumpers")),
+    df,
+    expected_df
+  )
+
+  # as ...
+  expect_query_equal(
+    df$explode("numbers", pl$col("jumpers")),
+    df,
+    expected_df
+  )
+
+
+  # empty values -> NA
+  df <- pl$DataFrame(
+    letters = c("a", "a", "b", "c"),
+    numbers = list(1, NULL, c(4, 5), c(6, 7, 8))
+  )
+  expect_query_equal(
+    .input$explode("numbers"),
+    df,
+    pl$DataFrame(
+      letters = c(rep("a", 2), "b", "b", rep("c", 3)),
+      numbers = c(1, NA, 4:8)
+    )
+  )
+
+  # several cols to explode test2
+  df <- pl$DataFrame(
+    letters = c("a", "a", "b", "c"),
+    numbers = list(1, NULL, c(4, 5), c(6, 7, 8)),
+    numbers2 = list(1, NULL, c(4, 5), c(6, 7, 8))
+  )
+  expect_query_equal(
+    .input$explode("numbers", pl$col("numbers2")),
+    df,
+    pl$DataFrame(
+      letters = c(rep("a", 2), "b", "b", rep("c", 3)),
+      numbers = c(1, NA, 4:8),
+      numbers2 = c(1, NA, 4:8)
+    )
+  )
+})
+
+test_that("with_row_index", {
+  expect_query_equal(
+    .input$with_row_index("idx", 42),
+    pl$DataFrame(x = 1:3),
+    pl$DataFrame(idx = 42:44, x = 1:3)$cast(idx = pl$UInt32)
+  )
+})

From 4dc4d0cb014f9db2f98339f82b4e8b8204fa6615 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Wed, 27 Nov 2024 15:48:08 +0100
Subject: [PATCH 31/71] minor

---
 tests/testthat/test-lazyframe-frame.R | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/testthat/test-lazyframe-frame.R b/tests/testthat/test-lazyframe-frame.R
index c5e0328e..4d98cb97 100644
--- a/tests/testthat/test-lazyframe-frame.R
+++ b/tests/testthat/test-lazyframe-frame.R
@@ -343,21 +343,17 @@ test_that("explode", {
     jumpers = 1:8
   )
 
-  # as vector
   expect_query_equal(
     .input$explode(c("numbers", "jumpers")),
     df,
     expected_df
   )
-
-  # as ...
   expect_query_equal(
     df$explode("numbers", pl$col("jumpers")),
     df,
     expected_df
   )
 
-
   # empty values -> NA
   df <- pl$DataFrame(
     letters = c("a", "a", "b", "c"),

From ec91129f8cff08f8165bb0107ec8f3b8555e1461 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Thu, 28 Nov 2024 21:50:57 +0100
Subject: [PATCH 32/71] more

---
 R/lazyframe-frame.R                   | 14 +++++++-----
 src/rust/src/conversion/mod.rs        | 13 -----------
 tests/testthat/test-lazyframe-frame.R | 32 ++++++++++++++++++++++++---
 3 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index 51cb8f78..054271d4 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -1779,8 +1779,12 @@ lazyframe__rolling <- function(
   wrap({
     check_dots_empty0(...)
     closed <- arg_match0(closed, values = c("both", "left", "right", "none"))
-    period <- parse_as_polars_duration_string(period)
-    offset <- parse_as_polars_duration_string(offset) %||% negate_duration_string(period)
+    period <- parse_as_duration_string(period)
+    if (!is.null(offset)) {
+      offset <- parse_as_duration_string(offset)
+    } else {
+      offset <- negate_duration_string(period)
+    }
     by <- parse_into_list_of_expressions(!!!group_by)
     self$`_ldf`$rolling(
       as_polars_expr(index_column)$`_rexpr`, period, offset, closed, by
@@ -1944,9 +1948,9 @@ lazyframe__group_by_dynamic <- function(
         "friday", "saturday", "sunday"
       )
     )
-    every <- parse_as_polars_duration_string(every)
-    offset <- parse_as_polars_duration_string(offset) %||% "0ns"
-    period <- parse_as_polars_duration_string(period) %||% every
+    every <- parse_as_duration_string(every)
+    offset <- parse_as_duration_string(offset) %||% "0ns"
+    period <- parse_as_duration_string(period) %||% every
     group_by <- parse_into_list_of_expressions(!!!group_by)
 
     self$`_ldf`$group_by_dynamic(
diff --git a/src/rust/src/conversion/mod.rs b/src/rust/src/conversion/mod.rs
index 89e3deb4..09022719 100644
--- a/src/rust/src/conversion/mod.rs
+++ b/src/rust/src/conversion/mod.rs
@@ -286,19 +286,6 @@ impl TryFrom<NumericScalar> for Wrap<usize> {
     }
 }
 
-impl TryFrom<NumericScalar> for Wrap<NonZeroUsize> {
-    type Error = savvy::Error;
-
-    fn try_from(n: NumericScalar) -> Result<Self, savvy::Error> {
-        let n = n.as_usize()?;
-        if n == 0 {
-            Err("Cannot convert to non-zero usize.".into())
-        } else {
-            Ok(Wrap(NonZeroUsize::new(n).unwrap()))
-        }
-    }
-}
-
 impl TryFrom<NumericSexp> for Wrap<Vec<usize>> {
     type Error = savvy::Error;
 
diff --git a/tests/testthat/test-lazyframe-frame.R b/tests/testthat/test-lazyframe-frame.R
index 4d98cb97..69df6de4 100644
--- a/tests/testthat/test-lazyframe-frame.R
+++ b/tests/testthat/test-lazyframe-frame.R
@@ -172,7 +172,7 @@ test_that("joins work lazy/eager", {
   )
 })
 
-test_that("sort works with lazy/eager", {
+test_that("sort()", {
   expect_query_error(
     .input$sort(complex(1)),
     pl$DataFrame(x = 1),
@@ -293,7 +293,7 @@ test_that("sort works with lazy/eager", {
 
 # TODO-REWRITE: add $rename() for DataFrame
 patrick::with_parameters_test_that(
-  "rename works with lazy/eager",
+  "rename()",
   {
     dat <- do.call(fun, list(mtcars))
     dat2 <- dat$rename(mpg = "miles_per_gallon", hp = "horsepower")
@@ -316,7 +316,7 @@ patrick::with_parameters_test_that(
 
 # TODO-REWRITE: requires $name$map()
 # patrick::with_parameters_test_that(
-#   "rename works with lazy/eager",
+#   "rename()",
 #   {
 #     dat <- do.call(fun, list(data.frame(foo = 1:3, bar = 6:8, ham = letters[1:3])))
 #     dat2 <- dat$rename(
@@ -392,3 +392,29 @@ test_that("with_row_index", {
     pl$DataFrame(idx = 42:44, x = 1:3)$cast(idx = pl$UInt32)
   )
 })
+
+test_that("rolling: date variable", {
+  df <- pl$DataFrame(
+    dt = c(
+      "2020-01-01", "2020-01-01", "2020-01-01",
+      "2020-01-02", "2020-01-03", "2020-01-08"
+    ),
+    a = c(3, 7, 5, 9, 2, 1)
+  )$with_columns(
+    pl$col("dt")$str$strptime(pl$Date, format = NULL)
+  )
+
+  expect_query_equal(
+    .input$rolling(index_column = "dt", period = "2d")$agg(
+      pl$sum("a")$alias("sum_a"),
+      pl$min("a")$alias("min_a"),
+      pl$max("a")$alias("max_a")
+    )$select("sum_a", "min_a", "max_a"),
+    df,
+    pl$DataFrame(
+      sum_a = c(15, 15, 15, 24, 11, 1),
+      min_a = c(3, 3, 3, 3, 2, 1),
+      max_a = c(7, 7, 7, 9, 9, 1)
+    )
+  )
+})

From ffafb75eb1ffa3dc411b42756163a8bbd8d36fce Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Thu, 28 Nov 2024 23:22:11 +0100
Subject: [PATCH 33/71] redoc

---
 man/dataframe__unnest.Rd | 33 +++++++++++++++++++++++++++++++++
 man/pl.Rd                |  2 +-
 2 files changed, 34 insertions(+), 1 deletion(-)
 create mode 100644 man/dataframe__unnest.Rd

diff --git a/man/dataframe__unnest.Rd b/man/dataframe__unnest.Rd
new file mode 100644
index 00000000..3eb99644
--- /dev/null
+++ b/man/dataframe__unnest.Rd
@@ -0,0 +1,33 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/dataframe-frame.R
+\name{dataframe__unnest}
+\alias{dataframe__unnest}
+\title{Decompose struct columns into separate columns for each of their fields}
+\usage{
+dataframe__unnest(...)
+}
+\arguments{
+\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Name of the struct column(s)
+that should be unnested.}
+}
+\value{
+A polars \link{DataFrame}
+}
+\description{
+The new columns will be inserted into the LazyFrame at the location of the
+struct column.
+}
+\examples{
+df <- pl$DataFrame(
+  a = 1:5,
+  b = c("one", "two", "three", "four", "five"),
+  c = 6:10
+)$
+  select(
+  pl$struct("b"),
+  a_and_c = pl$struct(c("a", "c"))
+)
+df
+
+df$unnest("a_and_c")
+}
diff --git a/man/pl.Rd b/man/pl.Rd
index 0c796695..33e6b48d 100644
--- a/man/pl.Rd
+++ b/man/pl.Rd
@@ -5,7 +5,7 @@
 \alias{pl}
 \title{Polars top-level function namespace}
 \format{
-An object of class \code{polars_object} of length 76.
+An object of class \code{polars_object} of length 75.
 }
 \usage{
 pl

From 4d3f654b8755ebe34c6e261de42976bbabe8953c Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Fri, 29 Nov 2024 21:22:37 +0100
Subject: [PATCH 34/71] more tests for rolling

---
 R/lazyframe-frame.R                   |   3 +
 tests/testthat/test-lazyframe-frame.R | 175 ++++++++++++++++++++++++++
 2 files changed, 178 insertions(+)

diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index 054271d4..bc2ccb87 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -1785,6 +1785,9 @@ lazyframe__rolling <- function(
     } else {
       offset <- negate_duration_string(period)
     }
+    if (!is.null(group_by) && !is.list(group_by)) {
+      group_by <- list(group_by)
+    }
     by <- parse_into_list_of_expressions(!!!group_by)
     self$`_ldf`$rolling(
       as_polars_expr(index_column)$`_rexpr`, period, offset, closed, by
diff --git a/tests/testthat/test-lazyframe-frame.R b/tests/testthat/test-lazyframe-frame.R
index 69df6de4..f329fdb1 100644
--- a/tests/testthat/test-lazyframe-frame.R
+++ b/tests/testthat/test-lazyframe-frame.R
@@ -418,3 +418,178 @@ test_that("rolling: date variable", {
     )
   )
 })
+
+test_that("rolling: datetime variable", {
+  df <- pl$DataFrame(
+    dt = c(
+      "2020-01-01 13:45:48", "2020-01-01 16:42:13", "2020-01-01 16:45:09",
+      "2020-01-02 18:12:48", "2020-01-03 19:45:32", "2020-01-08 23:16:43"
+    ),
+    a = c(3, 7, 5, 9, 2, 1)
+  )$with_columns(
+    pl$col("dt")$str$strptime(pl$Datetime("ms"), format = NULL)
+  )
+
+  expect_query_equal(
+    .input$rolling(index_column = "dt", period = "2d")$agg(
+      pl$sum("a")$alias("sum_a"),
+      pl$min("a")$alias("min_a"),
+      pl$max("a")$alias("max_a")
+    )$select("sum_a", "min_a", "max_a"),
+    df,
+    pl$DataFrame(
+      sum_a = c(3, 10, 15, 24, 11, 1),
+      min_a = c(3, 3, 3, 3, 2, 1),
+      max_a = c(3, 7, 7, 9, 9, 1)
+    )
+  )
+})
+
+test_that("rolling: integer variable", {
+  df <- pl$DataFrame(
+    index = c(1L, 2L, 3L, 4L, 8L, 9L),
+    a = c(3, 7, 5, 9, 2, 1)
+  )
+
+  expect_query_equal(
+    .input$rolling(index_column = "index", period = "2i")$agg(
+      pl$sum("a")$alias("sum_a"),
+      pl$min("a")$alias("min_a"),
+      pl$max("a")$alias("max_a")
+    )$select("sum_a", "min_a", "max_a"),
+    df,
+    pl$DataFrame(
+      sum_a = c(3, 10, 12, 14, 2, 3),
+      min_a = c(3, 3, 5, 5, 2, 1),
+      max_a = c(3, 7, 7, 9, 2, 2)
+    )
+  )
+})
+
+test_that("rolling: using difftime as period", {
+  df <- pl$DataFrame(
+    dt = as.Date(c(
+      "2020-01-01", "2020-01-01", "2020-01-01",
+      "2020-01-02", "2020-01-03", "2020-01-08"
+    )),
+    a = c(3, 7, 5, 9, 2, 1)
+  )
+  expected <- pl$DataFrame(
+    dt = as.Date(c(
+      "2020-01-01", "2020-01-01", "2020-01-01",
+      "2020-01-02", "2020-01-03", "2020-01-08"
+    )),
+    sum_a = c(15, 15, 15, 24, 11, 1)
+  )
+
+  expect_query_equal(
+    .input$rolling(index_column = "dt", period = "2d")$agg(
+      pl$sum("a")$alias("sum_a")
+    ),
+    df,
+    expected
+  )
+  expect_query_equal(
+    .input$rolling(index_column = "dt", period = as.difftime(2, units = "days"))$agg(
+      pl$sum("a")$alias("sum_a")
+    ),
+    df,
+    expected
+  )
+})
+
+test_that("rolling: error if period is negative", {
+  df <- pl$DataFrame(
+    index = c(1L, 2L, 3L, 4L, 8L, 9L),
+    a = c(3, 7, 5, 9, 2, 1)
+  )
+  expect_query_error(
+    .input$rolling(index_column = "index", period = "-2i")$agg(pl$col("a")),
+    df,
+    "rolling window period should be strictly positive"
+  )
+})
+
+test_that("rolling: argument 'group_by' works", {
+  df <- pl$DataFrame(
+    index = c(1L, 2L, 3L, 4L, 8L, 9L),
+    grp = c("a", "a", rep("b", 4)),
+    a = c(3, 7, 5, 9, 2, 1)
+  )
+  expect_query_equal(
+    .input$rolling(index_column = "index", period = "2i", group_by = pl$col("grp"))$agg(
+      pl$sum("a")$alias("sum_a"),
+      pl$min("a")$alias("min_a"),
+      pl$max("a")$alias("max_a")
+    )$select("sum_a", "min_a", "max_a"),
+    df,
+    pl$DataFrame(
+      sum_a = c(3, 10, 5, 14, 2, 3),
+      min_a = c(3, 3, 5, 5, 2, 1),
+      max_a = c(3, 7, 5, 9, 2, 2)
+    )
+  )
+
+  # string is parsed as column name in "group_by"
+  expect_query_equal(
+    .input$rolling(index_column = "index", period = "2i", group_by = "grp")$agg(
+      pl$sum("a")$alias("sum_a"),
+      pl$min("a")$alias("min_a"),
+      pl$max("a")$alias("max_a")
+    )$select("sum_a", "min_a", "max_a"),
+    df,
+    pl$DataFrame(
+      sum_a = c(3, 10, 5, 14, 2, 3),
+      min_a = c(3, 3, 5, 5, 2, 1),
+      max_a = c(3, 7, 5, 9, 2, 2)
+    )
+  )
+})
+test_that("rolling for LazyFrame: error if index not int or date/time", {
+  df <- pl$LazyFrame(
+    index = c(1:5, 6.0),
+    a = c(3, 7, 5, 9, 2, 1)
+  )
+
+  expect_grepl_error(
+    df$rolling(index_column = "index", period = "2i")$agg(
+      pl$sum("a")$alias("sum_a")
+    )$collect()
+  )
+})
+
+test_that("rolling: arg 'offset' works", {
+  df <- pl$DataFrame(
+    dt = as.Date(c(
+      "2020-01-01", "2020-01-01", "2020-01-01",
+      "2020-01-02", "2020-01-03", "2020-01-08"
+    )),
+    a = c(3, 7, 5, 9, 2, 1)
+  )
+
+  expect_query_equal(
+    .input$rolling(index_column = "dt", period = "2d", offset = "1d")$agg(
+      pl$sum("a")$alias("sum_a"),
+      pl$min("a")$alias("min_a"),
+      pl$max("a")$alias("max_a")
+    )$select("sum_a", "min_a", "max_a"),
+    df,
+    pl$DataFrame(
+      sum_a = c(2, 2, 2, NA, NA, NA),
+      min_a = c(2, 2, 2, NA, NA, NA),
+      max_a = c(2, 2, 2, NA, NA, NA)
+    )
+  )
+})
+
+test_that("rolling: can be ungrouped", {
+  df <- pl$DataFrame(
+    index = c(1:5, 6.0),
+    a = c(3, 7, 5, 9, 2, 1)
+  )
+  expect_query_equal(
+    .input$rolling(index_column = "dt", period = "2i")$ungroup(),
+    df,
+    df
+  )
+})

From 4e608da9aa279454338e71c4e0d3a5285cbb6e51 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Fri, 29 Nov 2024 21:39:45 +0100
Subject: [PATCH 35/71] more tests

---
 tests/testthat/helper-expections.R    |   8 +-
 tests/testthat/test-lazyframe-frame.R | 152 ++++++++++++++++++++++++++
 2 files changed, 157 insertions(+), 3 deletions(-)

diff --git a/tests/testthat/helper-expections.R b/tests/testthat/helper-expections.R
index 5230f9c4..55d55833 100644
--- a/tests/testthat/helper-expections.R
+++ b/tests/testthat/helper-expections.R
@@ -37,10 +37,12 @@ expect_query_equal <- function(object, ...) {
   }
 
   out_lazy <- rlang::eval_tidy(query, rlang::new_data_mask(rlang::env(!!!inputs_lazy)))$collect()
-  out_eager <- rlang::eval_tidy(query, rlang::new_data_mask(rlang::env(!!!inputs)))
-
   expect_equal(out_lazy, expected)
-  expect_equal(out_eager, expected)
+
+  # TODO-REWRITE: uncomment when eager functions are implemented
+  # out_eager <- rlang::eval_tidy(query, rlang::new_data_mask(rlang::env(!!!inputs)))
+  # expect_equal(out_eager, expected)
+
 
   invisible(NULL)
 }
diff --git a/tests/testthat/test-lazyframe-frame.R b/tests/testthat/test-lazyframe-frame.R
index f329fdb1..e439d884 100644
--- a/tests/testthat/test-lazyframe-frame.R
+++ b/tests/testthat/test-lazyframe-frame.R
@@ -593,3 +593,155 @@ test_that("rolling: can be ungrouped", {
     df
   )
 })
+
+test_that("with_columns_seq", {
+  df <- pl$DataFrame(x = 1:2)
+
+  expect_query_equal(
+    .input$with_columns_seq(y = list(1:2, 3:4)),
+    df,
+    pl$DataFrame(x = 1:2, y = list(1:2, 3:4))
+  )
+
+  expect_query_equal(
+    .input$with_columns_seq(y = list(1:2, 3:4), z = list(c("a", "b"), c("c", "d"))),
+    df,
+    pl$DataFrame(x = 1:2, y = list(1:2, 3:4), z = list(c("a", "b"), c("c", "d")))
+  )
+})
+
+test_that("$clear() works", {
+  df <- pl$DataFrame(
+    a = c(NA, 2),
+    b = c("a", NA),
+    c = c(TRUE, TRUE)
+  )
+
+  expect_query_equal(
+    .input$clear(),
+    df,
+    pl$DataFrame(a = numeric(0), b = character(0), c = logical(0))
+  )
+
+  # n > number of rows
+  expect_query_equal(
+    .input$clear(3),
+    df,
+    pl$DataFrame(a = rep(NA_real_, 3), b = rep(NA_character_, 3), c = rep(NA, 3))
+  )
+
+  # error
+  expect_query_error(
+    .input$clear(-1),
+    df,
+    "greater or equal to 0"
+  )
+})
+
+test_that("$explain() works", {
+  lazy_query <- as_polars_lf(iris)$sort("Species")$filter(pl$col("Species") != "setosa")
+
+  expect_error(
+    lazy_query$explain(format = "foobar"),
+    "`format` must be one of"
+  )
+  expect_error(
+    lazy_query$explain(format = 1),
+    "`format` must be a string or character vector"
+  )
+
+  expect_snapshot(cat(lazy_query$explain(optimized = FALSE)))
+  expect_snapshot(cat(lazy_query$explain()))
+
+  expect_snapshot(cat(lazy_query$explain(format = "tree", optimized = FALSE)))
+  expect_snapshot(cat(lazy_query$explain(format = "tree", )))
+})
+
+test_that("$gather_every() works", {
+  df <- pl$DataFrame(a = 1:4, b = 5:8)
+
+  expect_query_equal(
+    .input$gather_every(2),
+    df,
+    pl$DataFrame(a = c(1L, 3L), b = c(5L, 7L))
+  )
+  expect_query_equal(
+    .input$gather_every(2, offset = 1),
+    df,
+    pl$DataFrame(a = c(2L, 4L), b = c(6L, 8L))
+  )
+
+  # must specify n
+  expect_query_error(
+    .input$gather_every(),
+    df,
+    r"(argument "n" is missing)"
+  )
+
+  # offset must be positive
+  expect_query_error(
+    .input$gather_every(2, offset = -1),
+    df,
+    "cannot be less than zero"
+  )
+  expect_query_error(
+    .input$gather_every(2, offset = "a"),
+    df,
+    "Expected a value of type"
+  )
+})
+
+test_that("$cast() works", {
+  df <- pl$DataFrame(
+    foo = 1:3,
+    bar = c(6, 7, 8),
+    ham = as.Date(c("2020-01-02", "2020-03-04", "2020-05-06"))
+  )
+
+  expect_query_equal(
+    .input$cast(foo = pl$Float32, bar = pl$UInt8),
+    df,
+    pl$DataFrame(
+      foo = 1:3,
+      bar = c(6, 7, 8),
+      ham = as.Date(c("2020-01-02", "2020-03-04", "2020-05-06")),
+      .schema_overrides = list(foo = pl$Float32, bar = pl$UInt8, ham = pl$Date)
+    )
+  )
+
+  expect_query_equal(
+    .input$cast(pl$String),
+    df,
+    pl$DataFrame(
+      foo = 1:3,
+      bar = c(6, 7, 8),
+      ham = as.Date(c("2020-01-02", "2020-03-04", "2020-05-06")),
+      .schema_overrides = list(foo = pl$String, bar = pl$String, ham = pl$String)
+    )
+  )
+
+  expect_query_equal(
+    .input$cast(),
+    df,
+    df
+  )
+
+  expect_query_error(.input$cast(1), df)
+  expect_query_error(.input$cast("a"), df)
+  expect_query_error(.input$cast(list(foo = "a")), df)
+  expect_query_error(.input$cast(list(), strict = 1), df)
+
+  # Test overflow error
+  df <- pl$DataFrame(x = 1024)
+
+  expect_query_error(
+    .input$cast(pl$Int8),
+    df,
+    "conversion from `f64` to `i8` failed"
+  )
+  expect_query_equal(
+    .input$cast(pl$Int8, .strict = FALSE),
+    df,
+    pl$DataFrame(x = NA_integer_, .schema_overrides = list(x = pl$Int8))
+  )
+})

From ff1c6d85ab57880deba7ea00e89aa4b51dae9c9b Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Fri, 29 Nov 2024 23:21:29 +0100
Subject: [PATCH 36/71] join_asof tests

---
 tests/testthat/test-lazyframe-frame.R | 194 +++++++++++++++++++++++++-
 1 file changed, 190 insertions(+), 4 deletions(-)

diff --git a/tests/testthat/test-lazyframe-frame.R b/tests/testthat/test-lazyframe-frame.R
index e439d884..a2477bf5 100644
--- a/tests/testthat/test-lazyframe-frame.R
+++ b/tests/testthat/test-lazyframe-frame.R
@@ -1,4 +1,4 @@
-test_that("select works lazy/eager", {
+test_that("select", {
   .data <- pl$DataFrame(
     int32 = 1:5,
     int64 = as_polars_series(1:5)$cast(pl$Int64),
@@ -62,7 +62,7 @@ test_that("POLARS_AUTO_STRUCTIFY works for select", {
   )
 })
 
-test_that("slice/head/tail works lazy/eager", {
+test_that("slice/head/tail", {
   .data <- pl$DataFrame(
     foo = 1:5,
     bar = 6:10,
@@ -133,7 +133,7 @@ test_that("slice/head/tail works lazy/eager", {
   )
 })
 
-test_that("shift works lazy/eager", {
+test_that("shift", {
   .data <- as_polars_df(mtcars[1:3, 1:2])
   expect_query_equal(
     .input$shift(2),
@@ -148,7 +148,7 @@ test_that("shift works lazy/eager", {
 })
 
 # TODO-REWRITE: add $join() for DataFrame
-test_that("joins work lazy/eager", {
+test_that("join", {
   df <- pl$DataFrame(
     foo = 1:3,
     bar = c(6, 7, 8),
@@ -745,3 +745,189 @@ test_that("$cast() works", {
     pl$DataFrame(x = NA_integer_, .schema_overrides = list(x = pl$Int8))
   )
 })
+
+test_that("inequality joins work", {
+  east <- pl$DataFrame(
+    id = c(100, 101, 102),
+    dur = c(120, 140, 160),
+    rev = c(12, 14, 16),
+    cores = c(2, 8, 4)
+  )
+  west <- pl$DataFrame(
+    t_id = c(404, 498, 676, 742),
+    time = c(90, 130, 150, 170),
+    cost = c(9, 13, 15, 16),
+    cores = c(4, 2, 1, 4)
+  )
+
+  expect_query_equal(
+    .input$join_where(
+      .input2,
+      pl$col("dur") < pl$col("time"),
+      pl$col("rev") < pl$col("cost")
+    ),
+    .input = east, .input2 = west,
+    pl$DataFrame(
+      id = rep(c(100, 101), 3:2),
+      dur = rep(c(120, 140), 3:2),
+      rev = rep(c(12, 14), 3:2),
+      cores = rep(c(2, 8), 3:2),
+      t_id = c(498, 676, 742, 676, 742),
+      time = c(130, 150, 170, 150, 170),
+      cost = c(13, 15, 16, 15, 16),
+      cores_right = c(2, 1, 4, 1, 4)
+    )
+  )
+
+  expect_query_error(
+    east$join_where(
+      mtcars,
+      pl$col("dur") < pl$col("time"),
+      pl$col("rev") < pl$col("cost")
+    ),
+    "`other` must be a LazyFrame"
+  )
+})
+
+test_that("inequality joins require suffix when identical column names", {
+  east <- pl$DataFrame(
+    id = c(100, 101, 102),
+    dur = c(120, 140, 160),
+    rev = c(12, 14, 16),
+    cores = c(2, 8, 4)
+  )
+  west <- pl$DataFrame(
+    t_id = c(404, 498, 676, 742),
+    dur = c(90, 130, 150, 170),
+    rev = c(9, 13, 15, 16),
+    cores = c(4, 2, 1, 4)
+  )
+
+  expect_query_equal(
+    .input$join_where(
+      .input2,
+      pl$col("dur") < pl$col("dur_right"),
+      pl$col("rev") < pl$col("rev_right")
+    ),
+    .input = east, .input2 = west,
+    pl$DataFrame(
+      id = rep(c(100, 101), 3:2),
+      dur = rep(c(120, 140), 3:2),
+      rev = rep(c(12, 14), 3:2),
+      cores = rep(c(2, 8), 3:2),
+      t_id = c(498, 676, 742, 676, 742),
+      dur_right = c(130, 150, 170, 150, 170),
+      rev_right = c(13, 15, 16, 15, 16),
+      cores_right = c(2, 1, 4, 1, 4)
+    )
+  )
+})
+
+test_that("join_asof", {
+  l_gdp <- pl$DataFrame(
+    date = as.Date(c("2016-1-1", "2017-1-1", "2018-1-1", "2019-1-1")),
+    gdp = c(4164, 4411, 4566, 4696),
+    group = c("a", "a", "b", "b")
+  )$sort("date")
+
+  l_pop <- pl$DataFrame(
+    date = as.Date(c("2016-5-12", "2017-5-12", "2018-5-12", "2019-5-12")),
+    population = c(82.19, 82.66, 83.12, 83.52),
+    group_right = c("b", "b", "a", "a")
+  )$sort("date")
+
+  # strategy param
+  expect_query_equal(
+    .input$join_asof(.input2, on = "date", strategy = "backward"),
+    .input = l_gdp, .input2 = l_pop,
+    pl$DataFrame(
+      date = as.Date(c("2016-1-1", "2017-1-1", "2018-1-1", "2019-1-1")),
+      gdp = c(4164, 4411, 4566, 4696),
+      group = c("a", "a", "b", "b"),
+      population = c(NA, 82.19, 82.66, 83.12),
+      group_right = c(NA, "b", "b", "a")
+    )
+  )
+  expect_query_equal(
+    .input$join_asof(.input2, on = "date", strategy = "forward"),
+    .input = l_gdp, .input2 = l_pop,
+    pl$DataFrame(
+      date = as.Date(c("2016-1-1", "2017-1-1", "2018-1-1", "2019-1-1")),
+      gdp = c(4164, 4411, 4566, 4696),
+      group = c("a", "a", "b", "b"),
+      population = c(82.19, 82.66, 83.12, 83.52),
+      group_right = c("b", "b", "a", "a")
+    )
+  )
+  expect_snapshot(
+    l_gdp$lazy()$join_asof(l_pop$lazy(), on = "date", strategy = "fruitcake"),
+    error = TRUE
+  )
+
+  # left_on / right_on
+  expect_query_equal(
+    .input$join_asof(.input2, left_on = "date", right_on = "date", strategy = "forward"),
+    .input = l_gdp, .input2 = l_pop,
+    pl$DataFrame(
+      date = as.Date(c("2016-1-1", "2017-1-1", "2018-1-1", "2019-1-1")),
+      gdp = c(4164, 4411, 4566, 4696),
+      group = c("a", "a", "b", "b"),
+      population = c(82.19, 82.66, 83.12, 83.52),
+      group_right = c("b", "b", "a", "a")
+    )
+  )
+
+  # test by
+  expect_query_equal(
+    .input$join_asof(
+      .input2,
+      on = "date", by_left = "group",
+      by_right = "group_right", strategy = "backward"
+    ),
+    .input = l_gdp, .input2 = l_pop,
+    pl$DataFrame(
+      date = as.Date(c("2016-01-01", "2017-01-01", "2018-01-01", "2019-01-01")),
+      gdp = c(4164, 4411, 4566, 4696),
+      group = c("a", "a", "b", "b"),
+      population = c(NA, NA, 82.66, 82.66),
+    )
+  )
+  expect_query_equal(
+    .input$join_asof(
+      .input2,
+      on = "date", by_left = "group",
+      by_right = "group_right", strategy = "forward"
+    ),
+    .input = l_gdp, .input2 = l_pop,
+    pl$DataFrame(
+      date = as.Date(c("2016-01-01", "2017-01-01", "2018-01-01", "2019-01-01")),
+      gdp = c(4164, 4411, 4566, 4696),
+      group = c("a", "a", "b", "b"),
+      population = c(83.12, 83.12, NA, NA),
+    )
+  )
+
+  # tolerance exceeding 18w
+  expect_query_equal(
+    .input$join_asof(.input2, on = "date", strategy = "backward", tolerance = "18w"),
+    .input = l_gdp, .input2 = l_pop,
+    pl$DataFrame(
+      date = as.Date(c("2016-1-1", "2017-1-1", "2018-1-1", "2019-1-1")),
+      gdp = c(4164, 4411, 4566, 4696),
+      group = c("a", "a", "b", "b"),
+      population = rep(NA_real_, 4),
+      group_right = rep(NA_character_, 4)
+    )
+  )
+  expect_query_equal(
+    .input$join_asof(.input2, on = "date", strategy = "backward", tolerance = 18 * 7),
+    .input = l_gdp, .input2 = l_pop,
+    pl$DataFrame(
+      date = as.Date(c("2016-1-1", "2017-1-1", "2018-1-1", "2019-1-1")),
+      gdp = c(4164, 4411, 4566, 4696),
+      group = c("a", "a", "b", "b"),
+      population = rep(NA_real_, 4),
+      group_right = rep(NA_character_, 4)
+    )
+  )
+})

From e54253e7c3b82a0b9e441f0fa2e3771cbf67a7c5 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Fri, 29 Nov 2024 23:37:37 +0100
Subject: [PATCH 37/71] test unique, fill_null

---
 R/000-wrappers.R                      |  8 +++++
 src/init.c                            |  6 ++++
 src/rust/api.h                        |  1 +
 src/rust/src/lazyframe/general.rs     |  5 +++
 tests/testthat/test-lazyframe-frame.R | 52 +++++++++++++++++++++++++++
 5 files changed, 72 insertions(+)

diff --git a/R/000-wrappers.R b/R/000-wrappers.R
index 6dd19950..02212926 100644
--- a/R/000-wrappers.R
+++ b/R/000-wrappers.R
@@ -2560,6 +2560,13 @@ class(`PlRExpr`) <- c("PlRExpr__bundle", "savvy_neopolars__sealed")
   }
 }
 
+`PlRLazyFrame_fill_null` <- function(self) {
+  function(`fill_value`) {
+    `fill_value` <- .savvy_extract_ptr(`fill_value`, "PlRExpr")
+    .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_fill_null__impl, `self`, `fill_value`))
+  }
+}
+
 `PlRLazyFrame_min` <- function(self) {
   function() {
     .savvy_wrap_PlRLazyFrame(.Call(savvy_PlRLazyFrame_min__impl, `self`))
@@ -2713,6 +2720,7 @@ class(`PlRExpr`) <- c("PlRExpr__bundle", "savvy_neopolars__sealed")
   e$`reverse` <- `PlRLazyFrame_reverse`(ptr)
   e$`shift` <- `PlRLazyFrame_shift`(ptr)
   e$`fill_nan` <- `PlRLazyFrame_fill_nan`(ptr)
+  e$`fill_null` <- `PlRLazyFrame_fill_null`(ptr)
   e$`min` <- `PlRLazyFrame_min`(ptr)
   e$`max` <- `PlRLazyFrame_max`(ptr)
   e$`sum` <- `PlRLazyFrame_sum`(ptr)
diff --git a/src/init.c b/src/init.c
index 2d2aa76d..234b94ee 100644
--- a/src/init.c
+++ b/src/init.c
@@ -1764,6 +1764,11 @@ SEXP savvy_PlRLazyFrame_fill_nan__impl(SEXP self__, SEXP c_arg__fill_value) {
     return handle_result(res);
 }
 
+SEXP savvy_PlRLazyFrame_fill_null__impl(SEXP self__, SEXP c_arg__fill_value) {
+    SEXP res = savvy_PlRLazyFrame_fill_null__ffi(self__, c_arg__fill_value);
+    return handle_result(res);
+}
+
 SEXP savvy_PlRLazyFrame_min__impl(SEXP self__) {
     SEXP res = savvy_PlRLazyFrame_min__ffi(self__);
     return handle_result(res);
@@ -2422,6 +2427,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"savvy_PlRLazyFrame_reverse__impl", (DL_FUNC) &savvy_PlRLazyFrame_reverse__impl, 1},
     {"savvy_PlRLazyFrame_shift__impl", (DL_FUNC) &savvy_PlRLazyFrame_shift__impl, 3},
     {"savvy_PlRLazyFrame_fill_nan__impl", (DL_FUNC) &savvy_PlRLazyFrame_fill_nan__impl, 2},
+    {"savvy_PlRLazyFrame_fill_null__impl", (DL_FUNC) &savvy_PlRLazyFrame_fill_null__impl, 2},
     {"savvy_PlRLazyFrame_min__impl", (DL_FUNC) &savvy_PlRLazyFrame_min__impl, 1},
     {"savvy_PlRLazyFrame_max__impl", (DL_FUNC) &savvy_PlRLazyFrame_max__impl, 1},
     {"savvy_PlRLazyFrame_sum__impl", (DL_FUNC) &savvy_PlRLazyFrame_sum__impl, 1},
diff --git a/src/rust/api.h b/src/rust/api.h
index eafeea19..c3fa97bf 100644
--- a/src/rust/api.h
+++ b/src/rust/api.h
@@ -356,6 +356,7 @@ SEXP savvy_PlRLazyFrame_rename__ffi(SEXP self__, SEXP c_arg__existing, SEXP c_ar
 SEXP savvy_PlRLazyFrame_reverse__ffi(SEXP self__);
 SEXP savvy_PlRLazyFrame_shift__ffi(SEXP self__, SEXP c_arg__n, SEXP c_arg__fill_value);
 SEXP savvy_PlRLazyFrame_fill_nan__ffi(SEXP self__, SEXP c_arg__fill_value);
+SEXP savvy_PlRLazyFrame_fill_null__ffi(SEXP self__, SEXP c_arg__fill_value);
 SEXP savvy_PlRLazyFrame_min__ffi(SEXP self__);
 SEXP savvy_PlRLazyFrame_max__ffi(SEXP self__);
 SEXP savvy_PlRLazyFrame_sum__ffi(SEXP self__);
diff --git a/src/rust/src/lazyframe/general.rs b/src/rust/src/lazyframe/general.rs
index 21de28a6..c9a21f04 100644
--- a/src/rust/src/lazyframe/general.rs
+++ b/src/rust/src/lazyframe/general.rs
@@ -688,6 +688,11 @@ impl PlRLazyFrame {
         Ok(ldf.fill_nan(fill_value.inner.clone()).into())
     }
 
+    fn fill_null(&self, fill_value: &PlRExpr) -> Result<PlRLazyFrame> {
+        let ldf = self.ldf.clone();
+        Ok(ldf.fill_null(fill_value.inner.clone()).into())
+    }
+
     fn min(&self) -> Result<PlRLazyFrame> {
         let ldf = self.ldf.clone();
         let out = ldf.min();
diff --git a/tests/testthat/test-lazyframe-frame.R b/tests/testthat/test-lazyframe-frame.R
index a2477bf5..b24f1d27 100644
--- a/tests/testthat/test-lazyframe-frame.R
+++ b/tests/testthat/test-lazyframe-frame.R
@@ -931,3 +931,55 @@ test_that("join_asof", {
     )
   )
 })
+
+test_that("fill_null", {
+  df <- pl$DataFrame(
+    a = c(1.5, 2, NA, 4),
+    b = c(1.5, NA, NA, 4)
+  )
+  expect_query_equal(
+    .input$fill_null(99),
+    df,
+    pl$DataFrame(
+      a = c(1.5, 2, 99, 4),
+      b = c(1.5, 99, 99, 4)
+    )
+  )
+})
+
+test_that("unique", {
+  df <- pl$DataFrame(
+    x = c(1, 1, 2, 3),
+    y = c(1, 1, 3, 3),
+    z = c("a", "b", "c", "d")
+  )
+  expect_query_equal(
+    .input$unique("x")$sort("x"),
+    df,
+    pl$DataFrame(x = c(1, 2, 3), y = c(1, 3, 3), z = c("a", "c", "d"))
+  )
+  expect_query_equal(
+    .input$unique("x", keep = "first")$sort("x"),
+    df,
+    pl$DataFrame(x = c(1, 2, 3), y = c(1, 3, 3), z = c("a", "c", "d"))
+  )
+  expect_query_equal(
+    .input$unique("x", keep = "last")$sort("x"),
+    df,
+    pl$DataFrame(x = c(1, 2, 3), y = c(1, 3, 3), z = c("b", "c", "d"))
+  )
+  expect_query_equal(
+    .input$unique("x", keep = "none")$sort("x"),
+    df,
+    pl$DataFrame(x = c(2, 3), y = c(3, 3), z = c("c", "d"))
+  )
+})
+
+test_that("unique: maintain_order", {
+  df <- pl$DataFrame(x = rep(1:100, each = 2), y = 1:200)
+  expect_query_equal(
+    .input$unique("x", maintain_order = TRUE),
+    df,
+    pl$DataFrame(x = 1:100, y = seq(1, 200, 2))$cast(y = pl$Int32)
+  )
+})

From 9603ad899b52bfcf170b31ea1c337d264eddfdf3 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Sat, 30 Nov 2024 00:04:16 +0100
Subject: [PATCH 38/71] tests fill_nan, drop, drop_nulls, filter

---
 R/dataframe-frame.R                   |   4 +-
 R/lazyframe-frame.R                   |  10 +-
 man/dataframe__drop.Rd                |   2 +-
 man/lazyframe__drop.Rd                |   4 +-
 tests/testthat/test-lazyframe-frame.R | 192 +++++++++++++++++++++++++-
 5 files changed, 201 insertions(+), 11 deletions(-)

diff --git a/R/dataframe-frame.R b/R/dataframe-frame.R
index 3994cfcf..d397728a 100644
--- a/R/dataframe-frame.R
+++ b/R/dataframe-frame.R
@@ -419,8 +419,8 @@ dataframe__tail <- function(n = 5) {
 #'
 #' # equivalent
 #' as_polars_df(mtcars)$drop("mpg", "hp")
-dataframe__drop <- function(..., strict = TRUE) {
-  self$lazy()$drop(..., strict = strict)$collect(`_eager` = TRUE) |>
+dataframe__drop <- function(..., .strict = TRUE) {
+  self$lazy()$drop(..., .strict = .strict)$collect(`_eager` = TRUE) |>
     wrap()
 }
 
diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index bc2ccb87..ceebe249 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -733,7 +733,7 @@ lazyframe__with_columns_seq <- function(...) {
 #'
 #' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Names of the columns that
 #' should be removed from the dataframe. Accepts column selector input.
-#' @param strict Validate that all column names exist in the current schema,
+#' @param .strict Validate that all column names exist in the current schema,
 #' and throw an exception if any do not.
 #'
 #' @inherit as_polars_lf return
@@ -749,11 +749,11 @@ lazyframe__with_columns_seq <- function(...) {
 #'
 #' # Drop multiple columns by passing a selector
 #' lf$drop(cs$all())$collect()
-lazyframe__drop <- function(..., strict = TRUE) {
+lazyframe__drop <- function(..., .strict = TRUE) {
   wrap({
     check_dots_unnamed()
     parse_into_list_of_expressions(...) |>
-      self$`_ldf`$drop(strict)
+      self$`_ldf`$drop(.strict)
   })
 }
 
@@ -1067,7 +1067,9 @@ lazyframe__tail <- function(n = 5L) {
 #' lf$drop_nulls(subset = cs$integer())$collect()
 lazyframe__drop_nulls <- function(subset = NULL) {
   wrap({
-    subset <- parse_into_list_of_expressions(!!!subset)
+    if (!is.null(subset)) {
+      subset <- parse_into_list_of_expressions(!!!subset)
+    }
     self$`_ldf`$drop_nulls(subset)
   })
 }
diff --git a/man/dataframe__drop.Rd b/man/dataframe__drop.Rd
index b979acc5..4c4d313f 100644
--- a/man/dataframe__drop.Rd
+++ b/man/dataframe__drop.Rd
@@ -4,7 +4,7 @@
 \alias{dataframe__drop}
 \title{Drop columns of a DataFrame}
 \usage{
-dataframe__drop(..., strict = TRUE)
+dataframe__drop(..., .strict = TRUE)
 }
 \arguments{
 \item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Characters of column names to
diff --git a/man/lazyframe__drop.Rd b/man/lazyframe__drop.Rd
index 95552531..ca33049d 100644
--- a/man/lazyframe__drop.Rd
+++ b/man/lazyframe__drop.Rd
@@ -4,13 +4,13 @@
 \alias{lazyframe__drop}
 \title{Remove columns from the DataFrame}
 \usage{
-lazyframe__drop(..., strict = TRUE)
+lazyframe__drop(..., .strict = TRUE)
 }
 \arguments{
 \item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Names of the columns that
 should be removed from the dataframe. Accepts column selector input.}
 
-\item{strict}{Validate that all column names exist in the current schema,
+\item{.strict}{Validate that all column names exist in the current schema,
 and throw an exception if any do not.}
 }
 \value{
diff --git a/tests/testthat/test-lazyframe-frame.R b/tests/testthat/test-lazyframe-frame.R
index b24f1d27..a07eba34 100644
--- a/tests/testthat/test-lazyframe-frame.R
+++ b/tests/testthat/test-lazyframe-frame.R
@@ -932,16 +932,31 @@ test_that("join_asof", {
   )
 })
 
+test_that("fill_nan", {
+  df <- pl$DataFrame(
+    a = c(1.5, 2, NaN, NA),
+    b = c(1.5, NaN, NaN, 4)
+  )
+  expect_query_equal(
+    .input$fill_nan(99),
+    df,
+    pl$DataFrame(
+      a = c(1.5, 2, 99, NA),
+      b = c(1.5, 99, 99, 4)
+    )
+  )
+})
+
 test_that("fill_null", {
   df <- pl$DataFrame(
-    a = c(1.5, 2, NA, 4),
+    a = c(1.5, 2, NA, NaN),
     b = c(1.5, NA, NA, 4)
   )
   expect_query_equal(
     .input$fill_null(99),
     df,
     pl$DataFrame(
-      a = c(1.5, 2, 99, 4),
+      a = c(1.5, 2, 99, NaN),
       b = c(1.5, 99, 99, 4)
     )
   )
@@ -983,3 +998,176 @@ test_that("unique: maintain_order", {
     pl$DataFrame(x = 1:100, y = seq(1, 200, 2))$cast(y = pl$Int32)
   )
 })
+
+test_that("drop_nulls", {
+  df <- pl$DataFrame(x = c(1, NA, 2), y = c(NA, 1, 2))
+  expect_query_equal(
+    .input$drop_nulls(),
+    df,
+    pl$DataFrame(x = 2, y = 2)
+  )
+  expect_query_equal(
+    .input$drop_nulls(c("x", "y")),
+    df,
+    pl$DataFrame(x = 2, y = 2)
+  )
+  expect_query_equal(
+    .input$drop_nulls("x"),
+    df,
+    pl$DataFrame(x = c(1, 2), y = c(NA, 2))
+  )
+})
+
+test_that("drop", {
+  df <- pl$DataFrame(x = c(1, NA, 2), y = c(NA, 1, 2))
+  expect_query_equal(
+    .input$drop("x"),
+    df,
+    pl$DataFrame(y = c(NA, 1, 2))
+  )
+  expect_query_equal(
+    .input$drop("x", "y"),
+    df,
+    pl$DataFrame()
+  )
+
+  # arg 'strict' works
+  expect_query_error(
+    .input$drop("foo"),
+    df,
+    r"("foo" not found)"
+  )
+  expect_query_equal(
+    .input$drop("foo", .strict = FALSE),
+    df,
+    df
+  )
+})
+
+test_that("quantile", {
+  df <- pl$DataFrame(x = c(1, 2, 3, 1, 5, 6), y = 1:6)
+  expect_query_equal(
+    .input$quantile(1),
+    df,
+    pl$DataFrame(x = 6, y = 6)
+  )
+  expect_query_equal(
+    .input$quantile(0.5),
+    df,
+    pl$DataFrame(x = 3, y = 4)
+  )
+  expect_query_equal(
+    .input$quantile(0.5, "higher"),
+    df,
+    pl$DataFrame(x = 3, y = 4)
+  )
+  expect_query_equal(
+    .input$quantile(0.5, "lower"),
+    df,
+    pl$DataFrame(x = 2, y = 3)
+  )
+  expect_query_equal(
+    .input$quantile(0.5, "midpoint"),
+    df,
+    pl$DataFrame(x = 2.5, y = 3.5)
+  )
+  expect_query_equal(
+    .input$quantile(0.5, "linear"),
+    df,
+    pl$DataFrame(x = 2.5, y = 3.5)
+  )
+})
+
+
+test_that("lazy filter", {
+  df <- pl$DataFrame(
+    x = c(1, 2, 3, 4, 5),
+    y = letters[1:5],
+    z = c(TRUE, TRUE, FALSE, TRUE, FALSE)
+  )
+
+  # using ==
+  expect_query_equal(
+    .input$filter(pl$col("x") == 1),
+    df,
+    pl$DataFrame(x = 1, y = "a", z = TRUE)
+  )
+  expect_query_equal(
+    .input$filter(pl$col("z")),
+    df,
+    pl$DataFrame(x = c(1, 2, 4), y = c("a", "b", "d"), z = c(TRUE, TRUE, TRUE))
+  )
+  expect_query_equal(
+    .input$filter(!pl$col("z")),
+    df,
+    pl$DataFrame(x = c(3, 5), y = c("c", "e"), z = c(FALSE, FALSE))
+  )
+
+  # using inequality operators
+  expect_query_equal(
+    .input$filter(pl$col("x") > 4),
+    df,
+    pl$DataFrame(x = 5, y = "e", z = FALSE)
+  )
+  expect_query_equal(
+    .input$filter(pl$col("x") >= 4),
+    df,
+    pl$DataFrame(x = c(4, 5), y = c("d", "e"), z = c(TRUE, FALSE))
+  )
+  expect_query_equal(
+    .input$filter(pl$col("x") < 2),
+    df,
+    pl$DataFrame(x = 1, y = "a", z = TRUE)
+  )
+  expect_query_equal(
+    .input$filter(pl$col("x") <= 2),
+    df,
+    pl$DataFrame(x = c(1, 2), y = c("a", "b"), z = c(TRUE, TRUE))
+  )
+  expect_query_equal(
+    .input$filter(pl$col("x") != 3),
+    df,
+    pl$DataFrame(
+      x = c(1, 2, 4, 5),
+      y = c("a", "b", "d", "e"),
+      z = c(TRUE, TRUE, TRUE, FALSE)
+    )
+  )
+
+  # using &
+  expect_query_equal(
+    .input$filter(pl$col("x") <= 3 & pl$col("z")),
+    df,
+    pl$DataFrame(x = c(1, 2), y = c("a", "b"), z = c(TRUE, TRUE))
+  )
+  expect_query_equal(
+    .input$filter(pl$col("x") <= 3, pl$col("z")),
+    df,
+    pl$DataFrame(x = c(1, 2), y = c("a", "b"), z = c(TRUE, TRUE))
+  )
+
+  # using |
+  expect_query_equal(
+    .input$filter(pl$col("x") <= 3 | pl$col("z")),
+    df,
+    pl$DataFrame(
+      x = c(1, 2, 3, 4),
+      y = c("a", "b", "c", "d"),
+      z = c(TRUE, TRUE, FALSE, TRUE)
+    )
+  )
+})
+
+test_that("filter with nulls", {
+  df <- pl$DataFrame(x = c(1, 2, NA))
+  expect_query_equal(
+    .input$filter(pl$col("x") == 1),
+    df,
+    pl$DataFrame(x = 1)
+  )
+  expect_query_equal(
+    .input$filter(pl$col("x")$is_null()),
+    df,
+    pl$DataFrame(x = NA_real_)
+  )
+})

From 077ba1a2e42a7bb09bf2a73e6fe6fadfa8ae3e88 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Sat, 30 Nov 2024 11:29:59 +0100
Subject: [PATCH 39/71] redoc

---
 man/lazyframe__fill_nan.Rd     | 24 ++++++++++++++++++++++++
 man/lazyframe__rolling.Rd      |  5 +++++
 man/pl.Rd                      |  2 +-
 src/rust/src/conversion/mod.rs | 23 +++--------------------
 4 files changed, 33 insertions(+), 21 deletions(-)
 create mode 100644 man/lazyframe__fill_nan.Rd

diff --git a/man/lazyframe__fill_nan.Rd b/man/lazyframe__fill_nan.Rd
new file mode 100644
index 00000000..05e05a2c
--- /dev/null
+++ b/man/lazyframe__fill_nan.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lazyframe-frame.R
+\name{lazyframe__fill_nan}
+\alias{lazyframe__fill_nan}
+\title{Fill floating point \code{NaN} value with a fill value}
+\usage{
+lazyframe__fill_nan(value)
+}
+\arguments{
+\item{value}{Value used to fill \code{NaN} values.}
+}
+\value{
+A polars \link{LazyFrame}
+}
+\description{
+Fill floating point \code{NaN} value with a fill value
+}
+\examples{
+lf <- pl$LazyFrame(
+  a = c(1.5, 2, NaN, 4),
+  b = c(1.5, NaN, NaN, 4)
+)
+lf$fill_nan(99)$collect()
+}
diff --git a/man/lazyframe__rolling.Rd b/man/lazyframe__rolling.Rd
index dec217e3..62768257 100644
--- a/man/lazyframe__rolling.Rd
+++ b/man/lazyframe__rolling.Rd
@@ -56,6 +56,11 @@ whereas if you pass a non-default \code{offset}, then the windows will be:
 \item \verb{(t_n + offset, t_n + offset + period]}
 }
 }
+\details{
+If you want to compute multiple aggregation statistics over the same dynamic
+window, consider using \code{\link[=expr__rolling]{$rolling()}} - this method can cache
+the window size computation.
+}
 \examples{
 dates <- c(
   "2020-01-01 13:45:48",
diff --git a/man/pl.Rd b/man/pl.Rd
index 327dee16..33e6b48d 100644
--- a/man/pl.Rd
+++ b/man/pl.Rd
@@ -5,7 +5,7 @@
 \alias{pl}
 \title{Polars top-level function namespace}
 \format{
-An object of class \code{polars_object} of length 74.
+An object of class \code{polars_object} of length 75.
 }
 \usage{
 pl
diff --git a/src/rust/src/conversion/mod.rs b/src/rust/src/conversion/mod.rs
index 91465e1f..e2d60dc6 100644
--- a/src/rust/src/conversion/mod.rs
+++ b/src/rust/src/conversion/mod.rs
@@ -620,15 +620,15 @@ impl TryFrom<&str> for Wrap<Roll> {
 impl TryFrom<&str> for Wrap<QuantileMethod> {
     type Error = String;
 
-    fn try_from(interpolation: &str) -> Result<Self, String> {
-        let parsed = match interpolation {
+    fn try_from(roll: &str) -> Result<Self, String> {
+        let parsed = match roll {
             "nearest" => QuantileMethod::Nearest,
             "higher" => QuantileMethod::Higher,
             "lower" => QuantileMethod::Lower,
             "midpoint" => QuantileMethod::Midpoint,
             "linear" => QuantileMethod::Linear,
             "equiprobable" => QuantileMethod::Equiprobable,
-            _ => return Err(format!("unreachable")),
+            _ => return Err("unreachable".to_string()),
         };
         Ok(Wrap(parsed))
     }
@@ -824,23 +824,6 @@ impl TryFrom<&str> for Wrap<AsofStrategy> {
     }
 }
 
-impl TryFrom<&str> for Wrap<QuantileMethod> {
-    type Error = String;
-
-    fn try_from(roll: &str) -> Result<Self, String> {
-        let parsed = match roll {
-            "nearest" => QuantileMethod::Nearest,
-            "higher" => QuantileMethod::Higher,
-            "lower" => QuantileMethod::Lower,
-            "midpoint" => QuantileMethod::Midpoint,
-            "linear" => QuantileMethod::Linear,
-            "equiprobable" => QuantileMethod::Equiprobable,
-            _ => return Err("unreachable".to_string()),
-        };
-        Ok(Wrap(parsed))
-    }
-}
-
 impl TryFrom<&str> for Wrap<CsvEncoding> {
     type Error = String;
 

From 96fec3033ff8b668d1394498f9a23351dd5da4a0 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Sat, 30 Nov 2024 12:11:50 +0100
Subject: [PATCH 40/71] more tests for join

---
 R/lazyframe-frame.R                   |   2 +-
 src/rust/src/lazyframe/general.rs     |   1 +
 tests/testthat/test-lazyframe-frame.R | 178 +++++++++++++++++++++++++-
 3 files changed, 178 insertions(+), 3 deletions(-)

diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index ceebe249..1b0a7bc6 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -1226,7 +1226,7 @@ lazyframe__join <- function(
       }
       return(
         self$`_ldf`$join(
-          other$`_ldf`, as.list(NULL), as.list(NULL),
+          other$`_ldf`, list(), list(),
           how = how, validate = validate,
           join_nulls = join_nulls, suffix = suffix,
           allow_parallel = allow_parallel, force_parallel = force_parallel,
diff --git a/src/rust/src/lazyframe/general.rs b/src/rust/src/lazyframe/general.rs
index c9a21f04..a797e015 100644
--- a/src/rust/src/lazyframe/general.rs
+++ b/src/rust/src/lazyframe/general.rs
@@ -616,6 +616,7 @@ impl PlRLazyFrame {
         let other = other.ldf.clone();
         let left_on = <Wrap<Vec<Expr>>>::from(left_on).0;
         let right_on = <Wrap<Vec<Expr>>>::from(right_on).0;
+
         let how = <Wrap<JoinType>>::try_from(how)?.0;
         let validate = <Wrap<JoinValidation>>::try_from(validate)?.0;
         Ok(ldf
diff --git a/tests/testthat/test-lazyframe-frame.R b/tests/testthat/test-lazyframe-frame.R
index a07eba34..bd1f873d 100644
--- a/tests/testthat/test-lazyframe-frame.R
+++ b/tests/testthat/test-lazyframe-frame.R
@@ -147,19 +147,18 @@ test_that("shift", {
   )
 })
 
-# TODO-REWRITE: add $join() for DataFrame
 test_that("join", {
   df <- pl$DataFrame(
     foo = 1:3,
     bar = c(6, 7, 8),
     ham = c("a", "b", "c")
   )
-
   other_df <- pl$DataFrame(
     apple = c("x", "y", "z"),
     ham = c("a", "b", "d")
   )
 
+  # inner default
   expect_query_equal(
     .input$join(.input2, on = "ham"),
     .input = df, .input2 = other_df,
@@ -170,8 +169,183 @@ test_that("join", {
       apple = c("x", "y")
     )
   )
+
+  # outer
+  expect_query_equal(
+    .input$join(.input2, on = "ham", how = "full"),
+    .input = df, .input2 = other_df,
+    pl$DataFrame(
+      foo = c(1L, 2L, NA, 3L),
+      bar = c(6, 7, NA, 8),
+      ham = c("a", "b", NA, "c"),
+      apple = c("x", "y", "z", NA),
+      ham_right = c("a", "b", "d", NA)
+    )
+  )
+
+  # error on invalid 'how'
+  expect_query_error(
+    df$lazy()$join(other_df$lazy(), on = "ham", how = "foobar"),
+    "must be one of"
+  )
+  expect_query_error(
+    df$lazy()$join(other_df$lazy(), on = "ham", how = 42),
+    "must be a string or character vector"
+  )
+  # 'other' must be of same class
+  expect_query_error(
+    df$lazy()$join(other_df, on = "ham"),
+    "must be a polars lazy frame"
+  )
 })
 
+test_that("right join works", {
+  a <- pl$DataFrame(a = c(1, 2, 3), b = c(1, 2, 4))
+  b <- pl$DataFrame(a = c(1, 3), b = c(1, 3), c = c(1, 3))
+  expect_query_equal(
+    .input$join(.input2, on = "a", how = "right", coalesce = TRUE),
+    .input = a, .input2 = b,
+    pl$DataFrame(
+      b = c(1, 4),
+      a = c(1, 3),
+      b_right = c(1, 3),
+      c = c(1, 3)
+    )
+  )
+  expect_query_equal(
+    .input$join(.input2, on = "a", how = "right", coalesce = TRUE) |>
+      names(),
+    .input = a, .input2 = b,
+    c("a", "b", "a_right", "b_right", "c")
+  )
+})
+
+test_that("semi and anti join", {
+  df_a <- pl$DataFrame(key = 1:3, payload = c("f", "i", NA))
+  df_b <- pl$DataFrame(key = c(3L, 4L, 5L, NA))
+
+  expect_query_equal(
+    .input$join(.input2, on = "key", how = "anti"),
+    .input = df_a, .input2 = df_b,
+    pl$DataFrame(key = 1:2, payload = c("f", "i"))
+  )
+  expect_query_equal(
+    .input$join(.input2, on = "key", how = "semi"),
+    .input = df_a, .input2 = df_b,
+    pl$DataFrame(key = 3L, payload = NA_character_)
+  )
+
+  df_a <- pl$DataFrame(a = c(1:3, 1L), b = c("a", "b", "c", "a"), payload = c(10L, 20L, 30L, 40L))
+  df_b <- pl$DataFrame(a = c(3L, 3L, 4L, 5L), b = c("c", "c", "d", "e"))
+
+  expect_query_equal(
+    .input$join(.input2, on = c("a", "b"), how = "anti"),
+    .input = df_a, .input2 = df_b,
+    pl$DataFrame(a = c(1:2, 1L), b = c("a", "b", "a"), payload = c(10L, 20L, 40L))
+  )
+  expect_query_equal(
+    .input$join(.input2, on = c("a", "b"), how = "semi"),
+    .input = df_a, .input2 = df_b,
+    pl$DataFrame(a = 3L, b = "c", payload = 30L)
+  )
+})
+
+# TODO-REWRITE: panics
+# test_that("cross join", {
+#   dat <- pl$DataFrame(x = letters[1:3])
+#   dat2 <- pl$DataFrame(y = 1:4)
+
+#   expect_query_equal(
+#     .input$join(.input2, how = "cross"),
+#     .input = dat, .input2 = dat2,
+#     pl$DataFrame(
+#       x = rep(letters[1:3], each = 4),
+#       y = rep(1:4, 3)
+#     )
+#   )
+
+#   expect_query_error(
+#     dat$lazy()$join(.input2$lazy(), how = "cross", on = "foo"),
+#     "cross join should not pass join keys"
+#   )
+#   expect_query_error(
+#     dat$lazy()$join(.input2$lazy(), how = "cross", left_on = "foo", right_on = "foo2"),
+#     "cross join should not pass join keys"
+#   )
+
+#   # one empty dataframe
+#   dat_empty <- pl$DataFrame(y = character())
+#   expect_query_equal(
+#     .input$join(dat_empty, how = "cross"),
+#     .input = dat, .input2 = dat2,
+#     pl$DataFrame(x = character(), y = character())
+#   )
+#   expect_query_equal(
+#     dat_empty$join(dat, how = "cross"),
+#     pl$DataFrame(y = character(), x = character())
+#   )
+
+#   # suffix works
+#   expect_query_equal(
+#     .input$join(.input, how = "cross"),
+#     .input = dat,
+#     pl$DataFrame(
+#       x = rep(letters[1:3], each = 3),
+#       x_right = rep(letters[1:3], 3)
+#     )
+#   )
+# })
+
+test_that("argument 'validate' works", {
+  df1 <- pl$DataFrame(x = letters[1:5], y = 1:5)
+  df2 <- pl$DataFrame(x = c("a", letters[1:4]), y2 = 6:10)
+
+  expect_query_error(
+    df1$lazy()$join(df2$lazy(), on = "x", validate = "1:1"),
+    "join keys did not fulfill 1:1 validation"
+  )
+  expect_query_error(
+    df1$lazy()$join(df2$lazy(), on = "x", validate = "m:1"),
+    "join keys did not fulfill m:1 validation"
+  )
+  expect_query_error(
+    df2$lazy()$join(df1$lazy(), on = "x", validate = "1:m"),
+    "join keys did not fulfill 1:m validation"
+  )
+  expect_query_error(
+    df2$lazy()$join(df1$lazy(), on = "x", validate = "foobar"),
+    "must be one of"
+  )
+})
+
+test_that("argument 'join_nulls' works", {
+  df1 <- pl$DataFrame(x = c(NA, letters[1:2]), y = 1:3)
+  df2 <- pl$DataFrame(x = c(NA, letters[2:3]), y2 = 4:6)
+
+  # discard nulls by default
+  expect_query_equal(
+    .input$join(.input2, on = "x"),
+    .input = df1, .input2 = df2,
+    pl$DataFrame(x = "b", y = 3L, y2 = 5L)
+  )
+
+  # consider nulls as a valid key
+  expect_query_equal(
+    .input$join(.input2, on = "x", join_nulls = TRUE),
+    .input = df1, .input2 = df2,
+    pl$DataFrame(x = c(NA, "b"), y = c(1L, 3L), y2 = c(4L, 5L))
+  )
+
+  # several nulls
+  df3 <- pl$DataFrame(x = c(NA, letters[2:3], NA), y2 = 4:7)
+  expect_query_equal(
+    .input$join(.input2, on = "x", join_nulls = TRUE),
+    .input = df1, .input2 = df3,
+    pl$DataFrame(x = c(NA, "b", NA), y = c(1L, 3L, 1L), y2 = c(4L, 5L, 7L))
+  )
+})
+
+
 test_that("sort()", {
   expect_query_error(
     .input$sort(complex(1)),

From 2995da23964add2c6a376653f174bee14c7a7a80 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Sat, 30 Nov 2024 12:26:56 +0100
Subject: [PATCH 41/71] test for unnest

---
 tests/testthat/test-lazyframe-frame.R | 33 +++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/tests/testthat/test-lazyframe-frame.R b/tests/testthat/test-lazyframe-frame.R
index bd1f873d..826caa53 100644
--- a/tests/testthat/test-lazyframe-frame.R
+++ b/tests/testthat/test-lazyframe-frame.R
@@ -1345,3 +1345,36 @@ test_that("filter with nulls", {
     pl$DataFrame(x = NA_real_)
   )
 })
+
+test_that("unnest", {
+  df <- pl$DataFrame(
+    a = 1:5,
+    b = c("one", "two", "three", "four", "five"),
+    c = rep(TRUE, 5),
+    d = rep(42.0, 5),
+    e = rep(NaN, 5),
+    f = rep(NA_real_, 5)
+  )
+
+  df2 <- df$
+    select(
+    pl$struct(c("a", "b", "c"))$alias("first_struct"),
+    pl$struct(c("d", "e", "f"))$alias("second_struct")
+  )
+
+  expect_query_equal(
+    .input$unnest("first_struct", "second_struct"),
+    .input = df2,
+    df
+  )
+
+  expect_query_equal(
+    .input$unnest("first_struct"),
+    .input = df2,
+    df$
+      select(
+      pl$col("a", "b", "c"),
+      pl$struct(c("d", "e", "f"))$alias("second_struct")
+    )
+  )
+})

From 0245b952e947de1352b387a2a152628d4cb9b599 Mon Sep 17 00:00:00 2001
From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com>
Date: Sat, 30 Nov 2024 12:30:19 +0100
Subject: [PATCH 42/71] docs: fix math notation in `emw_` functions

---
 R/expr-expr.R         | 4 ++--
 man/expr__ewm_mean.Rd | 4 ++--
 man/expr__ewm_std.Rd  | 4 ++--
 man/expr__ewm_var.Rd  | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/R/expr-expr.R b/R/expr-expr.R
index dcfe8f28..3336c9b1 100644
--- a/R/expr-expr.R
+++ b/R/expr-expr.R
@@ -3322,8 +3322,8 @@ expr__rolling_var_by <- function(
 #' account for imbalance in relative weightings:
 #' * when `TRUE` (default), the EW function is calculated using weights
 #'  \eqn{w_i = (1 - \alpha)^i};
-#' * when `FALSE`, the EW function is calculated recursively by \deqn{y_0 &= x_0
-#'  \\y_t &= (1 - \alpha)y_{t - 1} + \alpha x_t}
+#' * when `FALSE`, the EW function is calculated recursively by \deqn{y_0 = x_0
+#'  ; y_t = (1 - \alpha)y_{t - 1} + \alpha x_t}
 #' @param bias If `FALSE` (default), apply a correction to make the estimate
 #' statistically unbiased.
 #' @param ignore_nulls Ignore missing values when calculating weights.
diff --git a/man/expr__ewm_mean.Rd b/man/expr__ewm_mean.Rd
index fe00e4b2..5bf0ff4d 100644
--- a/man/expr__ewm_mean.Rd
+++ b/man/expr__ewm_mean.Rd
@@ -36,8 +36,8 @@ account for imbalance in relative weightings:
 \itemize{
 \item when \code{TRUE} (default), the EW function is calculated using weights
 \eqn{w_i = (1 - \alpha)^i};
-\item when \code{FALSE}, the EW function is calculated recursively by \deqn{y_0 &= x_0
- \\y_t &= (1 - \alpha)y_{t - 1} + \alpha x_t}
+\item when \code{FALSE}, the EW function is calculated recursively by \deqn{y_0 = x_0
+ ; y_t = (1 - \alpha)y_{t - 1} + \alpha x_t}
 }}
 
 \item{min_periods}{The number of values in the window that should be
diff --git a/man/expr__ewm_std.Rd b/man/expr__ewm_std.Rd
index cf06831b..5b0add64 100644
--- a/man/expr__ewm_std.Rd
+++ b/man/expr__ewm_std.Rd
@@ -37,8 +37,8 @@ account for imbalance in relative weightings:
 \itemize{
 \item when \code{TRUE} (default), the EW function is calculated using weights
 \eqn{w_i = (1 - \alpha)^i};
-\item when \code{FALSE}, the EW function is calculated recursively by \deqn{y_0 &= x_0
- \\y_t &= (1 - \alpha)y_{t - 1} + \alpha x_t}
+\item when \code{FALSE}, the EW function is calculated recursively by \deqn{y_0 = x_0
+ ; y_t = (1 - \alpha)y_{t - 1} + \alpha x_t}
 }}
 
 \item{bias}{If \code{FALSE} (default), apply a correction to make the estimate
diff --git a/man/expr__ewm_var.Rd b/man/expr__ewm_var.Rd
index b8cff09e..4faf9073 100644
--- a/man/expr__ewm_var.Rd
+++ b/man/expr__ewm_var.Rd
@@ -37,8 +37,8 @@ account for imbalance in relative weightings:
 \itemize{
 \item when \code{TRUE} (default), the EW function is calculated using weights
 \eqn{w_i = (1 - \alpha)^i};
-\item when \code{FALSE}, the EW function is calculated recursively by \deqn{y_0 &= x_0
- \\y_t &= (1 - \alpha)y_{t - 1} + \alpha x_t}
+\item when \code{FALSE}, the EW function is calculated recursively by \deqn{y_0 = x_0
+ ; y_t = (1 - \alpha)y_{t - 1} + \alpha x_t}
 }}
 
 \item{bias}{If \code{FALSE} (default), apply a correction to make the estimate

From 72493c5cd2a63bb13b81439424b390831092bd6a Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Sun, 1 Dec 2024 17:32:05 +0100
Subject: [PATCH 43/71] most comments

---
 R/dataframe-frame.R                   |   4 +-
 R/lazyframe-frame.R                   |  56 +++++++++++---------------
 foo.parquet                           | Bin 0 -> 18052 bytes
 man/dataframe__drop.Rd                |   2 +-
 man/lazyframe__drop.Rd                |   4 +-
 src/rust/src/lazyframe/general.rs     |   2 +-
 tests/testthat/test-lazyframe-frame.R |   2 +-
 7 files changed, 31 insertions(+), 39 deletions(-)
 create mode 100644 foo.parquet

diff --git a/R/dataframe-frame.R b/R/dataframe-frame.R
index d397728a..3994cfcf 100644
--- a/R/dataframe-frame.R
+++ b/R/dataframe-frame.R
@@ -419,8 +419,8 @@ dataframe__tail <- function(n = 5) {
 #'
 #' # equivalent
 #' as_polars_df(mtcars)$drop("mpg", "hp")
-dataframe__drop <- function(..., .strict = TRUE) {
-  self$lazy()$drop(..., .strict = .strict)$collect(`_eager` = TRUE) |>
+dataframe__drop <- function(..., strict = TRUE) {
+  self$lazy()$drop(..., strict = strict)$collect(`_eager` = TRUE) |>
     wrap()
 }
 
diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index 1b0a7bc6..3490a638 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -276,12 +276,12 @@ lazyframe__collect <- function(
 #'   pl$col("bar")$cast(pl$Int64)
 #' )$collect_schema()
 lazyframe__collect_schema <- function() {
-  wrap({
-    lapply(self$`_ldf`$collect_schema(), function(x) {
+  self$`_ldf`$collect_schema() |>
+    lapply(function(x) {
       .savvy_wrap_PlRDataType(x) |>
         wrap()
-    })
-  })
+    }) |>
+    wrap()
 }
 
 #' Collect and profile a lazy query.
@@ -471,15 +471,6 @@ lazyframe__explain <- function(
   })
 }
 
-lazyframe__collect_schema <- function() {
-  self$`_ldf`$collect_schema() |>
-    lapply(function(x) {
-      .savvy_wrap_PlRDataType(x) |>
-        wrap()
-    }) |>
-    wrap()
-}
-
 #' Cast LazyFrame column(s) to the specified dtype(s)
 #'
 #' This allows to convert all columns to a datatype or to convert only specific
@@ -595,22 +586,23 @@ lazyframe__sort <- function(
     nulls_last = FALSE,
     multithreaded = TRUE,
     maintain_order = FALSE) {
-  check_dots_unnamed()
-
-  by <- parse_into_list_of_expressions(...)
-  if (length(by) == 0) {
-    abort("`...` must contain at least one element.")
-  }
-  descending <- extend_bool(descending, length(by), "descending", "...")
-  nulls_last <- extend_bool(nulls_last, length(by), "nulls_last", "...")
-
-  self$`_ldf`$sort_by_exprs(
-    by,
-    descending = descending,
-    nulls_last = nulls_last,
-    multithreaded = multithreaded,
-    maintain_order = maintain_order
-  ) |> wrap()
+  wrap({
+    check_dots_unnamed()
+    by <- parse_into_list_of_expressions(...)
+    if (length(by) == 0) {
+      abort("`...` must contain at least one element.")
+    }
+    descending <- extend_bool(descending, length(by), "descending", "...")
+    nulls_last <- extend_bool(nulls_last, length(by), "nulls_last", "...")
+
+    self$`_ldf`$sort_by_exprs(
+      by,
+      descending = descending,
+      nulls_last = nulls_last,
+      multithreaded = multithreaded,
+      maintain_order = maintain_order
+    )
+  })
 }
 
 #' Modify/append column(s) of a LazyFrame
@@ -733,7 +725,7 @@ lazyframe__with_columns_seq <- function(...) {
 #'
 #' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Names of the columns that
 #' should be removed from the dataframe. Accepts column selector input.
-#' @param .strict Validate that all column names exist in the current schema,
+#' @param strict Validate that all column names exist in the current schema,
 #' and throw an exception if any do not.
 #'
 #' @inherit as_polars_lf return
@@ -749,11 +741,11 @@ lazyframe__with_columns_seq <- function(...) {
 #'
 #' # Drop multiple columns by passing a selector
 #' lf$drop(cs$all())$collect()
-lazyframe__drop <- function(..., .strict = TRUE) {
+lazyframe__drop <- function(..., strict = TRUE) {
   wrap({
     check_dots_unnamed()
     parse_into_list_of_expressions(...) |>
-      self$`_ldf`$drop(.strict)
+      self$`_ldf`$drop(strict)
   })
 }
 
diff --git a/foo.parquet b/foo.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..cf15bd52001e6eb42eecc37e4c91c8c28bc6cfca
GIT binary patch
literal 18052
zcmd5^dvsLQx!-3plT0R)kjI>znIvLFnDTyP6lk%{8G=wDRY(v-(wYF7fIJd;iCo(|
zpp7Y@iy~o}G;&j}auHK*t|$#YxC2I3;D)TMh%~SwO+hIur7nyt7Sxn=zrDXXubDFk
z)c)Zt8&2k&^V`4k{q}Ev-`;1Zv;9RAL?X5e;<KYgHyp)_@i`O8aC?0n`-qY|e3T{;
zLWaLH?w`4VKM3T81%7i20hd%?mm!^MPnQDe5)spU{F5Oil39t)-?zM4`@%-rMkWQ*
zN?)^07ag8qB;Mn87ME9*tr%NazN%vFQqf7>w9!I41+V>qSPdh?je3UI)*xVrM+qv!
z*9Fpv42e@Bb|9GuB<)VgHz+9~If)c(b(V$Zezd7$UQzG4rW>|dp&`<7+cueC5)z!h
zY>CRRL-4j+I6<R!O$dIS@3x<6XP;~NN(g~qZ#ab4^8?rOPqC9Xd|zYKZ{|D5C@T>P
z?s;Xw3-7I{NUVx;3oGp*yG+nIyL@d_1cBj8HVs34!^p5mA3|*IEHK2Q!zx3YKx&X-
zecg%tv-!S{51+`lk;a$b==&(~;>nAoN|4zkAsbz|_6pvv1dX2hHbkp9Q!FWj&L&4>
z*a(5d!!K*f*H*46bHLI1^3`jWEv{TqxjK>kwrq99vQ^6#m%($SibL3Aq{@9YQ90?3
z$NObeyPuG7KX8jq&;2Am)#FYmt}0)=tb7gcd94DyEO@UZGO9*67`6y+U=PStdY;}J
z;hyKqkbG&G(CsL6KT!&}0)g}-QvAth&eGvlU*eUfq*dhJ$+Jd1<IlE?x5%-K<&BV2
zqpZGVq_{;ek~R%ELNMTCm*5Bb_jAR+?AuhJmj=dgN`VxgFDV%qJ0vHWl@_w)J^Ju$
zI<N1-7tY$uYoRfZKSSDDf@w0<D3|Q9&|bk?JxC+FzI|kBXk*q{8BId2s(lb#(yh3p
zzV6F>-&f!gZkE9HudgPN#<mV2Fob+wJv08?rrJ$T@X4}G3c*+|IhUr1rM_WgxI!OF
z<kf5Bl9d7p$u2pZf8wm~;}eWNe0uZkr1&29lfl|&CLfj3wtp%f9o?(QY-E{`;N9)w
z%wz<uEx&J;>7tBkY|B&`#gIaG#IX>vh1?x~DUe2tAy|gESTX^_5DI4LF0I%zZ_KW)
ztp_)Zv<UGfq1r{gB}s?m9%yKmR>8a1qY0zFO%!;8o*Yt2HS)@<0%-}8OJbSC=fqqR
zNVPjjvAE!#X5XMcZ5YuUOk8zO=L0W~ZV}F<npgx9UAy4z&F1`!nl<DX;GbL20deF3
z7s$Z_er;ik+{&-7y9xFPT$is(BEH1Y50X^(mXKr9tLu&?ANX`jUg=>?CN|0=K(KKz
zCur0z68uJaWP|Jw$Rj7t9zKykN*z@RCGqvaD+lZWAybyq$P|!AF644DM!D+yW{QC!
zjB0FC)I<!@Xmr*?@#!>)<WqrIEf_e$GDt=mq!5Xu!w+8u3X?7+k%hIPyGDFc`q$@v
ze@$@o#veGf<%#ARqEs@`4uL`Xyx{E|#$#&KJ_;O+V2UlK08>1&NN_KvlMj?y28yjy
z?Y!XfB}oAbuue<~q$H79;>dN2-@LGXV%)Bl<JB9JYvTSI{G7Vg0wS{B0+MG0?`y*~
zy-?pYl3b;?vD#gu>aBGG*@wYol@=6S<OZ1K)^#YfrliL=Rld1o@|*LwpBi2L-Xd>V
z)rXc6xlbDAkYj@Pi;)^`%$aJF$Ze^f8(Q+BQ4aaJuF^Vu!uRpTvmZl^+D-a5wFd%k
z6(8F&_`rll+h_0PO(qI4BeT2@L*E$A%c!~Un_13`V^qJXS#lJ!K(dg#AeiMU%Igy$
zh2%ib^JLR3UVK4P0na!|L0ah31zSkVrh{+qsH_N#8d0@tUjOEa)>5mYb=15B&9Xm_
z#}rzFkG_4>I-)#8FDDy$1*T5n^02-xSqeC$M3|OX;vJGNF2M$NE@*peN9LbWF8^`I
z+1Vqi=zPcTewf^FP|A@hM%7$6bGmgOXK2(gGTa$qXS^~)p$LYMW0`CrbLD}DB@-yO
zAX&4VoRKOqi#f1!4PtbmnjJ3+1!EpQd7*ttDDSdBg}kwvh(_fXh-jbSJvK?hSI@#E
zqBj-y@FGi@jD}}YY@WiM4Hmz76$b-ma3~U=U6?#!&DfXzvL&xG=iwUhm|$Ls!CZ9D
zk2DPR4SzQuxkN7;8SY^6;+P{(UIg2~Amiyvn?SD!-b)WKvay{MIXA$uDht%-U7A2%
zk=2K}flF4OP3|)#;+6swoRsCubYoE6p!}>{a`Nwm)ywmW$8~f$gsQk}WUF?9YGfst
z9308#VKpLv!bfIE?5K$fRiaV_(<@YQk}pWU1n`rMSQB|q4FuvP2k~XBJM_xT_g=66
z%Zh~~1ekO+mO0IH4<yK)Q#JbQn@6J65spGru)d5jg)}I~^b>(x4rg+fAX8z^mF|St
zrtwu1-=GYbZ4GC>5A08<T2y^<D;uE#d7UXuqshIp^P>|HSg8lJpT>zAHIGEs>Jt%$
zegLe5JT=p;7X{KB<_X`1y?+hnT?0^Oh;<{!bqvW}G{Wlp+veP!4}6KwWsgzktw!mn
zP4KqN(CDpi^t;(8T^xFcHcEU*u@R_lxCn4uV1v@NhZxt`E;3d;TTz_q6qoanM+&$>
z>nzFV%VsVDKT)4A6D0p+npj#M*EY%LS^C04_vya6%A>kfD8t$f6hB$0QC#1=;oyln
z+kq!`k~ozoq@(ckl~^S(ix1Y&ppvo1)z{gaWYn-K0WwgXrQ)gK>9}GMoEY4eWMUpn
zM-7i~hDHq=yBx<Aml8^idBEl2#YTCh(U&Hrv56vhD3Fpoh~@Sv2ZPTR5Bu=`godV;
ztr<SwpI?3MXmV+*Ss90^y>||0YScb5jZB;vLj9saxDM7QPFSkIQq$-A8j|Z(n4Eoy
zgW1h|HhhjhICkvFVZk?QHanV&GJD8$NxKXbWhcy=Z_m|8uP5=nv(t|GjB9K=ZI2j0
zjdDUbvi}swec`0UmeL?2F(diFS~L)FXTp*J`%p+r#iD2ApI=|S`tirkj-UIB&iL)Z
z=3q8ev%R6GE(zYt3wU6SIAA}BoDX3?NP-@VeaE7j>I0?5W9tP;z}3zsL&R>ia$6PD
zhHMR6S_>^x{x7-q>`X_PrFJkio)5K_Xw=s?jU<0%G}Nan933jhchor>FYa*B1*Vv(
zsp(MRr6!R#hg5HWd8ci5Pe(`fOkrbiSZP}BMT@$IA640b^}ApN`Uy=S^-Uwk$XZ19
zgh4)nT11u|vJuSJ!i#@kt-y7#6l(Allz7FTED=isKd*Xjd1bk~)YC@%nj+N5OHhiO
zDdS9y+W+2pY5x+&_0GM7eR41;kB_IBDrvmLmZqV&g9QaoW)8_3%u0MnQ|xmWc&SC2
z@>SJ~e?Of!_ekxM7Vr9>Y|B_&UPW4zio~eK11|+Dcwmh<U>As(F5!g^W$?GE-Bg_}
zL1tpfiDfAV@U+?O#0v8daeq(uf}XjrCf8ajmJJ)XG!9-iJ+96_VoaBMmT4I38%Bna
zsU9h+1+!)`T`lfdWe2$lOdm*f^V*qarF(dGDy*&a{^k$;$+gakj3pC&HqLW&$fkrc
z%1v0@HLlPQ)i;kspN|+s3IA9X;lEVpb*B`|v%Cb^hz0W&kp={bNRXr04Egw{Lj30T
zy;&<OU{w0Ss=$PjrK3(;QWIpJMpil}c>7mznnsO(H!Hctk#|_-Im_fK&tiLY|9)az
z=(P*<s^ASh#puR%Q{)hZS%JApPBieO%BvK4m5!yP9EoH>@AOE%Y??V3<H2T2qaF+e
z->^F$T@?rvyiR@`8Zvppy!eD*k{nmVv;<>R!&)9#BLb*@<a7x;DqAZ<<Ab&`RVbvw
zH7N+smg!)1&IYIL%u|6Bs{lQ9`$y^N>khg5juvfvYGu<6`;&j!;%oMd9->Y|q8x>F
zRR4NSN7XlvL?d4MAUm~q)yzQEmp;BssGVVSVrx1Sk~|A-HVpQ)#9t+(ywg3?QS!Hj
zr1?2<7sfW5SP2pnyc%tNhBGv37#Z%0X(iP0hH53zLe_@skyGsh`4CS6R?3IDDXimI
zqycc3Am5Zkrp)TK2yrjWtX`hdJMr{=RoU)@js>|Xvi3%vf;oN7vz#x&Fhk3LgnTzo
zeVdt+#ceS*6waJ3|B7+Fb3>sBcf&(rf7Q&KAm=OPel{*V42Jxal>><iCQ><f-3{Gz
z)PlgJg)@((R!<XCmmJzr9{5FP+-S=Z>+KD7WRsxj26_hA4OZ)Gs7OOG$ArEAYvt7o
zdnFWzw?TasexMZC0O3n3eDsC)x{LO08a*YU>9lQ<X1tGb5%kgHf_K;R$j+!yWEWY6
zss$_NZB3$wGPD<G^Y{unL&}0UyV=_Vn(6kys8mp1S65k)SMu^3*EUbAIu`s@MM}Ki
za+|CK>+)UINYkisq-mJpEy}0_g`6Bj$naskS&7Wu1TMds&+LRB2&EK1Ha8e5w9bpa
zXgRQx1WMGM5K-=dNW$`6<8QQ))VKV-v(KfioNNsLG+D$37@CGcWaYg%YCb^KVUaEz
z&s}3I_{w~!z|#<4eK^%Vn7FfH{Vr2VC&jtV1*J!=AtIITOP-hh`jZoi|6Ejes_^JL
z3;RBL_=AugY7;wA-pNH~rlE9m9PNjNyqC0nfaOM&3NQuIZ04*bjpy041$9-+yj?JJ
z?q*$er~qpa@E@!|urkV*1X;|M=tv_{$o)VlIO62Wdk)sRCT|T|bH~3-hW`;(PEu;(
zetY!A*zb}!Us&AP_gmy^!~*$71{qE0R-R0?77H@2JOW|L3TQXHqJ#GgFgbO(_?8cs
z%OwpW(y?tvY!!!gKyAA3+1$m&@32{Z!{ph`b#DnzWp*pO0V3D-<RI7V)b?UM6(oH!
zBFJ)R<&jlg*WdLykdDJBLwb_<`kn>y6+|0SSO5$cU>0P_j1c&eeFpL~$dtHXV}np&
z3kK_i+@I3T!N#4WIL*DkWz=Tj-pLc#{uODuIxC1UB!84fi?@FlJ_=yNhvhSjFC+Lb
zFyzoXSx}c(0*4Bgp@#WvC=9}EC9tNRlQlA=C(YuKi>owAITR(4cqk!w8ar76JB^2f
zbf+=4*YcIqU*~J@#_V{QoyNnn+SB13P9C?)PJ1)9$C=?_rmkrWzg;DsW^nrI9WX<n
z-fzGRpE{j2jW<(+ra3dbGF5jPALc8@3_H#(ZPVdEeao0(lzGgIm|=SsMjQ`Qbj&nr
z-i8@I!$mFe+o*}mGSrym=$h6r!z+^wPG46$Qikp{-f}`cee)YI!$7$hGYmkM4jWGs
zb{d;WG&t>Kr!gpcW_UO6h6q0JjhRsuJ6qQ@wim}vV>5Q$Y5a~e!|1vUPRBCHZPY~U
zm@(6+i6oPBP2*_?BaT5y*PYhH3?u$c)Wkb@Kf`7Q8(>>m(=rR~m?OtgIj!#t2>vDO
zXLzMm3x1oJ;iGaj@lekUu?3Zx-kX|;<CXfpfTwSKJ7yRO&J3~d);?Ca+uLF>le@j5
zrcumpGG^G0MGJnLm?3ti+Eer~lSKe*qkg7ghE2P*O^505K}81A^o7{Wa;RPG6d*+s
zPY!Ljh#m6rOE&emD?!{Ve}BOtt`1)wa;mph4-$LjOU|XKx9)a{A^H2ZOy$<S9uc<q
zVynH`u+Kew+&EZ0zL2Z7+JCQdYv(ZWynOj<!^Jc5@fRau13?&O?-=oY`M7<&a_jy)
z_10VWspOAM68q%K_xwoxzURl{`Y_^=eD(QuPE}zpO%r?MTU%y`@ZujIs&A&aHhg%p
zP`#w#5f#06jtcYkT$Mr1W6G_U7l@Z+#MTn^SzmfW?2^AfQ>HR&ELUy~R;ai3EK^%;
zT%oe*UnQQCVGgYpH--r}tW^>F*Q+F3pHX$Kc~-n4-+KPvl~!RQuigvuW)Uhk3<%6l
zRsJv!R2lVb6V&@(62yM_7ML7pqOwzS2Fg|uW|WA0t3G_cXz{H0sdaT_<yz4}M8`0T
z{!be4EA1RVe35pQH+&I(#kbDGnk68~yVHk}!W?LpHd#FFo)T}jpzHyPoRZ!C&nIZ-
zVL_y{w?PnpLcH}>1xm|N{zDA&KXjm%0Lo2wzsjL{;uWaKISglI{wE3a7RTyAtQ&R(
zD`F18p<pAZ$sZKZUGFkl^}9K((@8nT%T>4A?N1K?45i%w^8lf(4(j&W=|2)&Fld!{
zx2GrsV@Y7LOP{uKhCBa*GrW|lU`1&FE2^;cYqE`=_Hd{!gxZ&(K*iF)WWzhX#<8|H
za$4s_1<Oc-km(L}y6rutK?g#;>o(OO_V#i58o*Gx?e|=R<4l8)lkT;<EKG#HEQO(=
z2<E%L>ARd_8(@PGn(k7tqC^1Gn{U*nT^y=<4~M#Pw*qA-g880m+Q+e45vyj1f@LHE
z7{O#mG~N6@6Jhgza$27aH5DQDc4b-*FqDP>MmEA;5A92ES(yk2exNXn76D%fm~1en
z&79%@QvAmV1uIGffHL0(OxrlryRY%5Z62*a8H!-OZI||PtQ&}RVXT5>B!c-)TY7^-
z9eSP9Ix@jjgxK3)>Gltp2-g4xL}+79=p07t?XGwxLd|^&!&oAiY+R*#0ozTR_i~CI
z6BVo|5pY4(WQQp|!lAxEsHPt(P|+fQpys<j=^2jI^k1A-{}ctwNCfk3n)C|})rU}B
zKZzm2?RHSo>LW~qefziw?M#IIdGxg1WoIJn_A3kxMSy|aWXmIM0Bkq?46tE1>Mu~R
zqC`MbnQTg=Z*!<UZ*Zt%(-kN~5zO}u(sqv3i&(*j6f7eV%s2Vb&p6cX{hZeMSusSw
zWw+QH@aO=*P}&19-~=whk$l>2cR82{dmmO98j4`PXO8apUnasUZ*q!PW-C}xBA^pY
zHn-9J9O?{09Vu3z3`KxlrY76Z=rNA9vx(E%Jx{@k5&=zRvNepJ<4|1)HSn0J2(h<q
z(SCrTbo(Dv5l$A;^LCe$iO{i7VQ44<Yz#5kWJPO2OoVp8W)<o$6s#x_FejL7g`$ld
zYRdr*)my4SMT-D}nr~U6EgY*2u?{U(u#7|iubb>CqMaP7`oB4?odHu3V(<5%mjQ;-
zR)CS6aCrgkw!2c82$z;B3}cC4vO9;~;uM3;oMO{*1uIGfbb`q?8oKK!)AI;I4Xjk4
zqD25f!JsBPUFZRhHGo*{RSK4o2<E#{=y4A9?*H&7Y+e&XgxhTsq2~dH(i;E+B6P8w
z5Uij*c2_DBVPKuY&`<>ECnozk=rvC9?YB6^&J7Ayln9s;Om<$-ZO51h{Rp+=-xR25
z5kOG$JreXaj@5jS)4KLc1<ObT^X&`teGc^nLUjdUW^1x5JNDiJdKz#jZF*a^Lc>~m
z!R|_DD(rsFY{m?+W2xC>HvsJe^elQA&|yUC`L&7`B?JWTHZSk#^B*#>z3*_aeKjgr
zv=lhNnZ0ADyE)oth<0_0iWVgX8rAG=IBnrzdk$fLdlA4))G&EfO}hZdO?v?dOmUK#
z0^&;#*<CKCNAq@-W~43T_nl@hm1!U6+3-isbH~dnT7({V^2V1w-^%njhhYDxRly?l
zkg1uyilw_b+AA$ctxiRY(Bn>ChSC-eb_T)ve+OU&dfdUAO4<cLZn_iJSU``CWIB|+
zw5Q#PE(%MqZl}Mf=#lw9TC;d*`N}ePw{;x*Oj<tG@Ap3l|MO2?55I4i!w%th{?#nM
ze~rJu?}vZq_(#o#^G`p`&OhY$SA?&76#kw1Ae{D(W8W0AA5$y&Pxj-HC*h_aa?m37
zJAlDw0smP97|{Ck3TSNV9RKt=4EJel87?qvZ}uX<o6H-BOJ*=0Q~lE>!<XX#3P>-8
zucyvkJY(LIW%K84^gptZT`!Y{Q~cJc&<wk{m^~N#3)J}cKjXao(<U-J_zLhBFpB)C
z9-T37{DS#UubY>*V)>F875+&q9vk@`{ARcqJ`YZVA9MUuA7b~xzYm5VnN#rlXRl^_
zAPRuXo|_SYhkyLBqNR%p?*G~2^Y359u-WHdgTIF1&wc!{sSN+IviU{h%ly;Uu;=3s
ztJPx;e8Yms^?-l59)3T2Aodx>`p47oTs$sw@cZcxuUI@|`kKP!8&)h@xdgENKPK+@
c*_A8GR<H4tz<Q?kKQaUU-#=!<|NqSY1E5ZB761SM

literal 0
HcmV?d00001

diff --git a/man/dataframe__drop.Rd b/man/dataframe__drop.Rd
index 4c4d313f..b979acc5 100644
--- a/man/dataframe__drop.Rd
+++ b/man/dataframe__drop.Rd
@@ -4,7 +4,7 @@
 \alias{dataframe__drop}
 \title{Drop columns of a DataFrame}
 \usage{
-dataframe__drop(..., .strict = TRUE)
+dataframe__drop(..., strict = TRUE)
 }
 \arguments{
 \item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Characters of column names to
diff --git a/man/lazyframe__drop.Rd b/man/lazyframe__drop.Rd
index ca33049d..95552531 100644
--- a/man/lazyframe__drop.Rd
+++ b/man/lazyframe__drop.Rd
@@ -4,13 +4,13 @@
 \alias{lazyframe__drop}
 \title{Remove columns from the DataFrame}
 \usage{
-lazyframe__drop(..., .strict = TRUE)
+lazyframe__drop(..., strict = TRUE)
 }
 \arguments{
 \item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Names of the columns that
 should be removed from the dataframe. Accepts column selector input.}
 
-\item{.strict}{Validate that all column names exist in the current schema,
+\item{strict}{Validate that all column names exist in the current schema,
 and throw an exception if any do not.}
 }
 \value{
diff --git a/src/rust/src/lazyframe/general.rs b/src/rust/src/lazyframe/general.rs
index a797e015..d31d1576 100644
--- a/src/rust/src/lazyframe/general.rs
+++ b/src/rust/src/lazyframe/general.rs
@@ -52,7 +52,7 @@ impl PlRLazyFrame {
         cluster_with_columns: bool,
         streaming: bool,
         _eager: bool,
-    ) -> Result<PlRLazyFrame> {
+    ) -> Result<Self> {
         let ldf = self
             .ldf
             .clone()
diff --git a/tests/testthat/test-lazyframe-frame.R b/tests/testthat/test-lazyframe-frame.R
index 826caa53..de0f7b75 100644
--- a/tests/testthat/test-lazyframe-frame.R
+++ b/tests/testthat/test-lazyframe-frame.R
@@ -1212,7 +1212,7 @@ test_that("drop", {
     r"("foo" not found)"
   )
   expect_query_equal(
-    .input$drop("foo", .strict = FALSE),
+    .input$drop("foo", strict = FALSE),
     df,
     df
   )

From 1ba97b21589e26b5434c77476b00e1f8cbf94e06 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Mon, 2 Dec 2024 15:41:56 +0100
Subject: [PATCH 44/71] remove old parquet file

---
 foo.parquet | Bin 18052 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 foo.parquet

diff --git a/foo.parquet b/foo.parquet
deleted file mode 100644
index cf15bd52001e6eb42eecc37e4c91c8c28bc6cfca..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 18052
zcmd5^dvsLQx!-3plT0R)kjI>znIvLFnDTyP6lk%{8G=wDRY(v-(wYF7fIJd;iCo(|
zpp7Y@iy~o}G;&j}auHK*t|$#YxC2I3;D)TMh%~SwO+hIur7nyt7Sxn=zrDXXubDFk
z)c)Zt8&2k&^V`4k{q}Ev-`;1Zv;9RAL?X5e;<KYgHyp)_@i`O8aC?0n`-qY|e3T{;
zLWaLH?w`4VKM3T81%7i20hd%?mm!^MPnQDe5)spU{F5Oil39t)-?zM4`@%-rMkWQ*
zN?)^07ag8qB;Mn87ME9*tr%NazN%vFQqf7>w9!I41+V>qSPdh?je3UI)*xVrM+qv!
z*9Fpv42e@Bb|9GuB<)VgHz+9~If)c(b(V$Zezd7$UQzG4rW>|dp&`<7+cueC5)z!h
zY>CRRL-4j+I6<R!O$dIS@3x<6XP;~NN(g~qZ#ab4^8?rOPqC9Xd|zYKZ{|D5C@T>P
z?s;Xw3-7I{NUVx;3oGp*yG+nIyL@d_1cBj8HVs34!^p5mA3|*IEHK2Q!zx3YKx&X-
zecg%tv-!S{51+`lk;a$b==&(~;>nAoN|4zkAsbz|_6pvv1dX2hHbkp9Q!FWj&L&4>
z*a(5d!!K*f*H*46bHLI1^3`jWEv{TqxjK>kwrq99vQ^6#m%($SibL3Aq{@9YQ90?3
z$NObeyPuG7KX8jq&;2Am)#FYmt}0)=tb7gcd94DyEO@UZGO9*67`6y+U=PStdY;}J
z;hyKqkbG&G(CsL6KT!&}0)g}-QvAth&eGvlU*eUfq*dhJ$+Jd1<IlE?x5%-K<&BV2
zqpZGVq_{;ek~R%ELNMTCm*5Bb_jAR+?AuhJmj=dgN`VxgFDV%qJ0vHWl@_w)J^Ju$
zI<N1-7tY$uYoRfZKSSDDf@w0<D3|Q9&|bk?JxC+FzI|kBXk*q{8BId2s(lb#(yh3p
zzV6F>-&f!gZkE9HudgPN#<mV2Fob+wJv08?rrJ$T@X4}G3c*+|IhUr1rM_WgxI!OF
z<kf5Bl9d7p$u2pZf8wm~;}eWNe0uZkr1&29lfl|&CLfj3wtp%f9o?(QY-E{`;N9)w
z%wz<uEx&J;>7tBkY|B&`#gIaG#IX>vh1?x~DUe2tAy|gESTX^_5DI4LF0I%zZ_KW)
ztp_)Zv<UGfq1r{gB}s?m9%yKmR>8a1qY0zFO%!;8o*Yt2HS)@<0%-}8OJbSC=fqqR
zNVPjjvAE!#X5XMcZ5YuUOk8zO=L0W~ZV}F<npgx9UAy4z&F1`!nl<DX;GbL20deF3
z7s$Z_er;ik+{&-7y9xFPT$is(BEH1Y50X^(mXKr9tLu&?ANX`jUg=>?CN|0=K(KKz
zCur0z68uJaWP|Jw$Rj7t9zKykN*z@RCGqvaD+lZWAybyq$P|!AF644DM!D+yW{QC!
zjB0FC)I<!@Xmr*?@#!>)<WqrIEf_e$GDt=mq!5Xu!w+8u3X?7+k%hIPyGDFc`q$@v
ze@$@o#veGf<%#ARqEs@`4uL`Xyx{E|#$#&KJ_;O+V2UlK08>1&NN_KvlMj?y28yjy
z?Y!XfB}oAbuue<~q$H79;>dN2-@LGXV%)Bl<JB9JYvTSI{G7Vg0wS{B0+MG0?`y*~
zy-?pYl3b;?vD#gu>aBGG*@wYol@=6S<OZ1K)^#YfrliL=Rld1o@|*LwpBi2L-Xd>V
z)rXc6xlbDAkYj@Pi;)^`%$aJF$Ze^f8(Q+BQ4aaJuF^Vu!uRpTvmZl^+D-a5wFd%k
z6(8F&_`rll+h_0PO(qI4BeT2@L*E$A%c!~Un_13`V^qJXS#lJ!K(dg#AeiMU%Igy$
zh2%ib^JLR3UVK4P0na!|L0ah31zSkVrh{+qsH_N#8d0@tUjOEa)>5mYb=15B&9Xm_
z#}rzFkG_4>I-)#8FDDy$1*T5n^02-xSqeC$M3|OX;vJGNF2M$NE@*peN9LbWF8^`I
z+1Vqi=zPcTewf^FP|A@hM%7$6bGmgOXK2(gGTa$qXS^~)p$LYMW0`CrbLD}DB@-yO
zAX&4VoRKOqi#f1!4PtbmnjJ3+1!EpQd7*ttDDSdBg}kwvh(_fXh-jbSJvK?hSI@#E
zqBj-y@FGi@jD}}YY@WiM4Hmz76$b-ma3~U=U6?#!&DfXzvL&xG=iwUhm|$Ls!CZ9D
zk2DPR4SzQuxkN7;8SY^6;+P{(UIg2~Amiyvn?SD!-b)WKvay{MIXA$uDht%-U7A2%
zk=2K}flF4OP3|)#;+6swoRsCubYoE6p!}>{a`Nwm)ywmW$8~f$gsQk}WUF?9YGfst
z9308#VKpLv!bfIE?5K$fRiaV_(<@YQk}pWU1n`rMSQB|q4FuvP2k~XBJM_xT_g=66
z%Zh~~1ekO+mO0IH4<yK)Q#JbQn@6J65spGru)d5jg)}I~^b>(x4rg+fAX8z^mF|St
zrtwu1-=GYbZ4GC>5A08<T2y^<D;uE#d7UXuqshIp^P>|HSg8lJpT>zAHIGEs>Jt%$
zegLe5JT=p;7X{KB<_X`1y?+hnT?0^Oh;<{!bqvW}G{Wlp+veP!4}6KwWsgzktw!mn
zP4KqN(CDpi^t;(8T^xFcHcEU*u@R_lxCn4uV1v@NhZxt`E;3d;TTz_q6qoanM+&$>
z>nzFV%VsVDKT)4A6D0p+npj#M*EY%LS^C04_vya6%A>kfD8t$f6hB$0QC#1=;oyln
z+kq!`k~ozoq@(ckl~^S(ix1Y&ppvo1)z{gaWYn-K0WwgXrQ)gK>9}GMoEY4eWMUpn
zM-7i~hDHq=yBx<Aml8^idBEl2#YTCh(U&Hrv56vhD3Fpoh~@Sv2ZPTR5Bu=`godV;
ztr<SwpI?3MXmV+*Ss90^y>||0YScb5jZB;vLj9saxDM7QPFSkIQq$-A8j|Z(n4Eoy
zgW1h|HhhjhICkvFVZk?QHanV&GJD8$NxKXbWhcy=Z_m|8uP5=nv(t|GjB9K=ZI2j0
zjdDUbvi}swec`0UmeL?2F(diFS~L)FXTp*J`%p+r#iD2ApI=|S`tirkj-UIB&iL)Z
z=3q8ev%R6GE(zYt3wU6SIAA}BoDX3?NP-@VeaE7j>I0?5W9tP;z}3zsL&R>ia$6PD
zhHMR6S_>^x{x7-q>`X_PrFJkio)5K_Xw=s?jU<0%G}Nan933jhchor>FYa*B1*Vv(
zsp(MRr6!R#hg5HWd8ci5Pe(`fOkrbiSZP}BMT@$IA640b^}ApN`Uy=S^-Uwk$XZ19
zgh4)nT11u|vJuSJ!i#@kt-y7#6l(Allz7FTED=isKd*Xjd1bk~)YC@%nj+N5OHhiO
zDdS9y+W+2pY5x+&_0GM7eR41;kB_IBDrvmLmZqV&g9QaoW)8_3%u0MnQ|xmWc&SC2
z@>SJ~e?Of!_ekxM7Vr9>Y|B_&UPW4zio~eK11|+Dcwmh<U>As(F5!g^W$?GE-Bg_}
zL1tpfiDfAV@U+?O#0v8daeq(uf}XjrCf8ajmJJ)XG!9-iJ+96_VoaBMmT4I38%Bna
zsU9h+1+!)`T`lfdWe2$lOdm*f^V*qarF(dGDy*&a{^k$;$+gakj3pC&HqLW&$fkrc
z%1v0@HLlPQ)i;kspN|+s3IA9X;lEVpb*B`|v%Cb^hz0W&kp={bNRXr04Egw{Lj30T
zy;&<OU{w0Ss=$PjrK3(;QWIpJMpil}c>7mznnsO(H!Hctk#|_-Im_fK&tiLY|9)az
z=(P*<s^ASh#puR%Q{)hZS%JApPBieO%BvK4m5!yP9EoH>@AOE%Y??V3<H2T2qaF+e
z->^F$T@?rvyiR@`8Zvppy!eD*k{nmVv;<>R!&)9#BLb*@<a7x;DqAZ<<Ab&`RVbvw
zH7N+smg!)1&IYIL%u|6Bs{lQ9`$y^N>khg5juvfvYGu<6`;&j!;%oMd9->Y|q8x>F
zRR4NSN7XlvL?d4MAUm~q)yzQEmp;BssGVVSVrx1Sk~|A-HVpQ)#9t+(ywg3?QS!Hj
zr1?2<7sfW5SP2pnyc%tNhBGv37#Z%0X(iP0hH53zLe_@skyGsh`4CS6R?3IDDXimI
zqycc3Am5Zkrp)TK2yrjWtX`hdJMr{=RoU)@js>|Xvi3%vf;oN7vz#x&Fhk3LgnTzo
zeVdt+#ceS*6waJ3|B7+Fb3>sBcf&(rf7Q&KAm=OPel{*V42Jxal>><iCQ><f-3{Gz
z)PlgJg)@((R!<XCmmJzr9{5FP+-S=Z>+KD7WRsxj26_hA4OZ)Gs7OOG$ArEAYvt7o
zdnFWzw?TasexMZC0O3n3eDsC)x{LO08a*YU>9lQ<X1tGb5%kgHf_K;R$j+!yWEWY6
zss$_NZB3$wGPD<G^Y{unL&}0UyV=_Vn(6kys8mp1S65k)SMu^3*EUbAIu`s@MM}Ki
za+|CK>+)UINYkisq-mJpEy}0_g`6Bj$naskS&7Wu1TMds&+LRB2&EK1Ha8e5w9bpa
zXgRQx1WMGM5K-=dNW$`6<8QQ))VKV-v(KfioNNsLG+D$37@CGcWaYg%YCb^KVUaEz
z&s}3I_{w~!z|#<4eK^%Vn7FfH{Vr2VC&jtV1*J!=AtIITOP-hh`jZoi|6Ejes_^JL
z3;RBL_=AugY7;wA-pNH~rlE9m9PNjNyqC0nfaOM&3NQuIZ04*bjpy041$9-+yj?JJ
z?q*$er~qpa@E@!|urkV*1X;|M=tv_{$o)VlIO62Wdk)sRCT|T|bH~3-hW`;(PEu;(
zetY!A*zb}!Us&AP_gmy^!~*$71{qE0R-R0?77H@2JOW|L3TQXHqJ#GgFgbO(_?8cs
z%OwpW(y?tvY!!!gKyAA3+1$m&@32{Z!{ph`b#DnzWp*pO0V3D-<RI7V)b?UM6(oH!
zBFJ)R<&jlg*WdLykdDJBLwb_<`kn>y6+|0SSO5$cU>0P_j1c&eeFpL~$dtHXV}np&
z3kK_i+@I3T!N#4WIL*DkWz=Tj-pLc#{uODuIxC1UB!84fi?@FlJ_=yNhvhSjFC+Lb
zFyzoXSx}c(0*4Bgp@#WvC=9}EC9tNRlQlA=C(YuKi>owAITR(4cqk!w8ar76JB^2f
zbf+=4*YcIqU*~J@#_V{QoyNnn+SB13P9C?)PJ1)9$C=?_rmkrWzg;DsW^nrI9WX<n
z-fzGRpE{j2jW<(+ra3dbGF5jPALc8@3_H#(ZPVdEeao0(lzGgIm|=SsMjQ`Qbj&nr
z-i8@I!$mFe+o*}mGSrym=$h6r!z+^wPG46$Qikp{-f}`cee)YI!$7$hGYmkM4jWGs
zb{d;WG&t>Kr!gpcW_UO6h6q0JjhRsuJ6qQ@wim}vV>5Q$Y5a~e!|1vUPRBCHZPY~U
zm@(6+i6oPBP2*_?BaT5y*PYhH3?u$c)Wkb@Kf`7Q8(>>m(=rR~m?OtgIj!#t2>vDO
zXLzMm3x1oJ;iGaj@lekUu?3Zx-kX|;<CXfpfTwSKJ7yRO&J3~d);?Ca+uLF>le@j5
zrcumpGG^G0MGJnLm?3ti+Eer~lSKe*qkg7ghE2P*O^505K}81A^o7{Wa;RPG6d*+s
zPY!Ljh#m6rOE&emD?!{Ve}BOtt`1)wa;mph4-$LjOU|XKx9)a{A^H2ZOy$<S9uc<q
zVynH`u+Kew+&EZ0zL2Z7+JCQdYv(ZWynOj<!^Jc5@fRau13?&O?-=oY`M7<&a_jy)
z_10VWspOAM68q%K_xwoxzURl{`Y_^=eD(QuPE}zpO%r?MTU%y`@ZujIs&A&aHhg%p
zP`#w#5f#06jtcYkT$Mr1W6G_U7l@Z+#MTn^SzmfW?2^AfQ>HR&ELUy~R;ai3EK^%;
zT%oe*UnQQCVGgYpH--r}tW^>F*Q+F3pHX$Kc~-n4-+KPvl~!RQuigvuW)Uhk3<%6l
zRsJv!R2lVb6V&@(62yM_7ML7pqOwzS2Fg|uW|WA0t3G_cXz{H0sdaT_<yz4}M8`0T
z{!be4EA1RVe35pQH+&I(#kbDGnk68~yVHk}!W?LpHd#FFo)T}jpzHyPoRZ!C&nIZ-
zVL_y{w?PnpLcH}>1xm|N{zDA&KXjm%0Lo2wzsjL{;uWaKISglI{wE3a7RTyAtQ&R(
zD`F18p<pAZ$sZKZUGFkl^}9K((@8nT%T>4A?N1K?45i%w^8lf(4(j&W=|2)&Fld!{
zx2GrsV@Y7LOP{uKhCBa*GrW|lU`1&FE2^;cYqE`=_Hd{!gxZ&(K*iF)WWzhX#<8|H
za$4s_1<Oc-km(L}y6rutK?g#;>o(OO_V#i58o*Gx?e|=R<4l8)lkT;<EKG#HEQO(=
z2<E%L>ARd_8(@PGn(k7tqC^1Gn{U*nT^y=<4~M#Pw*qA-g880m+Q+e45vyj1f@LHE
z7{O#mG~N6@6Jhgza$27aH5DQDc4b-*FqDP>MmEA;5A92ES(yk2exNXn76D%fm~1en
z&79%@QvAmV1uIGffHL0(OxrlryRY%5Z62*a8H!-OZI||PtQ&}RVXT5>B!c-)TY7^-
z9eSP9Ix@jjgxK3)>Gltp2-g4xL}+79=p07t?XGwxLd|^&!&oAiY+R*#0ozTR_i~CI
z6BVo|5pY4(WQQp|!lAxEsHPt(P|+fQpys<j=^2jI^k1A-{}ctwNCfk3n)C|})rU}B
zKZzm2?RHSo>LW~qefziw?M#IIdGxg1WoIJn_A3kxMSy|aWXmIM0Bkq?46tE1>Mu~R
zqC`MbnQTg=Z*!<UZ*Zt%(-kN~5zO}u(sqv3i&(*j6f7eV%s2Vb&p6cX{hZeMSusSw
zWw+QH@aO=*P}&19-~=whk$l>2cR82{dmmO98j4`PXO8apUnasUZ*q!PW-C}xBA^pY
zHn-9J9O?{09Vu3z3`KxlrY76Z=rNA9vx(E%Jx{@k5&=zRvNepJ<4|1)HSn0J2(h<q
z(SCrTbo(Dv5l$A;^LCe$iO{i7VQ44<Yz#5kWJPO2OoVp8W)<o$6s#x_FejL7g`$ld
zYRdr*)my4SMT-D}nr~U6EgY*2u?{U(u#7|iubb>CqMaP7`oB4?odHu3V(<5%mjQ;-
zR)CS6aCrgkw!2c82$z;B3}cC4vO9;~;uM3;oMO{*1uIGfbb`q?8oKK!)AI;I4Xjk4
zqD25f!JsBPUFZRhHGo*{RSK4o2<E#{=y4A9?*H&7Y+e&XgxhTsq2~dH(i;E+B6P8w
z5Uij*c2_DBVPKuY&`<>ECnozk=rvC9?YB6^&J7Ayln9s;Om<$-ZO51h{Rp+=-xR25
z5kOG$JreXaj@5jS)4KLc1<ObT^X&`teGc^nLUjdUW^1x5JNDiJdKz#jZF*a^Lc>~m
z!R|_DD(rsFY{m?+W2xC>HvsJe^elQA&|yUC`L&7`B?JWTHZSk#^B*#>z3*_aeKjgr
zv=lhNnZ0ADyE)oth<0_0iWVgX8rAG=IBnrzdk$fLdlA4))G&EfO}hZdO?v?dOmUK#
z0^&;#*<CKCNAq@-W~43T_nl@hm1!U6+3-isbH~dnT7({V^2V1w-^%njhhYDxRly?l
zkg1uyilw_b+AA$ctxiRY(Bn>ChSC-eb_T)ve+OU&dfdUAO4<cLZn_iJSU``CWIB|+
zw5Q#PE(%MqZl}Mf=#lw9TC;d*`N}ePw{;x*Oj<tG@Ap3l|MO2?55I4i!w%th{?#nM
ze~rJu?}vZq_(#o#^G`p`&OhY$SA?&76#kw1Ae{D(W8W0AA5$y&Pxj-HC*h_aa?m37
zJAlDw0smP97|{Ck3TSNV9RKt=4EJel87?qvZ}uX<o6H-BOJ*=0Q~lE>!<XX#3P>-8
zucyvkJY(LIW%K84^gptZT`!Y{Q~cJc&<wk{m^~N#3)J}cKjXao(<U-J_zLhBFpB)C
z9-T37{DS#UubY>*V)>F875+&q9vk@`{ARcqJ`YZVA9MUuA7b~xzYm5VnN#rlXRl^_
zAPRuXo|_SYhkyLBqNR%p?*G~2^Y359u-WHdgTIF1&wc!{sSN+IviU{h%ly;Uu;=3s
ztJPx;e8Yms^?-l59)3T2Aodx>`p47oTs$sw@cZcxuUI@|`kKP!8&)h@xdgENKPK+@
c*_A8GR<H4tz<Q?kKQaUU-#=!<|NqSY1E5ZB761SM


From 761f2f6b1560388f8484ffd666f136ab2b63c16c Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Mon, 2 Dec 2024 15:42:27 +0100
Subject: [PATCH 45/71] missing to_string()

---
 src/rust/src/conversion/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/rust/src/conversion/mod.rs b/src/rust/src/conversion/mod.rs
index e2d60dc6..b7af79a0 100644
--- a/src/rust/src/conversion/mod.rs
+++ b/src/rust/src/conversion/mod.rs
@@ -611,7 +611,7 @@ impl TryFrom<&str> for Wrap<Roll> {
             "raise" => Roll::Raise,
             "forward" => Roll::Forward,
             "backward" => Roll::Backward,
-            _ => return Err(format!("unreachable",)),
+            _ => return Err("unreachable".to_string()),
         };
         Ok(Wrap(parsed))
     }

From 24c4a2147873377a445d20787ee865dc74478c2b Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Mon, 2 Dec 2024 15:43:14 +0100
Subject: [PATCH 46/71] add back TODO

---
 R/lazyframe-frame.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index 3490a638..091e473d 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -45,6 +45,7 @@ wrap.PlRLazyFrame <- function(x, ...) {
   self
 }
 
+# TODO: link to pl__select
 #' Select and modify columns of a LazyFrame
 #'
 #' @description

From 89119f5bed5e65d3ada6d8e9e65363bbba323353 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Mon, 2 Dec 2024 15:44:57 +0100
Subject: [PATCH 47/71] PlRLazyFrame -> Self

---
 src/rust/src/lazyframe/general.rs | 108 ++++++++++++------------------
 1 file changed, 44 insertions(+), 64 deletions(-)

diff --git a/src/rust/src/lazyframe/general.rs b/src/rust/src/lazyframe/general.rs
index d31d1576..e41d81cc 100644
--- a/src/rust/src/lazyframe/general.rs
+++ b/src/rust/src/lazyframe/general.rs
@@ -70,12 +70,12 @@ impl PlRLazyFrame {
         Ok(ldf.into())
     }
 
-    fn filter(&mut self, predicate: &PlRExpr) -> Result<PlRLazyFrame> {
+    fn filter(&mut self, predicate: &PlRExpr) -> Result<Self> {
         let ldf = self.ldf.clone();
         Ok(ldf.filter(predicate.inner.clone()).into())
     }
 
-    fn select(&mut self, exprs: ListSexp) -> Result<PlRLazyFrame> {
+    fn select(&mut self, exprs: ListSexp) -> Result<Self> {
         let ldf = self.ldf.clone();
         let exprs = <Wrap<Vec<Expr>>>::from(exprs).0;
         Ok(ldf.select(exprs).into())
@@ -133,20 +133,20 @@ impl PlRLazyFrame {
         Ok(df.into())
     }
 
-    fn slice(&self, offset: NumericScalar, len: Option<NumericScalar>) -> Result<PlRLazyFrame> {
+    fn slice(&self, offset: NumericScalar, len: Option<NumericScalar>) -> Result<Self> {
         let ldf = self.ldf.clone();
         let offset = <Wrap<i64>>::try_from(offset)?.0;
         let len = len.map(<Wrap<u32>>::try_from).transpose()?.map(|l| l.0);
         Ok(ldf.slice(offset, len.unwrap_or(u32::MAX)).into())
     }
 
-    fn tail(&self, n: NumericScalar) -> Result<PlRLazyFrame> {
+    fn tail(&self, n: NumericScalar) -> Result<Self> {
         let ldf = self.ldf.clone();
         let n = <Wrap<u32>>::try_from(n)?.0;
         Ok(ldf.tail(n).into())
     }
 
-    fn drop(&self, columns: ListSexp, strict: bool) -> Result<PlRLazyFrame> {
+    fn drop(&self, columns: ListSexp, strict: bool) -> Result<Self> {
         let ldf = self.ldf.clone();
         let columns = <Wrap<Vec<Expr>>>::from(columns).0;
         if strict {
@@ -156,14 +156,14 @@ impl PlRLazyFrame {
         }
     }
 
-    fn cast(&self, dtypes: ListSexp, strict: bool) -> Result<PlRLazyFrame> {
+    fn cast(&self, dtypes: ListSexp, strict: bool) -> Result<Self> {
         let dtypes = <Wrap<Vec<Field>>>::try_from(dtypes)?.0;
         let mut cast_map = PlHashMap::with_capacity(dtypes.len());
         cast_map.extend(dtypes.iter().map(|f| (f.name.as_ref(), f.dtype.clone())));
         Ok(self.ldf.clone().cast(cast_map, strict).into())
     }
 
-    fn cast_all(&self, dtype: &PlRDataType, strict: bool) -> Result<PlRLazyFrame> {
+    fn cast_all(&self, dtype: &PlRDataType, strict: bool) -> Result<Self> {
         Ok(self.ldf.clone().cast_all(dtype.dt.clone(), strict).into())
     }
 
@@ -184,7 +184,7 @@ impl PlRLazyFrame {
         nulls_last: LogicalSexp,
         maintain_order: bool,
         multithreaded: bool,
-    ) -> Result<PlRLazyFrame> {
+    ) -> Result<Self> {
         let ldf = self.ldf.clone();
         let by = <Wrap<Vec<Expr>>>::from(by).0;
         Ok(ldf
@@ -201,7 +201,7 @@ impl PlRLazyFrame {
             .into())
     }
 
-    fn with_columns(&mut self, exprs: ListSexp) -> Result<PlRLazyFrame> {
+    fn with_columns(&mut self, exprs: ListSexp) -> Result<Self> {
         let ldf = self.ldf.clone();
         let exprs = <Wrap<Vec<Expr>>>::from(exprs).0;
         Ok(ldf.with_columns(exprs).into())
@@ -219,7 +219,7 @@ impl PlRLazyFrame {
         nulls_last: bool,
         maintain_order: bool,
         multithreaded: bool,
-    ) -> Result<PlRLazyFrame> {
+    ) -> Result<Self> {
         let ldf = self.ldf.clone();
         Ok(ldf
             .sort(
@@ -235,7 +235,7 @@ impl PlRLazyFrame {
             .into())
     }
 
-    fn top_k(&self, k: NumericScalar, by: ListSexp, reverse: LogicalSexp) -> Result<PlRLazyFrame> {
+    fn top_k(&self, k: NumericScalar, by: ListSexp, reverse: LogicalSexp) -> Result<Self> {
         let ldf = self.ldf.clone();
         let k = <Wrap<u32>>::try_from(k)?.0;
         let exprs = <Wrap<Vec<Expr>>>::from(by).0;
@@ -249,12 +249,7 @@ impl PlRLazyFrame {
             .into())
     }
 
-    fn bottom_k(
-        &self,
-        k: NumericScalar,
-        by: ListSexp,
-        reverse: LogicalSexp,
-    ) -> Result<PlRLazyFrame> {
+    fn bottom_k(&self, k: NumericScalar, by: ListSexp, reverse: LogicalSexp) -> Result<Self> {
         let ldf = self.ldf.clone();
         let k = <Wrap<u32>>::try_from(k)?.0;
         let exprs = <Wrap<Vec<Expr>>>::from(by).0;
@@ -268,7 +263,7 @@ impl PlRLazyFrame {
             .into())
     }
 
-    fn cache(&self) -> Result<PlRLazyFrame> {
+    fn cache(&self) -> Result<Self> {
         let ldf = self.ldf.clone();
         Ok(ldf.cache().into())
     }
@@ -472,7 +467,7 @@ impl PlRLazyFrame {
         dump.try_into()
     }
 
-    fn select_seq(&mut self, exprs: ListSexp) -> Result<PlRLazyFrame> {
+    fn select_seq(&mut self, exprs: ListSexp) -> Result<Self> {
         let ldf = self.ldf.clone();
         let exprs = <Wrap<Vec<Expr>>>::from(exprs).0;
         Ok(ldf.select_seq(exprs).into())
@@ -538,7 +533,7 @@ impl PlRLazyFrame {
         Ok(PlRLazyGroupBy { lgb: Some(lazy_gb) })
     }
 
-    fn with_context(&self, contexts: ListSexp) -> Result<PlRLazyFrame> {
+    fn with_context(&self, contexts: ListSexp) -> Result<Self> {
         let contexts = <Wrap<Vec<LazyFrame>>>::try_from(contexts)?.0;
         Ok(self.ldf.clone().with_context(contexts).into())
     }
@@ -557,7 +552,7 @@ impl PlRLazyFrame {
         right_by: Option<StringSexp>,
         tolerance: Option<Sexp>,
         tolerance_str: Option<&str>,
-    ) -> Result<PlRLazyFrame> {
+    ) -> Result<Self> {
         let coalesce = if coalesce {
             JoinCoalesce::CoalesceColumns
         } else {
@@ -606,7 +601,7 @@ impl PlRLazyFrame {
         suffix: &str,
         validate: &str,
         coalesce: Option<bool>,
-    ) -> Result<PlRLazyFrame> {
+    ) -> Result<Self> {
         let coalesce = match coalesce {
             None => JoinCoalesce::JoinSpecific,
             Some(true) => JoinCoalesce::CoalesceColumns,
@@ -635,12 +630,7 @@ impl PlRLazyFrame {
             .into())
     }
 
-    fn join_where(
-        &self,
-        other: &PlRLazyFrame,
-        predicates: ListSexp,
-        suffix: &str,
-    ) -> Result<PlRLazyFrame> {
+    fn join_where(&self, other: &PlRLazyFrame, predicates: ListSexp, suffix: &str) -> Result<Self> {
         let ldf = self.ldf.clone();
         let other = other.ldf.clone();
 
@@ -654,28 +644,23 @@ impl PlRLazyFrame {
             .into())
     }
 
-    fn with_columns_seq(&mut self, exprs: ListSexp) -> Result<PlRLazyFrame> {
+    fn with_columns_seq(&mut self, exprs: ListSexp) -> Result<Self> {
         let ldf = self.ldf.clone();
         let exprs = <Wrap<Vec<Expr>>>::from(exprs).0;
         Ok(ldf.with_columns_seq(exprs).into())
     }
 
-    fn rename(
-        &mut self,
-        existing: StringSexp,
-        new: StringSexp,
-        strict: bool,
-    ) -> Result<PlRLazyFrame> {
+    fn rename(&mut self, existing: StringSexp, new: StringSexp, strict: bool) -> Result<Self> {
         let ldf = self.ldf.clone();
         Ok(ldf.rename(existing.to_vec(), new.to_vec(), strict).into())
     }
 
-    fn reverse(&self) -> Result<PlRLazyFrame> {
+    fn reverse(&self) -> Result<Self> {
         let ldf = self.ldf.clone();
         Ok(ldf.reverse().into())
     }
 
-    fn shift(&self, n: &PlRExpr, fill_value: Option<&PlRExpr>) -> Result<PlRLazyFrame> {
+    fn shift(&self, n: &PlRExpr, fill_value: Option<&PlRExpr>) -> Result<Self> {
         let lf = self.ldf.clone();
         let out = match fill_value {
             Some(v) => lf.shift_and_fill(n.inner.clone(), v.inner.clone()),
@@ -684,84 +669,79 @@ impl PlRLazyFrame {
         Ok(out.into())
     }
 
-    fn fill_nan(&self, fill_value: &PlRExpr) -> Result<PlRLazyFrame> {
+    fn fill_nan(&self, fill_value: &PlRExpr) -> Result<Self> {
         let ldf = self.ldf.clone();
         Ok(ldf.fill_nan(fill_value.inner.clone()).into())
     }
 
-    fn fill_null(&self, fill_value: &PlRExpr) -> Result<PlRLazyFrame> {
+    fn fill_null(&self, fill_value: &PlRExpr) -> Result<Self> {
         let ldf = self.ldf.clone();
         Ok(ldf.fill_null(fill_value.inner.clone()).into())
     }
 
-    fn min(&self) -> Result<PlRLazyFrame> {
+    fn min(&self) -> Result<Self> {
         let ldf = self.ldf.clone();
         let out = ldf.min();
         Ok(out.into())
     }
 
-    fn max(&self) -> Result<PlRLazyFrame> {
+    fn max(&self) -> Result<Self> {
         let ldf = self.ldf.clone();
         let out = ldf.max();
         Ok(out.into())
     }
 
-    fn sum(&self) -> Result<PlRLazyFrame> {
+    fn sum(&self) -> Result<Self> {
         let ldf = self.ldf.clone();
         let out = ldf.sum();
         Ok(out.into())
     }
 
-    fn mean(&self) -> Result<PlRLazyFrame> {
+    fn mean(&self) -> Result<Self> {
         let ldf = self.ldf.clone();
         let out = ldf.mean();
         Ok(out.into())
     }
 
-    fn std(&self, ddof: NumericScalar) -> Result<PlRLazyFrame> {
+    fn std(&self, ddof: NumericScalar) -> Result<Self> {
         let ddof = <Wrap<u8>>::try_from(ddof)?.0;
         let ldf = self.ldf.clone();
         let out = ldf.std(ddof);
         Ok(out.into())
     }
 
-    fn var(&self, ddof: NumericScalar) -> Result<PlRLazyFrame> {
+    fn var(&self, ddof: NumericScalar) -> Result<Self> {
         let ddof = <Wrap<u8>>::try_from(ddof)?.0;
         let ldf = self.ldf.clone();
         let out = ldf.var(ddof);
         Ok(out.into())
     }
 
-    fn median(&self) -> Result<PlRLazyFrame> {
+    fn median(&self) -> Result<Self> {
         let ldf = self.ldf.clone();
         let out = ldf.median();
         Ok(out.into())
     }
 
-    fn quantile(&self, quantile: &PlRExpr, interpolation: &str) -> Result<PlRLazyFrame> {
+    fn quantile(&self, quantile: &PlRExpr, interpolation: &str) -> Result<Self> {
         let ldf = self.ldf.clone();
         let interpolation = <Wrap<QuantileMethod>>::try_from(interpolation)?.0;
         let out = ldf.quantile(quantile.inner.clone(), interpolation);
         Ok(out.into())
     }
 
-    fn explode(&self, column: ListSexp) -> Result<PlRLazyFrame> {
+    fn explode(&self, column: ListSexp) -> Result<Self> {
         let ldf = self.ldf.clone();
         let column = <Wrap<Vec<Expr>>>::from(column).0;
         Ok(ldf.explode(column).into())
     }
 
-    fn null_count(&self) -> Result<PlRLazyFrame> {
+    fn null_count(&self) -> Result<Self> {
         let ldf = self.ldf.clone();
         Ok(ldf.null_count().into())
     }
 
-    fn unique(
-        &self,
-        maintain_order: bool,
-        keep: &str,
-        subset: Option<ListSexp>,
-    ) -> Result<PlRLazyFrame> {
+    fn unique(&self, maintain_order: bool, keep: &str, subset: Option<ListSexp>) -> Result<Self> {
         let ldf = self.ldf.clone();
         let keep = <Wrap<UniqueKeepStrategy>>::try_from(keep)?.0;
         let subset = subset.map(|e| <Wrap<Vec<Expr>>>::from(e).0);
@@ -772,7 +752,7 @@ impl PlRLazyFrame {
         Ok(out.into())
     }
 
-    fn drop_nulls(&self, subset: Option<ListSexp>) -> Result<PlRLazyFrame> {
+    fn drop_nulls(&self, subset: Option<ListSexp>) -> Result<Self> {
         let ldf = self.ldf.clone();
         let subset = subset.map(|e| <Wrap<Vec<Expr>>>::from(e).0);
         Ok(ldf.drop_nulls(subset).into())
@@ -784,7 +764,7 @@ impl PlRLazyFrame {
         index: ListSexp,
         value_name: Option<&str>,
         variable_name: Option<&str>,
-    ) -> Result<PlRLazyFrame> {
+    ) -> Result<Self> {
         let on = <Wrap<Vec<Expr>>>::from(on).0;
         let index = <Wrap<Vec<Expr>>>::from(index).0;
         let args = UnpivotArgsDSL {
@@ -798,7 +778,7 @@ impl PlRLazyFrame {
         Ok(ldf.unpivot(args).into())
     }
 
-    fn with_row_index(&self, name: &str, offset: Option<NumericScalar>) -> Result<PlRLazyFrame> {
+    fn with_row_index(&self, name: &str, offset: Option<NumericScalar>) -> Result<Self> {
         let ldf = self.ldf.clone();
         let offset: Option<u32> = match offset {
             Some(x) => Some(<Wrap<u32>>::try_from(x)?.0),
@@ -816,7 +796,7 @@ impl PlRLazyFrame {
     //     streamable: bool,
     //     schema: Option<Wrap<Schema>>,
     //     validate_output: bool,
-    // ) -> Result<PlRLazyFrame> {
+    // ) -> Result<Self> {
     //     let mut opt = OptFlags::default();
     //     opt.set(OptFlags::PREDICATE_PUSHDOWN, predicate_pushdown);
     //     opt.set(OptFlags::PROJECTION_PUSHDOWN, projection_pushdown);
@@ -834,21 +814,21 @@ impl PlRLazyFrame {
     //         .into()
     // }
 
-    fn clone(&self) -> Result<PlRLazyFrame> {
+    fn clone(&self) -> Result<Self> {
         Ok(self.ldf.clone().into())
     }
 
-    fn unnest(&self, columns: ListSexp) -> Result<PlRLazyFrame> {
+    fn unnest(&self, columns: ListSexp) -> Result<Self> {
         let columns = <Wrap<Vec<Expr>>>::from(columns).0;
         Ok(self.ldf.clone().unnest(columns).into())
     }
 
-    fn count(&self) -> Result<PlRLazyFrame> {
+    fn count(&self) -> Result<Self> {
         let ldf = self.ldf.clone();
         Ok(ldf.count().into())
     }
 
-    fn merge_sorted(&self, other: &PlRLazyFrame, key: &str) -> Result<PlRLazyFrame> {
+    fn merge_sorted(&self, other: &PlRLazyFrame, key: &str) -> Result<Self> {
         let out = self
             .ldf
             .clone()
@@ -968,7 +948,7 @@ impl PlRLazyFrame {
         storage_options: Option<StringSexp>,
         file_cache_ttl: Option<NumericScalar>,
         include_file_paths: Option<&str>,
-    ) -> Result<PlRLazyFrame> {
+    ) -> Result<Self> {
         let source = source
             .to_vec()
             .iter()

From 24ad77607e4cc295fa0b18cc370aa22e01153211 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Mon, 2 Dec 2024 15:52:17 +0100
Subject: [PATCH 48/71] remove `@rdname`

---
 R/io-csv-functions.R | 134 +++++++++++++++++++++----------------------
 R/lazyframe-frame.R  |   7 ---
 2 files changed, 67 insertions(+), 74 deletions(-)

diff --git a/R/io-csv-functions.R b/R/io-csv-functions.R
index 5f177f05..fc9715d8 100644
--- a/R/io-csv-functions.R
+++ b/R/io-csv-functions.R
@@ -3,8 +3,6 @@
 #' This allows the query optimizer to push down predicates and projections to
 #' the scan level, thereby potentially reducing memory overhead.
 #'
-#' @rdname IO_scan_csv
-#'
 #' @inheritParams rlang::check_dots_empty0
 #' @param source Path to a file or URL. It is possible to provide multiple paths
 #' provided that all CSV files have the same schema. It is not possible to
@@ -112,38 +110,39 @@
 #' lazy_frame$collect()
 #' unlink(my_file)
 pl__scan_csv <- function(
-    source,
-    ...,
-    has_header = TRUE,
-    separator = ",",
-    comment_prefix = NULL,
-    quote_char = '"',
-    skip_rows = 0,
-    schema = NULL,
-    schema_overrides = NULL,
-    null_values = NULL,
-    missing_utf8_is_empty_string = FALSE,
-    ignore_errors = FALSE,
-    cache = FALSE,
-    infer_schema = TRUE,
-    infer_schema_length = 100,
-    n_rows = NULL,
-    encoding = c("utf8", "utf8-lossy"),
-    low_memory = FALSE,
-    rechunk = FALSE,
-    skip_rows_after_header = 0,
-    row_index_name = NULL,
-    row_index_offset = 0,
-    try_parse_dates = FALSE,
-    eol_char = "\n",
-    raise_if_empty = TRUE,
-    truncate_ragged_lines = FALSE,
-    decimal_comma = FALSE,
-    glob = TRUE,
-    storage_options = NULL,
-    retries = 2,
-    file_cache_ttl = NULL,
-    include_file_paths = NULL) {
+  source,
+  ...,
+  has_header = TRUE,
+  separator = ",",
+  comment_prefix = NULL,
+  quote_char = '"',
+  skip_rows = 0,
+  schema = NULL,
+  schema_overrides = NULL,
+  null_values = NULL,
+  missing_utf8_is_empty_string = FALSE,
+  ignore_errors = FALSE,
+  cache = FALSE,
+  infer_schema = TRUE,
+  infer_schema_length = 100,
+  n_rows = NULL,
+  encoding = c("utf8", "utf8-lossy"),
+  low_memory = FALSE,
+  rechunk = FALSE,
+  skip_rows_after_header = 0,
+  row_index_name = NULL,
+  row_index_offset = 0,
+  try_parse_dates = FALSE,
+  eol_char = "\n",
+  raise_if_empty = TRUE,
+  truncate_ragged_lines = FALSE,
+  decimal_comma = FALSE,
+  glob = TRUE,
+  storage_options = NULL,
+  retries = 2,
+  file_cache_ttl = NULL,
+  include_file_paths = NULL
+) {
   check_dots_empty0(...)
   check_character(source, allow_na = FALSE)
   if (length(source) == 0) {
@@ -201,7 +200,7 @@ pl__scan_csv <- function(
 }
 
 #' New DataFrame from CSV
-#' @rdname IO_read_csv
+#'
 #' @inheritParams pl__scan_csv
 #' @inherit as_polars_df return
 #' @examples
@@ -210,38 +209,39 @@ pl__scan_csv <- function(
 #' pl$read_csv(my_file)
 #' unlink(my_file)
 pl__read_csv <- function(
-    source,
-    ...,
-    has_header = TRUE,
-    separator = ",",
-    comment_prefix = NULL,
-    quote_char = '"',
-    skip_rows = 0,
-    schema = NULL,
-    schema_overrides = NULL,
-    null_values = NULL,
-    missing_utf8_is_empty_string = FALSE,
-    ignore_errors = FALSE,
-    cache = FALSE,
-    infer_schema = TRUE,
-    infer_schema_length = 100,
-    n_rows = NULL,
-    encoding = c("utf8", "utf8-lossy"),
-    low_memory = FALSE,
-    rechunk = FALSE,
-    skip_rows_after_header = 0,
-    row_index_name = NULL,
-    row_index_offset = 0,
-    try_parse_dates = FALSE,
-    eol_char = "\n",
-    raise_if_empty = TRUE,
-    truncate_ragged_lines = FALSE,
-    decimal_comma = FALSE,
-    glob = TRUE,
-    storage_options = NULL,
-    retries = 2,
-    file_cache_ttl = NULL,
-    include_file_paths = NULL) {
+  source,
+  ...,
+  has_header = TRUE,
+  separator = ",",
+  comment_prefix = NULL,
+  quote_char = '"',
+  skip_rows = 0,
+  schema = NULL,
+  schema_overrides = NULL,
+  null_values = NULL,
+  missing_utf8_is_empty_string = FALSE,
+  ignore_errors = FALSE,
+  cache = FALSE,
+  infer_schema = TRUE,
+  infer_schema_length = 100,
+  n_rows = NULL,
+  encoding = c("utf8", "utf8-lossy"),
+  low_memory = FALSE,
+  rechunk = FALSE,
+  skip_rows_after_header = 0,
+  row_index_name = NULL,
+  row_index_offset = 0,
+  try_parse_dates = FALSE,
+  eol_char = "\n",
+  raise_if_empty = TRUE,
+  truncate_ragged_lines = FALSE,
+  decimal_comma = FALSE,
+  glob = TRUE,
+  storage_options = NULL,
+  retries = 2,
+  file_cache_ttl = NULL,
+  include_file_paths = NULL
+) {
   check_dots_empty0(...)
   .args <- as.list(environment())
   do.call(pl$scan_csv, .args)$collect() |>
diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index 091e473d..571b9495 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -2350,7 +2350,6 @@ lazyframe__with_row_index <- function(name = "index", offset = 0) {
 #' this to `FALSE` will be slightly faster.
 #' @inheritParams lazyframe__collect
 #'
-#' @rdname IO_sink_parquet
 #' @return Invisibly returns the input LazyFrame
 #'
 #' @examples
@@ -2430,8 +2429,6 @@ lazyframe__sink_parquet <- function(
 #' * `"lz4"`: fast compression/decompression.
 #' * `"zstd"`: good compression performance.
 #'
-#' @rdname IO_sink_ipc
-#'
 #' @examples
 #' # sink table 'mtcars' from mem to ipc
 #' tmpf <- tempfile()
@@ -2529,8 +2526,6 @@ lazyframe__sink_ipc <- function(
 #'   Namely, when writing a field that does not parse as a valid float or
 #'   integer, then quotes will be used even if they aren`t strictly necessary.
 #'
-#' @rdname IO_sink_csv
-#'
 #' @examples
 #' # sink table 'mtcars' from mem to CSV
 #' tmpf <- tempfile()
@@ -2616,8 +2611,6 @@ lazyframe__sink_csv <- function(
 #' @inherit lazyframe__sink_parquet description params return
 #' @inheritParams rlang::check_dots_empty0
 #'
-#' @rdname IO_sink_ndjson
-#'
 #' @examples
 #' # sink table 'mtcars' from mem to NDJSON
 #' tmpf <- tempfile(fileext = ".ndjson")

From 4ed27f052b7586ecc89f56e52d99f924f79624bb Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Mon, 2 Dec 2024 15:52:36 +0100
Subject: [PATCH 49/71] remove unreachable!()

---
 src/rust/src/conversion/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/rust/src/conversion/mod.rs b/src/rust/src/conversion/mod.rs
index b7af79a0..b8fc05c5 100644
--- a/src/rust/src/conversion/mod.rs
+++ b/src/rust/src/conversion/mod.rs
@@ -749,7 +749,7 @@ pub(crate) fn parse_parquet_compression(
                 })
                 .transpose()?,
         ),
-        _ => unreachable!(),
+        _ => return Err(RPolarsErr::Other("unreachable".to_string()).into()),
     };
     Ok(parsed)
 }

From b621355c9c743523e8b18dfe95df59081d20d596 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Mon, 2 Dec 2024 15:54:17 +0100
Subject: [PATCH 50/71] remove more format!()

---
 src/rust/src/conversion/mod.rs | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/rust/src/conversion/mod.rs b/src/rust/src/conversion/mod.rs
index b8fc05c5..dce3a5d1 100644
--- a/src/rust/src/conversion/mod.rs
+++ b/src/rust/src/conversion/mod.rs
@@ -46,7 +46,7 @@ impl TryFrom<Sexp> for Wrap<AnyValue<'_>> {
                 AnyValue::StringOwned((*val.first().unwrap()).into())
             }
             TypedSexp::Null(_) => AnyValue::Null,
-            _ => return Err(format!("Cannot cast to AnyValue")),
+            _ => return Err("Cannot cast to AnyValue".to_string()),
         };
         Ok(Wrap(out))
     }
@@ -643,7 +643,7 @@ impl TryFrom<&str> for Wrap<UniqueKeepStrategy> {
             "last" => UniqueKeepStrategy::Last,
             "none" => UniqueKeepStrategy::None,
             "any" => UniqueKeepStrategy::Any,
-            _ => return Err(format!("unreachable")),
+            _ => return Err("unreachable".to_string()),
         };
         Ok(Wrap(parsed))
     }
@@ -661,7 +661,7 @@ impl TryFrom<&str> for Wrap<JoinType> {
             "full" => JoinType::Full,
             "semi" => JoinType::Semi,
             "anti" => JoinType::Anti,
-            _ => return Err(format!("unreachable")),
+            _ => return Err("unreachable".to_string()),
         };
         Ok(Wrap(parsed))
     }
@@ -676,7 +676,7 @@ impl TryFrom<&str> for Wrap<JoinValidation> {
             "1:m" => JoinValidation::OneToMany,
             "1:1" => JoinValidation::OneToOne,
             "m:1" => JoinValidation::ManyToOne,
-            _ => return Err(format!("unreachable")),
+            _ => return Err("unreachable".to_string()),
         };
         Ok(Wrap(parsed))
     }
@@ -690,7 +690,7 @@ impl TryFrom<&str> for Wrap<Label> {
             "left" => Label::Left,
             "right" => Label::Right,
             "datapoint" => Label::DataPoint,
-            _ => return Err(format!("unreachable")),
+            _ => return Err("unreachable".to_string()),
         };
         Ok(Wrap(parsed))
     }
@@ -710,7 +710,7 @@ impl TryFrom<&str> for Wrap<StartBy> {
             "friday" => StartBy::Friday,
             "saturday" => StartBy::Saturday,
             "sunday" => StartBy::Sunday,
-            _ => return Err(format!("unreachable")),
+            _ => return Err("unreachable".to_string()),
         };
         Ok(Wrap(parsed))
     }
@@ -789,7 +789,7 @@ impl TryFrom<&str> for Wrap<IpcCompression> {
         let parsed = match compression {
             "lz4" => IpcCompression::LZ4,
             "zstd" => IpcCompression::ZSTD,
-            _ => return Err(format!("unreachable")),
+            _ => return Err("unreachable".to_string()),
         };
         Ok(Wrap(parsed))
     }
@@ -804,7 +804,7 @@ impl TryFrom<&str> for Wrap<QuoteStyle> {
             "necessary" => QuoteStyle::Necessary,
             "non_numeric" => QuoteStyle::NonNumeric,
             "never" => QuoteStyle::Never,
-            _ => return Err(format!("unreachable")),
+            _ => return Err("unreachable".to_string()),
         };
         Ok(Wrap(parsed))
     }
@@ -818,7 +818,7 @@ impl TryFrom<&str> for Wrap<AsofStrategy> {
             "forward" => AsofStrategy::Forward,
             "backward" => AsofStrategy::Backward,
             "nearest" => AsofStrategy::Nearest,
-            _ => return Err(format!("unreachable")),
+            _ => return Err("unreachable".to_string()),
         };
         Ok(Wrap(parsed))
     }

From 385aabf2a343a3ce337db950922f66f66f580bfc Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Mon, 2 Dec 2024 15:57:48 +0100
Subject: [PATCH 51/71] redoc

---
 man/{IO_sink_csv.Rd => lazyframe__sink_csv.Rd}         | 0
 man/{IO_sink_ipc.Rd => lazyframe__sink_ipc.Rd}         | 0
 man/{IO_sink_ndjson.Rd => lazyframe__sink_ndjson.Rd}   | 0
 man/{IO_sink_parquet.Rd => lazyframe__sink_parquet.Rd} | 0
 man/{IO_read_csv.Rd => pl__read_csv.Rd}                | 0
 man/{IO_scan_csv.Rd => pl__scan_csv.Rd}                | 0
 6 files changed, 0 insertions(+), 0 deletions(-)
 rename man/{IO_sink_csv.Rd => lazyframe__sink_csv.Rd} (100%)
 rename man/{IO_sink_ipc.Rd => lazyframe__sink_ipc.Rd} (100%)
 rename man/{IO_sink_ndjson.Rd => lazyframe__sink_ndjson.Rd} (100%)
 rename man/{IO_sink_parquet.Rd => lazyframe__sink_parquet.Rd} (100%)
 rename man/{IO_read_csv.Rd => pl__read_csv.Rd} (100%)
 rename man/{IO_scan_csv.Rd => pl__scan_csv.Rd} (100%)

diff --git a/man/IO_sink_csv.Rd b/man/lazyframe__sink_csv.Rd
similarity index 100%
rename from man/IO_sink_csv.Rd
rename to man/lazyframe__sink_csv.Rd
diff --git a/man/IO_sink_ipc.Rd b/man/lazyframe__sink_ipc.Rd
similarity index 100%
rename from man/IO_sink_ipc.Rd
rename to man/lazyframe__sink_ipc.Rd
diff --git a/man/IO_sink_ndjson.Rd b/man/lazyframe__sink_ndjson.Rd
similarity index 100%
rename from man/IO_sink_ndjson.Rd
rename to man/lazyframe__sink_ndjson.Rd
diff --git a/man/IO_sink_parquet.Rd b/man/lazyframe__sink_parquet.Rd
similarity index 100%
rename from man/IO_sink_parquet.Rd
rename to man/lazyframe__sink_parquet.Rd
diff --git a/man/IO_read_csv.Rd b/man/pl__read_csv.Rd
similarity index 100%
rename from man/IO_read_csv.Rd
rename to man/pl__read_csv.Rd
diff --git a/man/IO_scan_csv.Rd b/man/pl__scan_csv.Rd
similarity index 100%
rename from man/IO_scan_csv.Rd
rename to man/pl__scan_csv.Rd

From b39dae648a7931a53f110029bfeaafa5a099cb94 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Tue, 3 Dec 2024 11:04:41 +0100
Subject: [PATCH 52/71] formatting csv

---
 R/io-csv-functions.R | 130 +++++++++++++++++++++----------------------
 1 file changed, 64 insertions(+), 66 deletions(-)

diff --git a/R/io-csv-functions.R b/R/io-csv-functions.R
index fc9715d8..0eece9f6 100644
--- a/R/io-csv-functions.R
+++ b/R/io-csv-functions.R
@@ -110,39 +110,38 @@
 #' lazy_frame$collect()
 #' unlink(my_file)
 pl__scan_csv <- function(
-  source,
-  ...,
-  has_header = TRUE,
-  separator = ",",
-  comment_prefix = NULL,
-  quote_char = '"',
-  skip_rows = 0,
-  schema = NULL,
-  schema_overrides = NULL,
-  null_values = NULL,
-  missing_utf8_is_empty_string = FALSE,
-  ignore_errors = FALSE,
-  cache = FALSE,
-  infer_schema = TRUE,
-  infer_schema_length = 100,
-  n_rows = NULL,
-  encoding = c("utf8", "utf8-lossy"),
-  low_memory = FALSE,
-  rechunk = FALSE,
-  skip_rows_after_header = 0,
-  row_index_name = NULL,
-  row_index_offset = 0,
-  try_parse_dates = FALSE,
-  eol_char = "\n",
-  raise_if_empty = TRUE,
-  truncate_ragged_lines = FALSE,
-  decimal_comma = FALSE,
-  glob = TRUE,
-  storage_options = NULL,
-  retries = 2,
-  file_cache_ttl = NULL,
-  include_file_paths = NULL
-) {
+    source,
+    ...,
+    has_header = TRUE,
+    separator = ",",
+    comment_prefix = NULL,
+    quote_char = '"',
+    skip_rows = 0,
+    schema = NULL,
+    schema_overrides = NULL,
+    null_values = NULL,
+    missing_utf8_is_empty_string = FALSE,
+    ignore_errors = FALSE,
+    cache = FALSE,
+    infer_schema = TRUE,
+    infer_schema_length = 100,
+    n_rows = NULL,
+    encoding = c("utf8", "utf8-lossy"),
+    low_memory = FALSE,
+    rechunk = FALSE,
+    skip_rows_after_header = 0,
+    row_index_name = NULL,
+    row_index_offset = 0,
+    try_parse_dates = FALSE,
+    eol_char = "\n",
+    raise_if_empty = TRUE,
+    truncate_ragged_lines = FALSE,
+    decimal_comma = FALSE,
+    glob = TRUE,
+    storage_options = NULL,
+    retries = 2,
+    file_cache_ttl = NULL,
+    include_file_paths = NULL) {
   check_dots_empty0(...)
   check_character(source, allow_na = FALSE)
   if (length(source) == 0) {
@@ -209,39 +208,38 @@ pl__scan_csv <- function(
 #' pl$read_csv(my_file)
 #' unlink(my_file)
 pl__read_csv <- function(
-  source,
-  ...,
-  has_header = TRUE,
-  separator = ",",
-  comment_prefix = NULL,
-  quote_char = '"',
-  skip_rows = 0,
-  schema = NULL,
-  schema_overrides = NULL,
-  null_values = NULL,
-  missing_utf8_is_empty_string = FALSE,
-  ignore_errors = FALSE,
-  cache = FALSE,
-  infer_schema = TRUE,
-  infer_schema_length = 100,
-  n_rows = NULL,
-  encoding = c("utf8", "utf8-lossy"),
-  low_memory = FALSE,
-  rechunk = FALSE,
-  skip_rows_after_header = 0,
-  row_index_name = NULL,
-  row_index_offset = 0,
-  try_parse_dates = FALSE,
-  eol_char = "\n",
-  raise_if_empty = TRUE,
-  truncate_ragged_lines = FALSE,
-  decimal_comma = FALSE,
-  glob = TRUE,
-  storage_options = NULL,
-  retries = 2,
-  file_cache_ttl = NULL,
-  include_file_paths = NULL
-) {
+    source,
+    ...,
+    has_header = TRUE,
+    separator = ",",
+    comment_prefix = NULL,
+    quote_char = '"',
+    skip_rows = 0,
+    schema = NULL,
+    schema_overrides = NULL,
+    null_values = NULL,
+    missing_utf8_is_empty_string = FALSE,
+    ignore_errors = FALSE,
+    cache = FALSE,
+    infer_schema = TRUE,
+    infer_schema_length = 100,
+    n_rows = NULL,
+    encoding = c("utf8", "utf8-lossy"),
+    low_memory = FALSE,
+    rechunk = FALSE,
+    skip_rows_after_header = 0,
+    row_index_name = NULL,
+    row_index_offset = 0,
+    try_parse_dates = FALSE,
+    eol_char = "\n",
+    raise_if_empty = TRUE,
+    truncate_ragged_lines = FALSE,
+    decimal_comma = FALSE,
+    glob = TRUE,
+    storage_options = NULL,
+    retries = 2,
+    file_cache_ttl = NULL,
+    include_file_paths = NULL) {
   check_dots_empty0(...)
   .args <- as.list(environment())
   do.call(pl$scan_csv, .args)$collect() |>

From c8b08ea6e3724c44ca925027776d1a49bacc165e Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Tue, 3 Dec 2024 11:41:16 +0100
Subject: [PATCH 53/71] rework unnest()

---
 R/dataframe-frame.R                   | 12 +++++++++---
 R/lazyframe-frame.R                   |  2 ++
 man/lazyframe__unnest.Rd              |  1 +
 tests/testthat/test-lazyframe-frame.R | 19 ++++++++++++++++++-
 4 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/R/dataframe-frame.R b/R/dataframe-frame.R
index 3994cfcf..7b799ff6 100644
--- a/R/dataframe-frame.R
+++ b/R/dataframe-frame.R
@@ -557,8 +557,14 @@ dataframe__rechunk <- function() {
 #' df$unnest("a_and_c")
 dataframe__unnest <- function(...) {
   wrap({
-    columns <- unlist(list2(...))
-    check_character(columns, allow_na = FALSE)
-    self$`_df`$unnest(columns)
+    check_dots_unnamed()
+    dots <- list2(...)
+
+    if (is_list_of_string(dots)) {
+      dots <- unlist(dots)
+    } else {
+      abort("All elements of `...` must be strings.")
+    }
+    self$`_df`$unnest(dots)
   })
 }
diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index 571b9495..e68477ac 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -1676,8 +1676,10 @@ lazyframe__clone <- function() {
 #' lf$collect()
 #'
 #' lf$unnest("a_and_c")$collect()
+#' lf$unnest(pl$col("a_and_c"))$collect()
 lazyframe__unnest <- function(...) {
   wrap({
+    check_dots_unnamed()
     columns <- parse_into_list_of_expressions(...)
     self$`_ldf`$unnest(columns)
   })
diff --git a/man/lazyframe__unnest.Rd b/man/lazyframe__unnest.Rd
index a316732d..844d86f1 100644
--- a/man/lazyframe__unnest.Rd
+++ b/man/lazyframe__unnest.Rd
@@ -30,4 +30,5 @@ lf <- pl$LazyFrame(
 lf$collect()
 
 lf$unnest("a_and_c")$collect()
+lf$unnest(pl$col("a_and_c"))$collect()
 }
diff --git a/tests/testthat/test-lazyframe-frame.R b/tests/testthat/test-lazyframe-frame.R
index de0f7b75..11f568f6 100644
--- a/tests/testthat/test-lazyframe-frame.R
+++ b/tests/testthat/test-lazyframe-frame.R
@@ -1367,7 +1367,16 @@ test_that("unnest", {
     .input = df2,
     df
   )
-
+  # TODO: different accepted input in eager and lazy API, might be a bug
+  # https://github.com/pola-rs/polars/issues/20128
+  expect_equal(
+    df2$lazy()$unnest(pl$col("first_struct", "second_struct"))$collect(),
+    df
+  )
+  expect_error(
+    df2$unnest(pl$col("first_struct", "second_struct")),
+    "must be strings"
+  )
   expect_query_equal(
     .input$unnest("first_struct"),
     .input = df2,
@@ -1377,4 +1386,12 @@ test_that("unnest", {
       pl$struct(c("d", "e", "f"))$alias("second_struct")
     )
   )
+  expect_error(
+    df$unnest(a = "first_struct"),
+    "must be passed by position"
+  )
+  expect_error(
+    df$lazy()$unnest(a = "first_struct"),
+    "must be passed by position"
+  )
 })

From 4adf200a4b22b4db5727f6016ec4046cde9235f2 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Tue, 3 Dec 2024 11:41:24 +0100
Subject: [PATCH 54/71] snapshots

---
 tests/testthat/_snaps/lazyframe-frame.md | 87 +++++++++++++++++++-----
 1 file changed, 69 insertions(+), 18 deletions(-)

diff --git a/tests/testthat/_snaps/lazyframe-frame.md b/tests/testthat/_snaps/lazyframe-frame.md
index 681c5d46..b0e277fb 100644
--- a/tests/testthat/_snaps/lazyframe-frame.md
+++ b/tests/testthat/_snaps/lazyframe-frame.md
@@ -1,28 +1,79 @@
-# unnest works correctly
+# $explain() works
 
     Code
-      current$collect()
-    Condition
-      Error in `current$collect()`:
-      ! Evaluation failed in `$collect()`.
-      Caused by error:
-      ! Invalid operation: invalid selector expression: dyn float: 1.0
-      
-      Resolved plan until failure:
-      
-      	---> FAILED HERE RESOLVING THIS_NODE <---
-       SELECT [dyn float: 1.0.alias("foo"), col("b").as_struct(), col("a").as_struct([col("c")]).alias("a_and_c")] FROM
-        DF ["a", "b", "c"]; PROJECT */3 COLUMNS; SELECTION: None
+      cat(lazy_query$explain(optimized = FALSE))
+    Output
+      FILTER [(col("Species")) != (String(setosa))] FROM
+        SORT BY [col("Species")]
+          DF ["Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"]; PROJECT */5 COLUMNS; SELECTION: None
+
+---
+
+    Code
+      cat(lazy_query$explain())
+    Output
+      SORT BY [col("Species")]
+        DF ["Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"]; PROJECT */5 COLUMNS; SELECTION: [(col("Species")) != (String(setosa))]
 
 ---
 
     Code
-      current$collect()
+      cat(lazy_query$explain(format = "tree", optimized = FALSE))
     Output
-      polars: closing concurrent R handler
+                                0                               1                                             2
+         ┌───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
+         │
+         │                  ╭────────╮
+       0 │                  │ FILTER │
+         │                  ╰───┬┬───╯
+         │                      ││
+         │                      │╰──────────────────────────────╮
+         │                      │                               │
+         │  ╭───────────────────┴────────────────────╮     ╭────┴────╮
+         │  │ predicate:                             │     │ FROM:   │
+       1 │  │ [(col("Species")) != (String(setosa))] │     │ SORT BY │
+         │  ╰────────────────────────────────────────╯     ╰────┬┬───╯
+         │                                                      ││
+         │                                                      │╰────────────────────────────────────────────╮
+         │                                                      │                                             │
+         │                                              ╭───────┴────────╮  ╭─────────────────────────────────┴─────────────────────────────────╮
+         │                                              │ expression:    │  │ DF ["Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"] │
+       2 │                                              │ col("Species") │  │ PROJECT */5 COLUMNS                                               │
+         │                                              ╰────────────────╯  ╰───────────────────────────────────────────────────────────────────╯
+
+---
+
+    Code
+      cat(lazy_query$explain(format = "tree", ))
+    Output
+                    0                                             1
+         ┌───────────────────────────────────────────────────────────────────────────────────────────
+         │
+         │     ╭─────────╮
+       0 │     │ SORT BY │
+         │     ╰────┬┬───╯
+         │          ││
+         │          │╰────────────────────────────────────────────╮
+         │          │                                             │
+         │  ╭───────┴────────╮  ╭─────────────────────────────────┴─────────────────────────────────╮
+         │  │ expression:    │  │ DF ["Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"] │
+       1 │  │ col("Species") │  │ PROJECT */5 COLUMNS                                               │
+         │  ╰────────────────╯  ╰─────────────────────────────────┬─────────────────────────────────╯
+         │                                                        │
+         │                                                        │
+         │                                                        │
+         │                                    ╭───────────────────┴────────────────────╮
+         │                                    │ SELECTION:                             │
+       2 │                                    │ [(col("Species")) != (String(setosa))] │
+         │                                    ╰────────────────────────────────────────╯
+
+# join_asof
+
+    Code
+      l_gdp$lazy()$join_asof(l_pop$lazy(), on = "date", strategy = "fruitcake")
     Condition
-      Error in `current$collect()`:
-      ! Evaluation failed in `$collect()`.
+      Error:
+      ! Evaluation failed in `$join_asof()`.
       Caused by error:
-      ! A polars sub-thread panicked. See panic msg, which is likely more informative than this error: Any { .. }
+      ! `strategy` must be one of "backward", "forward", or "nearest", not "fruitcake".
 

From b0de8b74c45ef5da6f6ea276a17e796e7a8f6a41 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Mon, 9 Dec 2024 16:16:21 +0100
Subject: [PATCH 55/71] remove fetch

---
 R/lazyframe-frame.R       | 80 -----------------------------------
 man/lazyframe__collect.Rd |  1 -
 man/lazyframe__fetch.Rd   | 89 ---------------------------------------
 man/lazyframe__profile.Rd |  2 -
 4 files changed, 172 deletions(-)
 delete mode 100644 man/lazyframe__fetch.Rd

diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index e68477ac..3624745c 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -193,7 +193,6 @@ lazyframe__group_by <- function(..., .maintain_order = FALSE) {
 #' @inherit as_polars_lf return
 #'
 #' @seealso
-#'  - [`$fetch()`][lazyframe__fetch] - fast limited query check
 #'  - [`$profile()`][lazyframe__profile] - same as `$collect()` but also returns
 #'    a table with each operation profiled.
 #'  - [`$collect_in_background()`][lazyframe__collect_in_background] - non-blocking
@@ -305,7 +304,6 @@ lazyframe__collect_schema <- function() {
 #' also stored in the list.
 #' @seealso
 #'  - [`$collect()`][LazyFrame_collect] - regular collect.
-#'  - [`$fetch()`][LazyFrame_fetch] - fast limited query check
 #'  - [`$collect_in_background()`][LazyFrame_collect_in_background] - non-blocking
 #'    collect returns a future handle. Can also just be used via
 #'    `$collect(collect_in_background = TRUE)`.
@@ -1385,83 +1383,6 @@ lazyframe__rename <- function(mapping, ..., strict = TRUE) {
   })
 }
 
-#' Fetch `n` rows of a LazyFrame
-#'
-#' This is similar to `$collect()` but limit the number of rows to collect. It
-#' is mostly useful to check that a query works as expected.
-#'
-#'
-#' @details
-#' `$fetch()` does not guarantee the final number of rows in the DataFrame output.
-#' It only guarantees that `n` rows are used at the beginning of the query.
-#' Filters, join operations and a lower number of rows available in the scanned
-#' file influence the final number of rows.
-#'
-#' @param n_rows Integer. Maximum number of rows to fetch.
-#' @inheritParams lazyframe__collect
-#' @return A DataFrame of maximum n_rows
-#' @seealso
-#'  - [`$collect()`][lazyframe__collect] - regular collect.
-#'  - [`$profile()`][lazyframe__profile] - same as `$collect()` but also returns
-#'    a table with each operation profiled.
-#'  - [`$collect_in_background()`][lazyframe__collect_in_background] - non-blocking
-#'    collect returns a future handle. Can also just be used via
-#'    `$collect(collect_in_background = TRUE)`.
-#'  - [`$sink_parquet()`][lazyframe__sink_parquet()] streams query to a parquet file.
-#'  - [`$sink_ipc()`][lazyframe__sink_ipc()] streams query to a arrow file.
-#'
-#' @examples
-#' # fetch 3 rows
-#' as_polars_lf(iris)$fetch(3)
-#'
-#' # this fetch-query returns 4 rows, because we started with 3 and appended one
-#' # row in the query (see section 'Details')
-#' as_polars_lf(iris)$
-#'   select(pl$col("Species")$append("flora gigantica, alien"))$
-#'   fetch(3)
-lazyframe__fetch <- function(
-    n_rows = 500,
-    ...,
-    type_coercion = TRUE,
-    predicate_pushdown = TRUE,
-    projection_pushdown = TRUE,
-    simplify_expression = TRUE,
-    slice_pushdown = TRUE,
-    comm_subplan_elim = TRUE,
-    comm_subexpr_elim = TRUE,
-    cluster_with_columns = TRUE,
-    streaming = FALSE,
-    no_optimization = FALSE) {
-  if (isTRUE(no_optimization)) {
-    predicate_pushdown <- FALSE
-    projection_pushdown <- FALSE
-    slice_pushdown <- FALSE
-    comm_subplan_elim <- FALSE
-    comm_subexpr_elim <- FALSE
-    cluster_with_columns <- FALSE
-  }
-
-  if (isTRUE(streaming)) {
-    comm_subplan_elim <- FALSE
-  }
-
-  lf <- self |>
-    self$`_ldf`$optimization_toggle(
-      pe_coercion = type_coercion,
-      predicate_pushdown = predicate_pushdown,
-      projection_pushdown = projection_pushdown,
-      simplify_expression = simplify_expression,
-      slice_pushdown = slice_pushdown,
-      comm_subplan_elim = comm_subplan_elim,
-      comm_subexpr_elim = comm_subexpr_elim,
-      cluster_with_columns = cluster_with_columns,
-      streaming = streaming,
-      eager = FALSE
-    )
-
-  self$`_ldf`$fetch(n_rows)
-}
-
 #' Collect and profile a lazy query
 #'
 #' @description
@@ -1481,7 +1402,6 @@ lazyframe__fetch <- function(
 #' also stored in the list.
 #' @seealso
 #'  - [`$collect()`][lazyframe__collect] - regular collect.
-#'  - [`$fetch()`][lazyframe__fetch] - fast limited query check
 #'  - [`$collect_in_background()`][lazyframe__collect_in_background] - non-blocking
 #'    collect returns a future handle. Can also just be used via
 #'    `$collect(collect_in_background = TRUE)`.
diff --git a/man/lazyframe__collect.Rd b/man/lazyframe__collect.Rd
index 21b4a51b..b8224afa 100644
--- a/man/lazyframe__collect.Rd
+++ b/man/lazyframe__collect.Rd
@@ -74,7 +74,6 @@ lf$group_by("a")$agg(pl$all()$sum())$collect(
 }
 \seealso{
 \itemize{
-\item \code{\link[=lazyframe__fetch]{$fetch()}} - fast limited query check
 \item \code{\link[=lazyframe__profile]{$profile()}} - same as \verb{$collect()} but also returns
 a table with each operation profiled.
 \item \code{\link[=lazyframe__collect_in_background]{$collect_in_background()}} - non-blocking
diff --git a/man/lazyframe__fetch.Rd b/man/lazyframe__fetch.Rd
deleted file mode 100644
index 21345e1b..00000000
--- a/man/lazyframe__fetch.Rd
+++ /dev/null
@@ -1,89 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/lazyframe-frame.R
-\name{lazyframe__fetch}
-\alias{lazyframe__fetch}
-\title{Fetch \code{n} rows of a LazyFrame}
-\usage{
-lazyframe__fetch(
-  n_rows = 500,
-  ...,
-  type_coercion = TRUE,
-  predicate_pushdown = TRUE,
-  projection_pushdown = TRUE,
-  simplify_expression = TRUE,
-  slice_pushdown = TRUE,
-  comm_subplan_elim = TRUE,
-  comm_subexpr_elim = TRUE,
-  cluster_with_columns = TRUE,
-  streaming = FALSE,
-  no_optimization = FALSE
-)
-}
-\arguments{
-\item{n_rows}{Integer. Maximum number of rows to fetch.}
-
-\item{...}{Dots which should be empty.}
-
-\item{type_coercion}{Logical. Coerce types such that operations succeed and
-run on minimal required memory.}
-
-\item{predicate_pushdown}{Logical. Applies filters as early as possible at
-scan level.}
-
-\item{projection_pushdown}{Logical. Select only the columns that are needed
-at the scan level.}
-
-\item{simplify_expression}{Logical. Various optimizations, such as constant
-folding and replacing expensive operations with faster alternatives.}
-
-\item{slice_pushdown}{Logical. Only load the required slice from the scan
-level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).}
-
-\item{comm_subplan_elim}{Logical. Will try to cache branching subplans that
-occur on self-joins or unions.}
-
-\item{comm_subexpr_elim}{Logical. Common subexpressions will be cached and
-reused.}
-
-\item{cluster_with_columns}{Combine sequential independent calls to
-\code{\link[=lazyframe__with_columns]{with_columns()}}.}
-
-\item{streaming}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} Logical. Process the
-query in batches to handle larger-than-memory data. If \code{FALSE} (default),
-the entire query is processed in a single batch.}
-}
-\value{
-A DataFrame of maximum n_rows
-}
-\description{
-This is similar to \verb{$collect()} but limit the number of rows to collect. It
-is mostly useful to check that a query works as expected.
-}
-\details{
-\verb{$fetch()} does not guarantee the final number of rows in the DataFrame output.
-It only guarantees that \code{n} rows are used at the beginning of the query.
-Filters, join operations and a lower number of rows available in the scanned
-file influence the final number of rows.
-}
-\examples{
-# fetch 3 rows
-as_polars_lf(iris)$fetch(3)
-
-# this fetch-query returns 4 rows, because we started with 3 and appended one
-# row in the query (see section 'Details')
-as_polars_lf(iris)$
-  select(pl$col("Species")$append("flora gigantica, alien"))$
-  fetch(3)
-}
-\seealso{
-\itemize{
-\item \code{\link[=lazyframe__collect]{$collect()}} - regular collect.
-\item \code{\link[=lazyframe__profile]{$profile()}} - same as \verb{$collect()} but also returns
-a table with each operation profiled.
-\item \code{\link[=lazyframe__collect_in_background]{$collect_in_background()}} - non-blocking
-collect returns a future handle. Can also just be used via
-\verb{$collect(collect_in_background = TRUE)}.
-\item \code{\link[=lazyframe__sink_parquet]{$sink_parquet()}} streams query to a parquet file.
-\item \code{\link[=lazyframe__sink_ipc]{$sink_ipc()}} streams query to a arrow file.
-}
-}
diff --git a/man/lazyframe__profile.Rd b/man/lazyframe__profile.Rd
index 8223fe3a..d114e28f 100644
--- a/man/lazyframe__profile.Rd
+++ b/man/lazyframe__profile.Rd
@@ -148,7 +148,6 @@ as_polars_lf(iris)$
 \seealso{
 \itemize{
 \item \code{\link[=LazyFrame_collect]{$collect()}} - regular collect.
-\item \code{\link[=LazyFrame_fetch]{$fetch()}} - fast limited query check
 \item \code{\link[=LazyFrame_collect_in_background]{$collect_in_background()}} - non-blocking
 collect returns a future handle. Can also just be used via
 \verb{$collect(collect_in_background = TRUE)}.
@@ -158,7 +157,6 @@ collect returns a future handle. Can also just be used via
 
 \itemize{
 \item \code{\link[=lazyframe__collect]{$collect()}} - regular collect.
-\item \code{\link[=lazyframe__fetch]{$fetch()}} - fast limited query check
 \item \code{\link[=lazyframe__collect_in_background]{$collect_in_background()}} - non-blocking
 collect returns a future handle. Can also just be used via
 \verb{$collect(collect_in_background = TRUE)}.

From 80fd3a1426cf7e45cad7b6c1e711e2c749d4dac9 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Mon, 9 Dec 2024 16:20:05 +0100
Subject: [PATCH 56/71] add new args in sink_ functions

---
 R/000-wrappers.R                  | 16 ++---
 R/lazyframe-frame.R               | 33 ++++++++---
 man/lazyframe__sink_csv.Rd        | 22 ++++++-
 man/lazyframe__sink_ipc.Rd        | 22 ++++++-
 man/lazyframe__sink_ndjson.Rd     | 22 ++++++-
 man/lazyframe__sink_parquet.Rd    | 22 ++++++-
 src/init.c                        | 24 ++++----
 src/rust/api.h                    |  8 +--
 src/rust/src/lazyframe/general.rs | 99 ++++++++++++++++++++++++++++---
 9 files changed, 225 insertions(+), 43 deletions(-)

diff --git a/R/000-wrappers.R b/R/000-wrappers.R
index fef3a547..a838d0bf 100644
--- a/R/000-wrappers.R
+++ b/R/000-wrappers.R
@@ -3410,26 +3410,26 @@ class(`PlRExpr`) <- c("PlRExpr__bundle", "savvy_neopolars__sealed")
 }
 
 `PlRLazyFrame_sink_parquet` <- function(self) {
-  function(`path`, `compression`, `maintain_order`, `statistics`, `compression_level` = NULL, `row_group_size` = NULL, `data_page_size` = NULL) {
-    invisible(.Call(savvy_PlRLazyFrame_sink_parquet__impl, `self`, `path`, `compression`, `maintain_order`, `statistics`, `compression_level`, `row_group_size`, `data_page_size`))
+  function(`path`, `compression`, `maintain_order`, `statistics`, `retries`, `compression_level` = NULL, `row_group_size` = NULL, `data_page_size` = NULL, `storage_options` = NULL) {
+    invisible(.Call(savvy_PlRLazyFrame_sink_parquet__impl, `self`, `path`, `compression`, `maintain_order`, `statistics`, `retries`, `compression_level`, `row_group_size`, `data_page_size`, `storage_options`))
   }
 }
 
 `PlRLazyFrame_sink_ipc` <- function(self) {
-  function(`path`, `maintain_order`, `compression` = NULL) {
-    invisible(.Call(savvy_PlRLazyFrame_sink_ipc__impl, `self`, `path`, `maintain_order`, `compression`))
+  function(`path`, `maintain_order`, `retries`, `compression` = NULL, `storage_options` = NULL) {
+    invisible(.Call(savvy_PlRLazyFrame_sink_ipc__impl, `self`, `path`, `maintain_order`, `retries`, `compression`, `storage_options`))
   }
 }
 
 `PlRLazyFrame_sink_csv` <- function(self) {
-  function(`path`, `include_bom`, `include_header`, `separator`, `line_terminator`, `quote_char`, `maintain_order`, `batch_size`, `datetime_format` = NULL, `date_format` = NULL, `time_format` = NULL, `float_scientific` = NULL, `float_precision` = NULL, `null_value` = NULL, `quote_style` = NULL) {
-    invisible(.Call(savvy_PlRLazyFrame_sink_csv__impl, `self`, `path`, `include_bom`, `include_header`, `separator`, `line_terminator`, `quote_char`, `maintain_order`, `batch_size`, `datetime_format`, `date_format`, `time_format`, `float_scientific`, `float_precision`, `null_value`, `quote_style`))
+  function(`path`, `include_bom`, `include_header`, `separator`, `line_terminator`, `quote_char`, `maintain_order`, `batch_size`, `retries`, `datetime_format` = NULL, `date_format` = NULL, `time_format` = NULL, `float_scientific` = NULL, `float_precision` = NULL, `null_value` = NULL, `quote_style` = NULL, `storage_options` = NULL) {
+    invisible(.Call(savvy_PlRLazyFrame_sink_csv__impl, `self`, `path`, `include_bom`, `include_header`, `separator`, `line_terminator`, `quote_char`, `maintain_order`, `batch_size`, `retries`, `datetime_format`, `date_format`, `time_format`, `float_scientific`, `float_precision`, `null_value`, `quote_style`, `storage_options`))
   }
 }
 
 `PlRLazyFrame_sink_json` <- function(self) {
-  function(`path`, `maintain_order`) {
-    invisible(.Call(savvy_PlRLazyFrame_sink_json__impl, `self`, `path`, `maintain_order`))
+  function(`path`, `maintain_order`, `retries`, `storage_options` = NULL) {
+    invisible(.Call(savvy_PlRLazyFrame_sink_json__impl, `self`, `path`, `maintain_order`, `retries`, `storage_options`))
   }
 }
 
diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index 3624745c..f0dbf095 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -2271,6 +2271,7 @@ lazyframe__with_row_index <- function(name = "index", offset = 0) {
 #' @param maintain_order Maintain the order in which data is processed. Setting
 #' this to `FALSE` will be slightly faster.
 #' @inheritParams lazyframe__collect
+#' @inheritParams pl__scan_parquet
 #'
 #' @return Invisibly returns the input LazyFrame
 #'
@@ -2299,7 +2300,9 @@ lazyframe__sink_parquet <- function(
     projection_pushdown = TRUE,
     simplify_expression = TRUE,
     slice_pushdown = TRUE,
-    no_optimization = FALSE) {
+    no_optimization = FALSE,
+    storage_options = NULL,
+    retries = 2) {
   wrap({
     check_dots_empty0(...)
     compression <- arg_match0(
@@ -2335,7 +2338,9 @@ lazyframe__sink_parquet <- function(
       statistics = statistics,
       row_group_size = row_group_size,
       data_page_size = data_page_size,
-      maintain_order = maintain_order
+      maintain_order = maintain_order,
+      storage_options = storage_options,
+      retries = retries
     )
 
     invisible(self)
@@ -2372,7 +2377,9 @@ lazyframe__sink_ipc <- function(
     projection_pushdown = TRUE,
     simplify_expression = TRUE,
     slice_pushdown = TRUE,
-    no_optimization = FALSE) {
+    no_optimization = FALSE,
+    storage_options = NULL,
+    retries = 2) {
   wrap({
     check_dots_empty0(...)
     compression <- compression %||% "uncompressed"
@@ -2403,7 +2410,9 @@ lazyframe__sink_ipc <- function(
     lf$sink_ipc(
       path = path,
       compression = compression,
-      maintain_order = maintain_order
+      maintain_order = maintain_order,
+      storage_options = storage_options,
+      retries = retries
     )
 
     invisible(self)
@@ -2480,7 +2489,9 @@ lazyframe__sink_csv <- function(
     projection_pushdown = TRUE,
     simplify_expression = TRUE,
     slice_pushdown = TRUE,
-    no_optimization = FALSE) {
+    no_optimization = FALSE,
+    storage_options = NULL,
+    retries = 2) {
   wrap({
     check_dots_empty0(...)
     quote_style <- arg_match0(
@@ -2521,7 +2532,9 @@ lazyframe__sink_csv <- function(
       float_precision = float_precision,
       null_value = null_value,
       quote_style = quote_style,
-      maintain_order = maintain_order
+      maintain_order = maintain_order,
+      storage_options = storage_options,
+      retries = retries
     )
 
     invisible(self)
@@ -2549,7 +2562,9 @@ lazyframe__sink_ndjson <- function(
     projection_pushdown = TRUE,
     simplify_expression = TRUE,
     slice_pushdown = TRUE,
-    no_optimization = FALSE) {
+    no_optimization = FALSE,
+    storage_options = NULL,
+    retries = 2) {
   wrap({
     check_dots_empty0(...)
     if (isTRUE(no_optimization)) {
@@ -2573,7 +2588,9 @@ lazyframe__sink_ndjson <- function(
 
     lf$sink_json(
       path = path,
-      maintain_order = maintain_order
+      maintain_order = maintain_order,
+      storage_options = storage_options,
+      retries = retries
     )
 
     invisible(self)
diff --git a/man/lazyframe__sink_csv.Rd b/man/lazyframe__sink_csv.Rd
index 0ca3a8f5..0c8cac5b 100644
--- a/man/lazyframe__sink_csv.Rd
+++ b/man/lazyframe__sink_csv.Rd
@@ -25,7 +25,9 @@ lazyframe__sink_csv(
   projection_pushdown = TRUE,
   simplify_expression = TRUE,
   slice_pushdown = TRUE,
-  no_optimization = FALSE
+  no_optimization = FALSE,
+  storage_options = NULL,
+  retries = 2
 )
 }
 \arguments{
@@ -97,6 +99,24 @@ folding and replacing expensive operations with faster alternatives.}
 
 \item{slice_pushdown}{Logical. Only load the required slice from the scan
 level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).}
+
+\item{storage_options}{Named vector containing options that indicate how to
+connect to a cloud provider. The cloud providers currently supported are
+AWS, GCP, and Azure.
+See supported keys here:
+\itemize{
+\item \href{https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html}{aws}
+\item \href{https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html}{gcp}
+\item \href{https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html}{azure}
+\item Hugging Face (\verb{hf://}): Accepts an API key under the token parameter
+\code{c(token = YOUR_TOKEN)} or by setting the \code{HF_TOKEN} environment
+variable.
+}
+
+If \code{storage_options} is not provided, Polars will try to infer the
+information from environment variables.}
+
+\item{retries}{Number of retries if accessing a cloud instance fails.}
 }
 \value{
 Invisibly returns the input LazyFrame
diff --git a/man/lazyframe__sink_ipc.Rd b/man/lazyframe__sink_ipc.Rd
index 3d8dfb1d..0eb2d65e 100644
--- a/man/lazyframe__sink_ipc.Rd
+++ b/man/lazyframe__sink_ipc.Rd
@@ -14,7 +14,9 @@ lazyframe__sink_ipc(
   projection_pushdown = TRUE,
   simplify_expression = TRUE,
   slice_pushdown = TRUE,
-  no_optimization = FALSE
+  no_optimization = FALSE,
+  storage_options = NULL,
+  retries = 2
 )
 }
 \arguments{
@@ -46,6 +48,24 @@ folding and replacing expensive operations with faster alternatives.}
 
 \item{slice_pushdown}{Logical. Only load the required slice from the scan
 level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).}
+
+\item{storage_options}{Named vector containing options that indicate how to
+connect to a cloud provider. The cloud providers currently supported are
+AWS, GCP, and Azure.
+See supported keys here:
+\itemize{
+\item \href{https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html}{aws}
+\item \href{https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html}{gcp}
+\item \href{https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html}{azure}
+\item Hugging Face (\verb{hf://}): Accepts an API key under the token parameter
+\code{c(token = YOUR_TOKEN)} or by setting the \code{HF_TOKEN} environment
+variable.
+}
+
+If \code{storage_options} is not provided, Polars will try to infer the
+information from environment variables.}
+
+\item{retries}{Number of retries if accessing a cloud instance fails.}
 }
 \value{
 Invisibly returns the input LazyFrame
diff --git a/man/lazyframe__sink_ndjson.Rd b/man/lazyframe__sink_ndjson.Rd
index 8885e03c..97b0e50e 100644
--- a/man/lazyframe__sink_ndjson.Rd
+++ b/man/lazyframe__sink_ndjson.Rd
@@ -13,7 +13,9 @@ lazyframe__sink_ndjson(
   projection_pushdown = TRUE,
   simplify_expression = TRUE,
   slice_pushdown = TRUE,
-  no_optimization = FALSE
+  no_optimization = FALSE,
+  storage_options = NULL,
+  retries = 2
 )
 }
 \arguments{
@@ -38,6 +40,24 @@ folding and replacing expensive operations with faster alternatives.}
 
 \item{slice_pushdown}{Logical. Only load the required slice from the scan
 level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).}
+
+\item{storage_options}{Named vector containing options that indicate how to
+connect to a cloud provider. The cloud providers currently supported are
+AWS, GCP, and Azure.
+See supported keys here:
+\itemize{
+\item \href{https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html}{aws}
+\item \href{https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html}{gcp}
+\item \href{https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html}{azure}
+\item Hugging Face (\verb{hf://}): Accepts an API key under the token parameter
+\code{c(token = YOUR_TOKEN)} or by setting the \code{HF_TOKEN} environment
+variable.
+}
+
+If \code{storage_options} is not provided, Polars will try to infer the
+information from environment variables.}
+
+\item{retries}{Number of retries if accessing a cloud instance fails.}
 }
 \value{
 Invisibly returns the input LazyFrame
diff --git a/man/lazyframe__sink_parquet.Rd b/man/lazyframe__sink_parquet.Rd
index e74481ff..1a194f12 100644
--- a/man/lazyframe__sink_parquet.Rd
+++ b/man/lazyframe__sink_parquet.Rd
@@ -18,7 +18,9 @@ lazyframe__sink_parquet(
   projection_pushdown = TRUE,
   simplify_expression = TRUE,
   slice_pushdown = TRUE,
-  no_optimization = FALSE
+  no_optimization = FALSE,
+  storage_options = NULL,
+  retries = 2
 )
 }
 \arguments{
@@ -82,6 +84,24 @@ folding and replacing expensive operations with faster alternatives.}
 
 \item{slice_pushdown}{Logical. Only load the required slice from the scan
 level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).}
+
+\item{storage_options}{Named vector containing options that indicate how to
+connect to a cloud provider. The cloud providers currently supported are
+AWS, GCP, and Azure.
+See supported keys here:
+\itemize{
+\item \href{https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html}{aws}
+\item \href{https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html}{gcp}
+\item \href{https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html}{azure}
+\item Hugging Face (\verb{hf://}): Accepts an API key under the token parameter
+\code{c(token = YOUR_TOKEN)} or by setting the \code{HF_TOKEN} environment
+variable.
+}
+
+If \code{storage_options} is not provided, Polars will try to infer the
+information from environment variables.}
+
+\item{retries}{Number of retries if accessing a cloud instance fails.}
 }
 \value{
 Invisibly returns the input LazyFrame
diff --git a/src/init.c b/src/init.c
index b68ee975..7a76ab20 100644
--- a/src/init.c
+++ b/src/init.c
@@ -2334,23 +2334,23 @@ SEXP savvy_PlRLazyFrame_profile__impl(SEXP self__) {
     return handle_result(res);
 }
 
-SEXP savvy_PlRLazyFrame_sink_parquet__impl(SEXP self__, SEXP c_arg__path, SEXP c_arg__compression, SEXP c_arg__maintain_order, SEXP c_arg__statistics, SEXP c_arg__compression_level, SEXP c_arg__row_group_size, SEXP c_arg__data_page_size) {
-    SEXP res = savvy_PlRLazyFrame_sink_parquet__ffi(self__, c_arg__path, c_arg__compression, c_arg__maintain_order, c_arg__statistics, c_arg__compression_level, c_arg__row_group_size, c_arg__data_page_size);
+SEXP savvy_PlRLazyFrame_sink_parquet__impl(SEXP self__, SEXP c_arg__path, SEXP c_arg__compression, SEXP c_arg__maintain_order, SEXP c_arg__statistics, SEXP c_arg__retries, SEXP c_arg__compression_level, SEXP c_arg__row_group_size, SEXP c_arg__data_page_size, SEXP c_arg__storage_options) {
+    SEXP res = savvy_PlRLazyFrame_sink_parquet__ffi(self__, c_arg__path, c_arg__compression, c_arg__maintain_order, c_arg__statistics, c_arg__retries, c_arg__compression_level, c_arg__row_group_size, c_arg__data_page_size, c_arg__storage_options);
     return handle_result(res);
 }
 
-SEXP savvy_PlRLazyFrame_sink_ipc__impl(SEXP self__, SEXP c_arg__path, SEXP c_arg__maintain_order, SEXP c_arg__compression) {
-    SEXP res = savvy_PlRLazyFrame_sink_ipc__ffi(self__, c_arg__path, c_arg__maintain_order, c_arg__compression);
+SEXP savvy_PlRLazyFrame_sink_ipc__impl(SEXP self__, SEXP c_arg__path, SEXP c_arg__maintain_order, SEXP c_arg__retries, SEXP c_arg__compression, SEXP c_arg__storage_options) {
+    SEXP res = savvy_PlRLazyFrame_sink_ipc__ffi(self__, c_arg__path, c_arg__maintain_order, c_arg__retries, c_arg__compression, c_arg__storage_options);
     return handle_result(res);
 }
 
-SEXP savvy_PlRLazyFrame_sink_csv__impl(SEXP self__, SEXP c_arg__path, SEXP c_arg__include_bom, SEXP c_arg__include_header, SEXP c_arg__separator, SEXP c_arg__line_terminator, SEXP c_arg__quote_char, SEXP c_arg__maintain_order, SEXP c_arg__batch_size, SEXP c_arg__datetime_format, SEXP c_arg__date_format, SEXP c_arg__time_format, SEXP c_arg__float_scientific, SEXP c_arg__float_precision, SEXP c_arg__null_value, SEXP c_arg__quote_style) {
-    SEXP res = savvy_PlRLazyFrame_sink_csv__ffi(self__, c_arg__path, c_arg__include_bom, c_arg__include_header, c_arg__separator, c_arg__line_terminator, c_arg__quote_char, c_arg__maintain_order, c_arg__batch_size, c_arg__datetime_format, c_arg__date_format, c_arg__time_format, c_arg__float_scientific, c_arg__float_precision, c_arg__null_value, c_arg__quote_style);
+SEXP savvy_PlRLazyFrame_sink_csv__impl(SEXP self__, SEXP c_arg__path, SEXP c_arg__include_bom, SEXP c_arg__include_header, SEXP c_arg__separator, SEXP c_arg__line_terminator, SEXP c_arg__quote_char, SEXP c_arg__maintain_order, SEXP c_arg__batch_size, SEXP c_arg__retries, SEXP c_arg__datetime_format, SEXP c_arg__date_format, SEXP c_arg__time_format, SEXP c_arg__float_scientific, SEXP c_arg__float_precision, SEXP c_arg__null_value, SEXP c_arg__quote_style, SEXP c_arg__storage_options) {
+    SEXP res = savvy_PlRLazyFrame_sink_csv__ffi(self__, c_arg__path, c_arg__include_bom, c_arg__include_header, c_arg__separator, c_arg__line_terminator, c_arg__quote_char, c_arg__maintain_order, c_arg__batch_size, c_arg__retries, c_arg__datetime_format, c_arg__date_format, c_arg__time_format, c_arg__float_scientific, c_arg__float_precision, c_arg__null_value, c_arg__quote_style, c_arg__storage_options);
     return handle_result(res);
 }
 
-SEXP savvy_PlRLazyFrame_sink_json__impl(SEXP self__, SEXP c_arg__path, SEXP c_arg__maintain_order) {
-    SEXP res = savvy_PlRLazyFrame_sink_json__ffi(self__, c_arg__path, c_arg__maintain_order);
+SEXP savvy_PlRLazyFrame_sink_json__impl(SEXP self__, SEXP c_arg__path, SEXP c_arg__maintain_order, SEXP c_arg__retries, SEXP c_arg__storage_options) {
+    SEXP res = savvy_PlRLazyFrame_sink_json__ffi(self__, c_arg__path, c_arg__maintain_order, c_arg__retries, c_arg__storage_options);
     return handle_result(res);
 }
 
@@ -3196,10 +3196,10 @@ static const R_CallMethodDef CallEntries[] = {
     {"savvy_PlRLazyFrame_bottom_k__impl", (DL_FUNC) &savvy_PlRLazyFrame_bottom_k__impl, 4},
     {"savvy_PlRLazyFrame_cache__impl", (DL_FUNC) &savvy_PlRLazyFrame_cache__impl, 1},
     {"savvy_PlRLazyFrame_profile__impl", (DL_FUNC) &savvy_PlRLazyFrame_profile__impl, 1},
-    {"savvy_PlRLazyFrame_sink_parquet__impl", (DL_FUNC) &savvy_PlRLazyFrame_sink_parquet__impl, 8},
-    {"savvy_PlRLazyFrame_sink_ipc__impl", (DL_FUNC) &savvy_PlRLazyFrame_sink_ipc__impl, 4},
-    {"savvy_PlRLazyFrame_sink_csv__impl", (DL_FUNC) &savvy_PlRLazyFrame_sink_csv__impl, 16},
-    {"savvy_PlRLazyFrame_sink_json__impl", (DL_FUNC) &savvy_PlRLazyFrame_sink_json__impl, 3},
+    {"savvy_PlRLazyFrame_sink_parquet__impl", (DL_FUNC) &savvy_PlRLazyFrame_sink_parquet__impl, 10},
+    {"savvy_PlRLazyFrame_sink_ipc__impl", (DL_FUNC) &savvy_PlRLazyFrame_sink_ipc__impl, 6},
+    {"savvy_PlRLazyFrame_sink_csv__impl", (DL_FUNC) &savvy_PlRLazyFrame_sink_csv__impl, 18},
+    {"savvy_PlRLazyFrame_sink_json__impl", (DL_FUNC) &savvy_PlRLazyFrame_sink_json__impl, 5},
     {"savvy_PlRLazyFrame_serialize__impl", (DL_FUNC) &savvy_PlRLazyFrame_serialize__impl, 1},
     {"savvy_PlRLazyFrame_select_seq__impl", (DL_FUNC) &savvy_PlRLazyFrame_select_seq__impl, 2},
     {"savvy_PlRLazyFrame_rolling__impl", (DL_FUNC) &savvy_PlRLazyFrame_rolling__impl, 6},
diff --git a/src/rust/api.h b/src/rust/api.h
index 41c28d8a..0b7a45ae 100644
--- a/src/rust/api.h
+++ b/src/rust/api.h
@@ -470,10 +470,10 @@ SEXP savvy_PlRLazyFrame_top_k__ffi(SEXP self__, SEXP c_arg__k, SEXP c_arg__by, S
 SEXP savvy_PlRLazyFrame_bottom_k__ffi(SEXP self__, SEXP c_arg__k, SEXP c_arg__by, SEXP c_arg__reverse);
 SEXP savvy_PlRLazyFrame_cache__ffi(SEXP self__);
 SEXP savvy_PlRLazyFrame_profile__ffi(SEXP self__);
-SEXP savvy_PlRLazyFrame_sink_parquet__ffi(SEXP self__, SEXP c_arg__path, SEXP c_arg__compression, SEXP c_arg__maintain_order, SEXP c_arg__statistics, SEXP c_arg__compression_level, SEXP c_arg__row_group_size, SEXP c_arg__data_page_size);
-SEXP savvy_PlRLazyFrame_sink_ipc__ffi(SEXP self__, SEXP c_arg__path, SEXP c_arg__maintain_order, SEXP c_arg__compression);
-SEXP savvy_PlRLazyFrame_sink_csv__ffi(SEXP self__, SEXP c_arg__path, SEXP c_arg__include_bom, SEXP c_arg__include_header, SEXP c_arg__separator, SEXP c_arg__line_terminator, SEXP c_arg__quote_char, SEXP c_arg__maintain_order, SEXP c_arg__batch_size, SEXP c_arg__datetime_format, SEXP c_arg__date_format, SEXP c_arg__time_format, SEXP c_arg__float_scientific, SEXP c_arg__float_precision, SEXP c_arg__null_value, SEXP c_arg__quote_style);
-SEXP savvy_PlRLazyFrame_sink_json__ffi(SEXP self__, SEXP c_arg__path, SEXP c_arg__maintain_order);
+SEXP savvy_PlRLazyFrame_sink_parquet__ffi(SEXP self__, SEXP c_arg__path, SEXP c_arg__compression, SEXP c_arg__maintain_order, SEXP c_arg__statistics, SEXP c_arg__retries, SEXP c_arg__compression_level, SEXP c_arg__row_group_size, SEXP c_arg__data_page_size, SEXP c_arg__storage_options);
+SEXP savvy_PlRLazyFrame_sink_ipc__ffi(SEXP self__, SEXP c_arg__path, SEXP c_arg__maintain_order, SEXP c_arg__retries, SEXP c_arg__compression, SEXP c_arg__storage_options);
+SEXP savvy_PlRLazyFrame_sink_csv__ffi(SEXP self__, SEXP c_arg__path, SEXP c_arg__include_bom, SEXP c_arg__include_header, SEXP c_arg__separator, SEXP c_arg__line_terminator, SEXP c_arg__quote_char, SEXP c_arg__maintain_order, SEXP c_arg__batch_size, SEXP c_arg__retries, SEXP c_arg__datetime_format, SEXP c_arg__date_format, SEXP c_arg__time_format, SEXP c_arg__float_scientific, SEXP c_arg__float_precision, SEXP c_arg__null_value, SEXP c_arg__quote_style, SEXP c_arg__storage_options);
+SEXP savvy_PlRLazyFrame_sink_json__ffi(SEXP self__, SEXP c_arg__path, SEXP c_arg__maintain_order, SEXP c_arg__retries, SEXP c_arg__storage_options);
 SEXP savvy_PlRLazyFrame_serialize__ffi(SEXP self__);
 SEXP savvy_PlRLazyFrame_select_seq__ffi(SEXP self__, SEXP c_arg__exprs);
 SEXP savvy_PlRLazyFrame_rolling__ffi(SEXP self__, SEXP c_arg__index_column, SEXP c_arg__period, SEXP c_arg__offset, SEXP c_arg__closed, SEXP c_arg__by);
diff --git a/src/rust/src/lazyframe/general.rs b/src/rust/src/lazyframe/general.rs
index e41d81cc..fed0dd1d 100644
--- a/src/rust/src/lazyframe/general.rs
+++ b/src/rust/src/lazyframe/general.rs
@@ -323,9 +323,11 @@ impl PlRLazyFrame {
         compression: &str,
         maintain_order: bool,
         statistics: ListSexp,
+        retries: NumericScalar,
         compression_level: Option<NumericScalar>,
         row_group_size: Option<NumericScalar>,
         data_page_size: Option<NumericScalar>,
+        storage_options: Option<StringSexp>,
     ) -> Result<()> {
         let path: PathBuf = path.into();
         let statistics = <Wrap<StatisticsOptions>>::try_from(statistics)?.0;
@@ -342,6 +344,7 @@ impl PlRLazyFrame {
             Some(x) => Some(<Wrap<usize>>::try_from(x)?.0),
             None => None,
         };
+        let retries = <Wrap<usize>>::try_from(retries)?.0;
 
         let options = ParquetWriteOptions {
             compression,
@@ -350,18 +353,41 @@ impl PlRLazyFrame {
             data_page_size,
             maintain_order,
         };
-
+        let cloud_options = match storage_options {
+            Some(x) => {
+                let out = <Wrap<Vec<(String, String)>>>::try_from(x).map_err(|_| {
+                    RPolarsErr::Other(
+                        "`storage_options` must be a named character vector".to_string(),
+                    )
+                })?;
+                Some(out.0)
+            }
+            None => None,
+        };
+        let cloud_options = {
+            let cloud_options =
+                parse_cloud_options(path.to_str().unwrap(), cloud_options.unwrap_or_default())?;
+            Some(cloud_options.with_max_retries(retries))
+        };
         let _ = self
             .ldf
             .clone()
-            .sink_parquet(path, options)
+            .sink_parquet(&path, options, cloud_options)
             .map_err(RPolarsErr::from);
         Ok(())
     }
 
-    fn sink_ipc(&self, path: &str, maintain_order: bool, compression: Option<&str>) -> Result<()> {
+    fn sink_ipc(
+        &self,
+        path: &str,
+        maintain_order: bool,
+        retries: NumericScalar,
+        compression: Option<&str>,
+        storage_options: Option<StringSexp>,
+    ) -> Result<()> {
         let path: PathBuf = path.into();
 
+        let retries = <Wrap<usize>>::try_from(retries)?.0;
         let compression: Option<IpcCompression> = match compression {
             Some(x) => {
                 if x == "uncompressed" {
@@ -378,10 +404,27 @@ impl PlRLazyFrame {
             maintain_order,
         };
 
+        let cloud_options = match storage_options {
+            Some(x) => {
+                let out = <Wrap<Vec<(String, String)>>>::try_from(x).map_err(|_| {
+                    RPolarsErr::Other(
+                        "`storage_options` must be a named character vector".to_string(),
+                    )
+                })?;
+                Some(out.0)
+            }
+            None => None,
+        };
+        let cloud_options = {
+            let cloud_options =
+                parse_cloud_options(path.to_str().unwrap(), cloud_options.unwrap_or_default())?;
+            Some(cloud_options.with_max_retries(retries))
+        };
+
         let _ = self
             .ldf
             .clone()
-            .sink_ipc(path, options)
+            .sink_ipc(&path, options, cloud_options)
             .map_err(RPolarsErr::from);
         Ok(())
     }
@@ -396,6 +439,7 @@ impl PlRLazyFrame {
         quote_char: &str,
         maintain_order: bool,
         batch_size: NumericScalar,
+        retries: NumericScalar,
         datetime_format: Option<&str>,
         date_format: Option<&str>,
         time_format: Option<&str>,
@@ -403,12 +447,14 @@ impl PlRLazyFrame {
         float_precision: Option<NumericScalar>,
         null_value: Option<&str>,
         quote_style: Option<&str>,
+        storage_options: Option<StringSexp>,
     ) -> Result<()> {
         let path: PathBuf = path.into();
         let quote_style = match quote_style {
             Some(x) => <Wrap<QuoteStyle>>::try_from(x)?.0,
             None => QuoteStyle::default(),
         };
+        let retries = <Wrap<usize>>::try_from(retries)?.0;
         let null_value = null_value
             .map(|x| x.to_string())
             .unwrap_or(SerializeOptions::default().null);
@@ -440,23 +486,62 @@ impl PlRLazyFrame {
             batch_size,
             serialize_options,
         };
+        let cloud_options = match storage_options {
+            Some(x) => {
+                let out = <Wrap<Vec<(String, String)>>>::try_from(x).map_err(|_| {
+                    RPolarsErr::Other(
+                        "`storage_options` must be a named character vector".to_string(),
+                    )
+                })?;
+                Some(out.0)
+            }
+            None => None,
+        };
+        let cloud_options = {
+            let cloud_options =
+                parse_cloud_options(path.to_str().unwrap(), cloud_options.unwrap_or_default())?;
+            Some(cloud_options.with_max_retries(retries))
+        };
 
         let _ = self
             .ldf
             .clone()
-            .sink_csv(path, options)
+            .sink_csv(&path, options, cloud_options)
             .map_err(RPolarsErr::from);
         Ok(())
     }
 
-    fn sink_json(&self, path: &str, maintain_order: bool) -> Result<()> {
+    fn sink_json(
+        &self,
+        path: &str,
+        maintain_order: bool,
+        retries: NumericScalar,
+        storage_options: Option<StringSexp>,
+    ) -> Result<()> {
         let path: PathBuf = path.into();
+        let retries = <Wrap<usize>>::try_from(retries)?.0;
         let options = JsonWriterOptions { maintain_order };
+        let cloud_options = match storage_options {
+            Some(x) => {
+                let out = <Wrap<Vec<(String, String)>>>::try_from(x).map_err(|_| {
+                    RPolarsErr::Other(
+                        "`storage_options` must be a named character vector".to_string(),
+                    )
+                })?;
+                Some(out.0)
+            }
+            None => None,
+        };
+        let cloud_options = {
+            let cloud_options =
+                parse_cloud_options(path.to_str().unwrap(), cloud_options.unwrap_or_default())?;
+            Some(cloud_options.with_max_retries(retries))
+        };
 
         let _ = self
             .ldf
             .clone()
-            .sink_json(path, options)
+            .sink_json(&path, options, cloud_options)
             .map_err(RPolarsErr::from);
         Ok(())
     }

From f966da4dc3265383dd9c85a7965450f3332281e1 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Mon, 9 Dec 2024 16:48:48 +0100
Subject: [PATCH 57/71] redoc

---
 man/as_polars_df.Rd                | 33 ++++++++++-------------
 man/dataframe__select.Rd           | 10 +++----
 man/dataframe__with_columns.Rd     | 10 +++----
 man/lazyframe__collect.Rd          | 42 ++++++++++++++----------------
 man/lazyframe__explain.Rd          | 31 +++++++++-------------
 man/lazyframe__profile.Rd          | 33 ++++++++++-------------
 man/lazyframe__select.Rd           | 10 +++----
 man/lazyframe__select_seq.Rd       | 10 +++----
 man/lazyframe__sink_csv.Rd         | 17 +++++-------
 man/lazyframe__sink_ipc.Rd         | 17 +++++-------
 man/lazyframe__sink_ndjson.Rd      | 17 +++++-------
 man/lazyframe__sink_parquet.Rd     | 17 +++++-------
 man/lazyframe__to_dot.Rd           | 31 +++++++++-------------
 man/lazyframe__with_columns.Rd     | 10 +++----
 man/lazyframe__with_columns_seq.Rd | 10 +++----
 man/pl__struct.Rd                  | 10 +++----
 16 files changed, 134 insertions(+), 174 deletions(-)

diff --git a/man/as_polars_df.Rd b/man/as_polars_df.Rd
index d94243f5..4a27ac42 100644
--- a/man/as_polars_df.Rd
+++ b/man/as_polars_df.Rd
@@ -56,33 +56,28 @@ If \code{NULL}, the column name is taken from the \link{Series} name.}
 the \code{\link[=series_struct_unnest]{<Series>$struct$unnest()}} method is used to create a \link{DataFrame}
 from the struct \link{Series}. In this case, the \code{column_name} argument is ignored.}
 
-\item{type_coercion}{Logical. Coerce types such that operations succeed and
-run on minimal required memory.}
+\item{type_coercion}{A logical, indicates type coercion optimization.}
 
-\item{predicate_pushdown}{Logical. Applies filters as early as possible at
-scan level.}
+\item{predicate_pushdown}{A logical, indicates predicate pushdown optimization.}
 
-\item{projection_pushdown}{Logical. Select only the columns that are needed
-at the scan level.}
+\item{projection_pushdown}{A logical, indicates projection pushdown optimization.}
 
-\item{simplify_expression}{Logical. Various optimizations, such as constant
-folding and replacing expensive operations with faster alternatives.}
+\item{simplify_expression}{A logical, indicates simplify expression optimization.}
 
-\item{slice_pushdown}{Logical. Only load the required slice from the scan
-level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).}
+\item{slice_pushdown}{A logical, indicates slice pushdown optimization.}
 
-\item{comm_subplan_elim}{Logical. Will try to cache branching subplans that
-occur on self-joins or unions.}
+\item{comm_subplan_elim}{A logical, indicates trying to cache branching subplans that occur on self-joins or unions.}
 
-\item{comm_subexpr_elim}{Logical. Common subexpressions will be cached and
-reused.}
+\item{comm_subexpr_elim}{A logical, indicates trying to cache common subexpressions.}
 
-\item{cluster_with_columns}{Combine sequential independent calls to
-\code{\link[=lazyframe__with_columns]{with_columns()}}.}
+\item{cluster_with_columns}{A logical, indicates to combine sequential independent calls to with_columns.}
 
-\item{streaming}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} Logical. Process the
-query in batches to handle larger-than-memory data. If \code{FALSE} (default),
-the entire query is processed in a single batch.}
+\item{no_optimization}{A logical. If \code{TRUE}, turn off (certain) optimizations.}
+
+\item{streaming}{A logical. If \code{TRUE}, process the query in batches to handle larger-than-memory data.
+If \code{FALSE} (default), the entire query is processed in a single batch.
+Note that streaming mode is considered unstable.
+It may be changed at any point without it being considered a breaking change.}
 }
 \value{
 A polars \link{DataFrame}
diff --git a/man/dataframe__select.Rd b/man/dataframe__select.Rd
index 0fbbc140..9d570adc 100644
--- a/man/dataframe__select.Rd
+++ b/man/dataframe__select.Rd
@@ -7,11 +7,11 @@
 dataframe__select(...)
 }
 \arguments{
-\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Name-value pairs of objects
-to be converted to polars \link[=Expr]{expressions} by the \code{\link[=as_polars_expr]{as_polars_expr()}}
-function. Characters are parsed as column names, other non-expression inputs
-are parsed as \link[=pl__lit]{literals}. Each name will be used as the expression
-name.}
+\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}>
+Name-value pairs of objects to be converted to polars \link[=Expr]{expressions}
+by the \code{\link[=as_polars_expr]{as_polars_expr()}} function.
+Characters are parsed as column names, other non-expression inputs are parsed as \link[=pl__lit]{literals}.
+Each name will be used as the expression name.}
 }
 \value{
 A polars \link{DataFrame}
diff --git a/man/dataframe__with_columns.Rd b/man/dataframe__with_columns.Rd
index 53e6ae05..677dc3c0 100644
--- a/man/dataframe__with_columns.Rd
+++ b/man/dataframe__with_columns.Rd
@@ -7,11 +7,11 @@
 dataframe__with_columns(...)
 }
 \arguments{
-\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Name-value pairs of objects
-to be converted to polars \link[=Expr]{expressions} by the \code{\link[=as_polars_expr]{as_polars_expr()}}
-function. Characters are parsed as column names, other non-expression inputs
-are parsed as \link[=pl__lit]{literals}. Each name will be used as the expression
-name.}
+\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}>
+Name-value pairs of objects to be converted to polars \link[=Expr]{expressions}
+by the \code{\link[=as_polars_expr]{as_polars_expr()}} function.
+Characters are parsed as column names, other non-expression inputs are parsed as \link[=pl__lit]{literals}.
+Each name will be used as the expression name.}
 }
 \value{
 A polars \link{DataFrame}
diff --git a/man/lazyframe__collect.Rd b/man/lazyframe__collect.Rd
index b8224afa..73030a47 100644
--- a/man/lazyframe__collect.Rd
+++ b/man/lazyframe__collect.Rd
@@ -20,44 +20,40 @@ lazyframe__collect(
 )
 }
 \arguments{
-\item{...}{Dots which should be empty.}
+\item{...}{These dots are for future extensions and must be empty.}
 
-\item{type_coercion}{Logical. Coerce types such that operations succeed and
-run on minimal required memory.}
+\item{type_coercion}{A logical, indicates type coercion optimization.}
 
-\item{predicate_pushdown}{Logical. Applies filters as early as possible at
-scan level.}
+\item{predicate_pushdown}{A logical, indicates predicate pushdown optimization.}
 
-\item{projection_pushdown}{Logical. Select only the columns that are needed
-at the scan level.}
+\item{projection_pushdown}{A logical, indicates projection pushdown optimization.}
 
-\item{simplify_expression}{Logical. Various optimizations, such as constant
-folding and replacing expensive operations with faster alternatives.}
+\item{simplify_expression}{A logical, indicates simplify expression optimization.}
 
-\item{slice_pushdown}{Logical. Only load the required slice from the scan
-level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).}
+\item{slice_pushdown}{A logical, indicates slice pushdown optimization.}
 
-\item{comm_subplan_elim}{Logical. Will try to cache branching subplans that
-occur on self-joins or unions.}
+\item{comm_subplan_elim}{A logical, indicates trying to cache branching subplans that occur on self-joins or unions.}
 
-\item{comm_subexpr_elim}{Logical. Common subexpressions will be cached and
-reused.}
+\item{comm_subexpr_elim}{A logical, indicates trying to cache common subexpressions.}
 
-\item{cluster_with_columns}{Combine sequential independent calls to
-\code{\link[=lazyframe__with_columns]{with_columns()}}.}
+\item{cluster_with_columns}{A logical, indicates to combine sequential independent calls to with_columns.}
 
-\item{streaming}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} Logical. Process the
-query in batches to handle larger-than-memory data. If \code{FALSE} (default),
-the entire query is processed in a single batch.}
+\item{no_optimization}{A logical. If \code{TRUE}, turn off (certain) optimizations.}
 
-\item{_eager}{A logical, indicates to turn off multi-node optimizations and
-the other optimizations. This option is intended for internal use only.}
+\item{streaming}{A logical. If \code{TRUE}, process the query in batches to handle larger-than-memory data.
+If \code{FALSE} (default), the entire query is processed in a single batch.
+Note that streaming mode is considered unstable.
+It may be changed at any point without it being considered a breaking change.}
+
+\item{_eager}{A logical, indicates to turn off multi-node optimizations and the other optimizations.
+This option is intended for internal use only.}
 }
 \value{
-A polars \link{LazyFrame}
+A polars \link{DataFrame}
 }
 \description{
 By default, all query optimizations are enabled.
+Individual optimizations may be disabled by setting the corresponding parameter to \code{FALSE}.
 }
 \examples{
 lf <- pl$LazyFrame(
diff --git a/man/lazyframe__explain.Rd b/man/lazyframe__explain.Rd
index f902d783..0a3bab93 100644
--- a/man/lazyframe__explain.Rd
+++ b/man/lazyframe__explain.Rd
@@ -28,33 +28,26 @@ either \code{"plain"} (default) or \code{"tree"}.}
 \item{optimized}{Return an optimized query plan. If \code{TRUE} (default), the
 subsequent optimization flags control which optimizations run.}
 
-\item{type_coercion}{Logical. Coerce types such that operations succeed and
-run on minimal required memory.}
+\item{type_coercion}{A logical, indicates type coercion optimization.}
 
-\item{predicate_pushdown}{Logical. Applies filters as early as possible at
-scan level.}
+\item{predicate_pushdown}{A logical, indicates predicate pushdown optimization.}
 
-\item{projection_pushdown}{Logical. Select only the columns that are needed
-at the scan level.}
+\item{projection_pushdown}{A logical, indicates projection pushdown optimization.}
 
-\item{simplify_expression}{Logical. Various optimizations, such as constant
-folding and replacing expensive operations with faster alternatives.}
+\item{simplify_expression}{A logical, indicates simplify expression optimization.}
 
-\item{slice_pushdown}{Logical. Only load the required slice from the scan
-level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).}
+\item{slice_pushdown}{A logical, indicates slice pushdown optimization.}
 
-\item{comm_subplan_elim}{Logical. Will try to cache branching subplans that
-occur on self-joins or unions.}
+\item{comm_subplan_elim}{A logical, indicates trying to cache branching subplans that occur on self-joins or unions.}
 
-\item{comm_subexpr_elim}{Logical. Common subexpressions will be cached and
-reused.}
+\item{comm_subexpr_elim}{A logical, indicates trying to cache common subexpressions.}
 
-\item{cluster_with_columns}{Combine sequential independent calls to
-\code{\link[=lazyframe__with_columns]{with_columns()}}.}
+\item{cluster_with_columns}{A logical, indicates to combine sequential independent calls to with_columns.}
 
-\item{streaming}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} Logical. Process the
-query in batches to handle larger-than-memory data. If \code{FALSE} (default),
-the entire query is processed in a single batch.}
+\item{streaming}{A logical. If \code{TRUE}, process the query in batches to handle larger-than-memory data.
+If \code{FALSE} (default), the entire query is processed in a single batch.
+Note that streaming mode is considered unstable.
+It may be changed at any point without it being considered a breaking change.}
 }
 \value{
 A character value containing the query plan.
diff --git a/man/lazyframe__profile.Rd b/man/lazyframe__profile.Rd
index d114e28f..aa913dd7 100644
--- a/man/lazyframe__profile.Rd
+++ b/man/lazyframe__profile.Rd
@@ -37,33 +37,28 @@ lazyframe__profile(
 )
 }
 \arguments{
-\item{type_coercion}{Logical. Coerce types such that operations succeed and
-run on minimal required memory.}
+\item{type_coercion}{A logical, indicates type coercion optimization.}
 
-\item{predicate_pushdown}{Logical. Applies filters as early as possible at
-scan level.}
+\item{predicate_pushdown}{A logical, indicates predicate pushdown optimization.}
 
-\item{projection_pushdown}{Logical. Select only the columns that are needed
-at the scan level.}
+\item{projection_pushdown}{A logical, indicates projection pushdown optimization.}
 
-\item{simplify_expression}{Logical. Various optimizations, such as constant
-folding and replacing expensive operations with faster alternatives.}
+\item{simplify_expression}{A logical, indicates simplify expression optimization.}
 
-\item{slice_pushdown}{Logical. Only load the required slice from the scan
-level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).}
+\item{slice_pushdown}{A logical, indicates slice pushdown optimization.}
 
-\item{comm_subplan_elim}{Logical. Will try to cache branching subplans that
-occur on self-joins or unions.}
+\item{comm_subplan_elim}{A logical, indicates trying to cache branching subplans that occur on self-joins or unions.}
 
-\item{comm_subexpr_elim}{Logical. Common subexpressions will be cached and
-reused.}
+\item{comm_subexpr_elim}{A logical, indicates trying to cache common subexpressions.}
 
-\item{cluster_with_columns}{Combine sequential independent calls to
-\code{\link[=lazyframe__with_columns]{with_columns()}}.}
+\item{cluster_with_columns}{A logical, indicates to combine sequential independent calls to with_columns.}
 
-\item{streaming}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} Logical. Process the
-query in batches to handle larger-than-memory data. If \code{FALSE} (default),
-the entire query is processed in a single batch.}
+\item{streaming}{A logical. If \code{TRUE}, process the query in batches to handle larger-than-memory data.
+If \code{FALSE} (default), the entire query is processed in a single batch.
+Note that streaming mode is considered unstable.
+It may be changed at any point without it being considered a breaking change.}
+
+\item{no_optimization}{A logical. If \code{TRUE}, turn off (certain) optimizations.}
 
 \item{show_plot}{Show a Gantt chart of the profiling result}
 
diff --git a/man/lazyframe__select.Rd b/man/lazyframe__select.Rd
index e6586188..3ef41b08 100644
--- a/man/lazyframe__select.Rd
+++ b/man/lazyframe__select.Rd
@@ -7,11 +7,11 @@
 lazyframe__select(...)
 }
 \arguments{
-\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Name-value pairs of objects
-to be converted to polars \link[=Expr]{expressions} by the \code{\link[=as_polars_expr]{as_polars_expr()}}
-function. Characters are parsed as column names, other non-expression inputs
-are parsed as \link[=pl__lit]{literals}. Each name will be used as the expression
-name.}
+\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}>
+Name-value pairs of objects to be converted to polars \link[=Expr]{expressions}
+by the \code{\link[=as_polars_expr]{as_polars_expr()}} function.
+Characters are parsed as column names, other non-expression inputs are parsed as \link[=pl__lit]{literals}.
+Each name will be used as the expression name.}
 }
 \value{
 A polars \link{LazyFrame}
diff --git a/man/lazyframe__select_seq.Rd b/man/lazyframe__select_seq.Rd
index 7fd3a7fc..eec4c8fc 100644
--- a/man/lazyframe__select_seq.Rd
+++ b/man/lazyframe__select_seq.Rd
@@ -7,11 +7,11 @@
 lazyframe__select_seq(...)
 }
 \arguments{
-\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Name-value pairs of objects
-to be converted to polars \link[=Expr]{expressions} by the \code{\link[=as_polars_expr]{as_polars_expr()}}
-function. Characters are parsed as column names, other non-expression inputs
-are parsed as \link[=pl__lit]{literals}. Each name will be used as the expression
-name.}
+\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}>
+Name-value pairs of objects to be converted to polars \link[=Expr]{expressions}
+by the \code{\link[=as_polars_expr]{as_polars_expr()}} function.
+Characters are parsed as column names, other non-expression inputs are parsed as \link[=pl__lit]{literals}.
+Each name will be used as the expression name.}
 }
 \value{
 A polars \link{LazyFrame}
diff --git a/man/lazyframe__sink_csv.Rd b/man/lazyframe__sink_csv.Rd
index 0c8cac5b..628dfd28 100644
--- a/man/lazyframe__sink_csv.Rd
+++ b/man/lazyframe__sink_csv.Rd
@@ -85,20 +85,17 @@ integer, then quotes will be used even if they aren`t strictly necessary.
 \item{maintain_order}{Maintain the order in which data is processed. Setting
 this to \code{FALSE} will be slightly faster.}
 
-\item{type_coercion}{Logical. Coerce types such that operations succeed and
-run on minimal required memory.}
+\item{type_coercion}{A logical, indicates type coercion optimization.}
 
-\item{predicate_pushdown}{Logical. Applies filters as early as possible at
-scan level.}
+\item{predicate_pushdown}{A logical, indicates predicate pushdown optimization.}
 
-\item{projection_pushdown}{Logical. Select only the columns that are needed
-at the scan level.}
+\item{projection_pushdown}{A logical, indicates projection pushdown optimization.}
 
-\item{simplify_expression}{Logical. Various optimizations, such as constant
-folding and replacing expensive operations with faster alternatives.}
+\item{simplify_expression}{A logical, indicates simplify expression optimization.}
 
-\item{slice_pushdown}{Logical. Only load the required slice from the scan
-level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).}
+\item{slice_pushdown}{A logical, indicates slice pushdown optimization.}
+
+\item{no_optimization}{A logical. If \code{TRUE}, turn off (certain) optimizations.}
 
 \item{storage_options}{Named vector containing options that indicate how to
 connect to a cloud provider. The cloud providers currently supported are
diff --git a/man/lazyframe__sink_ipc.Rd b/man/lazyframe__sink_ipc.Rd
index 0eb2d65e..3b043054 100644
--- a/man/lazyframe__sink_ipc.Rd
+++ b/man/lazyframe__sink_ipc.Rd
@@ -34,20 +34,17 @@ lazyframe__sink_ipc(
 \item{maintain_order}{Maintain the order in which data is processed. Setting
 this to \code{FALSE} will be slightly faster.}
 
-\item{type_coercion}{Logical. Coerce types such that operations succeed and
-run on minimal required memory.}
+\item{type_coercion}{A logical, indicates type coercion optimization.}
 
-\item{predicate_pushdown}{Logical. Applies filters as early as possible at
-scan level.}
+\item{predicate_pushdown}{A logical, indicates predicate pushdown optimization.}
 
-\item{projection_pushdown}{Logical. Select only the columns that are needed
-at the scan level.}
+\item{projection_pushdown}{A logical, indicates projection pushdown optimization.}
 
-\item{simplify_expression}{Logical. Various optimizations, such as constant
-folding and replacing expensive operations with faster alternatives.}
+\item{simplify_expression}{A logical, indicates simplify expression optimization.}
 
-\item{slice_pushdown}{Logical. Only load the required slice from the scan
-level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).}
+\item{slice_pushdown}{A logical, indicates slice pushdown optimization.}
+
+\item{no_optimization}{A logical. If \code{TRUE}, turn off (certain) optimizations.}
 
 \item{storage_options}{Named vector containing options that indicate how to
 connect to a cloud provider. The cloud providers currently supported are
diff --git a/man/lazyframe__sink_ndjson.Rd b/man/lazyframe__sink_ndjson.Rd
index 97b0e50e..f0ef9a10 100644
--- a/man/lazyframe__sink_ndjson.Rd
+++ b/man/lazyframe__sink_ndjson.Rd
@@ -26,20 +26,17 @@ lazyframe__sink_ndjson(
 \item{maintain_order}{Maintain the order in which data is processed. Setting
 this to \code{FALSE} will be slightly faster.}
 
-\item{type_coercion}{Logical. Coerce types such that operations succeed and
-run on minimal required memory.}
+\item{type_coercion}{A logical, indicates type coercion optimization.}
 
-\item{predicate_pushdown}{Logical. Applies filters as early as possible at
-scan level.}
+\item{predicate_pushdown}{A logical, indicates predicate pushdown optimization.}
 
-\item{projection_pushdown}{Logical. Select only the columns that are needed
-at the scan level.}
+\item{projection_pushdown}{A logical, indicates projection pushdown optimization.}
 
-\item{simplify_expression}{Logical. Various optimizations, such as constant
-folding and replacing expensive operations with faster alternatives.}
+\item{simplify_expression}{A logical, indicates simplify expression optimization.}
 
-\item{slice_pushdown}{Logical. Only load the required slice from the scan
-level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).}
+\item{slice_pushdown}{A logical, indicates slice pushdown optimization.}
+
+\item{no_optimization}{A logical. If \code{TRUE}, turn off (certain) optimizations.}
 
 \item{storage_options}{Named vector containing options that indicate how to
 connect to a cloud provider. The cloud providers currently supported are
diff --git a/man/lazyframe__sink_parquet.Rd b/man/lazyframe__sink_parquet.Rd
index 1a194f12..4583f132 100644
--- a/man/lazyframe__sink_parquet.Rd
+++ b/man/lazyframe__sink_parquet.Rd
@@ -70,20 +70,17 @@ is set to 1024^2 bytes.}
 \item{maintain_order}{Maintain the order in which data is processed. Setting
 this to \code{FALSE} will be slightly faster.}
 
-\item{type_coercion}{Logical. Coerce types such that operations succeed and
-run on minimal required memory.}
+\item{type_coercion}{A logical, indicates type coercion optimization.}
 
-\item{predicate_pushdown}{Logical. Applies filters as early as possible at
-scan level.}
+\item{predicate_pushdown}{A logical, indicates predicate pushdown optimization.}
 
-\item{projection_pushdown}{Logical. Select only the columns that are needed
-at the scan level.}
+\item{projection_pushdown}{A logical, indicates projection pushdown optimization.}
 
-\item{simplify_expression}{Logical. Various optimizations, such as constant
-folding and replacing expensive operations with faster alternatives.}
+\item{simplify_expression}{A logical, indicates simplify expression optimization.}
 
-\item{slice_pushdown}{Logical. Only load the required slice from the scan
-level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).}
+\item{slice_pushdown}{A logical, indicates slice pushdown optimization.}
+
+\item{no_optimization}{A logical. If \code{TRUE}, turn off (certain) optimizations.}
 
 \item{storage_options}{Named vector containing options that indicate how to
 connect to a cloud provider. The cloud providers currently supported are
diff --git a/man/lazyframe__to_dot.Rd b/man/lazyframe__to_dot.Rd
index 3b650103..a83c4dfd 100644
--- a/man/lazyframe__to_dot.Rd
+++ b/man/lazyframe__to_dot.Rd
@@ -23,33 +23,26 @@ lazyframe__to_dot(
 
 \item{optimized}{Optimize the query plan.}
 
-\item{type_coercion}{Logical. Coerce types such that operations succeed and
-run on minimal required memory.}
+\item{type_coercion}{A logical, indicates type coercion optimization.}
 
-\item{predicate_pushdown}{Logical. Applies filters as early as possible at
-scan level.}
+\item{predicate_pushdown}{A logical, indicates predicate pushdown optimization.}
 
-\item{projection_pushdown}{Logical. Select only the columns that are needed
-at the scan level.}
+\item{projection_pushdown}{A logical, indicates projection pushdown optimization.}
 
-\item{simplify_expression}{Logical. Various optimizations, such as constant
-folding and replacing expensive operations with faster alternatives.}
+\item{simplify_expression}{A logical, indicates simplify expression optimization.}
 
-\item{slice_pushdown}{Logical. Only load the required slice from the scan
-level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).}
+\item{slice_pushdown}{A logical, indicates slice pushdown optimization.}
 
-\item{comm_subplan_elim}{Logical. Will try to cache branching subplans that
-occur on self-joins or unions.}
+\item{comm_subplan_elim}{A logical, indicates trying to cache branching subplans that occur on self-joins or unions.}
 
-\item{comm_subexpr_elim}{Logical. Common subexpressions will be cached and
-reused.}
+\item{comm_subexpr_elim}{A logical, indicates trying to cache common subexpressions.}
 
-\item{cluster_with_columns}{Combine sequential independent calls to
-\code{\link[=lazyframe__with_columns]{with_columns()}}.}
+\item{cluster_with_columns}{A logical, indicates to combine sequential independent calls to with_columns.}
 
-\item{streaming}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} Logical. Process the
-query in batches to handle larger-than-memory data. If \code{FALSE} (default),
-the entire query is processed in a single batch.}
+\item{streaming}{A logical. If \code{TRUE}, process the query in batches to handle larger-than-memory data.
+If \code{FALSE} (default), the entire query is processed in a single batch.
+Note that streaming mode is considered unstable.
+It may be changed at any point without it being considered a breaking change.}
 }
 \value{
 A character vector
diff --git a/man/lazyframe__with_columns.Rd b/man/lazyframe__with_columns.Rd
index 77d29306..60262f97 100644
--- a/man/lazyframe__with_columns.Rd
+++ b/man/lazyframe__with_columns.Rd
@@ -7,11 +7,11 @@
 lazyframe__with_columns(...)
 }
 \arguments{
-\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Name-value pairs of objects
-to be converted to polars \link[=Expr]{expressions} by the \code{\link[=as_polars_expr]{as_polars_expr()}}
-function. Characters are parsed as column names, other non-expression inputs
-are parsed as \link[=pl__lit]{literals}. Each name will be used as the expression
-name.}
+\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}>
+Name-value pairs of objects to be converted to polars \link[=Expr]{expressions}
+by the \code{\link[=as_polars_expr]{as_polars_expr()}} function.
+Characters are parsed as column names, other non-expression inputs are parsed as \link[=pl__lit]{literals}.
+Each name will be used as the expression name.}
 }
 \value{
 A polars \link{LazyFrame}
diff --git a/man/lazyframe__with_columns_seq.Rd b/man/lazyframe__with_columns_seq.Rd
index f2928535..59dfc908 100644
--- a/man/lazyframe__with_columns_seq.Rd
+++ b/man/lazyframe__with_columns_seq.Rd
@@ -7,11 +7,11 @@
 lazyframe__with_columns_seq(...)
 }
 \arguments{
-\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Name-value pairs of objects
-to be converted to polars \link[=Expr]{expressions} by the \code{\link[=as_polars_expr]{as_polars_expr()}}
-function. Characters are parsed as column names, other non-expression inputs
-are parsed as \link[=pl__lit]{literals}. Each name will be used as the expression
-name.}
+\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}>
+Name-value pairs of objects to be converted to polars \link[=Expr]{expressions}
+by the \code{\link[=as_polars_expr]{as_polars_expr()}} function.
+Characters are parsed as column names, other non-expression inputs are parsed as \link[=pl__lit]{literals}.
+Each name will be used as the expression name.}
 }
 \value{
 A polars \link{LazyFrame}
diff --git a/man/pl__struct.Rd b/man/pl__struct.Rd
index c6424779..1f2daa36 100644
--- a/man/pl__struct.Rd
+++ b/man/pl__struct.Rd
@@ -7,11 +7,11 @@
 pl__struct(...)
 }
 \arguments{
-\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Name-value pairs of objects
-to be converted to polars \link[=Expr]{expressions} by the \code{\link[=as_polars_expr]{as_polars_expr()}}
-function. Characters are parsed as column names, other non-expression inputs
-are parsed as \link[=pl__lit]{literals}. Each name will be used as the expression
-name.}
+\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}>
+Name-value pairs of objects to be converted to polars \link[=Expr]{expressions}
+by the \code{\link[=as_polars_expr]{as_polars_expr()}} function.
+Characters are parsed as column names, other non-expression inputs are parsed as \link[=pl__lit]{literals}.
+Each name will be used as the expression name.}
 }
 \value{
 A polars \link{expression}

From 512910eda13f7b7eb921bb41bee5738a001ba744 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Mon, 9 Dec 2024 16:49:02 +0100
Subject: [PATCH 58/71] answer some whys

---
 R/lazyframe-frame.R                   | 60 +++++++++++++--------------
 tests/testthat/helper-expections.R    |  1 +
 tests/testthat/test-lazyframe-frame.R |  4 +-
 3 files changed, 33 insertions(+), 32 deletions(-)

diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index f0dbf095..1de88f0d 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -57,12 +57,12 @@ wrap.PlRLazyFrame <- function(x, ...) {
 #' `$select()` call. For instance, if you create a variable `x`, you will only
 #' be able to use it in another `$select()` or `$with_columns()` call.
 #'
-#' @inherit as_polars_lf return
-#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Name-value pairs of objects
-#' to be converted to polars [expressions][Expr] by the [as_polars_expr()]
-#' function. Characters are parsed as column names, other non-expression inputs
-#' are parsed as [literals][pl__lit]. Each name will be used as the expression
-#' name.
+#' @inherit pl__LazyFrame return
+#' @param ... <[`dynamic-dots`][rlang::dyn-dots]>
+#' Name-value pairs of objects to be converted to polars [expressions][Expr]
+#' by the [as_polars_expr()] function.
+#' Characters are parsed as column names, other non-expression inputs are parsed as [literals][pl__lit].
+#' Each name will be used as the expression name.
 #' @examples
 #' # Pass the name of a column to select that column.
 #' lf <- pl$LazyFrame(
@@ -96,6 +96,7 @@ wrap.PlRLazyFrame <- function(x, ...) {
 lazyframe__select <- function(...) {
   wrap({
     structify <- parse_env_auto_structify()
+
     parse_into_list_of_expressions(..., `__structify` = structify) |>
       self$`_ldf`$select()
   })
@@ -163,32 +164,28 @@ lazyframe__group_by <- function(..., .maintain_order = FALSE) {
   })
 }
 
+# TODO: see also section
 #' Materialize this LazyFrame into a DataFrame
 #'
 #' By default, all query optimizations are enabled.
-#'
-#' @inheritParams rlang::check_dots_empty0
-#' @param type_coercion Logical. Coerce types such that operations succeed and
-#' run on minimal required memory.
-#' @param predicate_pushdown Logical. Applies filters as early as possible at
-#' scan level.
-#' @param projection_pushdown Logical. Select only the columns that are needed
-#' at the scan level.
-#' @param simplify_expression Logical. Various optimizations, such as constant
-#' folding and replacing expensive operations with faster alternatives.
-#' @param slice_pushdown Logical. Only load the required slice from the scan
-#' level. Don't materialize sliced outputs (e.g. `join$head(10)`).
-#' @param comm_subplan_elim Logical. Will try to cache branching subplans that
-#'  occur on self-joins or unions.
-#' @param comm_subexpr_elim Logical. Common subexpressions will be cached and
-#' reused.
-#' @param cluster_with_columns Combine sequential independent calls to
-#' [`with_columns()`][lazyframe__with_columns].
-#' @param streaming `r lifecycle::badge("experimental")` Logical. Process the
-#' query in batches to handle larger-than-memory data. If `FALSE` (default),
-#' the entire query is processed in a single batch.
-#' @param _eager A logical, indicates to turn off multi-node optimizations and
-#' the other optimizations. This option is intended for internal use only.
+#' Individual optimizations may be disabled by setting the corresponding parameter to `FALSE`.
+#' @inherit pl__DataFrame return
+#' @inheritParams rlang::args_dots_empty
+#' @param type_coercion A logical, indicates type coercion optimization.
+#' @param predicate_pushdown A logical, indicates predicate pushdown optimization.
+#' @param projection_pushdown A logical, indicates projection pushdown optimization.
+#' @param simplify_expression A logical, indicates simplify expression optimization.
+#' @param slice_pushdown A logical, indicates slice pushdown optimization.
+#' @param comm_subplan_elim A logical, indicates trying to cache branching subplans that occur on self-joins or unions.
+#' @param comm_subexpr_elim A logical, indicates trying to cache common subexpressions.
+#' @param cluster_with_columns A logical, indicates to combine sequential independent calls to with_columns.
+#' @param no_optimization A logical. If `TRUE`, turn off (certain) optimizations.
+#' @param streaming A logical. If `TRUE`, process the query in batches to handle larger-than-memory data.
+#' If `FALSE` (default), the entire query is processed in a single batch.
+#' Note that streaming mode is considered unstable.
+#' It may be changed at any point without it being considered a breaking change.
+#' @param _eager A logical, indicates to turn off multi-node optimizations and the other optimizations.
+#' This option is intended for internal use only.
 #'
 #' @inherit as_polars_lf return
 #'
@@ -439,6 +436,7 @@ lazyframe__explain <- function(
     streaming = FALSE) {
   wrap({
     check_dots_empty0(...)
+
     format <- arg_match0(format, c("plain", "tree"))
 
     if (isTRUE(optimized)) {
@@ -587,6 +585,7 @@ lazyframe__sort <- function(
     maintain_order = FALSE) {
   wrap({
     check_dots_unnamed()
+
     by <- parse_into_list_of_expressions(...)
     if (length(by) == 0) {
       abort("`...` must contain at least one element.")
@@ -615,7 +614,7 @@ lazyframe__sort <- function(
 #' variable `x`, you will only be able to use it in another `$with_columns()`
 #' or `$select()` call.
 #'
-#' @inherit as_polars_lf return
+#' @inherit pl__LazyFrame return
 #' @inheritParams lazyframe__select
 #' @examples
 #' # Pass an expression to add it as a new column.
@@ -743,6 +742,7 @@ lazyframe__with_columns_seq <- function(...) {
 lazyframe__drop <- function(..., strict = TRUE) {
   wrap({
     check_dots_unnamed()
+
     parse_into_list_of_expressions(...) |>
       self$`_ldf`$drop(strict)
   })
diff --git a/tests/testthat/helper-expections.R b/tests/testthat/helper-expections.R
index 55d55833..2cfa75d1 100644
--- a/tests/testthat/helper-expections.R
+++ b/tests/testthat/helper-expections.R
@@ -90,5 +90,6 @@ expect_eager_equal_lazy_error <- function(object, input, expected, regexp = NULL
     class = class,
     ...
   )
+
   invisible(NULL)
 }
diff --git a/tests/testthat/test-lazyframe-frame.R b/tests/testthat/test-lazyframe-frame.R
index 11f568f6..bbf08460 100644
--- a/tests/testthat/test-lazyframe-frame.R
+++ b/tests/testthat/test-lazyframe-frame.R
@@ -111,7 +111,7 @@ test_that("slice/head/tail", {
     .input$head(-4),
     .data,
     pl$DataFrame(foo = 1L, bar = 6L),
-    r"(-4.0 is out of range that can be safely converted to u32)"
+    r"(-4\.0 is out of range that can be safely converted to u32)"
   )
 
   # tail
@@ -129,7 +129,7 @@ test_that("slice/head/tail", {
     .input$tail(-4),
     .data,
     pl$DataFrame(foo = 5L, bar = 10L),
-    r"(-4.0 is out of range that can be safely converted to u32)"
+    r"(-4\.0 is out of range that can be safely converted to u32)"
   )
 })
 

From 7dfc8fb0e1fd6e1325df8f675f45687b3cca8cab Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Mon, 9 Dec 2024 16:52:54 +0100
Subject: [PATCH 59/71] remove clone_in_rust

---
 R/lazyframe-frame.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index 1de88f0d..54a561e8 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -1570,7 +1570,7 @@ lazyframe__explode <- function(...) {
 #' # now, the original LazyFrame doesn't get this attribute
 #' attributes(df1)
 lazyframe__clone <- function() {
-  self$`_ldf`$clone_in_rust()
+  self$`_ldf`$clone()
 }
 
 
From abf7e8c782980f55ce1e758ed015c515cab60e81 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Mon, 9 Dec 2024 16:57:09 +0100
Subject: [PATCH 60/71] use lazy API in eager unnest()

---
 R/000-wrappers.R                  |  7 -------
 R/dataframe-frame.R               | 13 ++-----------
 src/init.c                        |  6 ------
 src/rust/api.h                    |  1 -
 src/rust/src/dataframe/general.rs | 10 ----------
 5 files changed, 2 insertions(+), 35 deletions(-)

diff --git a/R/000-wrappers.R b/R/000-wrappers.R
index a838d0bf..72bfead1 100644
--- a/R/000-wrappers.R
+++ b/R/000-wrappers.R
@@ -475,12 +475,6 @@ class(`PlRChainedWhen`) <- c("PlRChainedWhen__bundle", "savvy_neopolars__sealed"
   }
 }
 
-`PlRDataFrame_unnest` <- function(self) {
-  function(`columns`) {
-    .savvy_wrap_PlRDataFrame(.Call(savvy_PlRDataFrame_unnest__impl, `self`, `columns`))
-  }
-}
-
 `.savvy_wrap_PlRDataFrame` <- function(ptr) {
   e <- new.env(parent = emptyenv())
   e$.ptr <- ptr
@@ -502,7 +496,6 @@ class(`PlRChainedWhen`) <- c("PlRChainedWhen__bundle", "savvy_neopolars__sealed"
   e$`to_struct` <- `PlRDataFrame_to_struct`(ptr)
   e$`n_chunks` <- `PlRDataFrame_n_chunks`(ptr)
   e$`rechunk` <- `PlRDataFrame_rechunk`(ptr)
-  e$`unnest` <- `PlRDataFrame_unnest`(ptr)
 
   class(e) <- c("PlRDataFrame", "savvy_neopolars__sealed")
   e
diff --git a/R/dataframe-frame.R b/R/dataframe-frame.R
index 7b799ff6..6bcf66b7 100644
--- a/R/dataframe-frame.R
+++ b/R/dataframe-frame.R
@@ -556,15 +556,6 @@ dataframe__rechunk <- function() {
 #'
 #' df$unnest("a_and_c")
 dataframe__unnest <- function(...) {
-  wrap({
-    check_dots_unnamed()
-    dots <- list2(...)
-
-    if (is_list_of_string(dots)) {
-      dots <- unlist(dots)
-    } else {
-      abort("All elements of `...` must be strings.")
-    }
-    self$`_df`$unnest(dots)
-  })
+  self$lazy()$unnest(...)$collect(`_eager` = TRUE) |>
+    wrap()
 }
diff --git a/src/init.c b/src/init.c
index 7a76ab20..d05a21b1 100644
--- a/src/init.c
+++ b/src/init.c
@@ -359,11 +359,6 @@ SEXP savvy_PlRDataFrame_rechunk__impl(SEXP self__) {
     return handle_result(res);
 }
 
-SEXP savvy_PlRDataFrame_unnest__impl(SEXP self__, SEXP c_arg__columns) {
-    SEXP res = savvy_PlRDataFrame_unnest__ffi(self__, c_arg__columns);
-    return handle_result(res);
-}
-
 SEXP savvy_PlRDataFrame_read_ipc_stream__impl(SEXP c_arg__source, SEXP c_arg__row_index_offset, SEXP c_arg__rechunk, SEXP c_arg__columns, SEXP c_arg__projection, SEXP c_arg__n_rows, SEXP c_arg__row_index_name) {
     SEXP res = savvy_PlRDataFrame_read_ipc_stream__ffi(c_arg__source, c_arg__row_index_offset, c_arg__rechunk, c_arg__columns, c_arg__projection, c_arg__n_rows, c_arg__row_index_name);
     return handle_result(res);
@@ -2801,7 +2796,6 @@ static const R_CallMethodDef CallEntries[] = {
     {"savvy_PlRDataFrame_to_struct__impl", (DL_FUNC) &savvy_PlRDataFrame_to_struct__impl, 2},
     {"savvy_PlRDataFrame_n_chunks__impl", (DL_FUNC) &savvy_PlRDataFrame_n_chunks__impl, 1},
     {"savvy_PlRDataFrame_rechunk__impl", (DL_FUNC) &savvy_PlRDataFrame_rechunk__impl, 1},
-    {"savvy_PlRDataFrame_unnest__impl", (DL_FUNC) &savvy_PlRDataFrame_unnest__impl, 2},
     {"savvy_PlRDataFrame_read_ipc_stream__impl", (DL_FUNC) &savvy_PlRDataFrame_read_ipc_stream__impl, 7},
     {"savvy_PlRDataType_new_from_name__impl", (DL_FUNC) &savvy_PlRDataType_new_from_name__impl, 1},
     {"savvy_PlRDataType_new_decimal__impl", (DL_FUNC) &savvy_PlRDataType_new_decimal__impl, 2},
diff --git a/src/rust/api.h b/src/rust/api.h
index 0b7a45ae..f51c8bfe 100644
--- a/src/rust/api.h
+++ b/src/rust/api.h
@@ -69,7 +69,6 @@ SEXP savvy_PlRDataFrame_lazy__ffi(SEXP self__);
 SEXP savvy_PlRDataFrame_to_struct__ffi(SEXP self__, SEXP c_arg__name);
 SEXP savvy_PlRDataFrame_n_chunks__ffi(SEXP self__);
 SEXP savvy_PlRDataFrame_rechunk__ffi(SEXP self__);
-SEXP savvy_PlRDataFrame_unnest__ffi(SEXP self__, SEXP c_arg__columns);
 SEXP savvy_PlRDataFrame_read_ipc_stream__ffi(SEXP c_arg__source, SEXP c_arg__row_index_offset, SEXP c_arg__rechunk, SEXP c_arg__columns, SEXP c_arg__projection, SEXP c_arg__n_rows, SEXP c_arg__row_index_name);
 
 // methods and associated functions for PlRDataType
diff --git a/src/rust/src/dataframe/general.rs b/src/rust/src/dataframe/general.rs
index 78435aed..783d8f9f 100644
--- a/src/rust/src/dataframe/general.rs
+++ b/src/rust/src/dataframe/general.rs
@@ -160,14 +160,4 @@ impl PlRDataFrame {
         df.as_single_chunk_par();
         Ok(df.into())
     }
-
-    pub fn unnest(&self, columns: StringSexp) -> Result<PlRDataFrame> {
-        let columns = columns.to_vec();
-        Ok(self
-            .df
-            .clone()
-            .unnest(columns)
-            .map_err(RPolarsErr::from)?
-            .into())
-    }
 }

From adbdcbaf9d865c260bf4123b05df316d2d543611 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Mon, 9 Dec 2024 17:00:33 +0100
Subject: [PATCH 61/71] just remove eager unnest

---
 R/dataframe-frame.R                      | 20 --------------
 man/dataframe__unnest.Rd                 | 33 ------------------------
 tests/testthat/_snaps/dataframe-frame.md | 30 ---------------------
 3 files changed, 83 deletions(-)
 delete mode 100644 man/dataframe__unnest.Rd
 delete mode 100644 tests/testthat/_snaps/dataframe-frame.md

diff --git a/R/dataframe-frame.R b/R/dataframe-frame.R
index 6bcf66b7..d76e6027 100644
--- a/R/dataframe-frame.R
+++ b/R/dataframe-frame.R
@@ -539,23 +539,3 @@ dataframe__rechunk <- function() {
     self$`_df`$rechunk()
   })
 }
-
-#' @inherit lazyframe__unnest title description params
-#' @inherit as_polars_df return
-#' @examples
-#' df <- pl$DataFrame(
-#'   a = 1:5,
-#'   b = c("one", "two", "three", "four", "five"),
-#'   c = 6:10
-#' )$
-#'   select(
-#'   pl$struct("b"),
-#'   a_and_c = pl$struct(c("a", "c"))
-#' )
-#' df
-#'
-#' df$unnest("a_and_c")
-dataframe__unnest <- function(...) {
-  self$lazy()$unnest(...)$collect(`_eager` = TRUE) |>
-    wrap()
-}
diff --git a/man/dataframe__unnest.Rd b/man/dataframe__unnest.Rd
deleted file mode 100644
index 3eb99644..00000000
--- a/man/dataframe__unnest.Rd
+++ /dev/null
@@ -1,33 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/dataframe-frame.R
-\name{dataframe__unnest}
-\alias{dataframe__unnest}
-\title{Decompose struct columns into separate columns for each of their fields}
-\usage{
-dataframe__unnest(...)
-}
-\arguments{
-\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Name of the struct column(s)
-that should be unnested.}
-}
-\value{
-A polars \link{DataFrame}
-}
-\description{
-The new columns will be inserted into the LazyFrame at the location of the
-struct column.
-}
-\examples{
-df <- pl$DataFrame(
-  a = 1:5,
-  b = c("one", "two", "three", "four", "five"),
-  c = 6:10
-)$
-  select(
-  pl$struct("b"),
-  a_and_c = pl$struct(c("a", "c"))
-)
-df
-
-df$unnest("a_and_c")
-}
diff --git a/tests/testthat/_snaps/dataframe-frame.md b/tests/testthat/_snaps/dataframe-frame.md
deleted file mode 100644
index 0393bb8a..00000000
--- a/tests/testthat/_snaps/dataframe-frame.md
+++ /dev/null
@@ -1,30 +0,0 @@
-# unnest works correctly
-
-    Code
-      df$unnest("b", pl$col("a_and_c"))
-    Condition
-      Error in `df$unnest()`:
-      ! Evaluation failed in `$unnest()`.
-      Caused by error in `df$unnest()`:
-      ! `columns` must be a character vector, not a list.
-
----
-
-    Code
-      df$unnest(1)
-    Condition
-      Error in `df$unnest()`:
-      ! Evaluation failed in `$unnest()`.
-      Caused by error in `df$unnest()`:
-      ! `columns` must be a character vector, not the number 1.
-
----
-
-    Code
-      df$unnest("foo")
-    Condition
-      Error in `df$unnest()`:
-      ! Evaluation failed in `$unnest()`.
-      Caused by error:
-      ! invalid series dtype: expected `Struct`, got `f64`
-

From 71d5240f6f1b9d923b098cc0f810f67cb1cff316 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Mon, 9 Dec 2024 17:02:32 +0100
Subject: [PATCH 62/71] other non-lazy changes

---
 R/io-csv-functions.R                    | 3 ++-
 man/{pl__scan_csv.Rd => IO_scan_csv.Rd} | 0
 2 files changed, 2 insertions(+), 1 deletion(-)
 rename man/{pl__scan_csv.Rd => IO_scan_csv.Rd} (100%)

diff --git a/R/io-csv-functions.R b/R/io-csv-functions.R
index 0eece9f6..32cfd0e9 100644
--- a/R/io-csv-functions.R
+++ b/R/io-csv-functions.R
@@ -3,6 +3,8 @@
 #' This allows the query optimizer to push down predicates and projections to
 #' the scan level, thereby potentially reducing memory overhead.
 #'
+#' @rdname IO_scan_csv
+#'
 #' @inheritParams rlang::check_dots_empty0
 #' @param source Path to a file or URL. It is possible to provide multiple paths
 #' provided that all CSV files have the same schema. It is not possible to
@@ -199,7 +201,6 @@ pl__scan_csv <- function(
 }
 
 #' New DataFrame from CSV
-#'
 #' @inheritParams pl__scan_csv
 #' @inherit as_polars_df return
 #' @examples
diff --git a/man/pl__scan_csv.Rd b/man/IO_scan_csv.Rd
similarity index 100%
rename from man/pl__scan_csv.Rd
rename to man/IO_scan_csv.Rd

From 08219cc238dc0f5c5b802ed32af05b43386d88f4 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Mon, 9 Dec 2024 17:03:20 +0100
Subject: [PATCH 63/71] same

---
 R/io-csv-functions.R                    | 1 +
 man/{pl__read_csv.Rd => IO_read_csv.Rd} | 0
 2 files changed, 1 insertion(+)
 rename man/{pl__read_csv.Rd => IO_read_csv.Rd} (100%)

diff --git a/R/io-csv-functions.R b/R/io-csv-functions.R
index 32cfd0e9..5f177f05 100644
--- a/R/io-csv-functions.R
+++ b/R/io-csv-functions.R
@@ -201,6 +201,7 @@ pl__scan_csv <- function(
 }
 
 #' New DataFrame from CSV
+#' @rdname IO_read_csv
 #' @inheritParams pl__scan_csv
 #' @inherit as_polars_df return
 #' @examples
diff --git a/man/pl__read_csv.Rd b/man/IO_read_csv.Rd
similarity index 100%
rename from man/pl__read_csv.Rd
rename to man/IO_read_csv.Rd

From 9c28657701bf310286d9d87eefa82f307e75a481 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Mon, 9 Dec 2024 17:05:14 +0100
Subject: [PATCH 64/71] more

---
 R/lazyframe-frame.R            | 74 +++++++++++++++++-----------------
 man/as_polars_df.Rd            | 16 ++++----
 man/lazyframe__collect.Rd      | 16 ++++----
 man/lazyframe__explain.Rd      | 16 ++++----
 man/lazyframe__profile.Rd      | 16 ++++----
 man/lazyframe__sink_csv.Rd     | 10 ++---
 man/lazyframe__sink_ipc.Rd     | 10 ++---
 man/lazyframe__sink_ndjson.Rd  | 10 ++---
 man/lazyframe__sink_parquet.Rd | 10 ++---
 man/lazyframe__to_dot.Rd       | 16 ++++----
 10 files changed, 97 insertions(+), 97 deletions(-)

diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index 54a561e8..bc7fab3e 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -171,14 +171,14 @@ lazyframe__group_by <- function(..., .maintain_order = FALSE) {
 #' Individual optimizations may be disabled by setting the corresponding parameter to `FALSE`.
 #' @inherit pl__DataFrame return
 #' @inheritParams rlang::args_dots_empty
-#' @param type_coercion A logical, indicates type coercion optimization.
-#' @param predicate_pushdown A logical, indicates predicate pushdown optimization.
-#' @param projection_pushdown A logical, indicates projection pushdown optimization.
-#' @param simplify_expression A logical, indicates simplify expression optimization.
-#' @param slice_pushdown A logical, indicates slice pushdown optimization.
-#' @param comm_subplan_elim A logical, indicates trying to cache branching subplans that occur on self-joins or unions.
-#' @param comm_subexpr_elim A logical, indicates trying to cache common subexpressions.
-#' @param cluster_with_columns A logical, indicates to combine sequential independent calls to with_columns.
+#' @param type_coercion A logical, indicats type coercion optimization.
+#' @param predicate_pushdown A logical, indicats predicate pushdown optimization.
+#' @param projection_pushdown A logical, indicats projection pushdown optimization.
+#' @param simplify_expression A logical, indicats simplify expression optimization.
+#' @param slice_pushdown A logical, indicats slice pushdown optimization.
+#' @param comm_subplan_elim A logical, indicats trying to cache branching subplans that occur on self-joins or unions.
+#' @param comm_subexpr_elim A logical, indicats trying to cache common subexpressions.
+#' @param cluster_with_columns A logical, indicats to combine sequential independent calls to with_columns.
 #' @param no_optimization A logical. If `TRUE`, turn off (certain) optimizations.
 #' @param streaming A logical. If `TRUE`, process the query in batches to handle larger-than-memory data.
 #' If `FALSE` (default), the entire query is processed in a single batch.
@@ -252,35 +252,6 @@ lazyframe__collect <- function(
   })
 }
 
-#' Resolve the schema of this LazyFrame
-#'
-#' This resolves the query plan but does not trigger computations.
-#'
-#' @return A named list with names indicating column names and values indicating
-#' column data types.
-#'
-#' @examples
-#' lf <- pl$LazyFrame(
-#'   foo = 1:3,
-#'   bar = 6:8,
-#'   ham = c("a", "b", "c")
-#' )
-#'
-#' lf$collect_schema()
-#'
-#' lf$with_columns(
-#'   baz = (pl$col("foo") + pl$col("bar"))$cast(pl$String),
-#'   pl$col("bar")$cast(pl$Int64)
-#' )$collect_schema()
-lazyframe__collect_schema <- function() {
-  self$`_ldf`$collect_schema() |>
-    lapply(function(x) {
-      .savvy_wrap_PlRDataType(x) |>
-        wrap()
-    }) |>
-    wrap()
-}
-
 #' Collect and profile a lazy query.
 #'
 #' This will run the query and return a list containing the materialized
@@ -468,6 +439,35 @@ lazyframe__explain <- function(
   })
 }
 
+#' Resolve the schema of this LazyFrame
+#'
+#' This resolves the query plan but does not trigger computations.
+#'
+#' @return A named list with names indicating column names and values indicating
+#' column data types.
+#'
+#' @examples
+#' lf <- pl$LazyFrame(
+#'   foo = 1:3,
+#'   bar = 6:8,
+#'   ham = c("a", "b", "c")
+#' )
+#'
+#' lf$collect_schema()
+#'
+#' lf$with_columns(
+#'   baz = (pl$col("foo") + pl$col("bar"))$cast(pl$String),
+#'   pl$col("bar")$cast(pl$Int64)
+#' )$collect_schema()
+lazyframe__collect_schema <- function() {
+  self$`_ldf`$collect_schema() |>
+    lapply(function(x) {
+      .savvy_wrap_PlRDataType(x) |>
+        wrap()
+    }) |>
+    wrap()
+}
+
 #' Cast LazyFrame column(s) to the specified dtype(s)
 #'
 #' This allows to convert all columns to a datatype or to convert only specific
diff --git a/man/as_polars_df.Rd b/man/as_polars_df.Rd
index 4a27ac42..77091117 100644
--- a/man/as_polars_df.Rd
+++ b/man/as_polars_df.Rd
@@ -56,21 +56,21 @@ If \code{NULL}, the column name is taken from the \link{Series} name.}
 the \code{\link[=series_struct_unnest]{<Series>$struct$unnest()}} method is used to create a \link{DataFrame}
 from the struct \link{Series}. In this case, the \code{column_name} argument is ignored.}
 
-\item{type_coercion}{A logical, indicates type coercion optimization.}
+\item{type_coercion}{A logical, indicats type coercion optimization.}
 
-\item{predicate_pushdown}{A logical, indicates predicate pushdown optimization.}
+\item{predicate_pushdown}{A logical, indicats predicate pushdown optimization.}
 
-\item{projection_pushdown}{A logical, indicates projection pushdown optimization.}
+\item{projection_pushdown}{A logical, indicats projection pushdown optimization.}
 
-\item{simplify_expression}{A logical, indicates simplify expression optimization.}
+\item{simplify_expression}{A logical, indicats simplify expression optimization.}
 
-\item{slice_pushdown}{A logical, indicates slice pushdown optimization.}
+\item{slice_pushdown}{A logical, indicats slice pushdown optimization.}
 
-\item{comm_subplan_elim}{A logical, indicates trying to cache branching subplans that occur on self-joins or unions.}
+\item{comm_subplan_elim}{A logical, indicats trying to cache branching subplans that occur on self-joins or unions.}
 
-\item{comm_subexpr_elim}{A logical, indicates trying to cache common subexpressions.}
+\item{comm_subexpr_elim}{A logical, indicats trying to cache common subexpressions.}
 
-\item{cluster_with_columns}{A logical, indicates to combine sequential independent calls to with_columns.}
+\item{cluster_with_columns}{A logical, indicats to combine sequential independent calls to with_columns.}
 
 \item{no_optimization}{A logical. If \code{TRUE}, turn off (certain) optimizations.}
 
diff --git a/man/lazyframe__collect.Rd b/man/lazyframe__collect.Rd
index 73030a47..f47ef0b7 100644
--- a/man/lazyframe__collect.Rd
+++ b/man/lazyframe__collect.Rd
@@ -22,21 +22,21 @@ lazyframe__collect(
 \arguments{
 \item{...}{These dots are for future extensions and must be empty.}
 
-\item{type_coercion}{A logical, indicates type coercion optimization.}
+\item{type_coercion}{A logical, indicats type coercion optimization.}
 
-\item{predicate_pushdown}{A logical, indicates predicate pushdown optimization.}
+\item{predicate_pushdown}{A logical, indicats predicate pushdown optimization.}
 
-\item{projection_pushdown}{A logical, indicates projection pushdown optimization.}
+\item{projection_pushdown}{A logical, indicats projection pushdown optimization.}
 
-\item{simplify_expression}{A logical, indicates simplify expression optimization.}
+\item{simplify_expression}{A logical, indicats simplify expression optimization.}
 
-\item{slice_pushdown}{A logical, indicates slice pushdown optimization.}
+\item{slice_pushdown}{A logical, indicats slice pushdown optimization.}
 
-\item{comm_subplan_elim}{A logical, indicates trying to cache branching subplans that occur on self-joins or unions.}
+\item{comm_subplan_elim}{A logical, indicats trying to cache branching subplans that occur on self-joins or unions.}
 
-\item{comm_subexpr_elim}{A logical, indicates trying to cache common subexpressions.}
+\item{comm_subexpr_elim}{A logical, indicats trying to cache common subexpressions.}
 
-\item{cluster_with_columns}{A logical, indicates to combine sequential independent calls to with_columns.}
+\item{cluster_with_columns}{A logical, indicats to combine sequential independent calls to with_columns.}
 
 \item{no_optimization}{A logical. If \code{TRUE}, turn off (certain) optimizations.}
 
diff --git a/man/lazyframe__explain.Rd b/man/lazyframe__explain.Rd
index 0a3bab93..4f0f7b77 100644
--- a/man/lazyframe__explain.Rd
+++ b/man/lazyframe__explain.Rd
@@ -28,21 +28,21 @@ either \code{"plain"} (default) or \code{"tree"}.}
 \item{optimized}{Return an optimized query plan. If \code{TRUE} (default), the
 subsequent optimization flags control which optimizations run.}
 
-\item{type_coercion}{A logical, indicates type coercion optimization.}
+\item{type_coercion}{A logical, indicats type coercion optimization.}
 
-\item{predicate_pushdown}{A logical, indicates predicate pushdown optimization.}
+\item{predicate_pushdown}{A logical, indicats predicate pushdown optimization.}
 
-\item{projection_pushdown}{A logical, indicates projection pushdown optimization.}
+\item{projection_pushdown}{A logical, indicats projection pushdown optimization.}
 
-\item{simplify_expression}{A logical, indicates simplify expression optimization.}
+\item{simplify_expression}{A logical, indicats simplify expression optimization.}
 
-\item{slice_pushdown}{A logical, indicates slice pushdown optimization.}
+\item{slice_pushdown}{A logical, indicats slice pushdown optimization.}
 
-\item{comm_subplan_elim}{A logical, indicates trying to cache branching subplans that occur on self-joins or unions.}
+\item{comm_subplan_elim}{A logical, indicats trying to cache branching subplans that occur on self-joins or unions.}
 
-\item{comm_subexpr_elim}{A logical, indicates trying to cache common subexpressions.}
+\item{comm_subexpr_elim}{A logical, indicats trying to cache common subexpressions.}
 
-\item{cluster_with_columns}{A logical, indicates to combine sequential independent calls to with_columns.}
+\item{cluster_with_columns}{A logical, indicats to combine sequential independent calls to with_columns.}
 
 \item{streaming}{A logical. If \code{TRUE}, process the query in batches to handle larger-than-memory data.
 If \code{FALSE} (default), the entire query is processed in a single batch.
diff --git a/man/lazyframe__profile.Rd b/man/lazyframe__profile.Rd
index aa913dd7..bc0bedba 100644
--- a/man/lazyframe__profile.Rd
+++ b/man/lazyframe__profile.Rd
@@ -37,21 +37,21 @@ lazyframe__profile(
 )
 }
 \arguments{
-\item{type_coercion}{A logical, indicates type coercion optimization.}
+\item{type_coercion}{A logical, indicats type coercion optimization.}
 
-\item{predicate_pushdown}{A logical, indicates predicate pushdown optimization.}
+\item{predicate_pushdown}{A logical, indicats predicate pushdown optimization.}
 
-\item{projection_pushdown}{A logical, indicates projection pushdown optimization.}
+\item{projection_pushdown}{A logical, indicats projection pushdown optimization.}
 
-\item{simplify_expression}{A logical, indicates simplify expression optimization.}
+\item{simplify_expression}{A logical, indicats simplify expression optimization.}
 
-\item{slice_pushdown}{A logical, indicates slice pushdown optimization.}
+\item{slice_pushdown}{A logical, indicats slice pushdown optimization.}
 
-\item{comm_subplan_elim}{A logical, indicates trying to cache branching subplans that occur on self-joins or unions.}
+\item{comm_subplan_elim}{A logical, indicats trying to cache branching subplans that occur on self-joins or unions.}
 
-\item{comm_subexpr_elim}{A logical, indicates trying to cache common subexpressions.}
+\item{comm_subexpr_elim}{A logical, indicats trying to cache common subexpressions.}
 
-\item{cluster_with_columns}{A logical, indicates to combine sequential independent calls to with_columns.}
+\item{cluster_with_columns}{A logical, indicats to combine sequential independent calls to with_columns.}
 
 \item{streaming}{A logical. If \code{TRUE}, process the query in batches to handle larger-than-memory data.
 If \code{FALSE} (default), the entire query is processed in a single batch.
diff --git a/man/lazyframe__sink_csv.Rd b/man/lazyframe__sink_csv.Rd
index 628dfd28..98202d20 100644
--- a/man/lazyframe__sink_csv.Rd
+++ b/man/lazyframe__sink_csv.Rd
@@ -85,15 +85,15 @@ integer, then quotes will be used even if they aren`t strictly necessary.
 \item{maintain_order}{Maintain the order in which data is processed. Setting
 this to \code{FALSE} will be slightly faster.}
 
-\item{type_coercion}{A logical, indicates type coercion optimization.}
+\item{type_coercion}{A logical, indicats type coercion optimization.}
 
-\item{predicate_pushdown}{A logical, indicates predicate pushdown optimization.}
+\item{predicate_pushdown}{A logical, indicats predicate pushdown optimization.}
 
-\item{projection_pushdown}{A logical, indicates projection pushdown optimization.}
+\item{projection_pushdown}{A logical, indicats projection pushdown optimization.}
 
-\item{simplify_expression}{A logical, indicates simplify expression optimization.}
+\item{simplify_expression}{A logical, indicats simplify expression optimization.}
 
-\item{slice_pushdown}{A logical, indicates slice pushdown optimization.}
+\item{slice_pushdown}{A logical, indicats slice pushdown optimization.}
 
 \item{no_optimization}{A logical. If \code{TRUE}, turn off (certain) optimizations.}
 
diff --git a/man/lazyframe__sink_ipc.Rd b/man/lazyframe__sink_ipc.Rd
index 3b043054..c9c509a9 100644
--- a/man/lazyframe__sink_ipc.Rd
+++ b/man/lazyframe__sink_ipc.Rd
@@ -34,15 +34,15 @@ lazyframe__sink_ipc(
 \item{maintain_order}{Maintain the order in which data is processed. Setting
 this to \code{FALSE} will be slightly faster.}
 
-\item{type_coercion}{A logical, indicates type coercion optimization.}
+\item{type_coercion}{A logical, indicats type coercion optimization.}
 
-\item{predicate_pushdown}{A logical, indicates predicate pushdown optimization.}
+\item{predicate_pushdown}{A logical, indicats predicate pushdown optimization.}
 
-\item{projection_pushdown}{A logical, indicates projection pushdown optimization.}
+\item{projection_pushdown}{A logical, indicats projection pushdown optimization.}
 
-\item{simplify_expression}{A logical, indicates simplify expression optimization.}
+\item{simplify_expression}{A logical, indicats simplify expression optimization.}
 
-\item{slice_pushdown}{A logical, indicates slice pushdown optimization.}
+\item{slice_pushdown}{A logical, indicats slice pushdown optimization.}
 
 \item{no_optimization}{A logical. If \code{TRUE}, turn off (certain) optimizations.}
 
diff --git a/man/lazyframe__sink_ndjson.Rd b/man/lazyframe__sink_ndjson.Rd
index f0ef9a10..218c8e2e 100644
--- a/man/lazyframe__sink_ndjson.Rd
+++ b/man/lazyframe__sink_ndjson.Rd
@@ -26,15 +26,15 @@ lazyframe__sink_ndjson(
 \item{maintain_order}{Maintain the order in which data is processed. Setting
 this to \code{FALSE} will be slightly faster.}
 
-\item{type_coercion}{A logical, indicates type coercion optimization.}
+\item{type_coercion}{A logical, indicats type coercion optimization.}
 
-\item{predicate_pushdown}{A logical, indicates predicate pushdown optimization.}
+\item{predicate_pushdown}{A logical, indicats predicate pushdown optimization.}
 
-\item{projection_pushdown}{A logical, indicates projection pushdown optimization.}
+\item{projection_pushdown}{A logical, indicats projection pushdown optimization.}
 
-\item{simplify_expression}{A logical, indicates simplify expression optimization.}
+\item{simplify_expression}{A logical, indicats simplify expression optimization.}
 
-\item{slice_pushdown}{A logical, indicates slice pushdown optimization.}
+\item{slice_pushdown}{A logical, indicats slice pushdown optimization.}
 
 \item{no_optimization}{A logical. If \code{TRUE}, turn off (certain) optimizations.}
 
diff --git a/man/lazyframe__sink_parquet.Rd b/man/lazyframe__sink_parquet.Rd
index 4583f132..2e04bd4a 100644
--- a/man/lazyframe__sink_parquet.Rd
+++ b/man/lazyframe__sink_parquet.Rd
@@ -70,15 +70,15 @@ is set to 1024^2 bytes.}
 \item{maintain_order}{Maintain the order in which data is processed. Setting
 this to \code{FALSE} will be slightly faster.}
 
-\item{type_coercion}{A logical, indicates type coercion optimization.}
+\item{type_coercion}{A logical, indicats type coercion optimization.}
 
-\item{predicate_pushdown}{A logical, indicates predicate pushdown optimization.}
+\item{predicate_pushdown}{A logical, indicats predicate pushdown optimization.}
 
-\item{projection_pushdown}{A logical, indicates projection pushdown optimization.}
+\item{projection_pushdown}{A logical, indicats projection pushdown optimization.}
 
-\item{simplify_expression}{A logical, indicates simplify expression optimization.}
+\item{simplify_expression}{A logical, indicats simplify expression optimization.}
 
-\item{slice_pushdown}{A logical, indicates slice pushdown optimization.}
+\item{slice_pushdown}{A logical, indicats slice pushdown optimization.}
 
 \item{no_optimization}{A logical. If \code{TRUE}, turn off (certain) optimizations.}
 
diff --git a/man/lazyframe__to_dot.Rd b/man/lazyframe__to_dot.Rd
index a83c4dfd..de3e89d3 100644
--- a/man/lazyframe__to_dot.Rd
+++ b/man/lazyframe__to_dot.Rd
@@ -23,21 +23,21 @@ lazyframe__to_dot(
 
 \item{optimized}{Optimize the query plan.}
 
-\item{type_coercion}{A logical, indicates type coercion optimization.}
+\item{type_coercion}{A logical, indicats type coercion optimization.}
 
-\item{predicate_pushdown}{A logical, indicates predicate pushdown optimization.}
+\item{predicate_pushdown}{A logical, indicats predicate pushdown optimization.}
 
-\item{projection_pushdown}{A logical, indicates projection pushdown optimization.}
+\item{projection_pushdown}{A logical, indicats projection pushdown optimization.}
 
-\item{simplify_expression}{A logical, indicates simplify expression optimization.}
+\item{simplify_expression}{A logical, indicats simplify expression optimization.}
 
-\item{slice_pushdown}{A logical, indicates slice pushdown optimization.}
+\item{slice_pushdown}{A logical, indicats slice pushdown optimization.}
 
-\item{comm_subplan_elim}{A logical, indicates trying to cache branching subplans that occur on self-joins or unions.}
+\item{comm_subplan_elim}{A logical, indicats trying to cache branching subplans that occur on self-joins or unions.}
 
-\item{comm_subexpr_elim}{A logical, indicates trying to cache common subexpressions.}
+\item{comm_subexpr_elim}{A logical, indicats trying to cache common subexpressions.}
 
-\item{cluster_with_columns}{A logical, indicates to combine sequential independent calls to with_columns.}
+\item{cluster_with_columns}{A logical, indicats to combine sequential independent calls to with_columns.}
 
 \item{streaming}{A logical. If \code{TRUE}, process the query in batches to handle larger-than-memory data.
 If \code{FALSE} (default), the entire query is processed in a single batch.

From 33d5f908985f3174b7233c9f26bdc14505180d82 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Mon, 9 Dec 2024 17:06:08 +0100
Subject: [PATCH 65/71] more

---
 R/lazyframe-frame.R       | 4 ++--
 man/as_polars_df.Rd       | 4 ++--
 man/lazyframe__collect.Rd | 4 ++--
 man/lazyframe__explain.Rd | 4 ++--
 man/lazyframe__profile.Rd | 4 ++--
 man/lazyframe__to_dot.Rd  | 4 ++--
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index bc7fab3e..e59e27d1 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -176,8 +176,8 @@ lazyframe__group_by <- function(..., .maintain_order = FALSE) {
 #' @param projection_pushdown A logical, indicats projection pushdown optimization.
 #' @param simplify_expression A logical, indicats simplify expression optimization.
 #' @param slice_pushdown A logical, indicats slice pushdown optimization.
-#' @param comm_subplan_elim A logical, indicats trying to cache branching subplans that occur on self-joins or unions.
-#' @param comm_subexpr_elim A logical, indicats trying to cache common subexpressions.
+#' @param comm_subplan_elim A logical, indicats tring to cache branching subplans that occur on self-joins or unions.
+#' @param comm_subexpr_elim A logical, indicats tring to cache common subexpressions.
 #' @param cluster_with_columns A logical, indicats to combine sequential independent calls to with_columns.
 #' @param no_optimization A logical. If `TRUE`, turn off (certain) optimizations.
 #' @param streaming A logical. If `TRUE`, process the query in batches to handle larger-than-memory data.
diff --git a/man/as_polars_df.Rd b/man/as_polars_df.Rd
index 77091117..e3986253 100644
--- a/man/as_polars_df.Rd
+++ b/man/as_polars_df.Rd
@@ -66,9 +66,9 @@ from the struct \link{Series}. In this case, the \code{column_name} argument is
 
 \item{slice_pushdown}{A logical, indicats slice pushdown optimization.}
 
-\item{comm_subplan_elim}{A logical, indicats trying to cache branching subplans that occur on self-joins or unions.}
+\item{comm_subplan_elim}{A logical, indicats tring to cache branching subplans that occur on self-joins or unions.}
 
-\item{comm_subexpr_elim}{A logical, indicats trying to cache common subexpressions.}
+\item{comm_subexpr_elim}{A logical, indicats tring to cache common subexpressions.}
 
 \item{cluster_with_columns}{A logical, indicats to combine sequential independent calls to with_columns.}
 
diff --git a/man/lazyframe__collect.Rd b/man/lazyframe__collect.Rd
index f47ef0b7..e6139b19 100644
--- a/man/lazyframe__collect.Rd
+++ b/man/lazyframe__collect.Rd
@@ -32,9 +32,9 @@ lazyframe__collect(
 
 \item{slice_pushdown}{A logical, indicats slice pushdown optimization.}
 
-\item{comm_subplan_elim}{A logical, indicats trying to cache branching subplans that occur on self-joins or unions.}
+\item{comm_subplan_elim}{A logical, indicats tring to cache branching subplans that occur on self-joins or unions.}
 
-\item{comm_subexpr_elim}{A logical, indicats trying to cache common subexpressions.}
+\item{comm_subexpr_elim}{A logical, indicats tring to cache common subexpressions.}
 
 \item{cluster_with_columns}{A logical, indicats to combine sequential independent calls to with_columns.}
 
diff --git a/man/lazyframe__explain.Rd b/man/lazyframe__explain.Rd
index 4f0f7b77..582869d2 100644
--- a/man/lazyframe__explain.Rd
+++ b/man/lazyframe__explain.Rd
@@ -38,9 +38,9 @@ subsequent optimization flags control which optimizations run.}
 
 \item{slice_pushdown}{A logical, indicats slice pushdown optimization.}
 
-\item{comm_subplan_elim}{A logical, indicats trying to cache branching subplans that occur on self-joins or unions.}
+\item{comm_subplan_elim}{A logical, indicats tring to cache branching subplans that occur on self-joins or unions.}
 
-\item{comm_subexpr_elim}{A logical, indicats trying to cache common subexpressions.}
+\item{comm_subexpr_elim}{A logical, indicats tring to cache common subexpressions.}
 
 \item{cluster_with_columns}{A logical, indicats to combine sequential independent calls to with_columns.}
 
diff --git a/man/lazyframe__profile.Rd b/man/lazyframe__profile.Rd
index bc0bedba..1117f152 100644
--- a/man/lazyframe__profile.Rd
+++ b/man/lazyframe__profile.Rd
@@ -47,9 +47,9 @@ lazyframe__profile(
 
 \item{slice_pushdown}{A logical, indicats slice pushdown optimization.}
 
-\item{comm_subplan_elim}{A logical, indicats trying to cache branching subplans that occur on self-joins or unions.}
+\item{comm_subplan_elim}{A logical, indicats tring to cache branching subplans that occur on self-joins or unions.}
 
-\item{comm_subexpr_elim}{A logical, indicats trying to cache common subexpressions.}
+\item{comm_subexpr_elim}{A logical, indicats tring to cache common subexpressions.}
 
 \item{cluster_with_columns}{A logical, indicats to combine sequential independent calls to with_columns.}
 
diff --git a/man/lazyframe__to_dot.Rd b/man/lazyframe__to_dot.Rd
index de3e89d3..eacace78 100644
--- a/man/lazyframe__to_dot.Rd
+++ b/man/lazyframe__to_dot.Rd
@@ -33,9 +33,9 @@ lazyframe__to_dot(
 
 \item{slice_pushdown}{A logical, indicats slice pushdown optimization.}
 
-\item{comm_subplan_elim}{A logical, indicats trying to cache branching subplans that occur on self-joins or unions.}
+\item{comm_subplan_elim}{A logical, indicats tring to cache branching subplans that occur on self-joins or unions.}
 
-\item{comm_subexpr_elim}{A logical, indicats trying to cache common subexpressions.}
+\item{comm_subexpr_elim}{A logical, indicats tring to cache common subexpressions.}
 
 \item{cluster_with_columns}{A logical, indicats to combine sequential independent calls to with_columns.}
 

From a3e20eeab99372e38dcd1605b7da8a6f2db7937e Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Mon, 9 Dec 2024 17:10:54 +0100
Subject: [PATCH 66/71] remove broken sql function

---
 R/lazyframe-frame.R   | 66 -------------------------------------------
 man/lazyframe__sql.Rd | 64 -----------------------------------------
 2 files changed, 130 deletions(-)
 delete mode 100644 man/lazyframe__sql.Rd

diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index e59e27d1..667642ce 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -1961,72 +1961,6 @@ lazyframe__clear <- function(n = 0) {
   pl$DataFrame(schema = self$schema)$clear(n)$lazy()
 }
 
-
-# TODO: we can't use % in the SQL query
-# <https://github.com/r-lib/roxygen2/issues/1616>
-#' Execute a SQL query against the LazyFrame
-#'
-#' The calling frame is automatically registered as a table in the SQL context
-#' under the name `"self"`. All [DataFrames][DataFrame_class] and
-#' [LazyFrames][lazyframe__class] found in the `envir` are also registered,
-#' using their variable name.
-#' More control over registration and execution behaviour is available by
-#' the [SQLContext][SQLContext_class] object.
-#'
-#' This functionality is considered **unstable**, although it is close to
-#' being considered stable. It may be changed at any point without it being
-#' considered a breaking change.
-#' @inherit pl_LazyFrame return
-#' @inheritParams SQLContext_execute
-#' @inheritParams SQLContext_register_globals
-#' @param table_name `NULL` (default) or a character of an explicit name for the table
-#' that represents the calling frame (the alias `"self"` will always be registered/available).
-#' @seealso
-#' - [SQLContext][SQLContext_class]
-#' @examplesIf polars_info()$features$sql
-#' lf1 <- pl$LazyFrame(a = 1:3, b = 6:8, c = c("z", "y", "x"))
-#' lf2 <- pl$LazyFrame(a = 3:1, d = c(125, -654, 888))
-#'
-#' # Query the LazyFrame using SQL:
-#' lf1$sql("SELECT c, b FROM self WHERE a > 1")$collect()
-#'
-#' # Join two LazyFrames:
-#' lf1$sql(
-#'   "
-#' SELECT self.*, d
-#' FROM self
-#' INNER JOIN lf2 USING (a)
-#' WHERE a > 1 AND b < 8
-#' "
-#' )$collect()
-#'
-#' # Apply SQL transforms (aliasing "self" to "frame") and subsequently
-#' # filter natively (you can freely mix SQL and native operations):
-#' lf1$sql(
-#'   query = r"(
-#' SELECT
-#'  a,
-#' MOD(a, 2) == 0 AS a_is_even,
-#' (b::float / 2) AS 'b/2',
-#' CONCAT_WS(':', c, c, c) AS c_c_c
-#' FROM frame
-#' ORDER BY a
-#' )",
-#'   table_name = "frame"
-#' )$filter(!pl$col("c_c_c")$str$starts_with("x"))$collect()
-lazyframe__sql <- function(query, ..., table_name = NULL, envir = parent.frame()) {
-  result({
-    ctx <- pl$SQLContext()$register_globals(envir = envir)$register("self", self)
-
-    if (!is.null(table_name)) {
-      ctx$register(table_name, self)
-    }
-
-    ctx$execute(query)
-  })
-}
-
-
 #' Take every nth row in the LazyFrame
 #'
 #' @param n Gather every `n`-th row.
diff --git a/man/lazyframe__sql.Rd b/man/lazyframe__sql.Rd
deleted file mode 100644
index 8daf4d75..00000000
--- a/man/lazyframe__sql.Rd
+++ /dev/null
@@ -1,64 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/lazyframe-frame.R
-\name{lazyframe__sql}
-\alias{lazyframe__sql}
-\title{Execute a SQL query against the LazyFrame}
-\usage{
-lazyframe__sql(query, ..., table_name = NULL, envir = parent.frame())
-}
-\arguments{
-\item{table_name}{\code{NULL} (default) or a character of an explicit name for the table
-that represents the calling frame (the alias \code{"self"} will always be registered/available).}
-}
-\description{
-The calling frame is automatically registered as a table in the SQL context
-under the name \code{"self"}. All \link[=DataFrame_class]{DataFrames} and
-\link[=lazyframe__class]{LazyFrames} found in the \code{envir} are also registered,
-using their variable name.
-More control over registration and execution behaviour is available by
-the \link[=SQLContext_class]{SQLContext} object.
-}
-\details{
-This functionality is considered \strong{unstable}, although it is close to
-being considered stable. It may be changed at any point without it being
-considered a breaking change.
-}
-\examples{
-\dontshow{if (polars_info()$features$sql) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
-lf1 <- pl$LazyFrame(a = 1:3, b = 6:8, c = c("z", "y", "x"))
-lf2 <- pl$LazyFrame(a = 3:1, d = c(125, -654, 888))
-
-# Query the LazyFrame using SQL:
-lf1$sql("SELECT c, b FROM self WHERE a > 1")$collect()
-
-# Join two LazyFrames:
-lf1$sql(
-  "
-SELECT self.*, d
-FROM self
-INNER JOIN lf2 USING (a)
-WHERE a > 1 AND b < 8
-"
-)$collect()
-
-# Apply SQL transforms (aliasing "self" to "frame") and subsequently
-# filter natively (you can freely mix SQL and native operations):
-lf1$sql(
-  query = r"(
-SELECT
- a,
-MOD(a, 2) == 0 AS a_is_even,
-(b::float / 2) AS 'b/2',
-CONCAT_WS(':', c, c, c) AS c_c_c
-FROM frame
-ORDER BY a
-)",
-  table_name = "frame"
-)$filter(!pl$col("c_c_c")$str$starts_with("x"))$collect()
-\dontshow{\}) # examplesIf}
-}
-\seealso{
-\itemize{
-\item \link[=SQLContext_class]{SQLContext}
-}
-}

From 78ff2ff842d5b1b1de61e9119bf3a608b90e336e Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Mon, 9 Dec 2024 17:13:02 +0100
Subject: [PATCH 67/71] remove last remnants of old r-polars

---
 R/lazyframe-frame.R       | 10 ++++------
 man/lazyframe__profile.Rd |  4 ++--
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index 667642ce..607ee22f 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -285,7 +285,7 @@ lazyframe__collect <- function(
 #' ## Use $profile() to compare two queries
 #'
 #' # -1-  map each Species-group with native polars
-#' pl$LazyFrame(iris)$
+#' as_polars_lf(iris)$
 #'   sort("Sepal.Length")$
 #'   group_by("Species", maintain_order = TRUE)$
 #'   agg(pl$col(pl$Float64)$first() + 5)$
@@ -299,7 +299,7 @@ lazyframe__collect <- function(
 #'   s$to_r()[1] + 5
 #' }
 #'
-#' pl$LazyFrame(iris)$
+#' as_polars_lf(iris)$
 #'   sort("Sepal.Length")$
 #'   group_by("Species", maintain_order = TRUE)$
 #'   agg(pl$col(pl$Float64)$map_elements(r_func))$
@@ -348,13 +348,11 @@ lazyframe__profile <- function(
       eager = FALSE
     )
 
-    out <- lf |>
-      .pr$LazyFrame$profile()
+    out <- self$`_ldf`$profile()
 
     if (isTRUE(show_plot)) {
       out[["plot"]] <- make_profile_plot(out, truncate_nodes) |>
-        result() |>
-        unwrap("in $profile()")
+        wrap()
     }
     out
   })
diff --git a/man/lazyframe__profile.Rd b/man/lazyframe__profile.Rd
index 1117f152..6182fb6d 100644
--- a/man/lazyframe__profile.Rd
+++ b/man/lazyframe__profile.Rd
@@ -95,7 +95,7 @@ pl$LazyFrame()$select(pl$lit(2) + 2)$profile()
 ## Use $profile() to compare two queries
 
 # -1-  map each Species-group with native polars
-pl$LazyFrame(iris)$
+as_polars_lf(iris)$
   sort("Sepal.Length")$
   group_by("Species", maintain_order = TRUE)$
   agg(pl$col(pl$Float64)$first() + 5)$
@@ -109,7 +109,7 @@ r_func <- \(s) {
   s$to_r()[1] + 5
 }
 
-pl$LazyFrame(iris)$
+as_polars_lf(iris)$
   sort("Sepal.Length")$
   group_by("Species", maintain_order = TRUE)$
   agg(pl$col(pl$Float64)$map_elements(r_func))$

From 03d10d461e9a6dcb179ae5940ee6b0bcbd0bb100 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Wed, 11 Dec 2024 16:27:37 +0100
Subject: [PATCH 68/71] use missing(...) in sort()

---
 R/lazyframe-frame.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index 607ee22f..82a6462b 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -584,10 +584,10 @@ lazyframe__sort <- function(
   wrap({
     check_dots_unnamed()
 
-    by <- parse_into_list_of_expressions(...)
-    if (length(by) == 0) {
+    if (missing(...)) {
       abort("`...` must contain at least one element.")
     }
+    by <- parse_into_list_of_expressions(...)    
     descending <- extend_bool(descending, length(by), "descending", "...")
     nulls_last <- extend_bool(nulls_last, length(by), "nulls_last", "...")
 

From 0532cd51bb82cd56fbb234eb601f41bd4bd4dba6 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Wed, 11 Dec 2024 16:29:00 +0100
Subject: [PATCH 69/71] add TODO in $rename()

---
 R/lazyframe-frame.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index 82a6462b..847dc0a4 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -587,7 +587,7 @@ lazyframe__sort <- function(
     if (missing(...)) {
       abort("`...` must contain at least one element.")
     }
-    by <- parse_into_list_of_expressions(...)    
+    by <- parse_into_list_of_expressions(...)
     descending <- extend_bool(descending, length(by), "descending", "...")
     nulls_last <- extend_bool(nulls_last, length(by), "nulls_last", "...")
 
@@ -1369,6 +1369,7 @@ lazyframe__rename <- function(mapping, ..., strict = TRUE) {
   wrap({
     if (!missing(mapping) && is_function(mapping)) {
       check_dots_empty0(...)
+      # TODO: this requires $name$map()
       self$select(pl$all()$name$map(mapping))
     } else {
       if (missing(mapping) || !is.list(mapping)) {

From 25dcda8db9289e19a970d4abf2722f35cb9d7c16 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Thu, 12 Dec 2024 16:24:47 +0100
Subject: [PATCH 70/71] rewrite $rename()

---
 R/lazyframe-frame.R      | 30 ++++++++++++------------------
 man/lazyframe__rename.Rd | 15 ++++++---------
 2 files changed, 18 insertions(+), 27 deletions(-)

diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index 847dc0a4..e158eb22 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -1339,14 +1339,12 @@ lazyframe__unpivot <- function(
 
 #' Rename column names
 #'
-#' @param mapping Either a function that takes a character vector as input and
-#' returns one as input, or a named list where names are old column names and
-#' values are the new ones.
-#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> If `mapping` is missing,
-#' those values are used.
-#' @param strict Validate that all column names exist in the current schema,
+#' @param ... <[`dynamic-dots`][rlang::dyn-dots]> Either a function that takes
+#' a character vector as input and returns a character vector as output, or
+#' named values where names are old column names and values are the new ones.
+#' @param .strict Validate that all column names exist in the current schema,
 #' and throw an error if any do not. (Note that this parameter is a no-op when
-#' passing a function to `mapping`).
+#' passing a function to `...`).
 #'
 #' @details
 #' If existing names are swapped (e.g. 'A' points to 'B' and 'B' points to
@@ -1365,20 +1363,16 @@ lazyframe__unpivot <- function(
 #' lf$rename(
 #'   \(column_name) paste0("c", substr(column_name, 2, 100))
 #' )$collect()
-lazyframe__rename <- function(mapping, ..., strict = TRUE) {
+lazyframe__rename <- function(..., .strict = TRUE) {
   wrap({
-    if (!missing(mapping) && is_function(mapping)) {
-      check_dots_empty0(...)
+    mapping <- list2(...)
+    if (length(mapping) == 1 && is_function(mapping[[1]])) {
       # TODO: this requires $name$map()
-      self$select(pl$all()$name$map(mapping))
-    } else {
-      if (missing(mapping) || !is.list(mapping)) {
-        mapping <- list2(...)
-      }
-      existing <- names(mapping)
-      new <- unlist(mapping)
-      self$`_ldf`$rename(existing, new, strict)
+      return(self$select(pl$all()$name$map(mapping[[1]])))
     }
+    existing <- names(mapping)
+    new <- unlist(mapping)
+    self$`_ldf`$rename(existing, new, .strict)
   })
 }
 
diff --git a/man/lazyframe__rename.Rd b/man/lazyframe__rename.Rd
index b34bef06..454f7e99 100644
--- a/man/lazyframe__rename.Rd
+++ b/man/lazyframe__rename.Rd
@@ -4,19 +4,16 @@
 \alias{lazyframe__rename}
 \title{Rename column names}
 \usage{
-lazyframe__rename(mapping, ..., strict = TRUE)
+lazyframe__rename(..., .strict = TRUE)
 }
 \arguments{
-\item{mapping}{Either a function that takes a character vector as input and
-returns one as input, or a named list where names are old column names and
-values are the new ones.}
+\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> Either a function that takes
+a character vector as input and returns a character vector as output, or
+named values where names are old column names and values are the new ones.}
 
-\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}> If \code{mapping} is missing,
-those values are used.}
-
-\item{strict}{Validate that all column names exist in the current schema,
+\item{.strict}{Validate that all column names exist in the current schema,
 and throw an error if any do not. (Note that this parameter is a no-op when
-passing a function to \code{mapping}).}
+passing a function to \code{...}).}
 }
 \value{
 A polars \link{LazyFrame}

From 100be732353cd95990258e53987b24274c7b25e1 Mon Sep 17 00:00:00 2001
From: etiennebacher <etienne.bacher@protonmail.com>
Date: Thu, 12 Dec 2024 17:36:03 +0100
Subject: [PATCH 71/71] rename: check that input are strings

---
 R/lazyframe-frame.R | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/R/lazyframe-frame.R b/R/lazyframe-frame.R
index e158eb22..fbf24f13 100644
--- a/R/lazyframe-frame.R
+++ b/R/lazyframe-frame.R
@@ -1370,6 +1370,9 @@ lazyframe__rename <- function(..., .strict = TRUE) {
       # TODO: this requires $name$map()
       return(self$select(pl$all()$name$map(mapping[[1]])))
     }
+    if (!is_list_of_string(mapping)) {
+      abort("`...` only accepts a function or named characters.")
+    }
     existing <- names(mapping)
     new <- unlist(mapping)
     self$`_ldf`$rename(existing, new, .strict)