From 8dc4719c92f4193ee910afab3e8fce33cb5b0b75 Mon Sep 17 00:00:00 2001 From: eitsupi Date: Sun, 14 Apr 2024 08:59:57 +0000 Subject: [PATCH 01/14] feat!: bump polars to 0.39.0 [skip ci] --- DESCRIPTION | 2 +- src/rust/Cargo.lock | 111 +++++++++++++++++++++++++++++++------------- src/rust/Cargo.toml | 8 ++-- 3 files changed, 83 insertions(+), 38 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index f303145de..b86f2f53e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -119,4 +119,4 @@ Collate: Config/rextendr/version: 0.3.1 VignetteBuilder: knitr Config/polars/LibVersion: 0.38.2 -Config/polars/RustToolchainVersion: nightly-2024-02-23 +Config/polars/RustToolchainVersion: nightly-2024-03-28 diff --git a/src/rust/Cargo.lock b/src/rust/Cargo.lock index 06301abf6..ccc29ea1c 100644 --- a/src/rust/Cargo.lock +++ b/src/rust/Cargo.lock @@ -1354,8 +1354,8 @@ dependencies = [ [[package]] name = "polars" -version = "0.38.3" -source = "git+https://github.com/pola-rs/polars.git?rev=1237c03f5571918fb8890169066257b883b7f33c#1237c03f5571918fb8890169066257b883b7f33c" +version = "0.39.0" +source = "git+https://github.com/pola-rs/polars.git?rev=37c630320da0d0d3270d283cdafbf6b8402de069#37c630320da0d0d3270d283cdafbf6b8402de069" dependencies = [ "getrandom", "polars-arrow", @@ -1374,8 +1374,8 @@ dependencies = [ [[package]] name = "polars-arrow" -version = "0.38.3" -source = "git+https://github.com/pola-rs/polars.git?rev=1237c03f5571918fb8890169066257b883b7f33c#1237c03f5571918fb8890169066257b883b7f33c" +version = "0.39.0" +source = "git+https://github.com/pola-rs/polars.git?rev=37c630320da0d0d3270d283cdafbf6b8402de069#37c630320da0d0d3270d283cdafbf6b8402de069" dependencies = [ "ahash", "atoi", @@ -1421,8 +1421,8 @@ dependencies = [ [[package]] name = "polars-compute" -version = "0.38.3" -source = "git+https://github.com/pola-rs/polars.git?rev=1237c03f5571918fb8890169066257b883b7f33c#1237c03f5571918fb8890169066257b883b7f33c" +version = "0.39.0" +source = "git+https://github.com/pola-rs/polars.git?rev=37c630320da0d0d3270d283cdafbf6b8402de069#37c630320da0d0d3270d283cdafbf6b8402de069" dependencies = [ "bytemuck", "either", @@ -1436,8 +1436,8 @@ dependencies = [ [[package]] name = "polars-core" -version = "0.38.3" -source = "git+https://github.com/pola-rs/polars.git?rev=1237c03f5571918fb8890169066257b883b7f33c#1237c03f5571918fb8890169066257b883b7f33c" +version = "0.39.0" +source = "git+https://github.com/pola-rs/polars.git?rev=37c630320da0d0d3270d283cdafbf6b8402de069#37c630320da0d0d3270d283cdafbf6b8402de069" dependencies = [ "ahash", "bitflags 2.4.2", @@ -1470,8 +1470,8 @@ dependencies = [ [[package]] name = "polars-error" -version = "0.38.3" -source = "git+https://github.com/pola-rs/polars.git?rev=1237c03f5571918fb8890169066257b883b7f33c#1237c03f5571918fb8890169066257b883b7f33c" +version = "0.39.0" +source = "git+https://github.com/pola-rs/polars.git?rev=37c630320da0d0d3270d283cdafbf6b8402de069#37c630320da0d0d3270d283cdafbf6b8402de069" dependencies = [ "avro-schema", "polars-arrow-format", @@ -1482,8 +1482,8 @@ dependencies = [ [[package]] name = "polars-io" -version = "0.38.3" -source = "git+https://github.com/pola-rs/polars.git?rev=1237c03f5571918fb8890169066257b883b7f33c#1237c03f5571918fb8890169066257b883b7f33c" +version = "0.39.0" +source = "git+https://github.com/pola-rs/polars.git?rev=37c630320da0d0d3270d283cdafbf6b8402de069#37c630320da0d0d3270d283cdafbf6b8402de069" dependencies = [ "ahash", "async-trait", @@ -1523,8 +1523,8 @@ dependencies = [ [[package]] name = "polars-json" -version = "0.38.3" -source = "git+https://github.com/pola-rs/polars.git?rev=1237c03f5571918fb8890169066257b883b7f33c#1237c03f5571918fb8890169066257b883b7f33c" +version = "0.39.0" +source = "git+https://github.com/pola-rs/polars.git?rev=37c630320da0d0d3270d283cdafbf6b8402de069#37c630320da0d0d3270d283cdafbf6b8402de069" dependencies = [ "ahash", "chrono", @@ -1543,8 +1543,8 @@ dependencies = [ [[package]] name = "polars-lazy" -version = "0.38.3" -source = "git+https://github.com/pola-rs/polars.git?rev=1237c03f5571918fb8890169066257b883b7f33c#1237c03f5571918fb8890169066257b883b7f33c" +version = "0.39.0" +source = "git+https://github.com/pola-rs/polars.git?rev=37c630320da0d0d3270d283cdafbf6b8402de069#37c630320da0d0d3270d283cdafbf6b8402de069" dependencies = [ "ahash", "bitflags 2.4.2", @@ -1566,8 +1566,8 @@ dependencies = [ [[package]] name = "polars-ops" -version = "0.38.3" -source = "git+https://github.com/pola-rs/polars.git?rev=1237c03f5571918fb8890169066257b883b7f33c#1237c03f5571918fb8890169066257b883b7f33c" +version = "0.39.0" +source = "git+https://github.com/pola-rs/polars.git?rev=37c630320da0d0d3270d283cdafbf6b8402de069#37c630320da0d0d3270d283cdafbf6b8402de069" dependencies = [ "ahash", "aho-corasick", @@ -1602,8 +1602,8 @@ dependencies = [ [[package]] name = "polars-parquet" -version = "0.38.3" -source = "git+https://github.com/pola-rs/polars.git?rev=1237c03f5571918fb8890169066257b883b7f33c#1237c03f5571918fb8890169066257b883b7f33c" +version = "0.39.0" +source = "git+https://github.com/pola-rs/polars.git?rev=37c630320da0d0d3270d283cdafbf6b8402de069#37c630320da0d0d3270d283cdafbf6b8402de069" dependencies = [ "ahash", "async-stream", @@ -1627,8 +1627,8 @@ dependencies = [ [[package]] name = "polars-pipe" -version = "0.38.3" -source = "git+https://github.com/pola-rs/polars.git?rev=1237c03f5571918fb8890169066257b883b7f33c#1237c03f5571918fb8890169066257b883b7f33c" +version = "0.39.0" +source = "git+https://github.com/pola-rs/polars.git?rev=37c630320da0d0d3270d283cdafbf6b8402de069#37c630320da0d0d3270d283cdafbf6b8402de069" dependencies = [ "crossbeam-channel", "crossbeam-queue", @@ -1651,13 +1651,14 @@ dependencies = [ [[package]] name = "polars-plan" -version = "0.38.3" -source = "git+https://github.com/pola-rs/polars.git?rev=1237c03f5571918fb8890169066257b883b7f33c#1237c03f5571918fb8890169066257b883b7f33c" +version = "0.39.0" +source = "git+https://github.com/pola-rs/polars.git?rev=37c630320da0d0d3270d283cdafbf6b8402de069#37c630320da0d0d3270d283cdafbf6b8402de069" dependencies = [ "ahash", "bytemuck", "chrono", "chrono-tz", + "hashbrown 0.14.3", "once_cell", "percent-encoding", "polars-arrow", @@ -1669,6 +1670,7 @@ dependencies = [ "polars-time", "polars-utils", "rayon", + "recursive", "regex", "serde", "smartstring", @@ -1678,8 +1680,8 @@ dependencies = [ [[package]] name = "polars-row" -version = "0.38.3" -source = "git+https://github.com/pola-rs/polars.git?rev=1237c03f5571918fb8890169066257b883b7f33c#1237c03f5571918fb8890169066257b883b7f33c" +version = "0.39.0" +source = "git+https://github.com/pola-rs/polars.git?rev=37c630320da0d0d3270d283cdafbf6b8402de069#37c630320da0d0d3270d283cdafbf6b8402de069" dependencies = [ "bytemuck", "polars-arrow", @@ -1689,8 +1691,8 @@ dependencies = [ [[package]] name = "polars-sql" -version = "0.38.3" -source = "git+https://github.com/pola-rs/polars.git?rev=1237c03f5571918fb8890169066257b883b7f33c#1237c03f5571918fb8890169066257b883b7f33c" +version = "0.39.0" +source = "git+https://github.com/pola-rs/polars.git?rev=37c630320da0d0d3270d283cdafbf6b8402de069#37c630320da0d0d3270d283cdafbf6b8402de069" dependencies = [ "hex", "polars-arrow", @@ -1706,8 +1708,8 @@ dependencies = [ [[package]] name = "polars-time" -version = "0.38.3" -source = "git+https://github.com/pola-rs/polars.git?rev=1237c03f5571918fb8890169066257b883b7f33c#1237c03f5571918fb8890169066257b883b7f33c" +version = "0.39.0" +source = "git+https://github.com/pola-rs/polars.git?rev=37c630320da0d0d3270d283cdafbf6b8402de069#37c630320da0d0d3270d283cdafbf6b8402de069" dependencies = [ "atoi", "chrono", @@ -1726,8 +1728,8 @@ dependencies = [ [[package]] name = "polars-utils" -version = "0.38.3" -source = "git+https://github.com/pola-rs/polars.git?rev=1237c03f5571918fb8890169066257b883b7f33c#1237c03f5571918fb8890169066257b883b7f33c" +version = "0.39.0" +source = "git+https://github.com/pola-rs/polars.git?rev=37c630320da0d0d3270d283cdafbf6b8402de069#37c630320da0d0d3270d283cdafbf6b8402de069" dependencies = [ "ahash", "bytemuck", @@ -1739,6 +1741,7 @@ dependencies = [ "raw-cpuid", "rayon", "smartstring", + "stacker", "sysinfo", "version_check", ] @@ -1758,6 +1761,15 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "psm" +version = "0.1.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5787f7cda34e3033a72192c018bc5883100330f362ef279a8cbccfce8bb4e874" +dependencies = [ + "cc", +] + [[package]] name = "quote" version = "1.0.35" @@ -1769,7 +1781,7 @@ dependencies = [ [[package]] name = "r-polars" -version = "0.38.2" +version = "0.39.0" dependencies = [ "either", "extendr-api", @@ -1865,6 +1877,26 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "recursive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" +dependencies = [ + "recursive-proc-macro-impl", + "stacker", +] + +[[package]] +name = "recursive-proc-macro-impl" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" +dependencies = [ + "quote", + "syn 2.0.50", +] + [[package]] name = "redox_syscall" version = "0.4.1" @@ -2125,6 +2157,19 @@ dependencies = [ "log", ] +[[package]] +name = "stacker" +version = "0.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c886bd4480155fd3ef527d45e9ac8dd7118a898a46530b7b94c3e21866259fce" +dependencies = [ + "cc", + "cfg-if", + "libc", + "psm", + "winapi", +] + [[package]] name = "state" version = "0.6.0" diff --git a/src/rust/Cargo.toml b/src/rust/Cargo.toml index b24b6677b..b9bcd0d39 100644 --- a/src/rust/Cargo.toml +++ b/src/rust/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "r-polars" -version = "0.38.2" +version = "0.39.0" edition = "2021" rust-version = "1.76.0" publish = false @@ -52,8 +52,8 @@ serde_json = "*" smartstring = "1.0.1" state = "0.6.0" thiserror = "1.0.58" -polars-core = { git = "https://github.com/pola-rs/polars.git", rev = "1237c03f5571918fb8890169066257b883b7f33c", default-features = false } -polars-lazy = { git = "https://github.com/pola-rs/polars.git", rev = "1237c03f5571918fb8890169066257b883b7f33c", default-features = false } +polars-core = { git = "https://github.com/pola-rs/polars.git", rev = "37c630320da0d0d3270d283cdafbf6b8402de069", default-features = false } +polars-lazy = { git = "https://github.com/pola-rs/polars.git", rev = "37c630320da0d0d3270d283cdafbf6b8402de069", default-features = false } either = "1" #features copied from node-polars @@ -154,4 +154,4 @@ features = [ "zip_with", ] git = "https://github.com/pola-rs/polars.git" -rev = "1237c03f5571918fb8890169066257b883b7f33c" +rev = "37c630320da0d0d3270d283cdafbf6b8402de069" From 33e73e1e0894d65df4cbe96eabc98af9ee99c1f5 Mon Sep 17 00:00:00 2001 From: eitsupi Date: Sun, 14 Apr 2024 10:10:25 +0000 Subject: [PATCH 02/14] fix: fix rust side and tweak `$sort` --- R/expr__expr.R | 3 +- R/expr__string.R | 1 + R/extendr-wrappers.R | 18 ++++----- R/series__series.R | 11 +++--- src/rust/src/lazy/dataframe.rs | 21 +++++++--- src/rust/src/lazy/dsl.rs | 51 +++++++++++++++++++------ src/rust/src/rdataframe/read_ipc.rs | 5 ++- src/rust/src/rdataframe/read_parquet.rs | 5 ++- src/rust/src/rlib.rs | 20 +++++++++- src/rust/src/series.rs | 21 +++++++++- 10 files changed, 117 insertions(+), 39 deletions(-) diff --git a/R/expr__expr.R b/R/expr__expr.R index 84fde9bc5..6a8c11874 100644 --- a/R/expr__expr.R +++ b/R/expr__expr.R @@ -1377,6 +1377,7 @@ Expr_mode = use_extendr_wrapper #' #' Sort this column. If used in a groupby context, the groups are sorted. #' +#' @param ... Ignored #' @param descending Sort in descending order. When sorting by multiple columns, #' can be specified per column by passing a vector of booleans. #' @param nulls_last If `TRUE`, place nulls values last. @@ -1384,7 +1385,7 @@ Expr_mode = use_extendr_wrapper #' @examples #' pl$DataFrame(a = c(6, 1, 0, NA, Inf, NaN))$ #' with_columns(sorted = pl$col("a")$sort()) -Expr_sort = function(descending = FALSE, nulls_last = FALSE) { +Expr_sort = function(..., descending = FALSE, nulls_last = FALSE) { .pr$Expr$sort(self, descending, nulls_last) } diff --git a/R/expr__string.R b/R/expr__string.R index e823b3021..5f625079d 100644 --- a/R/expr__string.R +++ b/R/expr__string.R @@ -865,6 +865,7 @@ ExprStr_explode = function() { unwrap("in str$explode():") } +# TODO: rename to `to_integer` #' Parse integers with base radix from strings #' #' @description Parse integers with base 2 by default. diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index e588975ce..06bf14d2b 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -14,7 +14,7 @@ all_horizontal <- function(dotdotdot) .Call(wrap__all_horizontal, dotdotdot) any_horizontal <- function(dotdotdot) .Call(wrap__any_horizontal, dotdotdot) -arg_sort_by <- function(exprs, descending) .Call(wrap__arg_sort_by, exprs, descending) +arg_sort_by <- function(exprs, descending, nulls_last, multithreaded, maintain_order) .Call(wrap__arg_sort_by, exprs, descending, nulls_last, multithreaded, maintain_order) arg_where <- function(condition) .Call(wrap__arg_where, condition) @@ -98,7 +98,7 @@ concat_series <- function(l, rechunk, to_supertypes) .Call(wrap__concat_series, new_from_csv <- function(path, has_header, separator, comment_prefix, quote_char, skip_rows, dtypes, null_values, ignore_errors, cache, infer_schema_length, n_rows, encoding, low_memory, rechunk, skip_rows_after_header, row_index_name, row_index_offset, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines) .Call(wrap__new_from_csv, path, has_header, separator, comment_prefix, quote_char, skip_rows, dtypes, null_values, ignore_errors, cache, infer_schema_length, n_rows, encoding, low_memory, rechunk, skip_rows_after_header, row_index_name, row_index_offset, try_parse_dates, eol_char, raise_if_empty, truncate_ragged_lines) -import_arrow_ipc <- function(path, n_rows, cache, rechunk, row_name, row_index, memmap) .Call(wrap__import_arrow_ipc, path, n_rows, cache, rechunk, row_name, row_index, memmap) +import_arrow_ipc <- function(path, n_rows, cache, rechunk, row_name, row_index, memory_map) .Call(wrap__import_arrow_ipc, path, n_rows, cache, rechunk, row_name, row_index, memory_map) new_from_ndjson <- function(path, infer_schema_length, batch_size, n_rows, low_memory, rechunk, row_index_name, row_index_offset, ignore_errors) .Call(wrap__new_from_ndjson, path, infer_schema_length, batch_size, n_rows, low_memory, rechunk, row_index_name, row_index_offset, ignore_errors) @@ -500,7 +500,7 @@ RPolarsExpr$search_sorted <- function(element) .Call(wrap__RPolarsExpr__search_s RPolarsExpr$gather <- function(idx) .Call(wrap__RPolarsExpr__gather, self, idx) -RPolarsExpr$sort_by <- function(by, descending) .Call(wrap__RPolarsExpr__sort_by, self, by, descending) +RPolarsExpr$sort_by <- function(by, descending, nulls_last, maintain_order, multithreaded) .Call(wrap__RPolarsExpr__sort_by, self, by, descending, nulls_last, maintain_order, multithreaded) RPolarsExpr$backward_fill <- function(limit) .Call(wrap__RPolarsExpr__backward_fill, self, limit) @@ -690,7 +690,7 @@ RPolarsExpr$list_gather <- function(index, null_on_oob) .Call(wrap__RPolarsExpr_ RPolarsExpr$list_gather_every <- function(n, offset) .Call(wrap__RPolarsExpr__list_gather_every, self, n, offset) -RPolarsExpr$list_get <- function(index) .Call(wrap__RPolarsExpr__list_get, self, index) +RPolarsExpr$list_get <- function(index, null_on_oob) .Call(wrap__RPolarsExpr__list_get, self, index, null_on_oob) RPolarsExpr$list_join <- function(separator, ignore_nulls) .Call(wrap__RPolarsExpr__list_join, self, separator, ignore_nulls) @@ -742,7 +742,7 @@ RPolarsExpr$arr_arg_min <- function() .Call(wrap__RPolarsExpr__arr_arg_min, self RPolarsExpr$arr_arg_max <- function() .Call(wrap__RPolarsExpr__arr_arg_max, self) -RPolarsExpr$arr_get <- function(index) .Call(wrap__RPolarsExpr__arr_get, self, index) +RPolarsExpr$arr_get <- function(index, null_on_oob) .Call(wrap__RPolarsExpr__arr_get, self, index, null_on_oob) RPolarsExpr$arr_join <- function(separator, ignore_nulls) .Call(wrap__RPolarsExpr__arr_join, self, separator, ignore_nulls) @@ -1024,7 +1024,7 @@ RPolarsExpr$str_slice <- function(offset, length) .Call(wrap__RPolarsExpr__str_s RPolarsExpr$str_explode <- function() .Call(wrap__RPolarsExpr__str_explode, self) -RPolarsExpr$str_parse_int <- function(radix, strict) .Call(wrap__RPolarsExpr__str_parse_int, self, radix, strict) +RPolarsExpr$str_parse_int <- function(base, strict) .Call(wrap__RPolarsExpr__str_parse_int, self, base, strict) RPolarsExpr$str_reverse <- function() .Call(wrap__RPolarsExpr__str_reverse, self) @@ -1176,7 +1176,7 @@ RPolarsLazyFrame$join_asof <- function(other, left_on, right_on, left_by, right_ RPolarsLazyFrame$join <- function(other, left_on, right_on, how, validate, join_nulls, suffix, allow_parallel, force_parallel) .Call(wrap__RPolarsLazyFrame__join, self, other, left_on, right_on, how, validate, join_nulls, suffix, allow_parallel, force_parallel) -RPolarsLazyFrame$sort_by_exprs <- function(by, dotdotdot, descending, nulls_last, maintain_order) .Call(wrap__RPolarsLazyFrame__sort_by_exprs, self, by, dotdotdot, descending, nulls_last, maintain_order) +RPolarsLazyFrame$sort_by_exprs <- function(by, dotdotdot, descending, nulls_last, maintain_order, multithreaded) .Call(wrap__RPolarsLazyFrame__sort_by_exprs, self, by, dotdotdot, descending, nulls_last, maintain_order, multithreaded) RPolarsLazyFrame$melt <- function(id_vars, value_vars, value_name, variable_name, streamable) .Call(wrap__RPolarsLazyFrame__melt, self, id_vars, value_vars, value_name, variable_name, streamable) @@ -1198,7 +1198,7 @@ RPolarsLazyFrame$clone_in_rust <- function() .Call(wrap__RPolarsLazyFrame__clone RPolarsLazyFrame$with_context <- function(contexts) .Call(wrap__RPolarsLazyFrame__with_context, self, contexts) -RPolarsLazyFrame$rolling <- function(index_column, period, offset, closed, by, check_sorted) .Call(wrap__RPolarsLazyFrame__rolling, self, index_column, period, offset, closed, by, check_sorted) +RPolarsLazyFrame$rolling <- function(index_column, period, offset, closed, group_by, check_sorted) .Call(wrap__RPolarsLazyFrame__rolling, self, index_column, period, offset, closed, group_by, check_sorted) RPolarsLazyFrame$group_by_dynamic <- function(index_column, every, period, offset, label, include_boundaries, closed, by, start_by, check_sorted) .Call(wrap__RPolarsLazyFrame__group_by_dynamic, self, index_column, every, period, offset, label, include_boundaries, closed, by, start_by, check_sorted) @@ -1250,7 +1250,7 @@ RPolarsSeries$n_unique <- function() .Call(wrap__RPolarsSeries__n_unique, self) RPolarsSeries$name <- function() .Call(wrap__RPolarsSeries__name, self) -RPolarsSeries$sort_mut <- function(descending, nulls_last) .Call(wrap__RPolarsSeries__sort_mut, self, descending, nulls_last) +RPolarsSeries$sort <- function(descending, nulls_last, multithreaded) .Call(wrap__RPolarsSeries__sort, self, descending, nulls_last, multithreaded) RPolarsSeries$value_counts <- function(sort, parallel) .Call(wrap__RPolarsSeries__value_counts, self, sort, parallel) diff --git a/R/series__series.R b/R/series__series.R index 2c60d38ca..af2faa6d2 100644 --- a/R/series__series.R +++ b/R/series__series.R @@ -902,6 +902,7 @@ Series_set_sorted = function(descending = FALSE, in_place = FALSE) { if (in_place) invisible(NULL) else invisible(self) } + #' Sort a Series #' #' @param descending Sort in descending order. @@ -913,18 +914,18 @@ Series_set_sorted = function(descending = FALSE, in_place = FALSE) { #' @examples #' as_polars_series(c(1.5, NA, 1, NaN, Inf, -Inf))$sort() #' as_polars_series(c(1.5, NA, 1, NaN, Inf, -Inf))$sort(nulls_last = TRUE) -Series_sort = function(descending = FALSE, nulls_last = FALSE, in_place = FALSE) { - if (in_place && polars_options()$strictly_immutable) { +Series_sort = function(..., descending = FALSE, nulls_last = FALSE, in_place = FALSE) { + if (isTRUE(in_place) && polars_options()$strictly_immutable) { stop(paste( - "in_place sort breaks immutability, to enable mutable features run:\n", + "in place sort breaks immutability, to enable mutable features run:\n", "`options(polars.strictly_immutable = FALSE)`" )) } - if (!in_place) { + if (!isTRUE(in_place)) { self = self$clone() } - .pr$Series$sort_mut(self, descending, nulls_last) + .pr$Series$sort(self, descending, nulls_last) } #' Convert Series to DataFrame diff --git a/src/rust/src/lazy/dataframe.rs b/src/rust/src/lazy/dataframe.rs index 4d7f55637..17b84b9c3 100644 --- a/src/rust/src/lazy/dataframe.rs +++ b/src/rust/src/lazy/dataframe.rs @@ -12,6 +12,7 @@ use crate::rpolarserr::{polars_to_rpolars_err, RPolarsErr, RResult}; use crate::utils::{r_result_list, try_f64_into_usize}; use extendr_api::prelude::*; use pl::{AsOfOptions, Duration, RollingGroupOptions}; +use polars::chunked_array::ops::SortMultipleOptions; use polars::frame::explode::MeltArgs; use polars::prelude as pl; @@ -462,6 +463,7 @@ impl RPolarsLazyFrame { descending: Robj, nulls_last: Robj, maintain_order: Robj, + multithreaded: Robj, ) -> RResult { let mut exprs = robj_to!(VecPLExprCol, by)?; let mut ddd = robj_to!(VecPLExprCol, dotdotdot)?; @@ -475,10 +477,19 @@ impl RPolarsLazyFrame { let nulls_last = robj_to!(bool, nulls_last)?; let maintain_order = robj_to!(bool, maintain_order)?; + let multithreaded = robj_to!(bool, multithreaded)?; Ok(self .0 .clone() - .sort_by_exprs(exprs, descending, nulls_last, maintain_order) + .sort_by_exprs( + exprs, + SortMultipleOptions { + descending, + nulls_last, + maintain_order, + multithreaded, + }, + ) .into()) } @@ -614,19 +625,19 @@ impl RPolarsLazyFrame { period: Robj, offset: Robj, closed: Robj, - by: Robj, + group_by: Robj, check_sorted: Robj, ) -> RResult { let index_column = robj_to!(PLExprCol, index_column)?; let period = Duration::parse(robj_to!(str, period)?); let offset = Duration::parse(robj_to!(str, offset)?); let closed_window = robj_to!(ClosedWindow, closed)?; - let by = robj_to!(VecPLExprCol, by)?; + let group_by = robj_to!(VecPLExprCol, group_by)?; let check_sorted = robj_to!(bool, check_sorted)?; - let lazy_gb = self.0.clone().group_by_rolling( + let lazy_gb = self.0.clone().rolling( index_column, - by, + group_by, RollingGroupOptions { index_column: "".into(), period, diff --git a/src/rust/src/lazy/dsl.rs b/src/rust/src/lazy/dsl.rs index c242144ff..772af4149 100644 --- a/src/rust/src/lazy/dsl.rs +++ b/src/rust/src/lazy/dsl.rs @@ -251,7 +251,7 @@ impl RPolarsExpr { pub fn sort(&self, descending: bool, nulls_last: bool) -> Self { self.clone() .0 - .sort_with(SortOptions { + .sort(SortOptions { descending, nulls_last, multithreaded: true, @@ -304,12 +304,28 @@ impl RPolarsExpr { .into()) } - pub fn sort_by(&self, by: Robj, descending: Robj) -> RResult { - let expr = RPolarsExpr(self.clone().0.sort_by( + pub fn sort_by( + &self, + by: Robj, + descending: Robj, + nulls_last: Robj, + maintain_order: Robj, + multithreaded: Robj, + ) -> RResult { + let descending = robj_to!(Vec, bool, descending)?; + let nulls_last = robj_to!(bool, nulls_last)?; + let maintain_order = robj_to!(bool, maintain_order)?; + let multithreaded = robj_to!(bool, multithreaded)?; + Ok((self.clone().0.sort_by( robj_to!(VecPLExprCol, by)?, - robj_to!(Vec, bool, descending)?, - )); - Ok(expr) + pl::SortMultipleOptions { + descending, + nulls_last, + maintain_order, + multithreaded, + }, + )) + .into()) } pub fn backward_fill(&self, limit: Nullable) -> Self { @@ -1082,8 +1098,13 @@ impl RPolarsExpr { .into()) } - fn list_get(&self, index: &RPolarsExpr) -> Self { - self.0.clone().list().get(index.clone().0).into() + fn list_get(&self, index: Robj, null_on_oob: Robj) -> RResult { + Ok(self + .0 + .clone() + .list() + .get(robj_to!(PLExprCol, index)?, robj_to!(bool, null_on_oob)?) + .into()) } fn list_join(&self, separator: Robj, ignore_nulls: Robj) -> RResult { @@ -1247,8 +1268,13 @@ impl RPolarsExpr { self.0.clone().arr().arg_max().into() } - fn arr_get(&self, index: Robj) -> RResult { - Ok(self.0.clone().arr().get(robj_to!(PLExprCol, index)?).into()) + fn arr_get(&self, index: Robj, null_on_oob: Robj) -> RResult { + Ok(self + .0 + .clone() + .arr() + .get(robj_to!(PLExprCol, index)?, robj_to!(bool, null_on_oob)?) + .into()) } fn arr_join(&self, separator: Robj, ignore_nulls: bool) -> RResult { @@ -2281,12 +2307,13 @@ impl RPolarsExpr { Ok(self.0.clone().str().explode().into()) } - pub fn str_parse_int(&self, radix: Robj, strict: Robj) -> RResult { + // TODO: rename to `str_to_integer` + pub fn str_parse_int(&self, base: Robj, strict: Robj) -> RResult { Ok(self .0 .clone() .str() - .to_integer(robj_to!(u32, radix)?, robj_to!(bool, strict)?) + .to_integer(robj_to!(PLExprCol, base)?, robj_to!(bool, strict)?) .with_fmt("str.parse_int") .into()) } diff --git a/src/rust/src/rdataframe/read_ipc.rs b/src/rust/src/rdataframe/read_ipc.rs index 88e119340..0ab4f601d 100644 --- a/src/rust/src/rdataframe/read_ipc.rs +++ b/src/rust/src/rdataframe/read_ipc.rs @@ -13,7 +13,7 @@ pub fn import_arrow_ipc( rechunk: Robj, row_name: Robj, row_index: Robj, - memmap: Robj, + memory_map: Robj, ) -> RResult { let args = ScanArgsIpc { n_rows: robj_to!(Option, usize, n_rows)?, @@ -22,7 +22,8 @@ pub fn import_arrow_ipc( row_index: robj_to!(Option, String, row_name)? .map(|name| robj_to!(u32, row_index).map(|offset| RowIndex { name, offset })) .transpose()?, - memmap: robj_to!(bool, memmap)?, + memory_map: robj_to!(bool, memory_map)?, + cloud_options: None, }; let lf = LazyFrame::scan_ipc(robj_to!(String, path)?, args) .map_err(crate::rpolarserr::polars_to_rpolars_err)?; diff --git a/src/rust/src/rdataframe/read_parquet.rs b/src/rust/src/rdataframe/read_parquet.rs index dbcba1332..392048b74 100644 --- a/src/rust/src/rdataframe/read_parquet.rs +++ b/src/rust/src/rdataframe/read_parquet.rs @@ -33,7 +33,10 @@ pub fn new_from_parquet( low_memory: robj_to!(bool, low_memory)?, cloud_options: None, use_statistics: robj_to!(bool, use_statistics)?, - hive_partitioning: robj_to!(bool, hive_partitioning)?, + hive_options: polars::io::HiveOptions { + enabled: robj_to!(bool, hive_partitioning)?, + schema: None, // TODO: implement a option to set this + }, }; pl::LazyFrame::scan_parquet(robj_to!(String, path)?, args) diff --git a/src/rust/src/rlib.rs b/src/rust/src/rlib.rs index 72bb86e23..a3b9e71c6 100644 --- a/src/rust/src/rlib.rs +++ b/src/rust/src/rlib.rs @@ -7,6 +7,7 @@ use crate::utils::robj_to_rchoice; use crate::RFnSignature; use crate::CONFIG; use extendr_api::prelude::*; +use polars::chunked_array::ops::SortMultipleOptions; use polars::prelude as pl; use std::result::Result; @@ -385,10 +386,25 @@ fn arg_where(condition: Robj) -> RResult { } #[extendr] -fn arg_sort_by(exprs: Robj, descending: Robj) -> RResult { +fn arg_sort_by( + exprs: Robj, + descending: Robj, + nulls_last: Robj, + multithreaded: Robj, + maintain_order: Robj, +) -> RResult { + let descending = robj_to!(Vec, bool, descending)?; + let nulls_last = robj_to!(bool, nulls_last)?; + let multithreaded = robj_to!(bool, multithreaded)?; + let maintain_order = robj_to!(bool, maintain_order)?; Ok(pl::arg_sort_by( robj_to!(VecPLExprCol, exprs)?, - &robj_to!(Vec, bool, descending)?, + SortMultipleOptions { + descending, + nulls_last, + multithreaded, + maintain_order, + }, ) .into()) } diff --git a/src/rust/src/series.rs b/src/rust/src/series.rs index 212690912..a72750f21 100644 --- a/src/rust/src/series.rs +++ b/src/rust/src/series.rs @@ -105,8 +105,25 @@ impl RPolarsSeries { self.0.name() } - pub fn sort_mut(&mut self, descending: bool, nulls_last: bool) -> Self { - RPolarsSeries(self.0.sort(descending, nulls_last)) + pub fn sort( + &mut self, + descending: Robj, + nulls_last: Robj, + multithreaded: Robj, + ) -> RResult { + let descending = robj_to!(bool, descending)?; + let nulls_last = robj_to!(bool, nulls_last)?; + let multithreaded = robj_to!(bool, multithreaded)?; + Ok(self + .0 + .sort( + pl::SortOptions::default() + .with_order_descending(descending) + .with_nulls_last(nulls_last) + .with_multithreaded(multithreaded), + ) + .map_err(polars_to_rpolars_err)? + .into()) } pub fn value_counts( From bcf5e2e263f2749fa134714ef9fa2ba41f916fbb Mon Sep 17 00:00:00 2001 From: eitsupi Date: Sun, 14 Apr 2024 11:44:54 +0000 Subject: [PATCH 03/14] fix: more fixes for R tests --- DESCRIPTION | 2 +- R/expr__array.R | 11 ++--- R/expr__expr.R | 28 +++++++----- R/expr__list.R | 16 ++++--- R/extendr-wrappers.R | 2 +- R/lazyframe__lazy.R | 11 ++--- R/series__series.R | 35 ++++++++------- man/DataFrame_sort.Rd | 4 +- man/ExprArr_get.Rd | 8 ++-- man/ExprArr_sort.Rd | 5 +-- man/ExprList_get.Rd | 16 ++++--- man/Expr_arg_sort.Rd | 5 +-- man/Expr_set_sorted.Rd | 4 +- man/Expr_sort.Rd | 9 ++-- man/Expr_sort_by.Rd | 21 +++++++-- man/LazyFrame_sort.Rd | 11 +++-- man/Series_set_sorted.Rd | 6 ++- man/Series_sort.Rd | 18 ++++++-- man/pl_arg_sort_by.Rd | 3 +- src/rust/src/lazy/dsl.rs | 4 +- tests/testthat/_snaps/after-wrappers.md | 6 +-- tests/testthat/test-as_polars.R | 4 +- tests/testthat/test-concat.R | 2 +- tests/testthat/test-dataframe.R | 24 +++++----- tests/testthat/test-expr_array.R | 59 +++++++++++++------------ tests/testthat/test-expr_expr.R | 14 +++--- 26 files changed, 189 insertions(+), 139 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index b86f2f53e..178bda6cf 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -118,5 +118,5 @@ Collate: 'zzz.R' Config/rextendr/version: 0.3.1 VignetteBuilder: knitr -Config/polars/LibVersion: 0.38.2 +Config/polars/LibVersion: 0.39.0 Config/polars/RustToolchainVersion: nightly-2024-03-28 diff --git a/R/expr__array.R b/R/expr__array.R index 92f11cb0e..6cc7f486a 100644 --- a/R/expr__array.R +++ b/R/expr__array.R @@ -136,14 +136,11 @@ ExprArr_unique = function(maintain_order = FALSE) .pr$Expr$arr_unique(self, main #' #' This allows to extract one value per array only. #' +#' @inherit ExprList_get return #' @param index An Expr or something coercible to an Expr, that must return a #' single index. Values are 0-indexed (so index 0 would return the first item #' of every sub-array) and negative values start from the end (index `-1` -#' returns the last item). If the index is out of bounds, it will return a -#' `null`. Strings are parsed as column names. -#' -#' @return Expr -#' @aliases arr_get +#' returns the last item). #' @examples #' df = pl$DataFrame( #' values = list(c(1, 2), c(3, 4), c(NA_real_, 6)), @@ -156,8 +153,8 @@ ExprArr_unique = function(maintain_order = FALSE) .pr$Expr$arr_unique(self, main #' val_minus_1 = pl$col("values")$arr$get(-1), #' val_oob = pl$col("values")$arr$get(10) #' ) -ExprArr_get = function(index) { - .pr$Expr$arr_get(self, index) |> +ExprArr_get = function(index, ..., null_on_oob = TRUE) { + .pr$Expr$arr_get(self, index, null_on_oob) |> unwrap("in $arr$get():") } diff --git a/R/expr__expr.R b/R/expr__expr.R index 6a8c11874..865aa78c0 100644 --- a/R/expr__expr.R +++ b/R/expr__expr.R @@ -1377,16 +1377,13 @@ Expr_mode = use_extendr_wrapper #' #' Sort this column. If used in a groupby context, the groups are sorted. #' -#' @param ... Ignored -#' @param descending Sort in descending order. When sorting by multiple columns, -#' can be specified per column by passing a vector of booleans. -#' @param nulls_last If `TRUE`, place nulls values last. +#' @inheritParams Series_sort #' @return Expr #' @examples #' pl$DataFrame(a = c(6, 1, 0, NA, Inf, NaN))$ #' with_columns(sorted = pl$col("a")$sort()) Expr_sort = function(..., descending = FALSE, nulls_last = FALSE) { - .pr$Expr$sort(self, descending, nulls_last) + .pr$Expr$sort_with(self, descending, nulls_last) } #' Top k values @@ -1478,6 +1475,7 @@ Expr_search_sorted = function(element) { .pr$Expr$search_sorted(self, wrap_e(element)) } +# TODO: rewrite `by` to `...` #' Sort Expr by order of others #' #' Sort this column by the ordering of another column, or multiple other columns. @@ -1485,7 +1483,9 @@ Expr_search_sorted = function(element) { #' #' @param by One expression or a list of expressions and/or strings (interpreted #' as column names). -#' @inheritParams Expr_sort +#' @param maintain_order A logical to indicate whether the order should be maintained +#' if elements are equal. +#' @inheritParams Series_sort #' @return Expr #' @examples #' df = pl$DataFrame( @@ -1511,12 +1511,19 @@ Expr_search_sorted = function(element) { #' df$with_columns( #' sorted = pl$col("group")$sort_by(pl$col("value1")$sort(descending = TRUE)) #' ) -Expr_sort_by = function(by, descending = FALSE) { +Expr_sort_by = function( + by, ..., descending = FALSE, + nulls_last = FALSE, + multithreaded = TRUE, + maintain_order = FALSE) { .pr$Expr$sort_by( self, wrap_elist_result(by, str_to_lit = FALSE), - result(descending) - ) |> unwrap("in $sort_by:") + descending, + nulls_last, + maintain_order, + multithreaded + ) |> unwrap("in $sort_by():") } #' Gather values by index @@ -3143,6 +3150,7 @@ Expr_cumulative_eval = function(expr, min_periods = 1L, parallel = FALSE) { #' This enables downstream code to use fast paths for sorted arrays. WARNING: #' this doesn't check whether the data is actually sorted, you have to ensure of #' that yourself. +#' @param ... Ignored. #' @param descending Sort the columns in descending order. #' @return Expr #' @examples @@ -3154,7 +3162,7 @@ Expr_cumulative_eval = function(expr, min_periods = 1L, parallel = FALSE) { #' s2 = pl$select(pl$lit(c(1, 3, 2, 4))$set_sorted()$alias("a"))$get_column("a") #' s2$sort() #' s2$flags # returns TRUE while it's not actually sorted -Expr_set_sorted = function(descending = FALSE) { +Expr_set_sorted = function(..., descending = FALSE) { self$map_batches(\(s) { .pr$Series$set_sorted_mut(s, descending) # use private to bypass mut protection s diff --git a/R/expr__list.R b/R/expr__list.R index f89345440..d6e2adb8b 100644 --- a/R/expr__list.R +++ b/R/expr__list.R @@ -112,11 +112,12 @@ ExprList_concat = function(other) { #' @param index An Expr or something coercible to an Expr, that must return a #' single index. Values are 0-indexed (so index 0 would return the first item #' of every sublist) and negative values start from the end (index `-1` -#' returns the last item). If the index is out of bounds, it will return a -#' `null`. Strings are parsed as column names. -#' -#' @return Expr -#' @aliases list_get +#' returns the last item). +#' @param ... Ignored. +#' @param null_on_oob A logical to determine the behavior if an index is out of bounds: +#' - `TRUE` (default): set as `null` +#' - `FALSE`: raise an error +#' @return [Expr][Expr_class] #' @examples #' df = pl$DataFrame( #' values = list(c(2, 2, NA), c(1, 2, 3), NA_real_, NULL), @@ -128,7 +129,10 @@ ExprList_concat = function(other) { #' val_minus_1 = pl$col("values")$list$get(-1), #' val_oob = pl$col("values")$list$get(10) #' ) -ExprList_get = function(index) .pr$Expr$list_get(self, wrap_e(index, str_to_lit = FALSE)) +ExprList_get = function(index, ..., null_on_oob = TRUE) { + .pr$Expr$list_get(self, index, null_on_oob) |> + unwrap("in $list$get():") +} #' Get several values by index in a list #' diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index 06bf14d2b..80d99652a 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -484,7 +484,7 @@ RPolarsExpr$to_physical <- function() .Call(wrap__RPolarsExpr__to_physical, self RPolarsExpr$cast <- function(data_type, strict) .Call(wrap__RPolarsExpr__cast, self, data_type, strict) -RPolarsExpr$sort <- function(descending, nulls_last) .Call(wrap__RPolarsExpr__sort, self, descending, nulls_last) +RPolarsExpr$sort_with <- function(descending, nulls_last) .Call(wrap__RPolarsExpr__sort_with, self, descending, nulls_last) RPolarsExpr$arg_sort <- function(descending, nulls_last) .Call(wrap__RPolarsExpr__arg_sort, self, descending, nulls_last) diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index 91e47e8a3..3f16adacf 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -1296,15 +1296,15 @@ LazyFrame_join = function( } -#' Sort a LazyFrame -#' @description Sort by one or more Expressions. +#' Sort the LazyFrame by the given columns +#' +#' @inheritParams Series_sort #' @param by Column(s) to sort by. Can be character vector of column names, #' a list of Expr(s) or a list with a mix of Expr(s) and column names. #' @param ... More columns to sort by as above but provided one Expr per argument. #' @param descending Logical. Sort in descending order (default is `FALSE`). This must be #' either of length 1 or a logical vector of the same length as the number of #' Expr(s) specified in `by` and `...`. -#' @param nulls_last Logical. Place `NULL`s at the end? Default is `FALSE`. #' @param maintain_order Whether the order should be maintained if elements are #' equal. If `TRUE`, streaming is not possible and performance might be worse #' since this requires a stable search. @@ -1326,10 +1326,11 @@ LazyFrame_sort = function( ..., descending = FALSE, nulls_last = FALSE, - maintain_order = FALSE) { + maintain_order = FALSE, + multithreaded = TRUE) { .pr$LazyFrame$sort_by_exprs( self, unpack_list(by, .context = "in $sort():"), err_on_named_args(...), - descending, nulls_last, maintain_order + descending, nulls_last, maintain_order, multithreaded ) |> unwrap("in $sort():") } diff --git a/R/series__series.R b/R/series__series.R index af2faa6d2..ebb496884 100644 --- a/R/series__series.R +++ b/R/series__series.R @@ -874,7 +874,7 @@ Series_is_sorted = function(descending = FALSE) { #' Set a sorted flag on a Series #' #' @inheritParams Expr_set_sorted -#' @param in_place If `TRUE`, this will set the flag mutably and return NULL. +#' @param in_place If `TRUE`, this will set the flag mutably and return `NULL`. #' Remember to use `options(polars.strictly_immutable = FALSE)` before using #' this parameter, otherwise an error will occur. If `FALSE` (default), it will #' return a cloned Series with the flag. @@ -886,46 +886,51 @@ Series_is_sorted = function(descending = FALSE) { #' @examples #' s = as_polars_series(1:4)$set_sorted() #' s$flags -Series_set_sorted = function(descending = FALSE, in_place = FALSE) { - if (in_place && polars_options()$strictly_immutable) { - stop(paste( +Series_set_sorted = function(..., descending = FALSE, in_place = FALSE) { + if (isTRUE(in_place) && polars_options()$strictly_immutable) { + Err_plain( "Using `in_place = TRUE` in `set_sorted()` breaks immutability. To enable mutable features run:\n", "`options(polars.strictly_immutable = FALSE)`" - )) + ) |> + unwrap("in $set_sorted():") } - if (!in_place) { + if (!isTRUE(in_place)) { self = self$clone() } .pr$Series$set_sorted_mut(self, descending) - if (in_place) invisible(NULL) else invisible(self) + if (isTRUE(in_place)) invisible(NULL) else invisible(self) } #' Sort a Series #' -#' @param descending Sort in descending order. -#' @inheritParams Expr_sort #' @inheritParams Series_set_sorted -#' +#' @param descending A logical. If `TRUE`, sort in descending order. +#' @param nulls_last A logical. If `TRUE`, place `null` values last insead of first. +#' @param multithreaded A logical. If `TRUE`, sort using multiple threads. #' @return [Series][Series_class] -#' #' @examples #' as_polars_series(c(1.5, NA, 1, NaN, Inf, -Inf))$sort() #' as_polars_series(c(1.5, NA, 1, NaN, Inf, -Inf))$sort(nulls_last = TRUE) -Series_sort = function(..., descending = FALSE, nulls_last = FALSE, in_place = FALSE) { +Series_sort = function( + ..., descending = FALSE, nulls_last = FALSE, multithreaded = TRUE, + in_place = FALSE) { + uw = \(res) unwrap(res, "in $sort():") if (isTRUE(in_place) && polars_options()$strictly_immutable) { - stop(paste( + Err_plain( "in place sort breaks immutability, to enable mutable features run:\n", "`options(polars.strictly_immutable = FALSE)`" - )) + ) |> + uw() } if (!isTRUE(in_place)) { self = self$clone() } - .pr$Series$sort(self, descending, nulls_last) + .pr$Series$sort(self, descending, nulls_last, multithreaded) |> + uw() } #' Convert Series to DataFrame diff --git a/man/DataFrame_sort.Rd b/man/DataFrame_sort.Rd index a352a9640..7cf630a77 100644 --- a/man/DataFrame_sort.Rd +++ b/man/DataFrame_sort.Rd @@ -22,7 +22,7 @@ a list of Expr(s) or a list with a mix of Expr(s) and column names.} either of length 1 or a logical vector of the same length as the number of Expr(s) specified in \code{by} and \code{...}.} -\item{nulls_last}{Logical. Place \code{NULL}s at the end? Default is \code{FALSE}.} +\item{nulls_last}{A logical. If \code{TRUE}, place \code{null} values last insead of first.} \item{maintain_order}{Whether the order should be maintained if elements are equal. If \code{TRUE}, streaming is not possible and performance might be worse @@ -32,7 +32,7 @@ since this requires a stable search.} DataFrame } \description{ -Sort by one or more Expressions. +Sort a DataFrame } \examples{ df = mtcars diff --git a/man/ExprArr_get.Rd b/man/ExprArr_get.Rd index 91c8d787c..bc199b2d2 100644 --- a/man/ExprArr_get.Rd +++ b/man/ExprArr_get.Rd @@ -2,20 +2,18 @@ % Please edit documentation in R/expr__array.R \name{ExprArr_get} \alias{ExprArr_get} -\alias{arr_get} \title{Get the value by index in an array} \usage{ -ExprArr_get(index) +ExprArr_get(index, ..., null_on_oob = TRUE) } \arguments{ \item{index}{An Expr or something coercible to an Expr, that must return a single index. Values are 0-indexed (so index 0 would return the first item of every sub-array) and negative values start from the end (index \code{-1} -returns the last item). If the index is out of bounds, it will return a -\code{null}. Strings are parsed as column names.} +returns the last item).} } \value{ -Expr +\link[=Expr_class]{Expr} } \description{ This allows to extract one value per array only. diff --git a/man/ExprArr_sort.Rd b/man/ExprArr_sort.Rd index 2acaabfa2..ffdd3404e 100644 --- a/man/ExprArr_sort.Rd +++ b/man/ExprArr_sort.Rd @@ -8,10 +8,9 @@ ExprArr_sort(descending = FALSE, nulls_last = FALSE) } \arguments{ -\item{descending}{Sort in descending order. When sorting by multiple columns, -can be specified per column by passing a vector of booleans.} +\item{descending}{A logical. If \code{TRUE}, sort in descending order.} -\item{nulls_last}{If \code{TRUE}, place nulls values last.} +\item{nulls_last}{A logical. If \code{TRUE}, place \code{null} values last insead of first.} } \description{ Sort values in an array diff --git a/man/ExprList_get.Rd b/man/ExprList_get.Rd index 806ba00d5..1fcbcb831 100644 --- a/man/ExprList_get.Rd +++ b/man/ExprList_get.Rd @@ -2,20 +2,26 @@ % Please edit documentation in R/expr__list.R \name{ExprList_get} \alias{ExprList_get} -\alias{list_get} \title{Get the value by index in a list} \usage{ -ExprList_get(index) +ExprList_get(index, ..., null_on_oob = TRUE) } \arguments{ \item{index}{An Expr or something coercible to an Expr, that must return a single index. Values are 0-indexed (so index 0 would return the first item of every sublist) and negative values start from the end (index \code{-1} -returns the last item). If the index is out of bounds, it will return a -\code{null}. Strings are parsed as column names.} +returns the last item).} + +\item{...}{Ignored.} + +\item{null_on_oob}{A logical to determine the behavior if an index is out of bounds: +\itemize{ +\item \code{TRUE} (default): set as \code{null} +\item \code{FALSE}: raise an error +}} } \value{ -Expr +\link[=Expr_class]{Expr} } \description{ This allows to extract one value per list only. To extract several values by diff --git a/man/Expr_arg_sort.Rd b/man/Expr_arg_sort.Rd index f045d3dad..a4fec9d9c 100644 --- a/man/Expr_arg_sort.Rd +++ b/man/Expr_arg_sort.Rd @@ -7,10 +7,9 @@ Expr_arg_sort(descending = FALSE, nulls_last = FALSE) } \arguments{ -\item{descending}{Sort in descending order. When sorting by multiple columns, -can be specified per column by passing a vector of booleans.} +\item{descending}{A logical. If \code{TRUE}, sort in descending order.} -\item{nulls_last}{If \code{TRUE}, place nulls values last.} +\item{nulls_last}{A logical. If \code{TRUE}, place \code{null} values last insead of first.} } \value{ Expr diff --git a/man/Expr_set_sorted.Rd b/man/Expr_set_sorted.Rd index f89d42bc3..38e784f80 100644 --- a/man/Expr_set_sorted.Rd +++ b/man/Expr_set_sorted.Rd @@ -4,9 +4,11 @@ \alias{Expr_set_sorted} \title{Flag an Expr as "sorted"} \usage{ -Expr_set_sorted(descending = FALSE) +Expr_set_sorted(..., descending = FALSE) } \arguments{ +\item{...}{Ignored.} + \item{descending}{Sort the columns in descending order.} } \value{ diff --git a/man/Expr_sort.Rd b/man/Expr_sort.Rd index d9db04227..214501ec3 100644 --- a/man/Expr_sort.Rd +++ b/man/Expr_sort.Rd @@ -4,13 +4,14 @@ \alias{Expr_sort} \title{Sort an Expr} \usage{ -Expr_sort(descending = FALSE, nulls_last = FALSE) +Expr_sort(..., descending = FALSE, nulls_last = FALSE) } \arguments{ -\item{descending}{Sort in descending order. When sorting by multiple columns, -can be specified per column by passing a vector of booleans.} +\item{...}{Ignored.} -\item{nulls_last}{If \code{TRUE}, place nulls values last.} +\item{descending}{A logical. If \code{TRUE}, sort in descending order.} + +\item{nulls_last}{A logical. If \code{TRUE}, place \code{null} values last insead of first.} } \value{ Expr diff --git a/man/Expr_sort_by.Rd b/man/Expr_sort_by.Rd index 3cbf86644..945d5542c 100644 --- a/man/Expr_sort_by.Rd +++ b/man/Expr_sort_by.Rd @@ -4,14 +4,29 @@ \alias{Expr_sort_by} \title{Sort Expr by order of others} \usage{ -Expr_sort_by(by, descending = FALSE) +Expr_sort_by( + by, + ..., + descending = FALSE, + nulls_last = FALSE, + multithreaded = TRUE, + maintain_order = FALSE +) } \arguments{ \item{by}{One expression or a list of expressions and/or strings (interpreted as column names).} -\item{descending}{Sort in descending order. When sorting by multiple columns, -can be specified per column by passing a vector of booleans.} +\item{...}{Ignored.} + +\item{descending}{A logical. If \code{TRUE}, sort in descending order.} + +\item{nulls_last}{A logical. If \code{TRUE}, place \code{null} values last insead of first.} + +\item{multithreaded}{A logical. If \code{TRUE}, sort using multiple threads.} + +\item{maintain_order}{A logical to indicate whether the order should be maintained +if elements are equal.} } \value{ Expr diff --git a/man/LazyFrame_sort.Rd b/man/LazyFrame_sort.Rd index d20812782..29c4d96fc 100644 --- a/man/LazyFrame_sort.Rd +++ b/man/LazyFrame_sort.Rd @@ -2,14 +2,15 @@ % Please edit documentation in R/lazyframe__lazy.R \name{LazyFrame_sort} \alias{LazyFrame_sort} -\title{Sort a LazyFrame} +\title{Sort the LazyFrame by the given columns} \usage{ LazyFrame_sort( by, ..., descending = FALSE, nulls_last = FALSE, - maintain_order = FALSE + maintain_order = FALSE, + multithreaded = TRUE ) } \arguments{ @@ -22,17 +23,19 @@ a list of Expr(s) or a list with a mix of Expr(s) and column names.} either of length 1 or a logical vector of the same length as the number of Expr(s) specified in \code{by} and \code{...}.} -\item{nulls_last}{Logical. Place \code{NULL}s at the end? Default is \code{FALSE}.} +\item{nulls_last}{A logical. If \code{TRUE}, place \code{null} values last insead of first.} \item{maintain_order}{Whether the order should be maintained if elements are equal. If \code{TRUE}, streaming is not possible and performance might be worse since this requires a stable search.} + +\item{multithreaded}{A logical. If \code{TRUE}, sort using multiple threads.} } \value{ LazyFrame } \description{ -Sort by one or more Expressions. +Sort the LazyFrame by the given columns } \examples{ df = mtcars diff --git a/man/Series_set_sorted.Rd b/man/Series_set_sorted.Rd index cd8f8b86d..a88e82cfb 100644 --- a/man/Series_set_sorted.Rd +++ b/man/Series_set_sorted.Rd @@ -4,12 +4,14 @@ \alias{Series_set_sorted} \title{Set a sorted flag on a Series} \usage{ -Series_set_sorted(descending = FALSE, in_place = FALSE) +Series_set_sorted(..., descending = FALSE, in_place = FALSE) } \arguments{ +\item{...}{Ignored.} + \item{descending}{Sort the columns in descending order.} -\item{in_place}{If \code{TRUE}, this will set the flag mutably and return NULL. +\item{in_place}{If \code{TRUE}, this will set the flag mutably and return \code{NULL}. Remember to use \code{options(polars.strictly_immutable = FALSE)} before using this parameter, otherwise an error will occur. If \code{FALSE} (default), it will return a cloned Series with the flag.} diff --git a/man/Series_sort.Rd b/man/Series_sort.Rd index 829c541ee..6d531111b 100644 --- a/man/Series_sort.Rd +++ b/man/Series_sort.Rd @@ -4,14 +4,24 @@ \alias{Series_sort} \title{Sort a Series} \usage{ -Series_sort(descending = FALSE, nulls_last = FALSE, in_place = FALSE) +Series_sort( + ..., + descending = FALSE, + nulls_last = FALSE, + multithreaded = TRUE, + in_place = FALSE +) } \arguments{ -\item{descending}{Sort in descending order.} +\item{...}{Ignored.} -\item{nulls_last}{If \code{TRUE}, place nulls values last.} +\item{descending}{A logical. If \code{TRUE}, sort in descending order.} -\item{in_place}{If \code{TRUE}, this will set the flag mutably and return NULL. +\item{nulls_last}{A logical. If \code{TRUE}, place \code{null} values last insead of first.} + +\item{multithreaded}{A logical. If \code{TRUE}, sort using multiple threads.} + +\item{in_place}{If \code{TRUE}, this will set the flag mutably and return \code{NULL}. Remember to use \code{options(polars.strictly_immutable = FALSE)} before using this parameter, otherwise an error will occur. If \code{FALSE} (default), it will return a cloned Series with the flag.} diff --git a/man/pl_arg_sort_by.Rd b/man/pl_arg_sort_by.Rd index 46ad29459..86cd764c0 100644 --- a/man/pl_arg_sort_by.Rd +++ b/man/pl_arg_sort_by.Rd @@ -10,8 +10,7 @@ pl_arg_sort_by(..., descending = FALSE) \item{...}{Column(s) to arg sort by. Can be Expr(s) or something coercible to Expr(s). Strings are parsed as column names.} -\item{descending}{Sort in descending order. When sorting by multiple columns, -can be specified per column by passing a vector of booleans.} +\item{descending}{A logical. If \code{TRUE}, sort in descending order.} } \value{ Expr diff --git a/src/rust/src/lazy/dsl.rs b/src/rust/src/lazy/dsl.rs index 772af4149..fee626b35 100644 --- a/src/rust/src/lazy/dsl.rs +++ b/src/rust/src/lazy/dsl.rs @@ -248,7 +248,7 @@ impl RPolarsExpr { .into() } - pub fn sort(&self, descending: bool, nulls_last: bool) -> Self { + pub fn sort_with(&self, descending: bool, nulls_last: bool) -> Self { self.clone() .0 .sort(SortOptions { @@ -1103,7 +1103,7 @@ impl RPolarsExpr { .0 .clone() .list() - .get(robj_to!(PLExprCol, index)?, robj_to!(bool, null_on_oob)?) + .get(robj_to!(PLExpr, index)?, robj_to!(bool, null_on_oob)?) .into()) } diff --git a/tests/testthat/_snaps/after-wrappers.md b/tests/testthat/_snaps/after-wrappers.md index d86dcf345..53f46bb1c 100644 --- a/tests/testthat/_snaps/after-wrappers.md +++ b/tests/testthat/_snaps/after-wrappers.md @@ -403,8 +403,8 @@ [253] "shrink_dtype" "shuffle" [255] "sign" "sin" [257] "sinh" "skew" - [259] "slice" "sort" - [261] "sort_by" "std" + [259] "slice" "sort_by" + [261] "sort_with" "std" [263] "str_base64_decode" "str_base64_encode" [265] "str_concat" "str_contains" [267] "str_contains_any" "str_count_matches" @@ -715,7 +715,7 @@ [35] "rem" "rename_mut" [37] "rep" "set_sorted_mut" [39] "shape" "sleep" - [41] "sort_mut" "std" + [41] "sort" "std" [43] "struct_fields" "sub" [45] "sum" "to_fmt_char" [47] "to_frame" "to_r" diff --git a/tests/testthat/test-as_polars.R b/tests/testthat/test-as_polars.R index 3427b2b8c..8e727d9b8 100644 --- a/tests/testthat/test-as_polars.R +++ b/tests/testthat/test-as_polars.R @@ -252,14 +252,14 @@ test_that("from arrow Table and ChunkedArray", { lapply(at$columns, \(x) x$num_chunks) ) - expect_grepl_error(expect_identical( + expect_identical( as_polars_df.ArrowTabular(at, rechunk = TRUE)$ select(pl$all()$map_batches(\(s) s$chunk_lengths()))$ to_list() |> lapply(length) |> unname(), lapply(at$columns, \(x) x$num_chunks) - )) + ) # #not supported yet diff --git a/tests/testthat/test-concat.R b/tests/testthat/test-concat.R index 339186c07..5852a6606 100644 --- a/tests/testthat/test-concat.R +++ b/tests/testthat/test-concat.R @@ -45,7 +45,7 @@ test_that("concat dataframe", { ) # type 'relaxed' vertical concatenation is not allowed by default - expect_grepl_error(pl$concat(l_ver[[1L]], pl$DataFrame(a = 2, b = 42L), how = "vertical"), "data types don't match") + expect_grepl_error(pl$concat(l_ver[[1L]], pl$DataFrame(a = 2, b = 42L), how = "vertical"), "cannot extend/append Int32 with Float64") # check lazy eager is identical l_ver_lazy = lapply(l_ver, \(df) df$lazy()) diff --git a/tests/testthat/test-dataframe.R b/tests/testthat/test-dataframe.R index 9f6ce63eb..b2e8d4ae7 100644 --- a/tests/testthat/test-dataframe.R +++ b/tests/testthat/test-dataframe.R @@ -798,34 +798,34 @@ test_that("as_data_frame (backward compatibility)", { test_that("sort", { df = pl$DataFrame(mtcars) - w = df$sort("mpg")$to_data_frame() - x = df$sort(pl$col("mpg"))$to_data_frame() + w = df$sort("mpg", maintain_order = TRUE)$to_data_frame() + x = df$sort(pl$col("mpg"), maintain_order = TRUE)$to_data_frame() y = mtcars[order(mtcars$mpg), ] expect_equal(x, y, ignore_attr = TRUE) - w = df$sort(pl$col("cyl"), pl$col("mpg"))$to_data_frame() - x = df$sort("cyl", "mpg")$to_data_frame() - y = df$sort(c("cyl", "mpg"))$to_data_frame() + w = df$sort(pl$col("cyl"), pl$col("mpg"), maintain_order = TRUE)$to_data_frame() + x = df$sort("cyl", "mpg", maintain_order = TRUE)$to_data_frame() + y = df$sort(c("cyl", "mpg"), maintain_order = TRUE)$to_data_frame() z = mtcars[order(mtcars$cyl, mtcars$mpg), ] expect_equal(w, x, ignore_attr = TRUE) expect_equal(w, y, ignore_attr = TRUE) expect_equal(w, z, ignore_attr = TRUE) # expr: one increasing and one decreasing - x = df$sort(-pl$col("cyl"), pl$col("hp"))$to_data_frame() + x = df$sort(-pl$col("cyl"), pl$col("hp"), maintain_order = TRUE)$to_data_frame() y = mtcars[order(-mtcars$cyl, mtcars$hp), ] expect_equal(x, y, ignore_attr = TRUE) # descending arg - w = df$sort("cyl", "mpg", descending = TRUE)$to_data_frame() - x = df$sort(c("cyl", "mpg"), descending = TRUE)$to_data_frame() + w = df$sort("cyl", "mpg", descending = TRUE, maintain_order = TRUE)$to_data_frame() + x = df$sort(c("cyl", "mpg"), descending = TRUE, maintain_order = TRUE)$to_data_frame() y = mtcars[order(-mtcars$cyl, -mtcars$mpg), ] expect_equal(w, x, ignore_attr = TRUE) expect_equal(w, y, ignore_attr = TRUE) # descending arg: vector of boolean - w = df$sort("cyl", "mpg", descending = c(TRUE, FALSE))$to_data_frame() - x = df$sort(c("cyl", "mpg"), descending = c(TRUE, FALSE))$to_data_frame() + w = df$sort("cyl", "mpg", descending = c(TRUE, FALSE), maintain_order = TRUE)$to_data_frame() + x = df$sort(c("cyl", "mpg"), descending = c(TRUE, FALSE), maintain_order = TRUE)$to_data_frame() y = mtcars[order(-mtcars$cyl, mtcars$mpg), ] expect_equal(w, x, ignore_attr = TRUE) expect_equal(w, y, ignore_attr = TRUE) @@ -834,8 +834,8 @@ test_that("sort", { df = mtcars df$mpg[1] = NA df = pl$DataFrame(df) - a = df$sort("mpg", nulls_last = TRUE)$to_data_frame() - b = df$sort("mpg", nulls_last = FALSE)$to_data_frame() + a = df$sort("mpg", nulls_last = TRUE, maintain_order = TRUE)$to_data_frame() + b = df$sort("mpg", nulls_last = FALSE, maintain_order = TRUE)$to_data_frame() expect_true(is.na(a$mpg[32])) expect_true(is.na(b$mpg[1])) diff --git a/tests/testthat/test-expr_array.R b/tests/testthat/test-expr_array.R index 7c4bab9e6..1600561a7 100644 --- a/tests/testthat/test-expr_array.R +++ b/tests/testthat/test-expr_array.R @@ -17,37 +17,38 @@ test_that("arr$sum", { ) }) -test_that("arr$max and arr$min", { - skip_if_not(polars_info()$features$nightly) +# TODO: reenable if the upstream issue is fixed +# test_that("arr$max and arr$min", { +# skip_if_not(polars_info()$features$nightly) - df = pl$DataFrame( - ints = list(1:2, c(1L, NA_integer_), c(NA_integer_, NA_integer_)), - floats = list(c(1, 2), c(1, NA_real_), c(NA_real_, NA_real_)), - schema = list( - ints = pl$Array(pl$Int32, 2), - floats = pl$Array(pl$Float32, 2) - ) - ) - # max --- - expect_identical( - df$select(pl$col("ints")$arr$max())$to_list(), - list(ints = c(2L, 1L, NA_integer_)) - ) - expect_identical( - df$select(pl$col("floats")$arr$max())$to_list(), - list(floats = c(2, 1, NA_real_)) - ) +# df = pl$DataFrame( +# ints = list(1:2, c(1L, NA_integer_), c(NA_integer_, NA_integer_)), +# floats = list(c(1, 2), c(1, NA_real_), c(NA_real_, NA_real_)), +# schema = list( +# ints = pl$Array(pl$Int32, 2), +# floats = pl$Array(pl$Float32, 2) +# ) +# ) +# # max --- +# expect_identical( +# df$select(pl$col("ints")$arr$max())$to_list(), +# list(ints = c(2L, 1L, NA_integer_)) +# ) +# expect_identical( +# df$select(pl$col("floats")$arr$max())$to_list(), +# list(floats = c(2, 1, NA_real_)) +# ) - # min --- - expect_identical( - df$select(pl$col("ints")$arr$min())$to_list(), - list(ints = c(1L, 1L, NA_integer_)) - ) - expect_identical( - df$select(pl$col("floats")$arr$min())$to_list(), - list(floats = c(1, 1, NA_real_)) - ) -}) +# # min --- +# expect_identical( +# df$select(pl$col("ints")$arr$min())$to_list(), +# list(ints = c(1L, 1L, NA_integer_)) +# ) +# expect_identical( +# df$select(pl$col("floats")$arr$min())$to_list(), +# list(floats = c(1, 1, NA_real_)) +# ) +# }) test_that("arr$max and arr$min error if the nightly feature is false", { skip_if(polars_info()$features$nightly) diff --git a/tests/testthat/test-expr_expr.R b/tests/testthat/test-expr_expr.R index b3e4b39bb..b5ef6d216 100644 --- a/tests/testthat/test-expr_expr.R +++ b/tests/testthat/test-expr_expr.R @@ -705,7 +705,7 @@ test_that("Expr_append", { expect_grepl_error( pl$DataFrame(list())$select(pl$lit("Bob")$append(FALSE, upcast = FALSE)), - "match" + "cannot extend/append String with Boolean" ) }) @@ -998,8 +998,8 @@ test_that("sort_by", { ) expect_grepl_error(pl$lit(1:4)$sort_by(1)$to_r(), "different length") - expect_grepl_error(pl$lit(1:4)$sort_by("blop")$to_r(), "column 'blop' not available in 'DataFrame'") - expect_grepl_error(pl$lit(1:4)$sort_by("blop")$to_r(), "column 'blop' not available in 'DataFrame'") + expect_grepl_error(pl$lit(1:4)$sort_by("blop")$to_r(), "field not found") + expect_grepl_error(pl$lit(1:4)$sort_by("blop")$to_r(), "field not found") expect_grepl_error(pl$lit(1:4)$sort_by(df)$to_r(), "not convertible into.* Expr") expect_grepl_error(pl$lit(1:4)$sort_by(df)$to_r(), "not convertible into.* Expr") @@ -2229,10 +2229,10 @@ test_that("entropy", { r_entropy(1:3, base = 2, normalize = FALSE) ) - # TODO: https://github.com/pola-rs/polars/issues/15350 - pl$select(pl$lit(c("a", "b", "b", "c", "c", "c"))$entropy(base = 2)) - - pl$lit(c("a", "a", "a"))$entropy(base = 2, normalize = FALSE)$to_r() + expect_grepl_error( + pl$select(pl$lit(c("a", "b", "b", "c", "c", "c"))$entropy(base = 2)), + "expected numerical input" + ) }) From 8d46fbee2171badfe214c465a6e3785d9aca69a4 Mon Sep 17 00:00:00 2001 From: eitsupi Date: Sun, 14 Apr 2024 13:32:36 +0000 Subject: [PATCH 04/14] fix: try to fix windows build issue --- src/Makevars.win | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Makevars.win b/src/Makevars.win index 97f244414..e2e875b4c 100644 --- a/src/Makevars.win +++ b/src/Makevars.win @@ -6,7 +6,7 @@ LIBNAME = libr_polars.a TARGET_DIR = $(CURDIR)/rust/target LIBDIR = $(TARGET_DIR)/$(TARGET)/$(LIBR_POLARS_PROFILE) STATLIB = $(LIBDIR)/$(LIBNAME) -PKG_LIBS = -L$(LIBDIR) -lr_polars -lws2_32 -ladvapi32 -luserenv -lbcrypt -lole32 -lntdll -lpsapi -liphlpapi -lpdh -lpowrprof -loleaut32 -lnetapi32 -lsecur32 -t +PKG_LIBS = -L$(LIBDIR) -lr_polars -lws2_32 -ladvapi32 -luserenv -lbcrypt -lole32 -lntdll -lpsapi -liphlpapi -lpdh -lpowrprof -loleaut32 -lnetapi32 -lsecur32 -lsynchronization -t # Rtools42 doesn't have the linker in the location that cargo expects, so we # need to overwrite it via configuration. From 181d3eb688292c3d7e05e5d13f3c7160dd79f923 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 14 Apr 2024 17:29:17 +0200 Subject: [PATCH 05/14] first $list$first() and $list$last() --- R/expr__list.R | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/R/expr__list.R b/R/expr__list.R index d6e2adb8b..e0b396dd9 100644 --- a/R/expr__list.R +++ b/R/expr__list.R @@ -200,7 +200,10 @@ ExprList_gather_every = function(n, offset = 0) { #' df$with_columns( #' first = pl$col("a")$list$first() #' ) -ExprList_first = function() .pr$Expr$list_get(self, wrap_e(0L, str_to_lit = FALSE)) +ExprList_first = function() { + .pr$Expr$list_get(self, 0, null_on_oob = TRUE) |> + unwrap("in $list$first():") +} #' Get the last value in a list #' @@ -211,7 +214,10 @@ ExprList_first = function() .pr$Expr$list_get(self, wrap_e(0L, str_to_lit = FALS #' df$with_columns( #' last = pl$col("a")$list$last() #' ) -ExprList_last = function() .pr$Expr$list_get(self, wrap_e(-1L, str_to_lit = FALSE)) +ExprList_last = function() { + .pr$Expr$list_get(self, -1, null_on_oob = TRUE) |> + unwrap("in $list$last():") +} #' Check if list contains a given value #' From 988726504b474f9cf24e968eba7da99ac6a78643 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 14 Apr 2024 17:35:14 +0200 Subject: [PATCH 06/14] fix arg_sort_by --- R/functions__lazy.R | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/R/functions__lazy.R b/R/functions__lazy.R index 2cafe1caf..a127f393c 100644 --- a/R/functions__lazy.R +++ b/R/functions__lazy.R @@ -1240,6 +1240,8 @@ pl_arg_where = function(condition) { #' @param ... Column(s) to arg sort by. Can be Expr(s) or something coercible #' to Expr(s). Strings are parsed as column names. #' @inheritParams Expr_sort +#' @inheritParams Series_sort +#' @inheritParams LazyFrame_sort #' #' @return Expr #' @seealso [$arg_sort()][Expr_arg_sort()] to find the row indices that would @@ -1259,7 +1261,14 @@ pl_arg_where = function(condition) { #' df$with_columns( #' arg_sort_a = pl$arg_sort_by(pl$col("a") * -1) #' ) -pl_arg_sort_by = function(..., descending = FALSE) { +pl_arg_sort_by = function( + ..., + descending = FALSE, + nulls_last = FALSE, + multithreaded = TRUE, + maintain_order = FALSE + ) { + dots = list2(...) # The first argument must be a column, not columns @@ -1268,7 +1277,7 @@ pl_arg_sort_by = function(..., descending = FALSE) { dots = unlist(dots, recursive = FALSE) } - arg_sort_by(dots, descending) |> + arg_sort_by(dots, descending = descending, nulls_last = nulls_last, multithreaded = multithreaded, maintain_order = maintain_order) |> unwrap("in pl$arg_sort_by():") } From b097a01d605c55bc53ef50299dc1940e312f42f4 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 14 Apr 2024 17:38:53 +0200 Subject: [PATCH 07/14] update error messages [skip ci] --- tests/testthat/test-expr_string.R | 4 ++-- tests/testthat/test-series-sub-namespace.R | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/testthat/test-expr_string.R b/tests/testthat/test-expr_string.R index dc15bf940..4c7c45900 100644 --- a/tests/testthat/test-expr_string.R +++ b/tests/testthat/test-expr_string.R @@ -494,7 +494,7 @@ test_that("str$count_matches", { expect_grepl_error( df$select(pl$col("foo")$str$count_matches(5)), - "data types don't match" + "invalid series dtype" ) df2 = pl$DataFrame(foo = c("hello", "hi there"), pat = c("ell", "e")) @@ -527,7 +527,7 @@ test_that("str$split", { expect_grepl_error( pl$DataFrame(pl$lit("42")$str$split(by = 42L, inclusive = TRUE)), - "data types don't match" + "invalid series dtype" ) expect_grepl_error( diff --git a/tests/testthat/test-series-sub-namespace.R b/tests/testthat/test-series-sub-namespace.R index 1fc6a7cd8..275c364e6 100644 --- a/tests/testthat/test-series-sub-namespace.R +++ b/tests/testthat/test-series-sub-namespace.R @@ -107,6 +107,6 @@ test_that("$struct$fields", { ) expect_grepl_error( as_polars_series(1:3)$struct$fields, - "data types don't match" + "invalid series dtype" ) }) From 759b396cae751ed5506e4b2f05e2890761884cdb Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 14 Apr 2024 12:09:54 -0400 Subject: [PATCH 08/14] fix test for group_by_dynamic() --- tests/testthat/test-datatype.R | 8 ++++---- tests/testthat/test-groupby.R | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/testthat/test-datatype.R b/tests/testthat/test-datatype.R index fd64b65a9..55b483f8f 100644 --- a/tests/testthat/test-datatype.R +++ b/tests/testthat/test-datatype.R @@ -63,14 +63,14 @@ test_that("POSIXct data conversion", { )$to_r() ) - non_exsitent_time_chr = "2020-03-08 02:00:00" + non_existent_time_chr = "2020-03-08 02:00:00" ambiguous_time_chr = "2020-11-01 01:00:00" expect_identical( - pl$lit(as.POSIXct(non_exsitent_time_chr))$to_r(), - as.POSIXct(non_exsitent_time_chr) + pl$lit(as.POSIXct(non_existent_time_chr))$to_r(), + as.POSIXct(non_existent_time_chr) ) expect_grepl_error( - pl$lit(non_exsitent_time_chr)$str$strptime(pl$Datetime(), "%F %T")$to_r(), + pl$lit(non_existent_time_chr)$str$strptime(pl$Datetime(), "%F %T")$to_r(), "non-existent" ) expect_grepl_error( diff --git a/tests/testthat/test-groupby.R b/tests/testthat/test-groupby.R index 68ef7d2e8..9d789d6dd 100644 --- a/tests/testthat/test-groupby.R +++ b/tests/testthat/test-groupby.R @@ -441,7 +441,7 @@ test_that("group_by_dynamic for LazyFrame: arg 'offset' works", { "2020-01-01", "2020-01-01", "2020-01-01", "2020-01-02", "2020-01-03", "2020-01-08" ), - n = c(3, 7, 5, 9, 2, 1) + n = c(3, 10, 5, 9, 2, 1) )$with_columns( pl$col("dt")$str$strptime(pl$Date, format = NULL)$set_sorted() ) @@ -453,7 +453,7 @@ test_that("group_by_dynamic for LazyFrame: arg 'offset' works", { expect_equal( actual[, "n"], - c(5.5, 1) + c(6, 5.5, 1) ) }) From 7f61666537163b5e5e5a97baf7065f7c639f394f Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 14 Apr 2024 15:11:32 -0400 Subject: [PATCH 09/14] style, redoc --- R/functions__lazy.R | 4 +--- man/pl_arg_sort_by.Rd | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/R/functions__lazy.R b/R/functions__lazy.R index a127f393c..ae26a65f6 100644 --- a/R/functions__lazy.R +++ b/R/functions__lazy.R @@ -1266,9 +1266,7 @@ pl_arg_sort_by = function( descending = FALSE, nulls_last = FALSE, multithreaded = TRUE, - maintain_order = FALSE - ) { - + maintain_order = FALSE) { dots = list2(...) # The first argument must be a column, not columns diff --git a/man/pl_arg_sort_by.Rd b/man/pl_arg_sort_by.Rd index 86cd764c0..bfe135b5c 100644 --- a/man/pl_arg_sort_by.Rd +++ b/man/pl_arg_sort_by.Rd @@ -4,13 +4,27 @@ \alias{pl_arg_sort_by} \title{Return the row indices that would sort the columns} \usage{ -pl_arg_sort_by(..., descending = FALSE) +pl_arg_sort_by( + ..., + descending = FALSE, + nulls_last = FALSE, + multithreaded = TRUE, + maintain_order = FALSE +) } \arguments{ \item{...}{Column(s) to arg sort by. Can be Expr(s) or something coercible to Expr(s). Strings are parsed as column names.} \item{descending}{A logical. If \code{TRUE}, sort in descending order.} + +\item{nulls_last}{A logical. If \code{TRUE}, place \code{null} values last insead of first.} + +\item{multithreaded}{A logical. If \code{TRUE}, sort using multiple threads.} + +\item{maintain_order}{Whether the order should be maintained if elements are +equal. If \code{TRUE}, streaming is not possible and performance might be worse +since this requires a stable search.} } \value{ Expr From 8f05dfea1c39c365960a1f161777e34d37f1b101 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 14 Apr 2024 15:34:22 -0400 Subject: [PATCH 10/14] redoc --- R/expr__array.R | 2 +- R/expr__list.R | 7 +++---- man/ExprArr_get.Rd | 5 +++++ man/ExprList_gather.Rd | 3 ++- man/ExprList_get.Rd | 7 ++----- 5 files changed, 13 insertions(+), 11 deletions(-) diff --git a/R/expr__array.R b/R/expr__array.R index 6cc7f486a..7c36f4778 100644 --- a/R/expr__array.R +++ b/R/expr__array.R @@ -136,7 +136,7 @@ ExprArr_unique = function(maintain_order = FALSE) .pr$Expr$arr_unique(self, main #' #' This allows to extract one value per array only. #' -#' @inherit ExprList_get return +#' @inherit ExprList_get params return #' @param index An Expr or something coercible to an Expr, that must return a #' single index. Values are 0-indexed (so index 0 would return the first item #' of every sub-array) and negative values start from the end (index `-1` diff --git a/R/expr__list.R b/R/expr__list.R index e0b396dd9..65d85f117 100644 --- a/R/expr__list.R +++ b/R/expr__list.R @@ -114,9 +114,8 @@ ExprList_concat = function(other) { #' of every sublist) and negative values start from the end (index `-1` #' returns the last item). #' @param ... Ignored. -#' @param null_on_oob A logical to determine the behavior if an index is out of bounds: -#' - `TRUE` (default): set as `null` -#' - `FALSE`: raise an error +#' @param null_on_oob If `TRUE`, return `null` if an index is out of bounds. +#' Otherwise, raise an error. #' @return [Expr][Expr_class] #' @examples #' df = pl$DataFrame( @@ -144,7 +143,7 @@ ExprList_get = function(index, ..., null_on_oob = TRUE) { #' first item of every sublist) and negative values start from the end (index #' `-1` returns the last item). If the index is out of bounds, it will return #' a `null`. Strings are parsed as column names. -#' @param null_on_oob Return a `null` value if index is out of bounds. +#' @inheritParams ExprList_get #' #' @return Expr #' @aliases list_gather diff --git a/man/ExprArr_get.Rd b/man/ExprArr_get.Rd index bc199b2d2..aaba152e0 100644 --- a/man/ExprArr_get.Rd +++ b/man/ExprArr_get.Rd @@ -11,6 +11,11 @@ ExprArr_get(index, ..., null_on_oob = TRUE) single index. Values are 0-indexed (so index 0 would return the first item of every sub-array) and negative values start from the end (index \code{-1} returns the last item).} + +\item{...}{Ignored.} + +\item{null_on_oob}{If \code{TRUE}, return \code{null} if an index is out of bounds. +Otherwise, raise an error.} } \value{ \link[=Expr_class]{Expr} diff --git a/man/ExprList_gather.Rd b/man/ExprList_gather.Rd index c247b022d..ae6765620 100644 --- a/man/ExprList_gather.Rd +++ b/man/ExprList_gather.Rd @@ -14,7 +14,8 @@ first item of every sublist) and negative values start from the end (index \code{-1} returns the last item). If the index is out of bounds, it will return a \code{null}. Strings are parsed as column names.} -\item{null_on_oob}{Return a \code{null} value if index is out of bounds.} +\item{null_on_oob}{If \code{TRUE}, return \code{null} if an index is out of bounds. +Otherwise, raise an error.} } \value{ Expr diff --git a/man/ExprList_get.Rd b/man/ExprList_get.Rd index 1fcbcb831..58933a080 100644 --- a/man/ExprList_get.Rd +++ b/man/ExprList_get.Rd @@ -14,11 +14,8 @@ returns the last item).} \item{...}{Ignored.} -\item{null_on_oob}{A logical to determine the behavior if an index is out of bounds: -\itemize{ -\item \code{TRUE} (default): set as \code{null} -\item \code{FALSE}: raise an error -}} +\item{null_on_oob}{If \code{TRUE}, return \code{null} if an index is out of bounds. +Otherwise, raise an error.} } \value{ \link[=Expr_class]{Expr} From 5f9fb8155729237031b128d53ca6306444bb4f42 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 14 Apr 2024 16:02:53 -0400 Subject: [PATCH 11/14] duplicated test --- tests/testthat/test-expr_expr.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/testthat/test-expr_expr.R b/tests/testthat/test-expr_expr.R index b5ef6d216..0b55cc156 100644 --- a/tests/testthat/test-expr_expr.R +++ b/tests/testthat/test-expr_expr.R @@ -999,8 +999,6 @@ test_that("sort_by", { expect_grepl_error(pl$lit(1:4)$sort_by(1)$to_r(), "different length") expect_grepl_error(pl$lit(1:4)$sort_by("blop")$to_r(), "field not found") - expect_grepl_error(pl$lit(1:4)$sort_by("blop")$to_r(), "field not found") - expect_grepl_error(pl$lit(1:4)$sort_by(df)$to_r(), "not convertible into.* Expr") expect_grepl_error(pl$lit(1:4)$sort_by(df)$to_r(), "not convertible into.* Expr") # this test is minimal, if polars give better documentation on behavior, expand the test. From 8bb0a4c60bbb27b7cc4ab4fabdbddd1a613f3732 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 14 Apr 2024 16:03:39 -0400 Subject: [PATCH 12/14] missing arg renaming --- R/expr__string.R | 6 +++--- man/ExprStr_parse_int.Rd | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/R/expr__string.R b/R/expr__string.R index 5f625079d..38a5e3e74 100644 --- a/R/expr__string.R +++ b/R/expr__string.R @@ -870,7 +870,7 @@ ExprStr_explode = function() { #' #' @description Parse integers with base 2 by default. #' @keywords ExprStr -#' @param radix Positive integer which is the base of the string we are parsing. +#' @param base Positive integer which is the base of the string we are parsing. #' Default is 2. #' @param strict If `TRUE` (default), integer overflow will raise an error. #' Otherwise, they will be converted to `null`. @@ -883,8 +883,8 @@ ExprStr_explode = function() { #' # Convert to null if the string is not a valid integer when `strict = FALSE` #' df = pl$DataFrame(x = c("1", "2", "foo")) #' df$select(pl$col("x")$str$parse_int(10, FALSE)) -ExprStr_parse_int = function(radix = 2, strict = TRUE) { - .pr$Expr$str_parse_int(self, radix, strict) |> unwrap("in str$parse_int():") +ExprStr_parse_int = function(base = 2, strict = TRUE) { + .pr$Expr$str_parse_int(self, base, strict) |> unwrap("in str$parse_int():") } #' Returns string values in reversed order diff --git a/man/ExprStr_parse_int.Rd b/man/ExprStr_parse_int.Rd index 8df0b604d..fdf599e16 100644 --- a/man/ExprStr_parse_int.Rd +++ b/man/ExprStr_parse_int.Rd @@ -4,10 +4,10 @@ \alias{ExprStr_parse_int} \title{Parse integers with base radix from strings} \usage{ -ExprStr_parse_int(radix = 2, strict = TRUE) +ExprStr_parse_int(base = 2, strict = TRUE) } \arguments{ -\item{radix}{Positive integer which is the base of the string we are parsing. +\item{base}{Positive integer which is the base of the string we are parsing. Default is 2.} \item{strict}{If \code{TRUE} (default), integer overflow will raise an error. From 345c29679f093a5b66eee61ed655dd6647b98733 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 14 Apr 2024 16:04:09 -0400 Subject: [PATCH 13/14] bump news --- NEWS.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index ae46792a1..5033dba85 100644 --- a/NEWS.md +++ b/NEWS.md @@ -63,6 +63,7 @@ - In `$dt$convert_time_zone()` and `$dt$replace_time_zone()`, the `tz` argument is renamed to `time_zone` (#944). - In `$str$strptime()`, the argument `datatype` is renamed to `dtype` (#939). + - In `$str$parse_int()`, argument `radix` is renamed to `base` (#1034). 2. Change in the way arguments are passed: @@ -85,6 +86,8 @@ `$str$to_time()`, all arguments (except the first one) must be named (#939). - In `pl$date_range()`, the arguments `closed`, `time_unit`, and `time_zone` must be named (#950). + - In `$set_sorted()` and `$sort_by()`, argument `descending` must be named + (#1034). - In `pl$Series()`, using positional arguments throws a warning, since the argument positions will be changed in the future (#966). @@ -144,6 +147,7 @@ early stage of this package and does not exist in other language APIs (#1028). - The following deprecated functions are now removed: `pl$threadpool_size()`, `$with_row_count()`, `$with_row_count()` (#965). +- In `$group_by_dynamic()`, the first datapoint is always preserved (#1034). ### New features @@ -181,6 +185,9 @@ when a datetime doesn't exist. - `mapping_strategy` in `$over()` (#984, #988). - `raise_if_undetermined` in `$meta$output_name()` (#961). + - `null_on_oob` in `$arr$get()` and `$list$get()` to determine what happens + when the index is out of bounds (#1034). + - `nulls_last`, `multithreaded`, and `maintain_order` in `$sort_by()` (#1034). - Other: @@ -188,7 +195,7 @@ more classes to Series properly (#1015). - Export the `Duration` datatype (#955). - New active binding `$struct$fields` (#1002). - - rust-polars is updated to 0.38.3 (#937). + - rust-polars is updated to 0.39.0 (#937, #1034). ### Bug fixes From 77db6358404759bf9f2150bb2c7c38f5743b722d Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Sun, 14 Apr 2024 16:09:16 -0400 Subject: [PATCH 14/14] trailing ws --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 5033dba85..aca0f8967 100644 --- a/NEWS.md +++ b/NEWS.md @@ -86,7 +86,7 @@ `$str$to_time()`, all arguments (except the first one) must be named (#939). - In `pl$date_range()`, the arguments `closed`, `time_unit`, and `time_zone` must be named (#950). - - In `$set_sorted()` and `$sort_by()`, argument `descending` must be named + - In `$set_sorted()` and `$sort_by()`, argument `descending` must be named (#1034). - In `pl$Series()`, using positional arguments throws a warning, since the argument positions will be changed in the future (#966).