From 3e418dd05d4f84472bca4d80902e1b7476f0e0d4 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 2 Dec 2024 13:13:16 -0500 Subject: [PATCH 01/78] Move make_strings_column benchmark to nvbench (#17340) Moves the `cpp/benchmarks/string/factory.cu` implementation from google-bench to nvbench. Also renames to `.cpp` by recoding without device code. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17340 --- cpp/benchmarks/CMakeLists.txt | 3 +- cpp/benchmarks/string/factory.cpp | 60 ++++++++++++++++++++ cpp/benchmarks/string/factory.cu | 92 ------------------------------- 3 files changed, 61 insertions(+), 94 deletions(-) create mode 100644 cpp/benchmarks/string/factory.cpp delete mode 100644 cpp/benchmarks/string/factory.cu diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index d3de9b39977..8e5ea900efa 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -360,8 +360,6 @@ ConfigureNVBench( # ################################################################################################## # * strings benchmark ------------------------------------------------------------------- -ConfigureBench(STRINGS_BENCH string/factory.cu) - ConfigureNVBench( STRINGS_NVBENCH string/case.cpp @@ -377,6 +375,7 @@ ConfigureNVBench( string/copy_range.cpp string/count.cpp string/extract.cpp + string/factory.cpp string/filter.cpp string/find.cpp string/find_multiple.cpp diff --git a/cpp/benchmarks/string/factory.cpp b/cpp/benchmarks/string/factory.cpp new file mode 100644 index 00000000000..03870b0ae23 --- /dev/null +++ b/cpp/benchmarks/string/factory.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include + +#include + +#include + +#include + +static void bench_factory(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); + + data_profile const profile = data_profile_builder().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); + auto const sv = cudf::strings_column_view(column->view()); + + auto stream = cudf::get_default_stream(); + auto mr = cudf::get_current_device_resource_ref(); + auto d_strings = cudf::strings::detail::create_string_vector_from_column(sv, stream, mr); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + auto chars_size = sv.chars_size(stream); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(chars_size); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::make_strings_column(d_strings, cudf::string_view{nullptr, 0}); + }); +} + +NVBENCH_BENCH(bench_factory) + .set_name("factory") + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/factory.cu b/cpp/benchmarks/string/factory.cu deleted file mode 100644 index c4e74c4d97e..00000000000 --- a/cpp/benchmarks/string/factory.cu +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "string_bench_args.hpp" - -#include -#include -#include - -#include - -#include -#include -#include - -#include - -#include -#include -#include - -#include - -namespace { -using string_pair = thrust::pair; -struct string_view_to_pair { - __device__ string_pair operator()(thrust::pair const& p) - { - return (p.second) ? string_pair{p.first.data(), p.first.size_bytes()} : string_pair{nullptr, 0}; - } -}; -} // namespace - -class StringsFactory : public cudf::benchmark {}; - -static void BM_factory(benchmark::State& state) -{ - cudf::size_type const n_rows{static_cast(state.range(0))}; - cudf::size_type const max_str_length{static_cast(state.range(1))}; - data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); - auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); - auto d_column = cudf::column_device_view::create(column->view()); - rmm::device_uvector pairs(d_column->size(), cudf::get_default_stream()); - thrust::transform(thrust::device, - d_column->pair_begin(), - d_column->pair_end(), - pairs.data(), - string_view_to_pair{}); - - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - cudf::make_strings_column(pairs, cudf::get_default_stream()); - } - - cudf::strings_column_view input(column->view()); - state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream())); -} - -static void generate_bench_args(benchmark::internal::Benchmark* b) -{ - int const min_rows = 1 << 12; - int const max_rows = 1 << 24; - int const row_mult = 8; - int const min_rowlen = 1 << 5; - int const max_rowlen = 1 << 13; - int const len_mult = 4; - generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); -} - -#define STRINGS_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(StringsFactory, name) \ - (::benchmark::State & st) { BM_factory(st); } \ - BENCHMARK_REGISTER_F(StringsFactory, name) \ - ->Apply(generate_bench_args) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -STRINGS_BENCHMARK_DEFINE(factory) From 5190b4460ba86151521de9f4415c5eb55781371e Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 2 Dec 2024 18:42:47 +0000 Subject: [PATCH 02/78] Temporarily skip tests due to dask/distributed#8953 (#17472) Temporarily skip tests failing due to an upstream dask change. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17472 --- .../custreamz/tests/test_dataframes.py | 56 ++++++++++++++++--- 1 file changed, 49 insertions(+), 7 deletions(-) diff --git a/python/custreamz/custreamz/tests/test_dataframes.py b/python/custreamz/custreamz/tests/test_dataframes.py index 8c0130d2818..6905044039c 100644 --- a/python/custreamz/custreamz/tests/test_dataframes.py +++ b/python/custreamz/custreamz/tests/test_dataframes.py @@ -216,7 +216,13 @@ def test_set_index(): assert_eq(b[0], df.set_index(df.y + 1)) -def test_binary_stream_operators(stream): +def test_binary_stream_operators(request, stream): + request.applymarker( + pytest.mark.xfail( + isinstance(stream, DaskStream), + reason="https://github.com/dask/distributed/issues/8953", + ) + ) df = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) expected = df.x + df.y @@ -242,7 +248,13 @@ def test_index(stream): assert_eq(L[1], df.index + 5) -def test_pair_arithmetic(stream): +def test_pair_arithmetic(request, stream): + request.applymarker( + pytest.mark.xfail( + isinstance(stream, DaskStream), + reason="https://github.com/dask/distributed/issues/8953", + ) + ) df = cudf.DataFrame({"x": list(range(10)), "y": [1] * 10}) a = DataFrame(example=df.iloc[:0], stream=stream) @@ -255,7 +267,13 @@ def test_pair_arithmetic(stream): assert_eq(cudf.concat(L), (df.x + df.y) * 2) -def test_getitem(stream): +def test_getitem(request, stream): + request.applymarker( + pytest.mark.xfail( + isinstance(stream, DaskStream), + reason="https://github.com/dask/distributed/issues/8953", + ) + ) df = cudf.DataFrame({"x": list(range(10)), "y": [1] * 10}) a = DataFrame(example=df.iloc[:0], stream=stream) @@ -332,7 +350,13 @@ def test_repr_html(stream): assert "1" in html -def test_setitem(stream): +def test_setitem(request, stream): + request.applymarker( + pytest.mark.xfail( + isinstance(stream, DaskStream), + reason="https://github.com/dask/distributed/issues/8953", + ) + ) df = cudf.DataFrame({"x": list(range(10)), "y": [1] * 10}) sdf = DataFrame(example=df.iloc[:0], stream=stream) @@ -356,7 +380,13 @@ def test_setitem(stream): assert_eq(L[-1], df.mean()) -def test_setitem_overwrites(stream): +def test_setitem_overwrites(request, stream): + request.applymarker( + pytest.mark.xfail( + isinstance(stream, DaskStream), + reason="https://github.com/dask/distributed/issues/8953", + ) + ) df = cudf.DataFrame({"x": list(range(10))}) sdf = DataFrame(example=df.iloc[:0], stream=stream) stream = sdf.stream @@ -413,8 +443,14 @@ def test_setitem_overwrites(stream): ], ) def test_rolling_count_aggregations( - op, window, m, pre_get, post_get, kwargs, stream + request, op, window, m, pre_get, post_get, kwargs, stream ): + request.applymarker( + pytest.mark.xfail( + isinstance(stream, DaskStream) and len(kwargs) == 0, + reason="https://github.com/dask/distributed/issues/8953", + ) + ) index = pd.DatetimeIndex( pd.date_range("2000-01-01", "2000-01-03", freq="1h") ) @@ -808,7 +844,13 @@ def test_reductions_with_start_state(stream): assert output2[0] == 360 -def test_rolling_aggs_with_start_state(stream): +def test_rolling_aggs_with_start_state(request, stream): + request.applymarker( + pytest.mark.xfail( + isinstance(stream, DaskStream), + reason="https://github.com/dask/distributed/issues/8953", + ) + ) example = cudf.DataFrame({"name": [], "amount": []}, dtype="float64") sdf = DataFrame(stream, example=example) output0 = ( From 68848673e879436139484461508fab8c1b4d021a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 2 Dec 2024 11:49:38 -0800 Subject: [PATCH 03/78] Remove cudf._lib.replace in favor of inlining pylibcudf (#17428) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/17428 --- python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/replace.pyx | 193 -------------------- python/cudf/cudf/core/column/categorical.py | 10 +- python/cudf/cudf/core/column/column.py | 53 +++++- python/cudf/cudf/core/column/numerical.py | 8 +- python/cudf/cudf/core/column/string.py | 2 +- 7 files changed, 54 insertions(+), 214 deletions(-) delete mode 100644 python/cudf/cudf/_lib/replace.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 8ed5d5b896c..de483b3070d 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -25,7 +25,6 @@ set(cython_sources orc.pyx parquet.pyx reduce.pyx - replace.pyx round.pyx scalar.pyx sort.pyx diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index b71c5ea73d6..ee1bd13f2c4 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -13,7 +13,6 @@ orc, parquet, reduce, - replace, round, sort, stream_compaction, diff --git a/python/cudf/cudf/_lib/replace.pyx b/python/cudf/cudf/_lib/replace.pyx deleted file mode 100644 index b50c6dd25e3..00000000000 --- a/python/cudf/cudf/_lib/replace.pyx +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.api.types import is_scalar -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar - -import pylibcudf - -from cudf._lib.scalar import as_device_scalar - - -@acquire_spill_lock() -def replace(Column input_col, Column values_to_replace, - Column replacement_values): - """ - Replaces values from values_to_replace with corresponding value from - replacement_values in input_col - - Parameters - ---------- - input_col : Column whose value will be updated - values_to_replace : Column with values which needs to be replaced - replacement_values : Column with values which will replace - """ - - return Column.from_pylibcudf( - pylibcudf.replace.find_and_replace_all( - input_col.to_pylibcudf(mode="read"), - values_to_replace.to_pylibcudf(mode="read"), - replacement_values.to_pylibcudf(mode="read"), - ) - ) - - -@acquire_spill_lock() -def replace_nulls_column(Column input_col, Column replacement_values): - """ - Replaces null values in input_col with corresponding values from - replacement_values - - Parameters - ---------- - input_col : Column whose value will be updated - replacement_values : Column with values which will replace nulls - """ - return Column.from_pylibcudf( - pylibcudf.replace.replace_nulls( - input_col.to_pylibcudf(mode="read"), - replacement_values.to_pylibcudf(mode="read"), - ) - ) - - -@acquire_spill_lock() -def replace_nulls_scalar(Column input_col, DeviceScalar replacement_value): - """ - Replaces null values in input_col with replacement_value - - Parameters - ---------- - input_col : Column whose value will be updated - replacement_value : DeviceScalar with value which will replace nulls - """ - return Column.from_pylibcudf( - pylibcudf.replace.replace_nulls( - input_col.to_pylibcudf(mode="read"), - replacement_value.c_value, - ) - ) - - -@acquire_spill_lock() -def replace_nulls_fill(Column input_col, object method): - """ - Replaces null values in input_col with replacement_value - - Parameters - ---------- - input_col : Column whose value will be updated - method : 'ffill' or 'bfill' - """ - return Column.from_pylibcudf( - pylibcudf.replace.replace_nulls( - input_col.to_pylibcudf(mode="read"), - pylibcudf.replace.ReplacePolicy.PRECEDING - if method == 'ffill' - else pylibcudf.replace.ReplacePolicy.FOLLOWING, - ) - ) - - -def replace_nulls( - Column input_col, - object replacement=None, - object method=None, - object dtype=None -): - """ - Calls one of the version of replace_nulls depending on type - of replacement - """ - - if replacement is None and method is None: - raise ValueError("Must specify a fill 'value' or 'method'.") - - if replacement and method: - raise ValueError("Cannot specify both 'value' and 'method'.") - - if method: - return replace_nulls_fill(input_col, method) - elif is_scalar(replacement): - return replace_nulls_scalar( - input_col, - as_device_scalar(replacement, dtype=dtype) - ) - else: - return replace_nulls_column(input_col, replacement) - - -@acquire_spill_lock() -def clamp(Column input_col, DeviceScalar lo, DeviceScalar hi): - """ - Clip the input_col such that values < lo will be replaced by lo - and > hi will be replaced by hi - - Parameters - ---------- - input_col : Column whose value will be updated - lo : DeviceScalar value for clipping lower values - hi : DeviceScalar value for clipping upper values - """ - return Column.from_pylibcudf( - pylibcudf.replace.clamp( - input_col.to_pylibcudf(mode="read"), - lo.c_value, - hi.c_value, - ) - ) - - -@acquire_spill_lock() -def clip(Column input_col, object lo, object hi): - """ - Clip the input_col such that values < lo will be replaced by lo - and > hi will be replaced by hi - """ - - lo_scalar = as_device_scalar(lo, dtype=input_col.dtype) - hi_scalar = as_device_scalar(hi, dtype=input_col.dtype) - - return clamp(input_col, lo_scalar, hi_scalar) - - -@acquire_spill_lock() -def normalize_nans_and_zeros_inplace(Column input_col): - """ - Inplace normalizing - """ - pylibcudf.replace.normalize_nans_and_zeros( - input_col.to_pylibcudf(mode="write"), inplace=True - ) - - -@acquire_spill_lock() -def normalize_nans_and_zeros_column(Column input_col): - """ - Returns a new normalized Column - """ - return Column.from_pylibcudf( - pylibcudf.replace.normalize_nans_and_zeros( - input_col.to_pylibcudf(mode="read") - ) - ) - - -def normalize_nans_and_zeros(Column input_col, in_place=False): - """ - Normalize the NaN and zeros in input_col - Convert -NaN -> NaN - Convert -0.0 -> 0.0 - - Parameters - ---------- - input_col : Column that needs to be normalized - in_place : boolean whether to normalize in place or return new column - """ - - if in_place is True: - normalize_nans_and_zeros_inplace(input_col) - else: - return normalize_nans_and_zeros_column(input_col) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 7354b917f90..7551703c53e 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -843,9 +843,9 @@ def values(self): """ raise NotImplementedError("cudf.Categorical is not yet implemented") - def clip(self, lo: ScalarLike, hi: ScalarLike) -> "column.ColumnBase": + def clip(self, lo: ScalarLike, hi: ScalarLike) -> Self: return ( - self.astype(self.categories.dtype).clip(lo, hi).astype(self.dtype) + self.astype(self.categories.dtype).clip(lo, hi).astype(self.dtype) # type: ignore[return-value] ) def data_array_view( @@ -989,10 +989,8 @@ def find_and_replace( replacement_col = catmap._data["index"].astype(replaced.codes.dtype) replaced_codes = column.as_column(replaced.codes) - output = libcudf.replace.replace( - replaced_codes, to_replace_col, replacement_col - ) - codes = as_unsigned_codes(len(new_cats["cats"]), output) + output = replaced_codes.replace(to_replace_col, replacement_col) + codes = as_unsigned_codes(len(new_cats["cats"]), output) # type: ignore[arg-type] result = type(self)( data=self.data, # type: ignore[arg-type] diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 8ddfd4a54ae..d1938f47d66 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -236,8 +236,14 @@ def find_and_replace( ) -> Self: raise NotImplementedError - def clip(self, lo: ScalarLike, hi: ScalarLike) -> ColumnBase: - return libcudf.replace.clip(self, lo, hi) + @acquire_spill_lock() + def clip(self, lo: ScalarLike, hi: ScalarLike) -> Self: + plc_column = plc.replace.clamp( + self.to_pylibcudf(mode="read"), + cudf.Scalar(lo, self.dtype).device_value.c_value, + cudf.Scalar(hi, self.dtype).device_value.c_value, + ) + return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] def equals(self, other: ColumnBase, check_dtypes: bool = False) -> bool: if self is other: @@ -685,6 +691,18 @@ def _validate_fillna_value( return cudf.Scalar(fill_value, dtype=self.dtype) return as_column(fill_value) + @acquire_spill_lock() + def replace( + self, values_to_replace: Self, replacement_values: Self + ) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.replace.find_and_replace_all( + self.to_pylibcudf(mode="read"), + values_to_replace.to_pylibcudf(mode="read"), + replacement_values.to_pylibcudf(mode="read"), + ) + ) + def fillna( self, fill_value: ScalarLike | ColumnLike, @@ -703,11 +721,32 @@ def fillna( return self.copy() else: fill_value = self._validate_fillna_value(fill_value) - return libcudf.replace.replace_nulls( - input_col=self.nans_to_nulls(), - replacement=fill_value, - method=method, - )._with_type_metadata(self.dtype) + + if fill_value is None and method is None: + raise ValueError("Must specify a fill 'value' or 'method'.") + + if fill_value and method: + raise ValueError("Cannot specify both 'value' and 'method'.") + + input_col = self.nans_to_nulls() + + with acquire_spill_lock(): + if method: + plc_replace = ( + plc.replace.ReplacePolicy.PRECEDING + if method == "ffill" + else plc.replace.ReplacePolicy.FOLLOWING + ) + elif is_scalar(fill_value): + plc_replace = cudf.Scalar(fill_value).device_value.c_value + else: + plc_replace = fill_value.to_pylibcudf(mode="read") + plc_column = plc.replace.replace_nulls( + input_col.to_pylibcudf(mode="read"), + plc_replace, + ) + result = type(self).from_pylibcudf(plc_column) + return result._with_type_metadata(self.dtype) # type: ignore[return-value] def isnull(self) -> ColumnBase: """Identify missing values in a Column.""" diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index a7538c1c947..c8f859596b2 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -482,7 +482,7 @@ def find_and_replace( to_replace: ColumnLike, replacement: ColumnLike, all_nan: bool = False, - ) -> NumericalColumn: + ) -> Self: """ Return col with *to_replace* replaced with *value*. """ @@ -547,7 +547,7 @@ def find_and_replace( ) elif len(replacement_col) == 1 and len(to_replace_col) == 0: return self.copy() - replaced = self.astype(common_type) + replaced = cast(Self, self.astype(common_type)) df = cudf.DataFrame._from_data( { "old": to_replace_col.astype(common_type), @@ -563,9 +563,7 @@ def find_and_replace( ) df = df.dropna(subset=["old"]) - return libcudf.replace.replace( - replaced, df._data["old"], df._data["new"] - ) + return replaced.replace(df._data["old"], df._data["new"]) def _validate_fillna_value( self, fill_value: ScalarLike | ColumnLike diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index d45c76d3ddb..fa5f0dd99fa 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -6185,7 +6185,7 @@ def find_and_replace( df = df.dropna(subset=["old"]) else: res = self - return libcudf.replace.replace(res, df._data["old"], df._data["new"]) + return res.replace(df._data["old"], df._data["new"]) def normalize_binop_value(self, other) -> column.ColumnBase | cudf.Scalar: if ( From d1bad33caef34b8fa95543c7494780f2084ee603 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 2 Dec 2024 21:48:26 +0000 Subject: [PATCH 04/78] Update the hook versions in pre-commit (#17462) The major change here is to move to ruff 0.8 which, among other things, introduces automatic sorting for `__all__` and `__slots__` (so I've turned those on and fixed things). Notable actual bug fix: https://github.com/rapidsai/cudf/commit/b2cfb9c88db13228a94628970c4c8c01a5527d56 Authors: - Lawrence Mitchell (https://github.com/wence-) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - Matthew Murray (https://github.com/Matt711) - Jake Awe (https://github.com/AyodeAwe) - Nghia Truong (https://github.com/ttnghia) - Richard (Rick) Zamora (https://github.com/rjzamora) URL: https://github.com/rapidsai/cudf/pull/17462 --- .pre-commit-config.yaml | 14 ++--- cpp/src/lists/set_operations.cu | 2 +- pyproject.toml | 25 +++++++-- python/cudf/benchmarks/common/config.py | 4 +- python/cudf/benchmarks/conftest.py | 16 +++--- python/cudf/cudf/__init__.py | 2 +- python/cudf/cudf/_fuzz_testing/fuzzer.py | 2 +- python/cudf/cudf/core/_base_index.py | 8 +-- python/cudf/cudf/core/buffer/spill_manager.py | 4 +- .../cudf/cudf/core/buffer/spillable_buffer.py | 2 +- python/cudf/cudf/core/column/__init__.py | 48 ++++++++++++----- python/cudf/cudf/core/column/categorical.py | 2 +- python/cudf/cudf/core/column/datetime.py | 4 +- python/cudf/cudf/core/column/decimal.py | 11 ++-- python/cudf/cudf/core/column/interval.py | 3 +- python/cudf/cudf/core/column/lists.py | 3 +- python/cudf/cudf/core/column/numerical.py | 9 ++-- .../cudf/cudf/core/column/numerical_base.py | 2 +- python/cudf/cudf/core/column/string.py | 13 ++--- python/cudf/cudf/core/column/struct.py | 2 +- python/cudf/cudf/core/column/timedelta.py | 6 ++- python/cudf/cudf/core/column_accessor.py | 6 +-- python/cudf/cudf/core/cut.py | 2 +- python/cudf/cudf/core/dataframe.py | 50 ++++++++++++------ python/cudf/cudf/core/dtypes.py | 18 +++---- python/cudf/cudf/core/frame.py | 2 +- python/cudf/cudf/core/groupby/groupby.py | 4 +- python/cudf/cudf/core/index.py | 22 ++++---- python/cudf/cudf/core/indexed_frame.py | 42 +++++++-------- python/cudf/cudf/core/mixins/scans.py | 4 +- python/cudf/cudf/core/multiindex.py | 2 +- python/cudf/cudf/core/reshape.py | 2 +- python/cudf/cudf/core/scalar.py | 2 +- python/cudf/cudf/core/series.py | 14 ++--- python/cudf/cudf/core/single_column_frame.py | 6 +-- python/cudf/cudf/core/udf/masked_typing.py | 4 +- python/cudf/cudf/datasets.py | 2 +- python/cudf/cudf/io/parquet.py | 7 +-- python/cudf/cudf/options.py | 2 +- python/cudf/cudf/pandas/__init__.py | 4 +- python/cudf/cudf/pandas/__main__.py | 2 +- python/cudf/cudf/pandas/_wrappers/pandas.py | 6 +-- python/cudf/cudf/pandas/fast_slow_proxy.py | 6 +-- .../pandas/scripts/analyze-test-failures.py | 2 +- python/cudf/cudf/testing/dataset_generator.py | 4 +- python/cudf/cudf/testing/testing.py | 4 +- .../cudf/tests/series/test_datetimelike.py | 4 +- python/cudf/cudf/tests/test_binops.py | 12 ++--- python/cudf/cudf/tests/test_categorical.py | 6 +-- python/cudf/cudf/tests/test_concat.py | 6 +-- python/cudf/cudf/tests/test_csv.py | 4 +- .../cudf/tests/test_cuda_array_interface.py | 2 +- python/cudf/cudf/tests/test_dataframe.py | 28 +++++----- python/cudf/cudf/tests/test_feather.py | 2 +- python/cudf/cudf/tests/test_groupby.py | 7 ++- python/cudf/cudf/tests/test_hdf.py | 2 +- python/cudf/cudf/tests/test_index.py | 14 ++--- python/cudf/cudf/tests/test_joining.py | 6 +-- python/cudf/cudf/tests/test_json.py | 14 ++--- python/cudf/cudf/tests/test_orc.py | 4 +- python/cudf/cudf/tests/test_parquet.py | 2 +- python/cudf/cudf/tests/test_quantiles.py | 2 +- python/cudf/cudf/tests/test_replace.py | 4 +- python/cudf/cudf/tests/test_reshape.py | 6 +-- python/cudf/cudf/tests/test_scalar.py | 4 +- python/cudf/cudf/tests/test_series.py | 6 +-- python/cudf/cudf/tests/test_setitem.py | 2 +- python/cudf/cudf/tests/test_spilling.py | 2 +- python/cudf/cudf/tests/test_string.py | 4 +- python/cudf/cudf/tests/test_testing.py | 2 +- .../cudf/cudf/tests/text/test_text_methods.py | 40 +++++++------- python/cudf/cudf/utils/ioutils.py | 2 +- python/cudf/cudf/utils/queryutils.py | 4 +- python/cudf/cudf/utils/utils.py | 2 +- .../cudf_pandas_tests/test_cudf_pandas.py | 4 +- .../tests/test_matplotlib.py | 2 +- .../tests/test_plotly.py | 2 +- .../tests/test_seaborn.py | 2 +- python/cudf_polars/cudf_polars/__init__.py | 2 +- .../cudf_polars/containers/__init__.py | 2 +- python/cudf_polars/cudf_polars/dsl/expr.py | 32 ++++++------ .../dsl/expressions/aggregation.py | 2 +- .../cudf_polars/dsl/expressions/base.py | 2 +- .../cudf_polars/dsl/expressions/boolean.py | 2 +- .../cudf_polars/dsl/expressions/rolling.py | 2 +- .../cudf_polars/dsl/expressions/selection.py | 2 +- .../cudf_polars/dsl/expressions/string.py | 2 +- .../cudf_polars/dsl/expressions/unary.py | 2 +- python/cudf_polars/cudf_polars/dsl/ir.py | 52 +++++++++---------- .../cudf_polars/cudf_polars/dsl/traversal.py | 6 +-- .../cudf_polars/typing/__init__.py | 10 ++-- .../cudf_polars/cudf_polars/utils/dtypes.py | 16 +++--- python/cudf_polars/pyproject.toml | 1 + python/dask_cudf/dask_cudf/__init__.py | 20 +++---- .../dask_cudf/dask_cudf/_expr/collection.py | 9 ++-- python/dask_cudf/dask_cudf/core.py | 4 +- python/dask_cudf/dask_cudf/io/__init__.py | 5 +- python/dask_cudf/dask_cudf/io/parquet.py | 2 +- python/dask_cudf/dask_cudf/tests/test_core.py | 4 +- .../dask_cudf/tests/test_dispatch.py | 2 +- .../dask_cudf/dask_cudf/tests/test_groupby.py | 2 +- python/libcudf/libcudf/__init__.py | 2 + python/pylibcudf/pylibcudf/__init__.py | 6 +-- python/pylibcudf/pylibcudf/nvtext/__init__.py | 2 +- .../pylibcudf/pylibcudf/tests/io/test_csv.py | 2 +- 105 files changed, 431 insertions(+), 368 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 37b26949804..39869b67547 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,7 +2,7 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.6.0 + rev: v5.0.0 hooks: - id: trailing-whitespace exclude: | @@ -17,11 +17,11 @@ repos: ^python/cudf/cudf/tests/data/subword_tokenizer_data/.* ) - repo: https://github.com/MarcoGorelli/cython-lint - rev: v0.16.2 + rev: v0.16.6 hooks: - id: cython-lint - repo: https://github.com/pre-commit/mirrors-mypy - rev: 'v1.10.0' + rev: 'v1.13.0' hooks: - id: mypy additional_dependencies: [types-cachetools] @@ -33,7 +33,7 @@ repos: "python/dask_cudf/dask_cudf"] pass_filenames: false - repo: https://github.com/nbQA-dev/nbQA - rev: 1.8.5 + rev: 1.9.1 hooks: - id: nbqa-isort # Use the cudf_kafka isort orderings in notebooks so that dask @@ -52,7 +52,7 @@ repos: ^cpp/include/cudf_test/cxxopts.hpp ) - repo: https://github.com/sirosen/texthooks - rev: 0.6.6 + rev: 0.6.7 hooks: - id: fix-smartquotes exclude: | @@ -133,7 +133,7 @@ repos: pass_filenames: false verbose: true - repo: https://github.com/codespell-project/codespell - rev: v2.2.6 + rev: v2.3.0 hooks: - id: codespell additional_dependencies: [tomli] @@ -144,7 +144,7 @@ repos: ^CHANGELOG.md$ ) - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.4.8 + rev: v0.8.0 hooks: - id: ruff args: ["--fix"] diff --git a/cpp/src/lists/set_operations.cu b/cpp/src/lists/set_operations.cu index c0bc10dd266..6f2acbb0712 100644 --- a/cpp/src/lists/set_operations.cu +++ b/cpp/src/lists/set_operations.cu @@ -72,7 +72,7 @@ std::unique_ptr have_overlap(lists_column_view const& lhs, // - Generate labels for lhs and rhs child elements. // - Check existence for rows of the table {rhs_labels, rhs_child} in the table // {lhs_labels, lhs_child}. - // - `reduce_by_key` with keys are rhs_labels and `logical_or` reduction on the existence reults + // - `reduce_by_key` with keys are rhs_labels and `logical_or` reduction on the existence results // computed in the previous step. auto const lhs_child = lhs.get_sliced_child(stream); diff --git a/pyproject.toml b/pyproject.toml index 6933484f4e7..0c95ea60408 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,12 +18,13 @@ exclude = [ skip = "./.git,./.github,./cpp/build,.*egg-info.*,./.mypy_cache,./cpp/tests,./python/cudf/cudf/tests,./java/src/test,./cpp/include/cudf_test/cxxopts.hpp" # ignore short words, and typename parameters like OffsetT ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b" -ignore-words-list = "inout,unparseable,falsy,couldn,Couldn" +ignore-words-list = "inout,unparseable,falsy,couldn,Couldn,thirdparty" builtin = "clear" quiet-level = 3 [tool.ruff] line-length = 79 +target-version = "py310" [tool.ruff.lint] typing-modules = ["cudf._typing"] @@ -94,17 +95,35 @@ select = [ "UP035", # usage of legacy `np.random` function calls "NPY002", + # Ruff-specific rules + "RUF", ] ignore = [ # whitespace before : "E203", # line-too-long (due to Copyright header) "E501", + # type-comparison, disabled because we compare types to numpy dtypes + "E721", + # String contains ambiguous character + "RUF001", + # Parenthesize `a and b` expressions when chaining `and` and `or` + # together, to make the precedence clear + "RUF021", + # Mutable class attributes should be annotated with + # `typing.ClassVar` + "RUF012", ] fixable = ["ALL"] exclude = [ - # TODO: Remove this in a follow-up where we fix __all__. - "__init__.py", + # TODO: https://github.com/rapidsai/cudf/issues/17461 + "**/*.ipynb", +] + +[tool.ruff.format] +exclude = [ + # TODO: https://github.com/rapidsai/cudf/issues/17461 + "**/*.ipynb", ] [tool.ruff.lint.per-file-ignores] diff --git a/python/cudf/benchmarks/common/config.py b/python/cudf/benchmarks/common/config.py index c1e9d4d6116..872ba424d20 100644 --- a/python/cudf/benchmarks/common/config.py +++ b/python/cudf/benchmarks/common/config.py @@ -42,9 +42,9 @@ def pytest_collection_modifyitems(session, config, items): items[:] = list(filter(is_pandas_compatible, items)) else: - import cupy # noqa: W0611, F401 + import cupy # noqa: F401 - import cudf # noqa: W0611, F401 + import cudf # noqa: F401 def pytest_collection_modifyitems(session, config, items): pass diff --git a/python/cudf/benchmarks/conftest.py b/python/cudf/benchmarks/conftest.py index 0e4afadccf5..24ff211387c 100644 --- a/python/cudf/benchmarks/conftest.py +++ b/python/cudf/benchmarks/conftest.py @@ -56,18 +56,16 @@ # into the main repo. sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common")) -# Turn off isort until we upgrade to 5.8.0 -# https://github.com/pycqa/isort/issues/1594 -from config import ( # noqa: W0611, E402, F401 +from config import ( NUM_COLS, NUM_ROWS, - collect_ignore, - cudf, # noqa: W0611, E402, F401 - pytest_collection_modifyitems, - pytest_sessionfinish, - pytest_sessionstart, + collect_ignore, # noqa: F401 + cudf, + pytest_collection_modifyitems, # noqa: F401 + pytest_sessionfinish, # noqa: F401 + pytest_sessionstart, # noqa: F401 ) -from utils import ( # noqa: E402 +from utils import ( OrderedSet, collapse_fixtures, column_generators, diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index 99b759e2166..843f2670b4d 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -99,6 +99,7 @@ __all__ = [ + "NA", "BaseIndex", "CategoricalDtype", "CategoricalIndex", @@ -114,7 +115,6 @@ "IntervalIndex", "ListDtype", "MultiIndex", - "NA", "NaT", "RangeIndex", "Scalar", diff --git a/python/cudf/cudf/_fuzz_testing/fuzzer.py b/python/cudf/cudf/_fuzz_testing/fuzzer.py index ee1b2c1f1c4..4b080937a17 100644 --- a/python/cudf/cudf/_fuzz_testing/fuzzer.py +++ b/python/cudf/cudf/_fuzz_testing/fuzzer.py @@ -95,7 +95,7 @@ def start(self): else: self._data_handler.set_rand_params(self.params) kwargs = self._data_handler._current_params["test_kwargs"] - logging.info(f"Parameters passed: {str(kwargs)}") + logging.info(f"Parameters passed: {kwargs!s}") self._target(file_name, **kwargs) except KeyboardInterrupt: logging.info( diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index a6abd63d042..2df154ee112 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -133,7 +133,7 @@ def memory_usage(self, deep=False): """ raise NotImplementedError - def tolist(self): # noqa: D102 + def tolist(self): raise TypeError( "cuDF does not support conversion to host memory " "via the `tolist()` method. Consider using " @@ -148,7 +148,7 @@ def name(self): raise NotImplementedError @property # type: ignore - def ndim(self) -> int: # noqa: D401 + def ndim(self) -> int: """Number of dimensions of the underlying data, by definition 1.""" return 1 @@ -265,7 +265,7 @@ def get_loc(self, key): slice(1, 3, None) >>> multi_index.get_loc(('b', 'e')) 1 - """ # noqa: E501 + """ def max(self): """The maximum value of the index.""" @@ -1473,7 +1473,7 @@ def _intersection(self, other, sort=None): ._data ) - if sort is {None, True} and len(other): + if sort in {None, True} and len(other): return intersection_result.sort_values() return intersection_result diff --git a/python/cudf/cudf/core/buffer/spill_manager.py b/python/cudf/cudf/core/buffer/spill_manager.py index ed351a6b107..07d0d698cb8 100644 --- a/python/cudf/cudf/core/buffer/spill_manager.py +++ b/python/cudf/cudf/core/buffer/spill_manager.py @@ -54,7 +54,7 @@ def get_rmm_memory_resource_stack( """ if hasattr(mr, "upstream_mr"): - return [mr] + get_rmm_memory_resource_stack(mr.upstream_mr) + return [mr, *get_rmm_memory_resource_stack(mr.upstream_mr)] return [mr] @@ -275,7 +275,7 @@ def _out_of_memory_handle(self, nbytes: int, *, retry_once=True) -> bool: print( f"[WARNING] RMM allocation of {format_bytes(nbytes)} bytes " "failed, spill-on-demand couldn't find any device memory to " - f"spill:\n{repr(self)}\ntraceback:\n{get_traceback()}\n" + f"spill:\n{self!r}\ntraceback:\n{get_traceback()}\n" f"{self.statistics}" ) return False # Since we didn't find anything to spill, we give up diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py index b40c56c9a6b..7305ff651c6 100644 --- a/python/cudf/cudf/core/buffer/spillable_buffer.py +++ b/python/cudf/cudf/core/buffer/spillable_buffer.py @@ -366,7 +366,7 @@ def __str__(self) -> str: f"<{self.__class__.__name__} size={format_bytes(self._size)} " f"spillable={self.spillable} exposed={self.exposed} " f"num-spill-locks={len(self._spill_locks)} " - f"ptr={ptr_info} owner={repr(self._owner)}>" + f"ptr={ptr_info} owner={self._owner!r}>" ) diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py index a1e87d04bc9..0a9d339a6a8 100644 --- a/python/cudf/cudf/core/column/__init__.py +++ b/python/cudf/cudf/core/column/__init__.py @@ -1,9 +1,5 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -""" -isort: skip_file -""" - from cudf.core.column.categorical import CategoricalColumn from cudf.core.column.column import ( ColumnBase, @@ -15,17 +11,43 @@ deserialize_columns, serialize_columns, ) -from cudf.core.column.datetime import DatetimeColumn # noqa: F401 -from cudf.core.column.datetime import DatetimeTZColumn # noqa: F401 -from cudf.core.column.lists import ListColumn # noqa: F401 -from cudf.core.column.numerical import NumericalColumn # noqa: F401 -from cudf.core.column.string import StringColumn # noqa: F401 -from cudf.core.column.struct import StructColumn # noqa: F401 -from cudf.core.column.timedelta import TimeDeltaColumn # noqa: F401 -from cudf.core.column.interval import IntervalColumn # noqa: F401 -from cudf.core.column.decimal import ( # noqa: F401 +from cudf.core.column.datetime import ( + DatetimeColumn, + DatetimeTZColumn, +) +from cudf.core.column.decimal import ( Decimal32Column, Decimal64Column, Decimal128Column, DecimalBaseColumn, ) +from cudf.core.column.interval import IntervalColumn +from cudf.core.column.lists import ListColumn +from cudf.core.column.numerical import NumericalColumn +from cudf.core.column.string import StringColumn +from cudf.core.column.struct import StructColumn +from cudf.core.column.timedelta import TimeDeltaColumn + +__all__ = [ + "CategoricalColumn", + "ColumnBase", + "DatetimeColumn", + "DatetimeTZColumn", + "Decimal32Column", + "Decimal64Column", + "Decimal128Column", + "DecimalBaseColumn", + "IntervalColumn", + "ListColumn", + "NumericalColumn", + "StringColumn", + "StructColumn", + "TimeDeltaColumn", + "as_column", + "build_column", + "column_empty", + "column_empty_like", + "concat_columns", + "deserialize_columns", + "serialize_columns", +] diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 7551703c53e..cbbe01f7289 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -888,7 +888,7 @@ def find_and_replace( if len(replacement_col) == replacement_col.null_count: replacement_col = replacement_col.astype(self.categories.dtype) - if type(to_replace_col) != type(replacement_col): + if type(to_replace_col) is not type(replacement_col): raise TypeError( f"to_replace and value should be of same types," f"got to_replace dtype: {to_replace_col.dtype} and " diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 24b55fe1bc2..c9be3f239f9 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -18,6 +18,8 @@ import pylibcudf as plc import cudf +import cudf.core.column.column as column +import cudf.core.column.string as string from cudf import _lib as libcudf from cudf.core._compat import PANDAS_GE_220 from cudf.core._internals import unary @@ -28,7 +30,7 @@ get_tz_data, ) from cudf.core.buffer import Buffer, acquire_spill_lock -from cudf.core.column import ColumnBase, as_column, column, string +from cudf.core.column.column import ColumnBase, as_column from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion from cudf.utils.dtypes import _get_base_dtype from cudf.utils.utils import ( diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index ce7aa91f775..ac9a2caad50 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -18,7 +18,8 @@ from cudf.api.types import is_scalar from cudf.core._internals import unary from cudf.core.buffer import as_buffer -from cudf.core.column import ColumnBase +from cudf.core.column.column import ColumnBase +from cudf.core.column.numerical_base import NumericalBaseColumn from cudf.core.dtypes import ( Decimal32Dtype, Decimal64Dtype, @@ -28,8 +29,6 @@ from cudf.core.mixins import BinaryOperand from cudf.utils.utils import pa_mask_buffer_to_mask -from .numerical_base import NumericalBaseColumn - if TYPE_CHECKING: from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike from cudf.core.buffer import Buffer @@ -435,7 +434,7 @@ def _get_decimal_type( `op` for the given dtypes. For precision & scale calculations see : https://docs.microsoft.com/en-us/sql/t-sql/data-types/precision-scale-and-length-transact-sql - """ # noqa: E501 + """ # This should at some point be hooked up to libcudf's # binary_operation_fixed_point_scale @@ -506,8 +505,8 @@ def _get_decimal_type( # if we've reached this point, we cannot create a decimal type without # overflow; raise an informative error raise ValueError( - f"Performing {op} between columns of type {repr(lhs_dtype)} and " - f"{repr(rhs_dtype)} would result in overflow" + f"Performing {op} between columns of type {lhs_dtype!r} and " + f"{rhs_dtype!r} would result in overflow" ) diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index 9147270c289..34975fc94f4 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -7,7 +7,8 @@ import pyarrow as pa import cudf -from cudf.core.column import StructColumn, as_column +from cudf.core.column.column import as_column +from cudf.core.column.struct import StructColumn from cudf.core.dtypes import IntervalDtype if TYPE_CHECKING: diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 42df5123014..789c4a7f3cb 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -13,11 +13,12 @@ import pylibcudf as plc import cudf +import cudf.core.column.column as column from cudf._lib.strings.convert.convert_lists import format_list_column from cudf._lib.types import size_type_dtype from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar from cudf.core.buffer import acquire_spill_lock -from cudf.core.column import ColumnBase, as_column, column +from cudf.core.column.column import ColumnBase, as_column from cudf.core.column.methods import ColumnMethods, ParentType from cudf.core.column.numerical import NumericalColumn from cudf.core.dtypes import ListDtype diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index c8f859596b2..8ca42debb72 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -12,10 +12,13 @@ import pylibcudf import cudf +import cudf.core.column.column as column +import cudf.core.column.string as string from cudf import _lib as libcudf from cudf.api.types import is_integer, is_scalar from cudf.core._internals import unary -from cudf.core.column import ColumnBase, as_column, column, string +from cudf.core.column.column import ColumnBase, as_column +from cudf.core.column.numerical_base import NumericalBaseColumn from cudf.core.dtypes import CategoricalDtype from cudf.core.mixins import BinaryOperand from cudf.errors import MixedTypeError @@ -26,8 +29,6 @@ np_dtypes_to_pandas_dtypes, ) -from .numerical_base import NumericalBaseColumn - if TYPE_CHECKING: from collections.abc import Callable, Sequence @@ -226,7 +227,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: # If `other` is a Python integer and it is out-of-bounds # promotion could fail but we can trivially define the result # in terms of `notnull` or `NULL_NOT_EQUALS`. - if type(other) is int and self.dtype.kind in "iu": # noqa: E721 + if type(other) is int and self.dtype.kind in "iu": truthiness = None iinfo = np.iinfo(self.dtype) if iinfo.min > other: diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 6d639337401..ea242e34edb 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -12,7 +12,7 @@ import cudf from cudf import _lib as libcudf from cudf.core.buffer import Buffer, acquire_spill_lock -from cudf.core.column import ColumnBase +from cudf.core.column.column import ColumnBase from cudf.core.missing import NA from cudf.core.mixins import Scannable diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index fa5f0dd99fa..76d67585609 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -17,13 +17,14 @@ import cudf import cudf.api.types +import cudf.core.column.column as column +import cudf.core.column.datetime as datetime from cudf import _lib as libcudf from cudf._lib import string_casting as str_cast, strings as libstrings from cudf._lib.column import Column from cudf._lib.types import size_type_dtype from cudf.api.types import is_integer, is_scalar, is_string_dtype from cudf.core.buffer import acquire_spill_lock -from cudf.core.column import column, datetime from cudf.core.column.column import ColumnBase from cudf.core.column.methods import ColumnMethods from cudf.utils.docutils import copy_docstring @@ -548,7 +549,7 @@ def join( 2 3 c-d dtype: object - """ # noqa E501 + """ if sep is None: sep = "" @@ -694,7 +695,7 @@ def extract( The `flags` parameter currently only supports re.DOTALL and re.MULTILINE. - """ # noqa W605 + """ if not _is_supported_regex_flags(flags): raise NotImplementedError( "unsupported value for `flags` parameter" @@ -830,7 +831,7 @@ def contains( value is set. The `flags` parameter currently only supports re.DOTALL and re.MULTILINE. - """ # noqa W605 + """ if na is not np.nan: raise NotImplementedError("`na` parameter is not yet supported") if regex and isinstance(pat, re.Pattern): @@ -3675,7 +3676,7 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex: - Some characters need to be escaped when passing in pat. e.g. ``'$'`` has a special meaning in regex and must be escaped when finding this literal character. - """ # noqa W605 + """ if isinstance(pat, re.Pattern): flags = pat.flags & ~re.U pat = pat.pattern @@ -6160,7 +6161,7 @@ def find_and_replace( to_replace_col = column.as_column(to_replace) replacement_col = column.as_column(replacement) - if type(to_replace_col) != type(replacement_col): + if type(to_replace_col) is not type(replacement_col): raise TypeError( f"to_replace and value should be of same types," f"got to_replace dtype: {to_replace_col.dtype} and " diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 2adc6b54bab..db6ad72ab56 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -8,7 +8,7 @@ import pyarrow as pa import cudf -from cudf.core.column import ColumnBase +from cudf.core.column.column import ColumnBase from cudf.core.column.methods import ColumnMethods from cudf.core.dtypes import StructDtype from cudf.core.missing import NA diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 620fe31c30f..ccc9ef2b3f6 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -11,11 +11,13 @@ import pyarrow as pa import cudf +import cudf.core.column.column as column +import cudf.core.column.string as string from cudf import _lib as libcudf from cudf.api.types import is_scalar from cudf.core._internals import unary from cudf.core.buffer import Buffer, acquire_spill_lock -from cudf.core.column import ColumnBase, column, string +from cudf.core.column.column import ColumnBase from cudf.utils.dtypes import np_to_pa_dtype from cudf.utils.utils import ( _all_bools_with_nulls, @@ -468,7 +470,7 @@ def components(self) -> dict[str, ColumnBase]: 2 13000 10 12 48 712 0 0 3 0 0 35 35 656 0 0 4 37 13 12 14 234 0 0 - """ # noqa: E501 + """ date_meta = { "seconds": ["m", "s"], diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 496e86ed709..e4fd82e819b 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -49,7 +49,7 @@ def from_zip(cls, data: abc.Iterator): def __getitem__(self, key): """Recursively apply dict.__getitem__ for nested elements.""" # As described in the pandas docs - # https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html#advanced-indexing-with-hierarchical-index # noqa: E501 + # https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html#advanced-indexing-with-hierarchical-index # accessing nested elements of a multiindex must be done using a tuple. # Lists and other sequences are treated as accessing multiple elements # at the top level of the index. @@ -62,10 +62,10 @@ def _to_flat_dict_inner(d: dict, parents: tuple = ()): for k, v in d.items(): if not isinstance(v, d.__class__): if parents: - k = parents + (k,) + k = (*parents, k) yield (k, v) else: - yield from _to_flat_dict_inner(d=v, parents=parents + (k,)) + yield from _to_flat_dict_inner(d=v, parents=(*parents, k)) class ColumnAccessor(abc.MutableMapping): diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py index a4d12cfc7f0..5bfea45a946 100644 --- a/python/cudf/cudf/core/cut.py +++ b/python/cudf/cudf/core/cut.py @@ -149,7 +149,7 @@ def cut( if len(set(bins)) is not len(bins): if duplicates == "raise": raise ValueError( - f"Bin edges must be unique: {repr(bins)}.\n" + f"Bin edges must be unique: {bins!r}.\n" f"You can drop duplicate edges by setting the 'duplicates'" "kwarg" ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index b58ab13be93..fa8d517a9ef 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -13,7 +13,13 @@ import textwrap import warnings from collections import abc, defaultdict -from collections.abc import Callable, Iterator, MutableMapping +from collections.abc import ( + Callable, + Hashable, + Iterator, + MutableMapping, + Sequence, +) from typing import TYPE_CHECKING, Any, Literal, cast import cupy @@ -1131,7 +1137,7 @@ def _from_data( data: MutableMapping, index: BaseIndex | None = None, columns: Any = None, - ) -> DataFrame: + ) -> Self: out = super()._from_data(data=data, index=index) if columns is not None: out.columns = columns @@ -2242,7 +2248,7 @@ def from_dict( n1 n2 a b 1 3 c 2 4 - """ # noqa: E501 + """ orient = orient.lower() if orient == "index": @@ -2399,7 +2405,7 @@ def to_dict( >>> df.to_dict('records', into=dd) [defaultdict(, {'col1': 1, 'col2': 0.5}), defaultdict(, {'col1': 2, 'col2': 0.75})] - """ # noqa: E501 + """ orient = orient.lower() if orient == "series": @@ -3027,7 +3033,7 @@ def set_index( if len(keys) == 0: raise ValueError("No valid columns to be added to index.") if append: - keys = [self.index] + keys + keys = [self.index, *keys] # Preliminary type check labels_not_found = [] @@ -3093,7 +3099,7 @@ def set_index( @_performance_tracking def fillna( self, value=None, method=None, axis=None, inplace=False, limit=None - ): # noqa: D102 + ): if isinstance(value, (pd.Series, pd.DataFrame)): value = cudf.from_pandas(value) if isinstance(value, cudf.Series): @@ -3574,7 +3580,7 @@ def drop_duplicates( 1 Yum Yum cup 4.0 2 Indomie cup 3.5 4 Indomie pack 5.0 - """ # noqa: E501 + """ outdf = super().drop_duplicates( subset=subset, keep=keep, @@ -4854,7 +4860,7 @@ def map( if na_action not in {"ignore", None}: raise ValueError( - f"na_action must be 'ignore' or None. Got {repr(na_action)}" + f"na_action must be 'ignore' or None. Got {na_action!r}" ) if na_action == "ignore": @@ -5727,7 +5733,7 @@ def to_arrow(self, preserve_index=None) -> pa.Table: """ data = self - index_descr = [] + index_descr: Sequence[dict[str, Any]] | Sequence[str] = [] write_index = preserve_index is not False keep_range_index = write_index and preserve_index is None index = self.index @@ -5934,7 +5940,7 @@ def _from_arrays( index=None, columns=None, nan_as_null=False, - ): + ) -> Self: """ Convert an object implementing an array interface to DataFrame. @@ -5987,6 +5993,12 @@ def _from_arrays( raise ValueError("Duplicate column names are not allowed") names = columns + # Mapping/MutableMapping are invariant in the key type, so + # dict[int, ColumnBase] (the inferred type of ca_data) is not + # a valid type to pass to a function accepting + # Mapping[Hashable, ColumnBase] even though int is Hashable. + # See: https://github.com/python/typing/issues/445 + ca_data: dict[Hashable, ColumnBase] if array_data.ndim == 2: ca_data = { k: column.as_column(array_data[:, i], nan_as_null=nan_as_null) @@ -6133,7 +6145,7 @@ def quantile( non-numeric types and result is expected to be a Series in case of Pandas. cuDF will return a DataFrame as it doesn't support mixed types under Series. - """ # noqa: E501 + """ if axis not in (0, None): raise NotImplementedError("axis is not implemented yet") @@ -6832,7 +6844,7 @@ def select_dtypes(self, include=None, exclude=None): 3 False 2.0 4 True 1.0 5 False 2.0 - """ # noqa: E501 + """ # code modified from: # https://github.com/pandas-dev/pandas/blob/master/pandas/core/frame.py#L3196 @@ -7035,7 +7047,9 @@ def to_orc( ) @_performance_tracking - def stack(self, level=-1, dropna=no_default, future_stack=False): + def stack( + self, level=-1, dropna=no_default, future_stack=False + ) -> DataFrame | Series: """Stack the prescribed level(s) from columns to index Return a reshaped DataFrame or Series having a multi-level @@ -7282,11 +7296,13 @@ def stack(self, level=-1, dropna=no_default, future_stack=False): ) if has_unnamed_levels: - unnamed_level_values = list( - map(column_name_idx.get_level_values, unnamed_levels_indices) - ) unnamed_level_values = pd.MultiIndex.from_arrays( - unnamed_level_values + list( + map( + column_name_idx.get_level_values, + unnamed_levels_indices, + ) + ) ) def unnamed_group_generator(): diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 2110e610c37..801020664da 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -237,7 +237,7 @@ def from_pandas(cls, dtype: pd.CategoricalDtype) -> "CategoricalDtype": >>> cudf_dtype = cudf.CategoricalDtype.from_pandas(pd_dtype) >>> cudf_dtype CategoricalDtype(categories=['b', 'a'], ordered=True, categories_dtype=object) - """ # noqa: E501 + """ return CategoricalDtype( categories=dtype.categories, ordered=dtype.ordered ) @@ -254,7 +254,7 @@ def to_pandas(self) -> pd.CategoricalDtype: CategoricalDtype(categories=['b', 'a'], ordered=True, categories_dtype=object) >>> dtype.to_pandas() CategoricalDtype(categories=['b', 'a'], ordered=True, categories_dtype=object) - """ # noqa: E501 + """ if self._categories is None: categories = None elif self._categories.dtype.kind == "f": @@ -399,7 +399,7 @@ def element_type(self) -> Dtype: ListDtype(float32) >>> deep_nested_type.element_type.element_type.element_type 'float32' - """ # noqa: E501 + """ if isinstance(self._typ.value_type, pa.ListType): return ListDtype.from_arrow(self._typ.value_type) elif isinstance(self._typ.value_type, pa.StructType): @@ -420,7 +420,7 @@ def leaf_type(self): ListDtype(ListDtype(ListDtype(float32))) >>> deep_nested_type.leaf_type 'float32' - """ # noqa: E501 + """ if isinstance(self.element_type, ListDtype): return self.element_type.leaf_type else: @@ -486,7 +486,7 @@ def __eq__(self, other): def __repr__(self): if isinstance(self.element_type, (ListDtype, StructDtype)): - return f"{type(self).__name__}({repr(self.element_type)})" + return f"{type(self).__name__}({self.element_type!r})" else: return f"{type(self).__name__}({self.element_type})" @@ -556,7 +556,7 @@ class StructDtype(_BaseDtype): >>> nested_struct_dtype = cudf.StructDtype({"dict_data": struct_dtype, "c": "uint8"}) >>> nested_struct_dtype StructDtype({'dict_data': StructDtype({'a': dtype('int64'), 'b': dtype('O')}), 'c': dtype('uint8')}) - """ # noqa: E501 + """ name = "struct" @@ -730,7 +730,7 @@ def itemsize(self): >>> decimal{size}_dtype = cudf.Decimal{size}Dtype(precision=9, scale=2) >>> decimal{size}_dtype Decimal{size}Dtype(precision=9, scale=2) - """ # noqa: E501 + """ ) @@ -743,7 +743,7 @@ def __init__(self, precision, scale=0): @property def str(self): - return f"{str(self.name)}({self.precision}, {self.scale})" + return f"{self.name!s}({self.precision}, {self.scale})" @property def precision(self): @@ -950,7 +950,7 @@ def __eq__(self, other): # This means equality isn't transitive but mimics pandas return other in (self.name, str(self)) return ( - type(self) == type(other) + type(self) is type(other) and self.subtype == other.subtype and self.closed == other.closed ) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 0c0f271fe6f..70789160cb6 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1392,7 +1392,7 @@ def argsort( >>> idx = cudf.Index([3, 1, 2]) >>> idx.argsort() array([1, 2, 0], dtype=int32) - """ # noqa: E501 + """ if na_position not in {"first", "last"}: raise ValueError(f"invalid na_position: {na_position}") if kind != "quicksort": diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index e977f037b79..29ab3b60d9d 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1467,9 +1467,7 @@ def _iterative_groupby_apply( RuntimeWarning, ) - chunks = [ - grouped_values[s:e] for s, e in zip(offsets[:-1], offsets[1:]) - ] + chunks = [grouped_values[s:e] for s, e in itertools.pairwise(offsets)] chunk_results = [function(chk, *args) for chk in chunks] return self._post_process_chunk_results( chunk_results, group_names, group_keys, grouped_values diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index ff9cd310aef..eac04cf36ec 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1619,7 +1619,7 @@ def argsort( Returns ------- cupy.ndarray: The indices sorted based on input. - """ # noqa: E501 + """ return super().argsort( axis=axis, kind=kind, @@ -2218,7 +2218,7 @@ def year(self) -> Index: DatetimeIndex(['2000-12-31', '2001-12-31', '2002-12-31'], dtype='datetime64[ns]') >>> datetime_index.year Index([2000, 2001, 2002], dtype='int16') - """ # noqa: E501 + """ return Index._from_column(self._column.year, name=self.name) @property # type: ignore @@ -2237,7 +2237,7 @@ def month(self) -> Index: DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31'], dtype='datetime64[ns]') >>> datetime_index.month Index([1, 2, 3], dtype='int16') - """ # noqa: E501 + """ return Index._from_column(self._column.month, name=self.name) @property # type: ignore @@ -2256,7 +2256,7 @@ def day(self) -> Index: DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03'], dtype='datetime64[ns]') >>> datetime_index.day Index([1, 2, 3], dtype='int16') - """ # noqa: E501 + """ return Index._from_column(self._column.day, name=self.name) @property # type: ignore @@ -2340,7 +2340,7 @@ def microsecond(self) -> Index: dtype='datetime64[ns]') >>> datetime_index.microsecond Index([0, 1, 2], dtype='int32') - """ # noqa: E501 + """ return Index._from_column( ( # Need to manually promote column to int32 because @@ -2615,7 +2615,7 @@ def ceil(self, freq: str) -> Self: ... ]) >>> gIndex.ceil("T") DatetimeIndex(['2020-05-31 08:06:00', '1999-12-31 18:41:00'], dtype='datetime64[ns]') - """ # noqa: E501 + """ return type(self)._from_column(self._column.ceil(freq), name=self.name) @_performance_tracking @@ -2646,7 +2646,7 @@ def floor(self, freq: str) -> Self: ... ]) >>> gIndex.floor("T") DatetimeIndex(['2020-05-31 08:59:00', '1999-12-31 18:44:00'], dtype='datetime64[ns]') - """ # noqa: E501 + """ return type(self)._from_column( self._column.floor(freq), name=self.name ) @@ -2686,7 +2686,7 @@ def round(self, freq: str) -> Self: DatetimeIndex(['2001-01-01', '2001-01-01', '2001-01-01'], dtype='datetime64[ns]') >>> dt_idx.round('T') DatetimeIndex(['2001-01-01 00:05:00', '2001-01-01 00:05:00', '2001-01-01 00:05:00'], dtype='datetime64[ns]') - """ # noqa: E501 + """ return type(self)._from_column( self._column.round(freq), name=self.name ) @@ -2737,7 +2737,7 @@ def tz_localize( ``ambiguous`` and ``nonexistent`` arguments. Any ambiguous or nonexistent timestamps are converted to 'NaT'. - """ # noqa: E501 + """ result_col = self._column.tz_localize(tz, ambiguous, nonexistent) return DatetimeIndex._from_column( result_col, name=self.name, freq=self._freq @@ -2774,7 +2774,7 @@ def tz_convert(self, tz: str | None) -> Self: '2018-03-02 14:00:00+00:00', '2018-03-03 14:00:00+00:00'], dtype='datetime64[ns, Europe/London]') - """ # noqa: E501 + """ result_col = self._column.tz_convert(tz) return DatetimeIndex._from_column(result_col, name=self.name) @@ -3118,7 +3118,7 @@ class CategoricalIndex(Index): >>> cudf.CategoricalIndex( ... data=[1, 2, 3, 4], dtype=pd.CategoricalDtype([1, 2, 3]), name="a") CategoricalIndex([1, 2, 3, ], categories=[1, 2, 3], ordered=False, dtype='category', name='a') - """ # noqa: E501 + """ @_performance_tracking def __init__( diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 2f8c2587937..21ac009e7ff 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -607,7 +607,7 @@ def copy(self, deep: bool = True) -> Self: ) @_performance_tracking - def equals(self, other) -> bool: # noqa: D102 + def equals(self, other) -> bool: return super().equals(other) and self.index.equals(other.index) @property @@ -5474,7 +5474,7 @@ def groupby( ), ) ) - def add(self, other, axis, level=None, fill_value=None): # noqa: D102 + def add(self, other, axis, level=None, fill_value=None): if level is not None: raise NotImplementedError("level parameter is not supported yet.") @@ -5515,7 +5515,7 @@ def add(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def radd(self, other, axis, level=None, fill_value=None): # noqa: D102 + def radd(self, other, axis, level=None, fill_value=None): if level is not None: raise NotImplementedError("level parameter is not supported yet.") @@ -5556,7 +5556,7 @@ def radd(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def subtract(self, other, axis, level=None, fill_value=None): # noqa: D102 + def subtract(self, other, axis, level=None, fill_value=None): if level is not None: raise NotImplementedError("level parameter is not supported yet.") @@ -5599,7 +5599,7 @@ def subtract(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def rsub(self, other, axis, level=None, fill_value=None): # noqa: D102 + def rsub(self, other, axis, level=None, fill_value=None): if level is not None: raise NotImplementedError("level parameter is not supported yet.") @@ -5640,7 +5640,7 @@ def rsub(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def multiply(self, other, axis, level=None, fill_value=None): # noqa: D102 + def multiply(self, other, axis, level=None, fill_value=None): if level is not None: raise NotImplementedError("level parameter is not supported yet.") @@ -5683,7 +5683,7 @@ def multiply(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def rmul(self, other, axis, level=None, fill_value=None): # noqa: D102 + def rmul(self, other, axis, level=None, fill_value=None): if level is not None: raise NotImplementedError("level parameter is not supported yet.") @@ -5724,7 +5724,7 @@ def rmul(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def mod(self, other, axis, level=None, fill_value=None): # noqa: D102 + def mod(self, other, axis, level=None, fill_value=None): if level is not None: raise NotImplementedError("level parameter is not supported yet.") @@ -5765,7 +5765,7 @@ def mod(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def rmod(self, other, axis, level=None, fill_value=None): # noqa: D102 + def rmod(self, other, axis, level=None, fill_value=None): if level is not None: raise NotImplementedError("level parameter is not supported yet.") @@ -5806,7 +5806,7 @@ def rmod(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def pow(self, other, axis, level=None, fill_value=None): # noqa: D102 + def pow(self, other, axis, level=None, fill_value=None): if level is not None: raise NotImplementedError("level parameter is not supported yet.") @@ -5847,7 +5847,7 @@ def pow(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def rpow(self, other, axis, level=None, fill_value=None): # noqa: D102 + def rpow(self, other, axis, level=None, fill_value=None): if level is not None: raise NotImplementedError("level parameter is not supported yet.") @@ -5888,7 +5888,7 @@ def rpow(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def floordiv(self, other, axis, level=None, fill_value=None): # noqa: D102 + def floordiv(self, other, axis, level=None, fill_value=None): if level is not None: raise NotImplementedError("level parameter is not supported yet.") @@ -5929,7 +5929,7 @@ def floordiv(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def rfloordiv(self, other, axis, level=None, fill_value=None): # noqa: D102 + def rfloordiv(self, other, axis, level=None, fill_value=None): if level is not None: raise NotImplementedError("level parameter is not supported yet.") @@ -5970,7 +5970,7 @@ def rfloordiv(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def truediv(self, other, axis, level=None, fill_value=None): # noqa: D102 + def truediv(self, other, axis, level=None, fill_value=None): if level is not None: raise NotImplementedError("level parameter is not supported yet.") @@ -6015,7 +6015,7 @@ def truediv(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def rtruediv(self, other, axis, level=None, fill_value=None): # noqa: D102 + def rtruediv(self, other, axis, level=None, fill_value=None): if level is not None: raise NotImplementedError("level parameter is not supported yet.") @@ -6059,7 +6059,7 @@ def rtruediv(self, other, axis, level=None, fill_value=None): # noqa: D102 ), ) ) - def eq(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 + def eq(self, other, axis="columns", level=None, fill_value=None): return self._binaryop( other=other, op="__eq__", fill_value=fill_value, can_reindex=True ) @@ -6099,7 +6099,7 @@ def eq(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 ), ) ) - def ne(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 + def ne(self, other, axis="columns", level=None, fill_value=None): return self._binaryop( other=other, op="__ne__", fill_value=fill_value, can_reindex=True ) @@ -6139,7 +6139,7 @@ def ne(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 ), ) ) - def lt(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 + def lt(self, other, axis="columns", level=None, fill_value=None): return self._binaryop( other=other, op="__lt__", fill_value=fill_value, can_reindex=True ) @@ -6179,7 +6179,7 @@ def lt(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 ), ) ) - def le(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 + def le(self, other, axis="columns", level=None, fill_value=None): return self._binaryop( other=other, op="__le__", fill_value=fill_value, can_reindex=True ) @@ -6219,7 +6219,7 @@ def le(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 ), ) ) - def gt(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 + def gt(self, other, axis="columns", level=None, fill_value=None): return self._binaryop( other=other, op="__gt__", fill_value=fill_value, can_reindex=True ) @@ -6259,7 +6259,7 @@ def gt(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 ), ) ) - def ge(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 + def ge(self, other, axis="columns", level=None, fill_value=None): return self._binaryop( other=other, op="__ge__", fill_value=fill_value, can_reindex=True ) diff --git a/python/cudf/cudf/core/mixins/scans.py b/python/cudf/cudf/core/mixins/scans.py index b0f606e32e6..289fcb84d91 100644 --- a/python/cudf/cudf/core/mixins/scans.py +++ b/python/cudf/cudf/core/mixins/scans.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. from .mixin_factory import _create_delegating_mixin @@ -12,5 +12,5 @@ "cumprod", "cummin", "cummax", - }, # noqa: E231 + }, ) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 19a53af018d..173d4e1c584 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -567,7 +567,7 @@ def levels(self) -> list[cudf.Index]: names=['a', 'b']) >>> midx.levels [Index([1, 2, 3], dtype='int64', name='a'), Index([10, 11, 12], dtype='int64', name='b')] - """ # noqa: E501 + """ return [ idx.rename(name) for idx, name in zip(self._levels, self.names) ] diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 016bd1225cd..f37b44b1100 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -1013,7 +1013,7 @@ def as_tuple(x): ca = ColumnAccessor( result, multiindex=True, - level_names=(None,) + columns._column_names, + level_names=(None, *columns._column_names), verify=False, ) return cudf.DataFrame._from_data(ca, index=index_labels) diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index f6331aa1f49..80dd0921f9c 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -304,7 +304,7 @@ def __repr__(self): # https://github.com/numpy/numpy/issues/17552 return ( f"{self.__class__.__name__}" - f"({str(self.value)}, dtype={self.dtype})" + f"({self.value!s}, dtype={self.dtype})" ) def _binop_result_dtype_or_error(self, other, op): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 95ea22b5ad5..928f3c3d666 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -517,7 +517,7 @@ def from_categorical(cls, categorical, codes=None): 3 a dtype: category Categories (3, object): ['a', 'b', 'c'] - """ # noqa: E501 + """ col = as_column(categorical) if codes is not None: codes = as_column(codes) @@ -942,7 +942,7 @@ def drop( labels, axis, index, columns, level, inplace, errors ) - def tolist(self): # noqa: D102 + def tolist(self): raise TypeError( "cuDF does not support conversion to host memory " "via the `tolist()` method. Consider using " @@ -1087,7 +1087,7 @@ def reindex( DataFrame, followed by the original Series values. When `drop` is True, a `Series` is returned. In either case, if ``inplace=True``, no value is returned. -""", # noqa: E501 +""", example=""" >>> series = cudf.Series(['a', 'b', 'c', 'd'], index=[10, 11, 12, 13]) >>> series @@ -1196,7 +1196,7 @@ def to_frame(self, name: abc.Hashable = no_default) -> cudf.DataFrame: 12 c 13 15 d - """ # noqa: E501 + """ return self._to_frame(name=name, index=self.index) @_performance_tracking @@ -2122,7 +2122,7 @@ def data(self): >>> np.array(series.data.memoryview()) array([1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0], dtype=uint8) - """ # noqa: E501 + """ return self._column.data @property # type: ignore @@ -4590,7 +4590,7 @@ def is_month_end(self) -> Series: 7 False 8 False dtype: bool - """ # noqa: E501 + """ return self._return_result_like_self(self.series._column.is_month_end) @property # type: ignore @@ -5169,7 +5169,7 @@ def components(self) -> cudf.DataFrame: 2 13000 10 12 48 712 0 0 3 0 0 35 35 656 0 0 4 37 13 12 14 234 0 0 - """ # noqa: E501 + """ ca = ColumnAccessor(self.series._column.components(), verify=False) return self.series._constructor_expanddim._from_data( ca, index=self.series.index diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 0e66f383ca0..f6d0664758f 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -83,7 +83,7 @@ def name(self, value): @property # type: ignore @_performance_tracking - def ndim(self) -> int: # noqa: D401 + def ndim(self) -> int: """Number of dimensions of the underlying data, by definition 1.""" return 1 @@ -105,12 +105,12 @@ def _column(self) -> ColumnBase: @property # type: ignore @_performance_tracking - def values(self) -> cupy.ndarray: # noqa: D102 + def values(self) -> cupy.ndarray: return self._column.values @property # type: ignore @_performance_tracking - def values_host(self) -> numpy.ndarray: # noqa: D102 + def values_host(self) -> numpy.ndarray: return self._column.values_host @classmethod diff --git a/python/cudf/cudf/core/udf/masked_typing.py b/python/cudf/cudf/core/udf/masked_typing.py index 4c90c5bbba0..3a1e01caf28 100644 --- a/python/cudf/cudf/core/udf/masked_typing.py +++ b/python/cudf/cudf/core/udf/masked_typing.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import operator @@ -50,7 +50,7 @@ SUPPORTED_NUMPY_TYPES = ( NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | STRING_TYPES ) -supported_type_str = "\n".join(sorted(list(SUPPORTED_NUMPY_TYPES) + ["bool"])) +supported_type_str = "\n".join(sorted([*list(SUPPORTED_NUMPY_TYPES), "bool"])) _units = ["ns", "ms", "us", "s"] _datetime_cases = {types.NPDatetime(u) for u in _units} diff --git a/python/cudf/cudf/datasets.py b/python/cudf/cudf/datasets.py index dbabaacf6b5..e8d634598f4 100644 --- a/python/cudf/cudf/datasets.py +++ b/python/cudf/cudf/datasets.py @@ -6,7 +6,7 @@ import cudf from cudf._lib.transform import bools_to_mask -__all__ = ["timeseries", "randomdata"] +__all__ = ["randomdata", "timeseries"] # TODO: diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 750c6cec180..2382e9f12ed 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -1062,10 +1062,7 @@ def to_parquet( ) partition_info = ( - [ - (i, j - i) - for i, j in zip(partition_offsets, partition_offsets[1:]) - ] + [(i, j - i) for i, j in itertools.pairwise(partition_offsets)] if partition_offsets is not None else None ) @@ -1485,7 +1482,7 @@ def write_table(self, df): ) existing_cw_batch = defaultdict(dict) new_cw_paths = [] - partition_info = [(i, j - i) for i, j in zip(offsets, offsets[1:])] + partition_info = [(i, j - i) for i, j in itertools.pairwise(offsets)] for path, part_info, meta_path in zip( paths, diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py index e206c8bca08..79a3a794af3 100644 --- a/python/cudf/cudf/options.py +++ b/python/cudf/cudf/options.py @@ -380,7 +380,7 @@ class option_context(ContextDecorator): >>> from cudf import option_context >>> with option_context('mode.pandas_compatible', True, 'default_float_bitwidth', 32): ... pass - """ # noqa: E501 + """ def __init__(self, *args) -> None: if len(args) % 2 != 0: diff --git a/python/cudf/cudf/pandas/__init__.py b/python/cudf/cudf/pandas/__init__.py index bacf1f7e77b..fec181e85d7 100644 --- a/python/cudf/cudf/pandas/__init__.py +++ b/python/cudf/cudf/pandas/__init__.py @@ -12,7 +12,7 @@ from .magics import load_ipython_extension from .profiler import Profiler -__all__ = ["Profiler", "load_ipython_extension", "install", "is_proxy_object"] +__all__ = ["Profiler", "install", "is_proxy_object", "load_ipython_extension"] LOADED = False @@ -57,7 +57,7 @@ def install(): current_mr = rmm.mr.get_current_device_resource() if not isinstance(current_mr, rmm.mr.CudaMemoryResource): warnings.warn( - f"cudf.pandas detected an already configured memory resource, ignoring 'CUDF_PANDAS_RMM_MODE'={str(rmm_mode)}", + f"cudf.pandas detected an already configured memory resource, ignoring 'CUDF_PANDAS_RMM_MODE'={rmm_mode!s}", UserWarning, ) return diff --git a/python/cudf/cudf/pandas/__main__.py b/python/cudf/cudf/pandas/__main__.py index e0d3d9101a9..619ee822a54 100644 --- a/python/cudf/cudf/pandas/__main__.py +++ b/python/cudf/cudf/pandas/__main__.py @@ -96,7 +96,7 @@ def main(): (module,) = args.module # run the module passing the remaining arguments # as if it were run with python -m - sys.argv[:] = [module] + args.args # not thread safe? + sys.argv[:] = [module, *args.args] # not thread safe? runpy.run_module(module, run_name="__main__") elif len(args.args) >= 1: # Remove ourself from argv and continue diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py index 05e7d159c63..e763875adb8 100644 --- a/python/cudf/cudf/pandas/_wrappers/pandas.py +++ b/python/cudf/cudf/pandas/_wrappers/pandas.py @@ -77,8 +77,8 @@ def _pandas_util_dir(): # https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/util/__init__.py res = list( set( - list(importlib.import_module("pandas.util").__dict__.keys()) - + [ + [ + *list(importlib.import_module("pandas.util").__dict__.keys()), "Appender", "Substitution", "_exceptions", @@ -219,7 +219,7 @@ def Timestamp_Timedelta__new__(cls, *args, **kwargs): def _DataFrame__dir__(self): # Column names that are string identifiers are added to the dir of the # DataFrame - # See https://github.com/pandas-dev/pandas/blob/43691a2f5d235b08f0f3aa813d8fdcb7c4ce1e47/pandas/core/indexes/base.py#L878 # noqa: E501 + # See https://github.com/pandas-dev/pandas/blob/43691a2f5d235b08f0f3aa813d8fdcb7c4ce1e47/pandas/core/indexes/base.py#L878 _pd_df_dir = dir(pd.DataFrame) return _pd_df_dir + [ colname diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index 40893ee2614..d32d388b975 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # noqa: E501 +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 @@ -247,7 +247,7 @@ def _fsproxy_state(self) -> _State: if metaclasses: metaclass = types.new_class( # type: ignore f"{name}_Meta", - metaclasses + (_FastSlowProxyMeta,), + (*metaclasses, _FastSlowProxyMeta), {}, ) cls = types.new_class( @@ -1301,7 +1301,7 @@ def _replace_closurevars( return functools.update_wrapper( g, f, - assigned=functools.WRAPPER_ASSIGNMENTS + ("__kwdefaults__",), + assigned=(*functools.WRAPPER_ASSIGNMENTS, "__kwdefaults__"), ) diff --git a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py index bb2fc00d9fc..e4ee0ce1ca4 100644 --- a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py +++ b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py @@ -41,7 +41,7 @@ def count_failures(log_file_name, pattern): PANDAS_TEST_PREFIX ) if fnmatch(line_module_name, pattern): - if "longrepr" in line and line["longrepr"]: + if line.get("longrepr"): if isinstance(line["longrepr"], (tuple, list)): message = line["longrepr"][2].splitlines()[0] elif isinstance(line["longrepr"], str): diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py index 99b686406fb..01a75a2efb0 100644 --- a/python/cudf/cudf/testing/dataset_generator.py +++ b/python/cudf/cudf/testing/dataset_generator.py @@ -237,9 +237,9 @@ def generate( def get_dataframe(parameters, use_threads): # Initialize seeds if parameters.seed is not None: - rng = np.random.default_rng(seed=parameters.seed) # noqa: F841 + rng = np.random.default_rng(seed=parameters.seed) else: - rng = np.random.default_rng(seed=0) # noqa: F841 + rng = np.random.default_rng(seed=0) # For each column, invoke the data generator for column_params in parameters.column_parameters: diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index 8d342f8e6c6..0b09cf7dc34 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -77,7 +77,7 @@ def _check_types( ): return - if type(left) != type(right): + if type(left) is not type(right): raise_assert_detail( obj, "Class types are different", f"{type(left)}", f"{type(right)}" ) @@ -149,7 +149,7 @@ def assert_column_equal( ): pass else: - if type(left) != type(right) or left.dtype != right.dtype: + if type(left) is not type(right) or left.dtype != right.dtype: msg1 = f"{left.dtype}" msg2 = f"{right.dtype}" raise_assert_detail(obj, "Dtypes are different", msg1, msg2) diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py index 691da224f44..81ba61b31dc 100644 --- a/python/cudf/cudf/tests/series/test_datetimelike.py +++ b/python/cudf/cudf/tests/series/test_datetimelike.py @@ -57,7 +57,7 @@ def test_localize_ambiguous(request, unit, zone_name): request.applymarker( pytest.mark.xfail( condition=(zone_name == "America/Metlakatla"), - reason="https://www.timeanddate.com/news/time/metlakatla-quits-dst.html", # noqa: E501 + reason="https://www.timeanddate.com/news/time/metlakatla-quits-dst.html", ) ) s = cudf.Series( @@ -83,7 +83,7 @@ def test_localize_nonexistent(request, unit, zone_name): request.applymarker( pytest.mark.xfail( condition=(zone_name == "America/Grand_Turk"), - reason="https://www.worldtimezone.com/dst_news/dst_news_turkscaicos03.html", # noqa: E501 + reason="https://www.worldtimezone.com/dst_news/dst_news_turkscaicos03.html", ) ) s = cudf.Series( diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 71b6bbd688d..0712a0de635 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -167,11 +167,11 @@ _operators_arithmetic = _operators_arithmetic[:1] _operators_comparison = _operators_comparison[:1] _cudf_scalar_reflected_ops = _cudf_scalar_reflected_ops[:1] - DATETIME_TYPES = {"datetime64[ms]"} # noqa: F811 - NUMERIC_TYPES = {"float32"} # noqa: F811 - FLOAT_TYPES = {"float64"} # noqa: F811 - INTEGER_TYPES = {"int16"} # noqa: F811 - TIMEDELTA_TYPES = {"timedelta64[s]"} # noqa: F811 + DATETIME_TYPES = {"datetime64[ms]"} + NUMERIC_TYPES = {"float32"} + FLOAT_TYPES = {"float64"} + INTEGER_TYPES = {"int16"} + TIMEDELTA_TYPES = {"timedelta64[s]"} # To save time, we skip tests marked "pytest.mark.xfail" pytest_xfail = pytest.mark.skipif @@ -444,7 +444,7 @@ def test_str_series_compare_num_reflected( @pytest.mark.parametrize("obj_class", ["Series", "Index"]) @pytest.mark.parametrize("nelem", [1, 2, 100]) @pytest.mark.parametrize("cmpop", _cmpops) -@pytest.mark.parametrize("dtype", utils.NUMERIC_TYPES + ["datetime64[ms]"]) +@pytest.mark.parametrize("dtype", [*utils.NUMERIC_TYPES, "datetime64[ms]"]) @pytest.mark.parametrize("use_cudf_scalar", [True, False]) def test_series_compare_scalar( nelem, cmpop, obj_class, dtype, use_cudf_scalar diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index db41f689255..db24fdd2a29 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -98,7 +98,7 @@ def test_categorical_compare_unordered(): # test equal out = sr == sr assert out.dtype == np.bool_ - assert type(out[0]) == np.bool_ + assert type(out[0]) is np.bool_ assert np.all(out.to_numpy()) assert np.all(pdsr == pdsr) @@ -134,7 +134,7 @@ def test_categorical_compare_ordered(): # test equal out = sr1 == sr1 assert out.dtype == np.bool_ - assert type(out[0]) == np.bool_ + assert type(out[0]) is np.bool_ assert np.all(out.to_numpy()) assert np.all(pdsr1 == pdsr1) @@ -768,7 +768,7 @@ def test_categorical_setitem_with_nan(): assert_eq(gs, expected_series) -@pytest.mark.parametrize("dtype", list(NUMERIC_TYPES) + ["object"]) +@pytest.mark.parametrize("dtype", [*list(NUMERIC_TYPES), "object"]) @pytest.mark.parametrize("input_obj", [[1, cudf.NA, 3]]) def test_series_construction_with_nulls(input_obj, dtype): dtype = cudf.dtype(dtype) diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index ab0f1767cd6..f57f256d55c 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -625,7 +625,7 @@ def test_concat_series_dataframe_input_str(objs): ) @pytest.mark.parametrize("ignore_index", [True, False]) def test_concat_empty_dataframes(df, other, ignore_index): - other_pd = [df] + other + other_pd = [df, *other] gdf = cudf.from_pandas(df) other_gd = [gdf] + [cudf.from_pandas(o) for o in other] @@ -1224,7 +1224,7 @@ def test_concat_join_empty_dataframes( request, df, other, ignore_index, join, sort ): axis = 0 - other_pd = [df] + other + other_pd = [df, *other] gdf = cudf.from_pandas(df) other_gd = [gdf] + [cudf.from_pandas(o) for o in other] @@ -1312,7 +1312,7 @@ def test_concat_join_empty_dataframes_axis_1( df, other, ignore_index, axis, join, sort ): # no duplicate columns - other_pd = [df] + other + other_pd = [df, *other] gdf = cudf.from_pandas(df) other_gd = [gdf] + [cudf.from_pandas(o) for o in other] diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index ac772c47e3a..e18112d03ea 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -916,10 +916,10 @@ def test_csv_reader_nrows(tmpdir): str(fname), dtype=dtypes, skiprows=skip_rows + 1, nrows=read_rows ) assert df.shape == (read_rows, 2) - assert str(skip_rows) in list(df)[0] + assert str(skip_rows) in next(iter(df)) assert str(2 * skip_rows) in list(df)[1] for row in range(0, read_rows // sample_skip, sample_skip): - assert df[list(df)[0]][row] == row + skip_rows + 1 + assert df[next(iter(df))][row] == row + skip_rows + 1 assert df[list(df)[1]][row] == 2 * (row + skip_rows + 1) assert df[list(df)[1]][read_rows - 1] == 2 * (read_rows + skip_rows) diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py index 29f2f46e3c7..381ca45de31 100644 --- a/python/cudf/cudf/tests/test_cuda_array_interface.py +++ b/python/cudf/cudf/tests/test_cuda_array_interface.py @@ -170,7 +170,7 @@ def test_column_from_ephemeral_cupy_try_lose_reference(): # CuPy array a = cudf.Series(cupy.asarray([1, 2, 3]))._column a = cudf.core.column.as_column(a) - b = cupy.asarray([1, 1, 1]) # noqa: F841 + b = cupy.asarray([1, 1, 1]) assert_eq(pd.Index([1, 2, 3]), a.to_pandas()) a = cudf.Series(cupy.asarray([1, 2, 3]))._column diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 509ee0d65a5..d04fd97dcbd 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -56,9 +56,9 @@ # If spilling is enabled globally, we skip many test permutations # to reduce running time. if get_global_manager() is not None: - ALL_TYPES = ["float32"] # noqa: F811 - DATETIME_TYPES = ["datetime64[ms]"] # noqa: F811 - NUMERIC_TYPES = ["float32"] # noqa: F811 + ALL_TYPES = ["float32"] + DATETIME_TYPES = ["datetime64[ms]"] + NUMERIC_TYPES = ["float32"] # To save time, we skip tests marked "xfail" pytest_xfail = pytest.mark.skipif @@ -452,8 +452,8 @@ def test_dataframe_basic(): df = cudf.concat([df, df2]) assert len(df) == 11 - hkeys = np.asarray(np.arange(10, dtype=np.float64).tolist() + [123]) - hvals = np.asarray(rnd_vals.tolist() + [321]) + hkeys = np.asarray([*np.arange(10, dtype=np.float64).tolist(), 123]) + hvals = np.asarray([*rnd_vals.tolist(), 321]) np.testing.assert_equal(df["keys"].to_numpy(), hkeys) np.testing.assert_equal(df["vals"].to_numpy(), hvals) @@ -1118,7 +1118,7 @@ def test_dataframe_to_string_wide(monkeypatch): 1 1 1 1 1 1 1 1 1 ... 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 ... 2 2 2 2 2 2 2 2 - [3 rows x 100 columns]""" # noqa: E501 + [3 rows x 100 columns]""" ) assert got == expect @@ -2197,7 +2197,7 @@ def test_dataframe_shape_empty(): @pytest.mark.parametrize("num_cols", [1, 2, 10]) @pytest.mark.parametrize("num_rows", [1, 2, 20]) -@pytest.mark.parametrize("dtype", dtypes + ["object"]) +@pytest.mark.parametrize("dtype", [*dtypes, "object"]) @pytest.mark.parametrize("nulls", ["none", "some", "all"]) def test_dataframe_transpose(nulls, num_cols, num_rows, dtype): # In case of `bool` dtype: pandas <= 1.2.5 type-casts @@ -2842,7 +2842,7 @@ def test_arrow_round_trip(preserve_index, index): assert_eq(gdf_out, pdf_out) -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"]) +@pytest.mark.parametrize("dtype", [*NUMERIC_TYPES, "bool"]) def test_cuda_array_interface(dtype): np_data = np.arange(10).astype(dtype) cupy_data = cupy.array(np_data) @@ -3707,7 +3707,7 @@ def test_sort_index_axis_1_ignore_index_true_columnaccessor_state_names(): assert result._data.names == tuple(result._data.keys()) -@pytest.mark.parametrize("dtype", dtypes + ["category"]) +@pytest.mark.parametrize("dtype", [*dtypes, "category"]) def test_dataframe_0_row_dtype(dtype): if dtype == "category": data = pd.Series(["a", "b", "c", "d", "e"], dtype="category") @@ -7910,10 +7910,10 @@ def test_dataframe_concat_dataframe_lists(df, other, sort, ignore_index): with _hide_concat_empty_dtype_warning(): expected = pd.concat( - [pdf] + other_pd, sort=sort, ignore_index=ignore_index + [pdf, *other_pd], sort=sort, ignore_index=ignore_index ) actual = cudf.concat( - [gdf] + other_gd, sort=sort, ignore_index=ignore_index + [gdf, *other_gd], sort=sort, ignore_index=ignore_index ) # In some cases, Pandas creates an empty Index([], dtype="object") for @@ -8026,10 +8026,10 @@ def test_dataframe_concat_lists(df, other, sort, ignore_index): with _hide_concat_empty_dtype_warning(): expected = pd.concat( - [pdf] + other_pd, sort=sort, ignore_index=ignore_index + [pdf, *other_pd], sort=sort, ignore_index=ignore_index ) actual = cudf.concat( - [gdf] + other_gd, sort=sort, ignore_index=ignore_index + [gdf, *other_gd], sort=sort, ignore_index=ignore_index ) if expected.shape != df.shape: @@ -10892,7 +10892,7 @@ def test_dataframe_from_ndarray_dup_columns(): @pytest.mark.parametrize("contains", ["a", 0, None, np.nan, cudf.NA]) @pytest.mark.parametrize("other_names", [[], ["b", "c"], [1, 2]]) def test_dataframe_contains(name, contains, other_names): - column_names = [name] + other_names + column_names = [name, *other_names] gdf = cudf.DataFrame({c: [0] for c in column_names}) pdf = pd.DataFrame({c: [0] for c in column_names}) diff --git a/python/cudf/cudf/tests/test_feather.py b/python/cudf/cudf/tests/test_feather.py index f93bd2c5d32..6a9dd4c4a66 100644 --- a/python/cudf/cudf/tests/test_feather.py +++ b/python/cudf/cudf/tests/test_feather.py @@ -16,7 +16,7 @@ @pytest.fixture(params=[0, 1, 10, 100]) def pdf(request): rng = np.random.default_rng(seed=0) - types = NUMERIC_TYPES + ["bool"] + types = [*NUMERIC_TYPES, "bool"] nrows = request.param # Create a pandas dataframe with random data of mixed types diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index e4422e204bc..eae0fd23ef8 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -917,7 +917,6 @@ def test_groupby_apply_return_col_from_df(): # tests a UDF that consists of purely colwise # ops, such as `lambda group: group.x + group.y` # which returns a column - func = lambda group: group.x + group.y # noqa:E731 df = cudf.DataFrame( { "id": range(10), @@ -1222,7 +1221,7 @@ def test_groupby_column_numeral(): pd.Series([0, 2, 0]), pd.Series([0, 2, 0], index=[0, 2, 1]), ], -) # noqa: E501 +) def test_groupby_external_series(series): pdf = pd.DataFrame({"x": [1.0, 2.0, 3.0], "y": [1, 2, 1]}) gdf = DataFrame.from_pandas(pdf) @@ -2016,8 +2015,8 @@ def test_multi_agg(): @pytest.mark.parametrize( "agg", ( - list(itertools.combinations(["count", "max", "min", "nunique"], 2)) - + [ + [ + *itertools.combinations(["count", "max", "min", "nunique"], 2), {"b": "min", "c": "mean"}, {"b": "max", "c": "mean"}, {"b": "count", "c": "mean"}, diff --git a/python/cudf/cudf/tests/test_hdf.py b/python/cudf/cudf/tests/test_hdf.py index 430ed973f19..4921b7b51fc 100644 --- a/python/cudf/cudf/tests/test_hdf.py +++ b/python/cudf/cudf/tests/test_hdf.py @@ -16,7 +16,7 @@ @pytest.fixture(params=[0, 1, 10, 100]) def pdf(request): - types = set(NUMERIC_TYPES + ["datetime64[ns]"] + ["bool"]) - set( + types = set([*NUMERIC_TYPES, "datetime64[ns]", "bool"]) - set( UNSIGNED_TYPES ) typer = {"col_" + val: val for val in types} diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 24d42d9eb4c..11f6d687931 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -1274,7 +1274,7 @@ def test_index_append_list(data, other): @pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) @pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] + "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"] ) @pytest.mark.parametrize("name", [1, "a", None]) def test_index_basic(data, dtype, name): @@ -1399,7 +1399,7 @@ def test_multiindex_append(data, other): @pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) @pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] + "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"] ) def test_index_empty(data, dtype): pdi = pd.Index(data, dtype=dtype) @@ -1410,7 +1410,7 @@ def test_index_empty(data, dtype): @pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) @pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] + "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"] ) def test_index_size(data, dtype): pdi = pd.Index(data, dtype=dtype) @@ -1421,7 +1421,7 @@ def test_index_size(data, dtype): @pytest.mark.parametrize("data", [[1, 2, 3, 1, 2, 3, 4], [], [1], [1, 2, 3]]) @pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] + "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"] ) def test_index_drop_duplicates(data, dtype): pdi = pd.Index(data, dtype=dtype) @@ -1437,7 +1437,7 @@ def test_dropna_bad_how(): @pytest.mark.parametrize("data", [[1, 2, 3, 1, 2, 3, 4], []]) @pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] + "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"] ) def test_index_tolist(data, dtype): gdi = cudf.Index(data, dtype=dtype) @@ -1455,7 +1455,7 @@ def test_index_tolist(data, dtype): @pytest.mark.parametrize("data", [[], [1], [1, 2, 3]]) @pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] + "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"] ) def test_index_iter_error(data, dtype): gdi = cudf.Index(data, dtype=dtype) @@ -1473,7 +1473,7 @@ def test_index_iter_error(data, dtype): @pytest.mark.parametrize("data", [[], [1], [1, 2, 3, 4, 5]]) @pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] + "dtype", [*NUMERIC_TYPES, "str", "category", "datetime64[ns]"] ) def test_index_values_host(data, dtype): gdi = cudf.Index(data, dtype=dtype) diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index f6941ce7fae..f8e61651f37 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -1527,7 +1527,7 @@ def test_categorical_typecast_outer(): result = left.merge(right, how="outer", on="key") -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str"]) +@pytest.mark.parametrize("dtype", [*NUMERIC_TYPES, "str"]) def test_categorical_typecast_inner_one_cat(dtype): data = np.array([1, 2, 3], dtype=dtype) @@ -1538,7 +1538,7 @@ def test_categorical_typecast_inner_one_cat(dtype): assert result["key"].dtype == left["key"].dtype.categories.dtype -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str"]) +@pytest.mark.parametrize("dtype", [*NUMERIC_TYPES, "str"]) def test_categorical_typecast_left_one_cat(dtype): data = np.array([1, 2, 3], dtype=dtype) @@ -1549,7 +1549,7 @@ def test_categorical_typecast_left_one_cat(dtype): assert result["key"].dtype == left["key"].dtype -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str"]) +@pytest.mark.parametrize("dtype", [*NUMERIC_TYPES, "str"]) def test_categorical_typecast_outer_one_cat(dtype): data = np.array([1, 2, 3], dtype=dtype) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index b48be6b2c2f..aaa8d7d07ee 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -58,12 +58,14 @@ def gdf(pdf): @pytest.fixture(params=[0, 1, 10, 100]) def gdf_writer_types(request): # datetime64[us], datetime64[ns] are unsupported due to a bug in parser - types = ( - NUMERIC_TYPES - + ["datetime64[s]", "datetime64[ms]"] - + TIMEDELTA_TYPES - + ["bool", "str"] - ) + types = [ + *NUMERIC_TYPES, + "datetime64[s]", + "datetime64[ms]", + *TIMEDELTA_TYPES, + "bool", + "str", + ] typer = {"col_" + val: val for val in types} ncols = len(types) nrows = request.param diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 41c1c3ccb20..c4b4ef60184 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -606,7 +606,7 @@ def normalized_equals(value1, value2): def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq): from pyarrow import orc - supported_stat_types = supported_numpy_dtypes + ["str"] + supported_stat_types = [*supported_numpy_dtypes, "str"] # Writing bool columns to multiple row groups is disabled # until #6763 is fixed if nrows == 100000: @@ -681,7 +681,7 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq): def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): from pyarrow import orc - supported_stat_types = supported_numpy_dtypes + ["str"] + supported_stat_types = [*supported_numpy_dtypes, "str"] # Writing bool columns to multiple row groups is disabled # until #6763 is fixed if nrows == 200000: diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 659d2ebd89a..de3636f7526 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -2313,7 +2313,7 @@ def test_parquet_writer_criteo(tmpdir): cont_names = ["I" + str(x) for x in range(1, 14)] cat_names = ["C" + str(x) for x in range(1, 27)] - cols = ["label"] + cont_names + cat_names + cols = ["label", *cont_names, *cat_names] df = cudf.read_csv(fname, sep="\t", names=cols, byte_range=(0, 1000000000)) df = df.drop(columns=cont_names) diff --git a/python/cudf/cudf/tests/test_quantiles.py b/python/cudf/cudf/tests/test_quantiles.py index 7d8303df0c3..9a2816f5444 100644 --- a/python/cudf/cudf/tests/test_quantiles.py +++ b/python/cudf/cudf/tests/test_quantiles.py @@ -90,4 +90,4 @@ def test_quantile_type_int_float(interpolation): actual = gsr.quantile(0.5, interpolation=interpolation) assert expected == actual - assert type(expected) == type(actual) + assert type(expected) is type(actual) diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index d9f4ceaf3f7..8ea0d205e8b 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -31,7 +31,7 @@ [ cudf.Series([5, 1, 2, 3, None, 243, None, 4]), cudf.Series(["one", "two", "three", None, "one"], dtype="category"), - cudf.Series(list(range(400)) + [None]), + cudf.Series([*list(range(400)), None]), ], ) @pytest.mark.parametrize( @@ -128,7 +128,7 @@ def test_series_replace(): assert_eq(a8, sr8.to_numpy()) # large input containing null - sr9 = cudf.Series(list(range(400)) + [None]) + sr9 = cudf.Series([*list(range(400)), None]) sr10 = sr9.replace([22, 323, 27, 0], None) assert sr10.null_count == 5 assert len(sr10.dropna().to_numpy()) == (401 - 5) diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index 53fe5f7f30d..5cebdf37c9f 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -28,9 +28,9 @@ # If spilling is enabled globally, we skip many test permutations # to reduce running time. if get_global_manager() is not None: - ALL_TYPES = ["float32"] # noqa: F811 - DATETIME_TYPES = ["datetime64[ms]"] # noqa: F811 - NUMERIC_TYPES = ["float32"] # noqa: F811 + ALL_TYPES = ["float32"] + DATETIME_TYPES = ["datetime64[ms]"] + NUMERIC_TYPES = ["float32"] # To save time, we skip tests marked "pytest.mark.xfail" pytest_xfail = pytest.mark.skipif diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py index f2faf4343b6..fcd98831686 100644 --- a/python/cudf/cudf/tests/test_scalar.py +++ b/python/cudf/cudf/tests/test_scalar.py @@ -351,7 +351,7 @@ def test_scalar_implicit_float_conversion(value): got = float(cudf.Scalar(value)) assert expect == got - assert type(expect) == type(got) + assert type(expect) is type(got) @pytest.mark.parametrize("value", [1, -1, 1.5, 0, "1", True, False]) @@ -360,7 +360,7 @@ def test_scalar_implicit_int_conversion(value): got = int(cudf.Scalar(value)) assert expect == got - assert type(expect) == type(got) + assert type(expect) is type(got) @pytest.mark.parametrize("cls", [int, float, bool]) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index a040d1dc57f..99bd9adb034 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -282,8 +282,8 @@ def test_series_concat_list_series_with_index(data, others, ignore_index): other_ps = others other_gs = [cudf.from_pandas(obj) for obj in others] - expected = pd.concat([psr] + other_ps, ignore_index=ignore_index) - actual = cudf.concat([gsr] + other_gs, ignore_index=ignore_index) + expected = pd.concat([psr, *other_ps], ignore_index=ignore_index) + actual = cudf.concat([gsr, *other_gs], ignore_index=ignore_index) assert_eq(expected, actual) @@ -1942,7 +1942,7 @@ def test_diff_many_dtypes(data): @pytest.mark.parametrize("num_rows", [1, 100]) @pytest.mark.parametrize("num_bins", [1, 10]) @pytest.mark.parametrize("right", [True, False]) -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"]) +@pytest.mark.parametrize("dtype", [*NUMERIC_TYPES, "bool"]) @pytest.mark.parametrize("series_bins", [True, False]) def test_series_digitize(num_rows, num_bins, right, dtype, series_bins): rng = np.random.default_rng(seed=0) diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py index 5406836ba61..6119fda0752 100644 --- a/python/cudf/cudf/tests/test_setitem.py +++ b/python/cudf/cudf/tests/test_setitem.py @@ -472,7 +472,7 @@ def test_loc_setitem_series_index_alignment_13031(other_index): ), ], ) -@pytest.mark.parametrize("arg", list(range(-20, 20)) + [5.6, 3.1]) +@pytest.mark.parametrize("arg", [*list(range(-20, 20)), 5.6, 3.1]) def test_series_set_item_range_index(ps, arg): gsr = cudf.from_pandas(ps) psr = ps.copy(deep=True) diff --git a/python/cudf/cudf/tests/test_spilling.py b/python/cudf/cudf/tests/test_spilling.py index 7af83a99d60..13d98e43ddc 100644 --- a/python/cudf/cudf/tests/test_spilling.py +++ b/python/cudf/cudf/tests/test_spilling.py @@ -669,7 +669,7 @@ def test_statistics_expose(manager: SpillManager): # Expose the first buffer buffers[0].owner.mark_exposed() assert len(manager.statistics.exposes) == 1 - stat = list(manager.statistics.exposes.values())[0] + stat = next(iter(manager.statistics.exposes.values())) assert stat.count == 1 assert stat.total_nbytes == buffers[0].nbytes assert stat.spilled_nbytes == 0 diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 9700f548a16..bdc9e695844 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -536,8 +536,8 @@ def test_string_cat(ps_gs, others, sep, na_rep, index): assert_eq(expect, got) - expect = ps.str.cat(others=[ps.index] + [ps.index], sep=sep, na_rep=na_rep) - got = gs.str.cat(others=[gs.index] + [gs.index], sep=sep, na_rep=na_rep) + expect = ps.str.cat(others=[ps.index, ps.index], sep=sep, na_rep=na_rep) + got = gs.str.cat(others=[gs.index, gs.index], sep=sep, na_rep=na_rep) assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/test_testing.py b/python/cudf/cudf/tests/test_testing.py index c3620db3880..87734ebed58 100644 --- a/python/cudf/cudf/tests/test_testing.py +++ b/python/cudf/cudf/tests/test_testing.py @@ -69,7 +69,7 @@ def test_basic_assert_index_equal( msg = str(e) if kind is not None: - if (kind == TypeError) and ( + if (kind is TypeError) and ( msg == ( "Categoricals can only be compared " diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 47e541fdcef..3637ef075f2 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -44,7 +44,7 @@ def test_tokenize(): actual = strings.str.tokenize() - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) @@ -71,7 +71,7 @@ def test_tokenize_delimiter(): actual = strings.str.tokenize(delimiter="o") - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) @@ -106,7 +106,7 @@ def test_detokenize(): "the siamésé cat jumped under the sofa", ] ) - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) indices = cudf.Series( @@ -122,7 +122,7 @@ def test_detokenize(): "the+the+the+the", ] ) - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) @@ -150,7 +150,7 @@ def test_token_count(delimiter, expected_token_counts): actual = strings.str.token_count(delimiter) - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual, check_dtype=False) @@ -208,7 +208,7 @@ def test_tokenize_with_vocabulary(delimiter, input, default_id, results): ) actual = tokenizer.tokenize(strings, delimiter, default_id) - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) @@ -232,7 +232,7 @@ def test_normalize_spaces(): actual = strings.str.normalize_spaces() - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) @@ -252,7 +252,7 @@ def test_normalize_characters(): ) actual = strings.str.normalize_characters() - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) expected = cudf.Series( @@ -266,7 +266,7 @@ def test_normalize_characters(): ] ) actual = strings.str.normalize_characters(do_lower=False) - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) @@ -309,7 +309,7 @@ def test_ngrams(n, separator, expected_values): actual = strings.str.ngrams(n=n, separator=separator) - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) @@ -364,7 +364,7 @@ def test_character_ngrams(n, expected_values, expected_index, as_list): actual = strings.str.character_ngrams(n=n, as_list=as_list) - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) @@ -379,12 +379,12 @@ def test_hash_character_ngrams(): ] ) actual = strings.str.hash_character_ngrams(5, True) - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) actual = strings.str.hash_character_ngrams(5) expected = expected.explode() - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) @@ -417,7 +417,7 @@ def test_ngrams_tokenize(n, separator, expected_values): actual = strings.str.ngrams_tokenize(n=n, separator=separator) - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) @@ -844,7 +844,7 @@ def test_porter_stemmer_measure(): actual = strings.str.porter_stemmer_measure() - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) @@ -856,14 +856,14 @@ def test_is_vowel_consonant(): [False, False, True, False, False, False, True, False, None, False] ) actual = strings.str.is_vowel(2) - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) expected = cudf.Series( [True, False, True, False, False, False, True, True, None, False] ) actual = strings.str.is_consonant(1) - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) indices = cudf.Series([2, 1, 0, 0, 1, 2, 0, 3, 0, 0]) @@ -871,14 +871,14 @@ def test_is_vowel_consonant(): [False, True, False, False, True, False, True, True, None, False] ) actual = strings.str.is_vowel(indices) - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) expected = cudf.Series( [False, False, True, True, False, True, False, False, None, False] ) actual = strings.str.is_consonant(indices) - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) @@ -1097,5 +1097,5 @@ def test_byte_pair_encoding(separator, input, results): expected = cudf.Series([results, None, "", results]) actual = encoder(strings, separator) - assert type(expected) == type(actual) + assert type(expected) is type(actual) assert_eq(expected, actual) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 252bb19063a..5681601d2be 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -829,7 +829,7 @@ >>> cudf.read_json(json_str, engine='cudf', lines=True, dtype={'k1':float, 'k2':cudf.ListDtype(int)}) k1 k2 0 1.0 [1] -""" # noqa: E501 +""" doc_read_json: Callable = docfmt_partial(docstring=_docstring_read_json) _docstring_to_json = """ diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py index 78aeac425f7..8966789fee8 100644 --- a/python/cudf/cudf/utils/queryutils.py +++ b/python/cudf/cudf/utils/queryutils.py @@ -64,7 +64,7 @@ def query_parser(text): Returns ------- info: a `dict` of the parsed info - """ # noqa + """ # convert any '@' to text = text.replace("@", ENVREF_PREFIX) tree = ast.parse(text) @@ -249,7 +249,7 @@ def query_execute(df, expr, callenv): nrows = len(df) out = column_empty(nrows, dtype=np.bool_) # run kernel - args = [out] + colarrays + envargs + args = [out, *colarrays, *envargs] with _CUDFNumbaConfig(): kernel.forall(nrows)(*args) out_mask = applyutils.make_aggregate_nullmask(df, columns=columns) diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index e6d252b8807..c83c1cbe895 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -210,7 +210,7 @@ class GetAttrGetItemMixin: # Tracking of protected keys by each subclass is necessary to make the # `__getattr__`->`__getitem__` call safe. See - # https://nedbatchelder.com/blog/201010/surprising_getattr_recursion.html # noqa: E501 + # https://nedbatchelder.com/blog/201010/surprising_getattr_recursion.html # for an explanation. In brief, defining the `_PROTECTED_KEYS` allows this # class to avoid calling `__getitem__` inside `__getattr__` when # `__getitem__` will internally again call `__getattr__`, resulting in an diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index 4473a0e6f12..d494e157a18 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -1589,8 +1589,8 @@ def test_numpy_cupy_flatiter(series): _, s = series arr = s.values - assert type(arr.flat._fsproxy_fast) == cp.flatiter - assert type(arr.flat._fsproxy_slow) == np.flatiter + assert type(arr.flat._fsproxy_fast) is cp.flatiter + assert type(arr.flat._fsproxy_slow) is np.flatiter @pytest.mark.xfail( diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py index 665b9d6fb08..1909392b9f7 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py @@ -15,7 +15,7 @@ def assert_plots_equal(expect, got): for expect_ch, got_ch in zip( expect.get_children(), got.get_children() ): - assert type(expect_ch) == type(got_ch) + assert type(expect_ch) is type(got_ch) if isinstance(expect_ch, Line2D): assert_equal(expect_ch.get_xdata(), got_ch.get_xdata()) assert_equal(expect_ch.get_ydata(), got_ch.get_ydata()) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_plotly.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_plotly.py index 27d9df83476..2a0f6697f3a 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_plotly.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_plotly.py @@ -8,7 +8,7 @@ def assert_plotly_equal(expect, got): - assert type(expect) == type(got) + assert type(expect) is type(got) if isinstance(expect, dict): assert expect.keys() == got.keys() for k in expect.keys(): diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py index 4b272900acd..021c5bac9b7 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py @@ -14,7 +14,7 @@ def assert_plots_equal(expect, got): for expect_ch, got_ch in zip( expect.get_children(), got.get_children() ): - assert type(expect_ch) == type(got_ch) + assert type(expect_ch) is type(got_ch) if isinstance(expect_ch, Line2D): assert_equal(expect_ch.get_xdata(), got_ch.get_xdata()) assert_equal(expect_ch.get_ydata(), got_ch.get_ydata()) diff --git a/python/cudf_polars/cudf_polars/__init__.py b/python/cudf_polars/cudf_polars/__init__.py index ba4858c5619..72e09b872d5 100644 --- a/python/cudf_polars/cudf_polars/__init__.py +++ b/python/cudf_polars/cudf_polars/__init__.py @@ -21,8 +21,8 @@ del _ensure_polars_version __all__: list[str] = [ - "execute_with_cudf", "Translator", "__git_commit__", "__version__", + "execute_with_cudf", ] diff --git a/python/cudf_polars/cudf_polars/containers/__init__.py b/python/cudf_polars/cudf_polars/containers/__init__.py index 3b1eff4a0d0..9dff8822376 100644 --- a/python/cudf_polars/cudf_polars/containers/__init__.py +++ b/python/cudf_polars/cudf_polars/containers/__init__.py @@ -5,7 +5,7 @@ from __future__ import annotations -__all__: list[str] = ["DataFrame", "Column"] +__all__: list[str] = ["Column", "DataFrame"] from cudf_polars.containers.column import Column from cudf_polars.containers.dataframe import DataFrame diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 326d6b65cbe..98d49e36fb1 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -36,27 +36,27 @@ from cudf_polars.dsl.expressions.unary import Cast, Len, UnaryFunction __all__ = [ - "Expr", + "Agg", + "AggInfo", + "BinOp", + "BooleanFunction", + "Cast", + "Col", + "ColRef", "ErrorExpr", - "NamedExpr", + "Expr", + "Filter", + "Gather", + "GroupedRollingWindow", + "Len", "Literal", "LiteralColumn", - "Len", - "Col", - "ColRef", - "BooleanFunction", - "StringFunction", - "TemporalFunction", + "NamedExpr", + "RollingWindow", "Sort", "SortBy", - "Gather", - "Filter", - "RollingWindow", - "GroupedRollingWindow", - "Cast", - "Agg", - "AggInfo", + "StringFunction", + "TemporalFunction", "Ternary", - "BinOp", "UnaryFunction", ] diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py index 2af9fdaacc5..624a9bd87ea 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py @@ -31,7 +31,7 @@ class Agg(Expr): - __slots__ = ("name", "options", "op", "request") + __slots__ = ("name", "op", "options", "request") _non_child = ("dtype", "name", "options") def __init__( diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/base.py b/python/cudf_polars/cudf_polars/dsl/expressions/base.py index 23851f91938..4c7ae007070 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/base.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/base.py @@ -20,7 +20,7 @@ from cudf_polars.containers import Column, DataFrame -__all__ = ["Expr", "NamedExpr", "Col", "AggInfo", "ExecutionContext", "ColRef"] +__all__ = ["AggInfo", "Col", "ColRef", "ExecutionContext", "Expr", "NamedExpr"] class AggInfo(NamedTuple): diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py index 1682e7a8a9c..5aa35ead127 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py @@ -195,7 +195,7 @@ def do_evaluate( # If the input null count was non-zero, we must # post-process the result to insert the correct value. h_result = plc.interop.to_arrow(result).as_py() - if is_any and not h_result or not is_any and h_result: + if (is_any and not h_result) or (not is_any and h_result): # Any All # False || Null => Null True && Null => Null return Column(plc.Column.all_null_like(column.obj, 1)) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py index fa68bcb9426..48c37d101f4 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py @@ -13,7 +13,7 @@ if TYPE_CHECKING: import pylibcudf as plc -__all__ = ["RollingWindow", "GroupedRollingWindow"] +__all__ = ["GroupedRollingWindow", "RollingWindow"] class RollingWindow(Expr): diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/selection.py b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py index 77d7d4c0d22..12326740f74 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/selection.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py @@ -20,7 +20,7 @@ from cudf_polars.containers import DataFrame -__all__ = ["Gather", "Filter"] +__all__ = ["Filter", "Gather"] class Gather(Expr): diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/string.py b/python/cudf_polars/cudf_polars/dsl/expressions/string.py index 92c3c658c21..124a6e8d71c 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/string.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/string.py @@ -92,7 +92,7 @@ def from_polars(cls, obj: pl_expr.StringFunction) -> Self: raise ValueError("StringFunction required") return getattr(cls, name) - __slots__ = ("name", "options", "_regex_program") + __slots__ = ("_regex_program", "name", "options") _non_child = ("dtype", "name", "options") def __init__( diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py index 7999ec86068..10caaff6811 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py @@ -21,7 +21,7 @@ from cudf_polars.containers import DataFrame -__all__ = ["Cast", "UnaryFunction", "Len"] +__all__ = ["Cast", "Len", "UnaryFunction"] class Cast(Expr): diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index e8d9691f2a0..a28b4cf25b2 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -42,24 +42,24 @@ __all__ = [ "IR", - "ErrorNode", - "PythonScan", - "Scan", "Cache", - "DataFrameScan", - "Select", - "GroupBy", - "Join", "ConditionalJoin", - "HStack", + "DataFrameScan", "Distinct", - "Sort", - "Slice", + "ErrorNode", "Filter", - "Projection", + "GroupBy", + "HConcat", + "HStack", + "Join", "MapFunction", + "Projection", + "PythonScan", + "Scan", + "Select", + "Slice", + "Sort", "Union", - "HConcat", ] @@ -130,7 +130,7 @@ def broadcast(*columns: Column, target_length: int | None = None) -> list[Column class IR(Node["IR"]): """Abstract plan node, representing an unevaluated dataframe.""" - __slots__ = ("schema", "_non_child_args") + __slots__ = ("_non_child_args", "schema") # This annotation is needed because of https://github.com/python/mypy/issues/17981 _non_child: ClassVar[tuple[str, ...]] = ("schema",) # Concrete classes should set this up with the arguments that will @@ -253,16 +253,16 @@ class Scan(IR): """Input from files.""" __slots__ = ( - "typ", - "reader_options", "cloud_options", "config_options", - "paths", - "with_columns", - "skip_rows", "n_rows", - "row_index", + "paths", "predicate", + "reader_options", + "row_index", + "skip_rows", + "typ", + "with_columns", ) _non_child = ( "schema", @@ -688,7 +688,7 @@ class DataFrameScan(IR): This typically arises from ``q.collect().lazy()`` """ - __slots__ = ("df", "projection", "predicate") + __slots__ = ("df", "predicate", "projection") _non_child = ("schema", "df", "projection", "predicate") df: Any """Polars LazyFrame object.""" @@ -819,11 +819,11 @@ class GroupBy(IR): """Perform a groupby.""" __slots__ = ( + "agg_infos", "agg_requests", "keys", "maintain_order", "options", - "agg_infos", ) _non_child = ("schema", "keys", "agg_requests", "maintain_order", "options") keys: tuple[expr.NamedExpr, ...] @@ -993,7 +993,7 @@ def do_evaluate( class ConditionalJoin(IR): """A conditional inner join of two dataframes on a predicate.""" - __slots__ = ("predicate", "options", "ast_predicate") + __slots__ = ("ast_predicate", "options", "predicate") _non_child = ("schema", "predicate", "options") predicate: expr.Expr options: tuple @@ -1053,7 +1053,7 @@ def do_evaluate( class Join(IR): """A join of two dataframes.""" - __slots__ = ("left_on", "right_on", "options") + __slots__ = ("left_on", "options", "right_on") _non_child = ("schema", "left_on", "right_on", "options") left_on: tuple[expr.NamedExpr, ...] """List of expressions used as keys in the left frame.""" @@ -1337,7 +1337,7 @@ def do_evaluate( class Distinct(IR): """Produce a new dataframe with distinct rows.""" - __slots__ = ("keep", "subset", "zlice", "stable") + __slots__ = ("keep", "stable", "subset", "zlice") _non_child = ("schema", "keep", "subset", "zlice", "stable") keep: plc.stream_compaction.DuplicateKeepOption """Which distinct value to keep.""" @@ -1424,7 +1424,7 @@ def do_evaluate( class Sort(IR): """Sort a dataframe.""" - __slots__ = ("by", "order", "null_order", "stable", "zlice") + __slots__ = ("by", "null_order", "order", "stable", "zlice") _non_child = ("schema", "by", "order", "null_order", "stable", "zlice") by: tuple[expr.NamedExpr, ...] """Sort keys.""" @@ -1505,7 +1505,7 @@ def do_evaluate( class Slice(IR): """Slice a dataframe.""" - __slots__ = ("offset", "length") + __slots__ = ("length", "offset") _non_child = ("schema", "offset", "length") offset: int """Start of the slice.""" diff --git a/python/cudf_polars/cudf_polars/dsl/traversal.py b/python/cudf_polars/cudf_polars/dsl/traversal.py index be8338cb9a9..b3248dae93c 100644 --- a/python/cudf_polars/cudf_polars/dsl/traversal.py +++ b/python/cudf_polars/cudf_polars/dsl/traversal.py @@ -16,10 +16,10 @@ __all__: list[str] = [ - "traversal", - "reuse_if_unchanged", - "make_recursive", "CachingVisitor", + "make_recursive", + "reuse_if_unchanged", + "traversal", ] diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py index 57c5fdaa7cf..52be130ab90 100644 --- a/python/cudf_polars/cudf_polars/typing/__init__.py +++ b/python/cudf_polars/cudf_polars/typing/__init__.py @@ -21,13 +21,13 @@ from cudf_polars.dsl import expr, ir, nodebase __all__: list[str] = [ - "PolarsIR", - "PolarsExpr", - "NodeTraverser", - "OptimizationArgs", - "GenericTransformer", "ExprTransformer", + "GenericTransformer", "IRTransformer", + "NodeTraverser", + "OptimizationArgs", + "PolarsExpr", + "PolarsIR", ] PolarsIR: TypeAlias = Union[ diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index e7ac72df609..6bb5d78c488 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -19,9 +19,9 @@ ) __all__ = [ - "from_polars", - "downcast_arrow_lists", "can_cast", + "downcast_arrow_lists", + "from_polars", "is_order_preserving_cast", ] import pylibcudf as plc @@ -75,11 +75,13 @@ def can_cast(from_: plc.DataType, to: plc.DataType) -> bool: return ( ( from_ == to - or not has_empty - and ( - plc.traits.is_fixed_width(to) - and plc.traits.is_fixed_width(from_) - and plc.unary.is_supported_cast(from_, to) + or ( + not has_empty + and ( + plc.traits.is_fixed_width(to) + and plc.traits.is_fixed_width(from_) + and plc.unary.is_supported_cast(from_, to) + ) ) ) or (from_.id() == plc.TypeId.STRING and is_numeric_not_bool(to)) diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index f050a7c568a..b781b13ec10 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -158,6 +158,7 @@ ignore = [ "ISC002", # multi-line-implicit-string-concatenation ] fixable = ["ALL"] +typing-modules = ["cudf_polars.typing"] [tool.ruff.lint.per-file-ignores] "**/tests/**/*.py" = ["D"] diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py index cc17e71039a..20eb2404b77 100644 --- a/python/dask_cudf/dask_cudf/__init__.py +++ b/python/dask_cudf/dask_cudf/__init__.py @@ -3,15 +3,15 @@ import warnings from importlib import import_module -from dask import config import dask.dataframe as dd -from dask.dataframe import from_delayed # noqa: E402 +from dask import config +from dask.dataframe import from_delayed -import cudf # noqa: E402 +import cudf -from . import backends # noqa: E402, F401 -from ._version import __git_commit__, __version__ # noqa: E402, F401 -from .core import concat, from_cudf, DataFrame, Index, Series # noqa: F401 +from . import backends # noqa: F401 +from ._version import __git_commit__, __version__ # noqa: F401 +from .core import DataFrame, Index, Series, concat, from_cudf QUERY_PLANNING_ON = dd.DASK_EXPR_ENABLED @@ -56,17 +56,17 @@ def inner_func(*args, **kwargs): if QUERY_PLANNING_ON: + from . import io from ._expr.expr import _patch_dask_expr - from . import io # noqa: F401 groupby_agg = _deprecated_api("dask_cudf.groupby_agg") read_text = DataFrame.read_text _patch_dask_expr() else: + from . import io # noqa: F401 from ._legacy.groupby import groupby_agg # noqa: F401 from ._legacy.io import read_text # noqa: F401 - from . import io # noqa: F401 to_orc = _deprecated_api( @@ -78,10 +78,10 @@ def inner_func(*args, **kwargs): __all__ = [ "DataFrame", - "Series", "Index", - "from_cudf", + "Series", "concat", + "from_cudf", "from_delayed", ] diff --git a/python/dask_cudf/dask_cudf/_expr/collection.py b/python/dask_cudf/dask_cudf/_expr/collection.py index 89c0d108743..2dc4031b876 100644 --- a/python/dask_cudf/dask_cudf/_expr/collection.py +++ b/python/dask_cudf/dask_cudf/_expr/collection.py @@ -213,8 +213,9 @@ def _create_array_collection_with_meta(expr): name = result._name meta = result._meta divisions = result.divisions - chunks = ((np.nan,) * (len(divisions) - 1),) + tuple( - (d,) for d in meta.shape[1:] + chunks = ( + (np.nan,) * (len(divisions) - 1), + *tuple((d,) for d in meta.shape[1:]), ) if len(chunks) > 1: if isinstance(dsk, HighLevelGraph): @@ -224,11 +225,11 @@ def _create_array_collection_with_meta(expr): layer = dsk if isinstance(layer, Blockwise): layer.new_axes["j"] = chunks[1][0] - layer.output_indices = layer.output_indices + ("j",) + layer.output_indices = (*layer.output_indices, "j") else: suffix = (0,) * (len(chunks) - 1) for i in range(len(chunks[0])): - layer[(name, i) + suffix] = layer.pop((name, i)) + layer[(name, i, *suffix)] = layer.pop((name, i)) return da.Array(dsk, name=name, chunks=chunks, meta=meta) diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index 7d6d5c05cbe..5fd217209ec 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -10,7 +10,7 @@ # This module provides backward compatibility for legacy import patterns. if dd.DASK_EXPR_ENABLED: - from dask_cudf._expr.collection import ( # noqa: E402 + from dask_cudf._expr.collection import ( DataFrame, Index, Series, @@ -19,7 +19,7 @@ from dask_cudf._legacy.core import DataFrame, Index, Series # noqa: F401 -concat = dd.concat # noqa: F401 +concat = dd.concat @_dask_cudf_performance_tracking diff --git a/python/dask_cudf/dask_cudf/io/__init__.py b/python/dask_cudf/dask_cudf/io/__init__.py index 212951336c9..9bca33e414a 100644 --- a/python/dask_cudf/dask_cudf/io/__init__.py +++ b/python/dask_cudf/dask_cudf/io/__init__.py @@ -1,9 +1,8 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from dask_cudf import _deprecated_api, QUERY_PLANNING_ON - -from . import csv, orc, json, parquet, text # noqa: F401 +from dask_cudf import QUERY_PLANNING_ON, _deprecated_api +from . import csv, json, orc, parquet, text # noqa: F401 read_csv = _deprecated_api( "dask_cudf.io.read_csv", new_api="dask_cudf.read_csv" diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index ce9935c8b3c..ba6209c4820 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -40,7 +40,7 @@ def TaskList(*x): from dask_cudf import QUERY_PLANNING_ON, _deprecated_api # Dask-expr imports CudfEngine from this module -from dask_cudf._legacy.io.parquet import CudfEngine # noqa: F401 +from dask_cudf._legacy.io.parquet import CudfEngine if TYPE_CHECKING: from collections.abc import MutableMapping diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 5130b804179..cda7e2d134d 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -489,7 +489,7 @@ def test_repartition_hash_staged(npartitions): ) # Make sure we are getting a dask_cudf dataframe - assert type(ddf_new) == type(ddf) + assert type(ddf_new) is type(ddf) # Check that the length was preserved assert len(ddf_new) == len(ddf) @@ -956,7 +956,7 @@ def func(x): # NOTE: The calculation here doesn't need to make sense. # We just need to make sure we get the right type back. - assert type(result) == type(expect) + assert type(result) is type(expect) @pytest.mark.parametrize("data", [[1, 2, 3], [1.1, 2.3, 4.5]]) diff --git a/python/dask_cudf/dask_cudf/tests/test_dispatch.py b/python/dask_cudf/dask_cudf/tests/test_dispatch.py index fe57d4a4f00..d91b9defc1c 100644 --- a/python/dask_cudf/dask_cudf/tests/test_dispatch.py +++ b/python/dask_cudf/dask_cudf/tests/test_dispatch.py @@ -44,7 +44,7 @@ def test_pyarrow_conversion_dispatch(preserve_index, index): if not preserve_index and index is not None: df1.index.name = None - assert type(df1) == type(df2) + assert type(df1) is type(df2) assert_eq(df1, df2) # Check that preserve_index does not produce a RangeIndex diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index 918290aa6fa..9bd3b506db0 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -58,7 +58,7 @@ def pdf(request): # deprecation check for "collect". @pytest.mark.parametrize( "aggregation", - sorted(tuple(set(OPTIMIZED_AGGS) - {list}) + ("collect",)), + sorted((*tuple(set(OPTIMIZED_AGGS) - {list}), "collect")), ) @pytest.mark.parametrize("series", [False, True]) def test_groupby_basic(series, aggregation, pdf): diff --git a/python/libcudf/libcudf/__init__.py b/python/libcudf/libcudf/__init__.py index 10c476cbe89..4077fa8fbf9 100644 --- a/python/libcudf/libcudf/__init__.py +++ b/python/libcudf/libcudf/__init__.py @@ -14,3 +14,5 @@ from libcudf._version import __git_commit__, __version__ from libcudf.load import load_library + +__all__ = ["__git_commit__", "__version__", "load_library"] diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py index 62a2170f83e..8ea176a6b07 100644 --- a/python/pylibcudf/pylibcudf/__init__.py +++ b/python/pylibcudf/pylibcudf/__init__.py @@ -65,8 +65,8 @@ "aggregation", "binaryop", "column_factories", - "contiguous_split", "concatenate", + "contiguous_split", "copying", "datetime", "experimental", @@ -83,6 +83,7 @@ "lists", "merge", "null_mask", + "nvtext", "partitioning", "quantiles", "reduce", @@ -91,13 +92,12 @@ "rolling", "round", "search", + "sorting", "stream_compaction", "strings", - "sorting", "traits", "transform", "transpose", "types", "unary", - "nvtext", ] diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py index 4f125d3a733..d88a7d4b825 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.py +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py @@ -15,11 +15,11 @@ ) __all__ = [ + "byte_pair_encode", "edit_distance", "generate_ngrams", "jaccard", "minhash", - "byte_pair_encode", "ngrams_tokenize", "normalize", "replace", diff --git a/python/pylibcudf/pylibcudf/tests/io/test_csv.py b/python/pylibcudf/pylibcudf/tests/io/test_csv.py index 1cbaac57315..555ca2fb02c 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_csv.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_csv.py @@ -281,7 +281,7 @@ def test_read_csv_header(csv_table_data, source_or_sink, header): new_tbl_dict = {} for i, (name, vals) in enumerate(tbl_dict.items()): str_vals = [str(val) for val in vals] - new_tbl_dict[str(i)] = [name] + str_vals + new_tbl_dict[str(i)] = [name, *str_vals] pa_table = pa.table(new_tbl_dict) assert_table_and_meta_eq( From 852338e71dae9833a53507bd4b1470798f0a5c4b Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 2 Dec 2024 16:41:35 -0600 Subject: [PATCH 05/78] Update PyTorch to >=2.4.0 to get fix for CUDA array interface bug, and drop CUDA 11 PyTorch tests. (#17475) This PR updates our PyTorch lower bound to 2.4.0 to get the bugfix from https://github.com/pytorch/pytorch/pull/121458. Also, this PR drops CUDA 11 tests because conda-forge no longer produces CUDA 11 builds of PyTorch. This was causing a failure on Hopper GPUs because the last available CUDA 11 builds from conda-forge do not include sm90 support. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - James Lamb (https://github.com/jameslamb) - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/17475 --- conda/environments/all_cuda-118_arch-x86_64.yaml | 3 --- conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +- dependencies.yaml | 7 ++----- .../cudf/cudf/tests/test_cuda_array_interface.py | 15 +++++---------- .../dependencies.yaml | 2 +- 5 files changed, 9 insertions(+), 20 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 97c72ec8042..2be64b7cd70 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -80,7 +80,6 @@ dependencies: - python-confluent-kafka>=2.5.0,<2.6.0a0 - python-xxhash - python>=3.10,<3.13 -- pytorch>=2.1.0 - rapids-build-backend>=0.3.0,<0.4.0.dev0 - rapids-dask-dependency==25.2.*,>=0.0.0a0 - rich @@ -97,8 +96,6 @@ dependencies: - sphinxcontrib-websupport - streamz - sysroot_linux-64==2.17 -- tokenizers==0.15.2 -- transformers==4.39.3 - typing_extensions>=4.0.0 - zlib>=1.2.13 name: all_cuda-118_arch-x86_64 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 84b58b6d7a4..6b5ca04c015 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -78,7 +78,7 @@ dependencies: - python-confluent-kafka>=2.5.0,<2.6.0a0 - python-xxhash - python>=3.10,<3.13 -- pytorch>=2.1.0 +- pytorch>=2.4.0 - rapids-build-backend>=0.3.0,<0.4.0.dev0 - rapids-dask-dependency==25.2.*,>=0.0.0a0 - rich diff --git a/dependencies.yaml b/dependencies.yaml index 3976696a41c..259d41b59fe 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -885,12 +885,9 @@ dependencies: - output_types: conda matrices: - matrix: - arch: x86_64 + cuda: "12.*" packages: - # Currently, CUDA + aarch64 builds of pytorch do not exist on conda-forge. - - pytorch>=2.1.0 - # We only install these on x86_64 to avoid pulling pytorch as a - # dependency of transformers. + - pytorch>=2.4.0 - *tokenizers - *transformers - matrix: diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py index 381ca45de31..dcde0dab83d 100644 --- a/python/cudf/cudf/tests/test_cuda_array_interface.py +++ b/python/cudf/cudf/tests/test_cuda_array_interface.py @@ -187,7 +187,7 @@ def test_column_from_ephemeral_cupy_try_lose_reference(): ), ) def test_cuda_array_interface_pytorch(): - torch = pytest.importorskip("torch", minversion="1.6.0") + torch = pytest.importorskip("torch", minversion="2.4.0") if not torch.cuda.is_available(): pytest.skip("need gpu version of pytorch to be installed") @@ -202,15 +202,10 @@ def test_cuda_array_interface_pytorch(): assert_eq(got, cudf.Series(buffer, dtype=np.bool_)) - # TODO: This test fails with PyTorch 2. It appears that PyTorch - # checks that the pointer is device-accessible even when the - # size is zero. See - # https://github.com/pytorch/pytorch/issues/98133 - # - # index = cudf.Index([], dtype="float64") - # tensor = torch.tensor(index) - # got = cudf.Index(tensor) - # assert_eq(got, index) + index = cudf.Index([], dtype="float64") + tensor = torch.tensor(index) + got = cudf.Index(tensor) + assert_eq(got, index) index = cudf.core.index.RangeIndex(start=0, stop=100) tensor = torch.tensor(index) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml index 6b317cc13fb..e726b7fdca1 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml @@ -213,7 +213,7 @@ dependencies: - output_types: conda packages: - numpy - - pytorch>=2.1.0 + - pytorch>=2.4.0 test_seaborn: common: - output_types: conda From da72cf609f61fa4dd154be377a8b591ea1773e04 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 2 Dec 2024 15:08:37 -0800 Subject: [PATCH 06/78] Remove cudf._lib.filling in favor of inlining pylibcudf (#17459) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) - Lawrence Mitchell (https://github.com/wence-) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/17459 --- python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/filling.pyx | 57 --------------------- python/cudf/cudf/core/column/categorical.py | 7 +-- python/cudf/cudf/core/column/column.py | 47 +++++++++++------ python/cudf/cudf/core/frame.py | 11 +++- python/cudf/cudf/core/index.py | 14 ++--- 7 files changed, 51 insertions(+), 87 deletions(-) delete mode 100644 python/cudf/cudf/_lib/filling.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index de483b3070d..e69a2672163 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -18,7 +18,6 @@ set(cython_sources column.pyx copying.pyx csv.pyx - filling.pyx groupby.pyx interop.pyx merge.pyx diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index ee1bd13f2c4..ec32386b2ce 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -5,7 +5,6 @@ binaryop, copying, csv, - filling, groupby, interop, merge, diff --git a/python/cudf/cudf/_lib/filling.pyx b/python/cudf/cudf/_lib/filling.pyx deleted file mode 100644 index b2f4c620144..00000000000 --- a/python/cudf/cudf/_lib/filling.pyx +++ /dev/null @@ -1,57 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport columns_from_pylibcudf_table - -import pylibcudf - -from cudf._lib.scalar import as_device_scalar - - -@acquire_spill_lock() -def fill_in_place(Column destination, int begin, int end, DeviceScalar value): - pylibcudf.filling.fill_in_place( - destination.to_pylibcudf(mode='write'), - begin, - end, - ( as_device_scalar(value, dtype=destination.dtype)).c_value - ) - - -@acquire_spill_lock() -def fill(Column destination, int begin, int end, DeviceScalar value): - return Column.from_pylibcudf( - pylibcudf.filling.fill( - destination.to_pylibcudf(mode='read'), - begin, - end, - ( as_device_scalar(value)).c_value - ) - ) - - -@acquire_spill_lock() -def repeat(list inp, object count): - ctbl = pylibcudf.Table([col.to_pylibcudf(mode="read") for col in inp]) - if isinstance(count, Column): - count = count.to_pylibcudf(mode="read") - return columns_from_pylibcudf_table( - pylibcudf.filling.repeat( - ctbl, - count - ) - ) - - -@acquire_spill_lock() -def sequence(int size, DeviceScalar init, DeviceScalar step): - return Column.from_pylibcudf( - pylibcudf.filling.sequence( - size, - ( as_device_scalar(init)).c_value, - ( as_device_scalar(step)).c_value - ) - ) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index cbbe01f7289..c849a9d3d2b 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -668,13 +668,8 @@ def _fill( return self if inplace else self.copy() fill_code = self._encode(fill_value) - fill_scalar = cudf._lib.scalar.as_device_scalar( - fill_code, self.codes.dtype - ) - result = self if inplace else self.copy() - - libcudf.filling.fill_in_place(result.codes, begin, end, fill_scalar) + result.codes._fill(fill_code, begin, end, inplace=True) return result def slice(self, start: int, stop: int, stride: int | None = None) -> Self: diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index d1938f47d66..cdc3a03f445 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -401,14 +401,19 @@ def _fill( # the scalar is None when calling `is_valid`. slr = cudf.Scalar(fill_value, dtype=self.dtype) - if not inplace: - return libcudf.filling.fill(self, begin, end, slr.device_value) - - if is_string_dtype(self.dtype): - return self._mimic_inplace( - libcudf.filling.fill(self, begin, end, slr.device_value), - inplace=True, - ) + if not inplace or is_string_dtype(self.dtype): + with acquire_spill_lock(): + result = type(self).from_pylibcudf( + plc.filling.fill( + self.to_pylibcudf(mode="read"), + begin, + end, + slr.device_value.c_value, + ) + ) + if is_string_dtype(self.dtype): + return self._mimic_inplace(result, inplace=True) + return result # type: ignore[return-value] if not slr.is_valid() and not self.nullable: mask = as_buffer( @@ -418,8 +423,13 @@ def _fill( ) self.set_base_mask(mask) - libcudf.filling.fill_in_place(self, begin, end, slr.device_value) - + with acquire_spill_lock(): + plc.filling.fill_in_place( + self.to_pylibcudf(mode="write"), + begin, + end, + slr.device_value.c_value, + ) return self def shift(self, offset: int, fill_value: ScalarLike) -> ColumnBase: @@ -1813,11 +1823,18 @@ def as_column( * range objects """ if isinstance(arbitrary, (range, pd.RangeIndex, cudf.RangeIndex)): - column = libcudf.filling.sequence( - len(arbitrary), - as_device_scalar(arbitrary.start, dtype=cudf.dtype("int64")), - as_device_scalar(arbitrary.step, dtype=cudf.dtype("int64")), - ) + with acquire_spill_lock(): + column = Column.from_pylibcudf( + plc.filling.sequence( + len(arbitrary), + as_device_scalar( + arbitrary.start, dtype=np.dtype(np.int64) + ).c_value, + as_device_scalar( + arbitrary.step, dtype=np.dtype(np.int64) + ).c_value, + ) + ) if cudf.get_option("default_integer_bitwidth") and dtype is None: dtype = cudf.dtype( f'i{cudf.get_option("default_integer_bitwidth")//8}' diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 70789160cb6..0a7e6fefe6e 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1901,7 +1901,16 @@ def _repeat( if not is_scalar(repeats): repeats = as_column(repeats) - return libcudf.filling.repeat(columns, repeats) + with acquire_spill_lock(): + plc_table = plc.Table( + [col.to_pylibcudf(mode="read") for col in columns] + ) + if isinstance(repeats, ColumnBase): + repeats = repeats.to_pylibcudf(mode="read") + return [ + libcudf.column.Column.from_pylibcudf(col) + for col in plc.filling.repeat(plc_table, repeats).columns() + ] @_performance_tracking @_warn_no_dask_cudf diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index eac04cf36ec..cc3d8448151 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -20,7 +20,6 @@ import cudf from cudf import _lib as libcudf -from cudf._lib.filling import sequence from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import ( @@ -3402,11 +3401,14 @@ def interval_range( start = start.astype(common_dtype) freq = freq.astype(common_dtype) - bin_edges = sequence( - size=periods + 1, - init=start.device_value, - step=freq.device_value, - ) + with acquire_spill_lock(): + bin_edges = libcudf.column.Column.from_pylibcudf( + plc.filling.sequence( + size=periods + 1, + init=start.device_value.c_value, + step=freq.device_value.c_value, + ) + ) return IntervalIndex.from_breaks(bin_edges, closed=closed, name=name) From b67c0a97d16ec3c9d0abf825ad9755013b24ebab Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 2 Dec 2024 16:04:59 -0800 Subject: [PATCH 07/78] Update MurmurHash3_x64_128 to use the cuco equivalent implementation (#17457) This PR modifies MurmurHash3_x64_128 to utilize the cuco equivalent implementation, eliminating duplication. Authors: - Yunsong Wang (https://github.com/PointKernel) - Karthikeyan (https://github.com/karthikeyann) Approvers: - Karthikeyan (https://github.com/karthikeyann) - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) - MithunR (https://github.com/mythrocks) URL: https://github.com/rapidsai/cudf/pull/17457 --- .../hashing/detail/murmurhash3_x64_128.cuh | 163 +++--------------- .../hashing/detail/murmurhash3_x86_32.cuh | 2 +- cpp/include/cudf/hashing/detail/xxhash_64.cuh | 2 +- cpp/src/hash/murmurhash3_x64_128.cu | 17 +- 4 files changed, 35 insertions(+), 149 deletions(-) diff --git a/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh b/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh index 5e88b905023..31390aa3edf 100644 --- a/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh +++ b/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh @@ -15,177 +15,63 @@ */ #pragma once +#include +#include #include #include -#include +#include +#include +#include namespace cudf::hashing::detail { -// MurmurHash3_x64_128 implementation from -// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp -//----------------------------------------------------------------------------- -// MurmurHash3 was written by Austin Appleby, and is placed in the public -// domain. The author hereby disclaims copyright to this source code. -// Note - The x86 and x64 versions do _not_ produce the same results, as the -// algorithms are optimized for their respective platforms. You can still -// compile and run any of them on any platform, but your performance with the -// non-native version will be less than optimal. template struct MurmurHash3_x64_128 { - using result_type = thrust::pair; + using result_type = cuda::std::array; - constexpr MurmurHash3_x64_128() = default; - constexpr MurmurHash3_x64_128(uint64_t seed) : m_seed(seed) {} - - __device__ inline uint32_t getblock32(std::byte const* data, cudf::size_type offset) const + CUDF_HOST_DEVICE constexpr MurmurHash3_x64_128(uint64_t seed = cudf::DEFAULT_HASH_SEED) + : _impl{seed} { - // Read a 4-byte value from the data pointer as individual bytes for safe - // unaligned access (very likely for string types). - auto block = reinterpret_cast(data + offset); - return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24); } - __device__ inline uint64_t getblock64(std::byte const* data, cudf::size_type offset) const - { - uint64_t result = getblock32(data, offset + 4); - result = result << 32; - return result | getblock32(data, offset); - } + __device__ constexpr result_type operator()(Key const& key) const { return this->_impl(key); } - __device__ inline uint64_t fmix64(uint64_t k) const + __device__ constexpr result_type compute_bytes(cuda::std::byte const* bytes, + std::uint64_t size) const { - k ^= k >> 33; - k *= 0xff51afd7ed558ccdUL; - k ^= k >> 33; - k *= 0xc4ceb9fe1a85ec53UL; - k ^= k >> 33; - return k; + return this->_impl.compute_hash(bytes, size); } - result_type __device__ inline operator()(Key const& key) const { return compute(key); } - + private: template - result_type __device__ inline compute(T const& key) const - { - return compute_bytes(reinterpret_cast(&key), sizeof(T)); - } - - result_type __device__ inline compute_remaining_bytes(std::byte const* data, - cudf::size_type len, - cudf::size_type tail_offset, - result_type h) const - { - // Process remaining bytes that do not fill a 8-byte chunk. - uint64_t k1 = 0; - uint64_t k2 = 0; - auto const tail = reinterpret_cast(data) + tail_offset; - switch (len & (BLOCK_SIZE - 1)) { - case 15: k2 ^= static_cast(tail[14]) << 48; - case 14: k2 ^= static_cast(tail[13]) << 40; - case 13: k2 ^= static_cast(tail[12]) << 32; - case 12: k2 ^= static_cast(tail[11]) << 24; - case 11: k2 ^= static_cast(tail[10]) << 16; - case 10: k2 ^= static_cast(tail[9]) << 8; - case 9: - k2 ^= static_cast(tail[8]) << 0; - k2 *= c2; - k2 = rotate_bits_left(k2, 33); - k2 *= c1; - h.second ^= k2; - - case 8: k1 ^= static_cast(tail[7]) << 56; - case 7: k1 ^= static_cast(tail[6]) << 48; - case 6: k1 ^= static_cast(tail[5]) << 40; - case 5: k1 ^= static_cast(tail[4]) << 32; - case 4: k1 ^= static_cast(tail[3]) << 24; - case 3: k1 ^= static_cast(tail[2]) << 16; - case 2: k1 ^= static_cast(tail[1]) << 8; - case 1: - k1 ^= static_cast(tail[0]) << 0; - k1 *= c1; - k1 = rotate_bits_left(k1, 31); - k1 *= c2; - h.first ^= k1; - }; - return h; - } - - result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const + __device__ constexpr result_type compute(T const& key) const { - auto const nblocks = len / BLOCK_SIZE; - uint64_t h1 = m_seed; - uint64_t h2 = m_seed; - - // Process all four-byte chunks. - for (cudf::size_type i = 0; i < nblocks; i++) { - uint64_t k1 = getblock64(data, (i * BLOCK_SIZE)); // 1st 8 bytes - uint64_t k2 = getblock64(data, (i * BLOCK_SIZE) + (BLOCK_SIZE / 2)); // 2nd 8 bytes - - k1 *= c1; - k1 = rotate_bits_left(k1, 31); - k1 *= c2; - - h1 ^= k1; - h1 = rotate_bits_left(h1, 27); - h1 += h2; - h1 = h1 * 5 + 0x52dce729; - - k2 *= c2; - k2 = rotate_bits_left(k2, 33); - k2 *= c1; - - h2 ^= k2; - h2 = rotate_bits_left(h2, 31); - h2 += h1; - h2 = h2 * 5 + 0x38495ab5; - } - - thrust::tie(h1, h2) = compute_remaining_bytes(data, len, nblocks * BLOCK_SIZE, {h1, h2}); - - // Finalize hash. - h1 ^= len; - h2 ^= len; - - h1 += h2; - h2 += h1; - - h1 = fmix64(h1); - h2 = fmix64(h2); - - h1 += h2; - h2 += h1; - - return {h1, h2}; + return this->compute_bytes(reinterpret_cast(&key), sizeof(T)); } - private: - uint64_t m_seed{}; - static constexpr uint32_t BLOCK_SIZE = 16; // 2 x 64-bit = 16 bytes - - static constexpr uint64_t c1 = 0x87c37b91114253d5UL; - static constexpr uint64_t c2 = 0x4cf5ad432745937fUL; + cuco::murmurhash3_x64_128 _impl; }; template <> MurmurHash3_x64_128::result_type __device__ inline MurmurHash3_x64_128::operator()( bool const& key) const { - return compute(key); + return this->compute(key); } template <> MurmurHash3_x64_128::result_type __device__ inline MurmurHash3_x64_128::operator()( float const& key) const { - return compute(normalize_nans(key)); + return this->compute(normalize_nans(key)); } template <> MurmurHash3_x64_128::result_type __device__ inline MurmurHash3_x64_128::operator()( double const& key) const { - return compute(normalize_nans(key)); + return this->compute(normalize_nans(key)); } template <> @@ -193,9 +79,8 @@ MurmurHash3_x64_128::result_type __device__ inline MurmurHash3_x64_128::operator()( cudf::string_view const& key) const { - auto const data = reinterpret_cast(key.data()); - auto const len = key.size_bytes(); - return compute_bytes(data, len); + return this->compute_bytes(reinterpret_cast(key.data()), + key.size_bytes()); } template <> @@ -203,7 +88,7 @@ MurmurHash3_x64_128::result_type __device__ inline MurmurHash3_x64_128::operator()( numeric::decimal32 const& key) const { - return compute(key.value()); + return this->compute(key.value()); } template <> @@ -211,7 +96,7 @@ MurmurHash3_x64_128::result_type __device__ inline MurmurHash3_x64_128::operator()( numeric::decimal64 const& key) const { - return compute(key.value()); + return this->compute(key.value()); } template <> @@ -219,7 +104,7 @@ MurmurHash3_x64_128::result_type __device__ inline MurmurHash3_x64_128::operator()( numeric::decimal128 const& key) const { - return compute(key.value()); + return this->compute(key.value()); } } // namespace cudf::hashing::detail diff --git a/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh b/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh index 38a7d927b9c..e0c7ce840d7 100644 --- a/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh +++ b/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh @@ -33,7 +33,7 @@ template struct MurmurHash3_x86_32 { using result_type = hash_value_type; - __host__ __device__ constexpr MurmurHash3_x86_32(uint32_t seed = cudf::DEFAULT_HASH_SEED) + CUDF_HOST_DEVICE constexpr MurmurHash3_x86_32(uint32_t seed = cudf::DEFAULT_HASH_SEED) : _impl{seed} { } diff --git a/cpp/include/cudf/hashing/detail/xxhash_64.cuh b/cpp/include/cudf/hashing/detail/xxhash_64.cuh index 7d72349e340..d77d040b365 100644 --- a/cpp/include/cudf/hashing/detail/xxhash_64.cuh +++ b/cpp/include/cudf/hashing/detail/xxhash_64.cuh @@ -31,7 +31,7 @@ template struct XXHash_64 { using result_type = std::uint64_t; - __host__ __device__ constexpr XXHash_64(uint64_t seed = cudf::DEFAULT_HASH_SEED) : _impl{seed} {} + CUDF_HOST_DEVICE constexpr XXHash_64(uint64_t seed = cudf::DEFAULT_HASH_SEED) : _impl{seed} {} __device__ constexpr result_type operator()(Key const& key) const { return this->_impl(key); } diff --git a/cpp/src/hash/murmurhash3_x64_128.cu b/cpp/src/hash/murmurhash3_x64_128.cu index 090bd92af8c..43df7f325ac 100644 --- a/cpp/src/hash/murmurhash3_x64_128.cu +++ b/cpp/src/hash/murmurhash3_x64_128.cu @@ -24,6 +24,7 @@ #include #include +#include #include namespace cudf { @@ -31,7 +32,7 @@ namespace hashing { namespace detail { namespace { -using hash_value_type = thrust::pair; +using hash_value_type = cuda::std::array; /** * @brief Computes the hash value of a row in the given table. @@ -58,7 +59,7 @@ class murmur_device_row_hasher { */ __device__ void operator()(size_type row_index) const noexcept { - auto h = cudf::detail::accumulate( + auto const h = cudf::detail::accumulate( _input.begin(), _input.end(), hash_value_type{_seed, 0}, @@ -66,8 +67,8 @@ class murmur_device_row_hasher { return cudf::type_dispatcher( column.type(), element_hasher_adapter{}, column, row_index, nulls, hash); }); - _output1[row_index] = h.first; - _output2[row_index] = h.second; + _output1[row_index] = h[0]; + _output2[row_index] = h[1]; } /** @@ -78,13 +79,13 @@ class murmur_device_row_hasher { template ())> __device__ hash_value_type operator()(column_device_view const& col, size_type row_index, - Nullate const _check_nulls, - hash_value_type const _seed) const noexcept + Nullate const check_nulls, + hash_value_type const seed) const noexcept { - if (_check_nulls && col.is_null(row_index)) { + if (check_nulls && col.is_null(row_index)) { return {std::numeric_limits::max(), std::numeric_limits::max()}; } - auto const hasher = MurmurHash3_x64_128{_seed.first}; + auto const hasher = MurmurHash3_x64_128{seed[0]}; return hasher(col.element(row_index)); } From 12c77f32eee3b1aa0ba5592d9f25b4664104bd04 Mon Sep 17 00:00:00 2001 From: tequilayu <48981002+tequilayu@users.noreply.github.com> Date: Tue, 3 Dec 2024 08:57:51 +0800 Subject: [PATCH 08/78] add comment to Series.tolist method (#17350) closes #15767 This PR adds comment to `Series.tolist` method. It mentions that the method will raise a `TypeError` when it's called and suggest alternatives. Authors: - https://github.com/tequilayu - Michael Wang (https://github.com/isVoid) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/17350 --- python/cudf/cudf/core/series.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 928f3c3d666..58cefc6554e 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -943,6 +943,19 @@ def drop( ) def tolist(self): + """Conversion to host memory lists is currently unsupported + + Raises + ------ + TypeError + If this method is called + + Notes + ----- + cuDF currently does not support implicity conversion from GPU stored series to + host stored lists. A `TypeError` is raised when this method is called. + Consider calling `.to_arrow().to_pylist()` to construct a Python list. + """ raise TypeError( "cuDF does not support conversion to host memory " "via the `tolist()` method. Consider using " From 3785a48eb81be23b44b895624f21acbfc1a828c5 Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Tue, 3 Dec 2024 11:17:04 -0600 Subject: [PATCH 09/78] Add multi-partition `DataFrameScan` support to cuDF-Polars (#17441) Follow-up to https://github.com/rapidsai/cudf/pull/17262 Adds support for parallel `DataFrameScan` operations. Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/17441 --- python/cudf_polars/cudf_polars/callback.py | 14 +- python/cudf_polars/cudf_polars/dsl/ir.py | 17 +- .../cudf_polars/cudf_polars/dsl/translate.py | 1 + .../cudf_polars/experimental/base.py | 43 +++ .../cudf_polars/experimental/dispatch.py | 84 ++++++ .../cudf_polars/experimental/io.py | 49 ++++ .../cudf_polars/experimental/parallel.py | 245 +++++++++--------- .../cudf_polars/tests/dsl/test_traversal.py | 12 +- .../tests/experimental/test_dataframescan.py | 53 ++++ python/cudf_polars/tests/test_executors.py | 16 ++ 10 files changed, 411 insertions(+), 123 deletions(-) create mode 100644 python/cudf_polars/cudf_polars/experimental/base.py create mode 100644 python/cudf_polars/cudf_polars/experimental/dispatch.py create mode 100644 python/cudf_polars/cudf_polars/experimental/io.py create mode 100644 python/cudf_polars/tests/experimental/test_dataframescan.py diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py index 95527028aa9..29d3dc4ae79 100644 --- a/python/cudf_polars/cudf_polars/callback.py +++ b/python/cudf_polars/cudf_polars/callback.py @@ -217,7 +217,8 @@ def validate_config_options(config: dict) -> None: If the configuration contains unsupported options. """ if unsupported := ( - config.keys() - {"raise_on_fail", "parquet_options", "executor"} + config.keys() + - {"raise_on_fail", "parquet_options", "executor", "executor_options"} ): raise ValueError( f"Engine configuration contains unsupported settings: {unsupported}" @@ -226,6 +227,17 @@ def validate_config_options(config: dict) -> None: config.get("parquet_options", {}) ) + # Validate executor_options + executor = config.get("executor", "pylibcudf") + if executor == "dask-experimental": + unsupported = config.get("executor_options", {}).keys() - { + "max_rows_per_partition" + } + else: + unsupported = config.get("executor_options", {}).keys() + if unsupported: + raise ValueError(f"Unsupported executor_options for {executor}: {unsupported}") + def execute_with_cudf(nt: NodeTraverser, *, config: GPUEngine) -> None: """ diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index a28b4cf25b2..1faa778ccf6 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -688,14 +688,16 @@ class DataFrameScan(IR): This typically arises from ``q.collect().lazy()`` """ - __slots__ = ("df", "predicate", "projection") - _non_child = ("schema", "df", "projection", "predicate") + __slots__ = ("config_options", "df", "predicate", "projection") + _non_child = ("schema", "df", "projection", "predicate", "config_options") df: Any """Polars LazyFrame object.""" projection: tuple[str, ...] | None """List of columns to project out.""" predicate: expr.NamedExpr | None """Mask to apply.""" + config_options: dict[str, Any] + """GPU-specific configuration options""" def __init__( self, @@ -703,11 +705,13 @@ def __init__( df: Any, projection: Sequence[str] | None, predicate: expr.NamedExpr | None, + config_options: dict[str, Any], ): self.schema = schema self.df = df self.projection = tuple(projection) if projection is not None else None self.predicate = predicate + self.config_options = config_options self._non_child_args = (schema, df, self.projection, predicate) self.children = () @@ -719,7 +723,14 @@ def get_hashable(self) -> Hashable: not stable across runs, or repeat instances of the same equal dataframes. """ schema_hash = tuple(self.schema.items()) - return (type(self), schema_hash, id(self.df), self.projection, self.predicate) + return ( + type(self), + schema_hash, + id(self.df), + self.projection, + self.predicate, + json.dumps(self.config_options), + ) @classmethod def do_evaluate( diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index b1e2de63ba6..37cf36dc4dd 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -263,6 +263,7 @@ def _( translate_named_expr(translator, n=node.selection) if node.selection is not None else None, + translator.config.config.copy(), ) diff --git a/python/cudf_polars/cudf_polars/experimental/base.py b/python/cudf_polars/cudf_polars/experimental/base.py new file mode 100644 index 00000000000..8f660632df2 --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/base.py @@ -0,0 +1,43 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +"""Multi-partition base classes.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from cudf_polars.dsl.ir import Union + +if TYPE_CHECKING: + from collections.abc import Iterator, Sequence + + from cudf_polars.containers import DataFrame + from cudf_polars.dsl.nodebase import Node + + +class PartitionInfo: + """ + Partitioning information. + + This class only tracks the partition count (for now). + """ + + __slots__ = ("count",) + + def __init__(self, count: int): + self.count = count + + def keys(self, node: Node) -> Iterator[tuple[str, int]]: + """Return the partitioned keys for a given node.""" + name = get_key_name(node) + yield from ((name, i) for i in range(self.count)) + + +def get_key_name(node: Node) -> str: + """Generate the key name for a Node.""" + return f"{type(node).__name__.lower()}-{hash(node)}" + + +def _concat(dfs: Sequence[DataFrame]) -> DataFrame: + # Concatenate a sequence of DataFrames vertically + return Union.do_evaluate(None, *dfs) diff --git a/python/cudf_polars/cudf_polars/experimental/dispatch.py b/python/cudf_polars/cudf_polars/experimental/dispatch.py new file mode 100644 index 00000000000..79a52ff3cde --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/dispatch.py @@ -0,0 +1,84 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +"""Multi-partition dispatch functions.""" + +from __future__ import annotations + +from functools import singledispatch +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from collections.abc import MutableMapping + from typing import TypeAlias + + from cudf_polars.dsl.ir import IR + from cudf_polars.experimental.base import PartitionInfo + from cudf_polars.typing import GenericTransformer + + +LowerIRTransformer: TypeAlias = ( + "GenericTransformer[IR, tuple[IR, MutableMapping[IR, PartitionInfo]]]" +) +"""Protocol for Lowering IR nodes.""" + + +@singledispatch +def lower_ir_node( + ir: IR, rec: LowerIRTransformer +) -> tuple[IR, MutableMapping[IR, PartitionInfo]]: + """ + Rewrite an IR node and extract partitioning information. + + Parameters + ---------- + ir + IR node to rewrite. + rec + Recursive LowerIRTransformer callable. + + Returns + ------- + new_ir, partition_info + The rewritten node, and a mapping from unique nodes in + the full IR graph to associated partitioning information. + + Notes + ----- + This function is used by `lower_ir_graph`. + + See Also + -------- + lower_ir_graph + """ + raise AssertionError(f"Unhandled type {type(ir)}") # pragma: no cover + + +@singledispatch +def generate_ir_tasks( + ir: IR, partition_info: MutableMapping[IR, PartitionInfo] +) -> MutableMapping[Any, Any]: + """ + Generate a task graph for evaluation of an IR node. + + Parameters + ---------- + ir + IR node to generate tasks for. + partition_info + Partitioning information, obtained from :func:`lower_ir_graph`. + + Returns + ------- + mapping + A (partial) dask task graph for the evaluation of an ir node. + + Notes + ----- + Task generation should only produce the tasks for the current node, + referring to child tasks by name. + + See Also + -------- + task_graph + """ + raise AssertionError(f"Unhandled type {type(ir)}") # pragma: no cover diff --git a/python/cudf_polars/cudf_polars/experimental/io.py b/python/cudf_polars/cudf_polars/experimental/io.py new file mode 100644 index 00000000000..3a1fec36079 --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/io.py @@ -0,0 +1,49 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +"""Multi-partition IO Logic.""" + +from __future__ import annotations + +import math +from typing import TYPE_CHECKING + +from cudf_polars.dsl.ir import DataFrameScan, Union +from cudf_polars.experimental.base import PartitionInfo +from cudf_polars.experimental.dispatch import lower_ir_node + +if TYPE_CHECKING: + from collections.abc import MutableMapping + + from cudf_polars.dsl.ir import IR + from cudf_polars.experimental.dispatch import LowerIRTransformer + + +@lower_ir_node.register(DataFrameScan) +def _( + ir: DataFrameScan, rec: LowerIRTransformer +) -> tuple[IR, MutableMapping[IR, PartitionInfo]]: + rows_per_partition = ir.config_options.get("executor_options", {}).get( + "max_rows_per_partition", 1_000_000 + ) + + nrows = max(ir.df.shape()[0], 1) + count = math.ceil(nrows / rows_per_partition) + + if count > 1: + length = math.ceil(nrows / count) + slices = [ + DataFrameScan( + ir.schema, + ir.df.slice(offset, length), + ir.projection, + ir.predicate, + ir.config_options, + ) + for offset in range(0, nrows, length) + ] + new_node = Union(ir.schema, None, *slices) + return new_node, {slice: PartitionInfo(count=1) for slice in slices} | { + new_node: PartitionInfo(count=count) + } + + return ir, {ir: PartitionInfo(count=1)} diff --git a/python/cudf_polars/cudf_polars/experimental/parallel.py b/python/cudf_polars/cudf_polars/experimental/parallel.py index 6518dd60c7d..e5884f1c574 100644 --- a/python/cudf_polars/cudf_polars/experimental/parallel.py +++ b/python/cudf_polars/cudf_polars/experimental/parallel.py @@ -1,93 +1,46 @@ # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 -"""Partitioned LogicalPlan nodes.""" +"""Multi-partition Dask execution.""" from __future__ import annotations +import itertools import operator -from functools import reduce, singledispatch +from functools import reduce from typing import TYPE_CHECKING, Any -from cudf_polars.dsl.ir import IR -from cudf_polars.dsl.traversal import traversal +import cudf_polars.experimental.io # noqa: F401 +from cudf_polars.dsl.ir import IR, Cache, Projection, Union +from cudf_polars.dsl.traversal import CachingVisitor, traversal +from cudf_polars.experimental.base import PartitionInfo, _concat, get_key_name +from cudf_polars.experimental.dispatch import ( + generate_ir_tasks, + lower_ir_node, +) if TYPE_CHECKING: from collections.abc import MutableMapping - from typing import TypeAlias from cudf_polars.containers import DataFrame - from cudf_polars.dsl.nodebase import Node - from cudf_polars.typing import GenericTransformer - - -class PartitionInfo: - """ - Partitioning information. - - This class only tracks the partition count (for now). - """ - - __slots__ = ("count",) - - def __init__(self, count: int): - self.count = count - - -LowerIRTransformer: TypeAlias = ( - "GenericTransformer[IR, MutableMapping[IR, PartitionInfo]]" -) -"""Protocol for Lowering IR nodes.""" - - -def get_key_name(node: Node) -> str: - """Generate the key name for a Node.""" - return f"{type(node).__name__.lower()}-{hash(node)}" - - -@singledispatch -def lower_ir_node( - ir: IR, rec: LowerIRTransformer -) -> tuple[IR, MutableMapping[IR, PartitionInfo]]: - """ - Rewrite an IR node and extract partitioning information. - - Parameters - ---------- - ir - IR node to rewrite. - rec - Recursive LowerIRTransformer callable. - - Returns - ------- - new_ir, partition_info - The rewritten node, and a mapping from unique nodes in - the full IR graph to associated partitioning information. - - Notes - ----- - This function is used by `lower_ir_graph`. - - See Also - -------- - lower_ir_graph - """ - raise AssertionError(f"Unhandled type {type(ir)}") # pragma: no cover + from cudf_polars.experimental.dispatch import LowerIRTransformer @lower_ir_node.register(IR) def _(ir: IR, rec: LowerIRTransformer) -> tuple[IR, MutableMapping[IR, PartitionInfo]]: + # Default logic - Requires single partition + if len(ir.children) == 0: # Default leaf node has single partition - return ir, {ir: PartitionInfo(count=1)} + return ir, { + ir: PartitionInfo(count=1) + } # pragma: no cover; Missed by pylibcudf executor # Lower children - children, _partition_info = zip(*(rec(c) for c in ir.children), strict=False) + children, _partition_info = zip(*(rec(c) for c in ir.children), strict=True) partition_info = reduce(operator.or_, _partition_info) # Check that child partitioning is supported - count = max(partition_info[c].count for c in children) - if count > 1: + if any(partition_info[c].count > 1 for c in children): raise NotImplementedError( f"Class {type(ir)} does not support multiple partitions." ) # pragma: no cover @@ -123,41 +76,62 @@ def lower_ir_graph(ir: IR) -> tuple[IR, MutableMapping[IR, PartitionInfo]]: -------- lower_ir_node """ - from cudf_polars.dsl.traversal import CachingVisitor - mapper = CachingVisitor(lower_ir_node) return mapper(ir) -@singledispatch -def generate_ir_tasks( +def task_graph( ir: IR, partition_info: MutableMapping[IR, PartitionInfo] -) -> MutableMapping[Any, Any]: +) -> tuple[MutableMapping[Any, Any], str | tuple[str, int]]: """ - Generate a task graph for evaluation of an IR node. + Construct a task graph for evaluation of an IR graph. Parameters ---------- ir - IR node to generate tasks for. + Root of the graph to rewrite. partition_info - Partitioning information, obtained from :func:`lower_ir_graph`. + A mapping from all unique IR nodes to the + associated partitioning information. Returns ------- - mapping - A (partial) dask task graph for the evaluation of an ir node. + graph + A Dask-compatible task graph for the entire + IR graph with root `ir`. Notes ----- - Task generation should only produce the tasks for the current node, - referring to child tasks by name. + This function traverses the unique nodes of the + graph with root `ir`, and extracts the tasks for + each node with :func:`generate_ir_tasks`. See Also -------- - task_graph + generate_ir_tasks """ - raise AssertionError(f"Unhandled type {type(ir)}") # pragma: no cover + graph = reduce( + operator.or_, + (generate_ir_tasks(node, partition_info) for node in traversal(ir)), + ) + + key_name = get_key_name(ir) + partition_count = partition_info[ir].count + if partition_count > 1: + graph[key_name] = (_concat, list(partition_info[ir].keys(ir))) + return graph, key_name + else: + return graph, (key_name, 0) + + +def evaluate_dask(ir: IR) -> DataFrame: + """Evaluate an IR graph with Dask.""" + from dask import get + + ir, partition_info = lower_ir_graph(ir) + + graph, key = task_graph(ir, partition_info) + return get(graph, key) @generate_ir_tasks.register(IR) @@ -189,48 +163,85 @@ def _( } -def task_graph( - ir: IR, partition_info: MutableMapping[IR, PartitionInfo] -) -> tuple[MutableMapping[Any, Any], str | tuple[str, int]]: - """ - Construct a task graph for evaluation of an IR graph. +@lower_ir_node.register(Union) +def _( + ir: Union, rec: LowerIRTransformer +) -> tuple[IR, MutableMapping[IR, PartitionInfo]]: + # Lower children + children, _partition_info = zip(*(rec(c) for c in ir.children), strict=True) + partition_info = reduce(operator.or_, _partition_info) - Parameters - ---------- - ir - Root of the graph to rewrite. - partition_info - A mapping from all unique IR nodes to the - associated partitioning information. + # Check zlice + if ir.zlice is not None: # pragma: no cover + if any(p[c].count > 1 for p, c in zip(children, _partition_info, strict=False)): + raise NotImplementedError("zlice is not supported for multiple partitions.") + new_node = ir.reconstruct(children) + partition_info[new_node] = PartitionInfo(count=1) + return new_node, partition_info - Returns - ------- - graph - A Dask-compatible task graph for the entire - IR graph with root `ir`. + # Partition count is the sum of all child partitions + count = sum(partition_info[c].count for c in children) - Notes - ----- - This function traverses the unique nodes of the - graph with root `ir`, and extracts the tasks for - each node with :func:`generate_ir_tasks`. + # Return reconstructed node and partition-info dict + new_node = ir.reconstruct(children) + partition_info[new_node] = PartitionInfo(count=count) + return new_node, partition_info - See Also - -------- - generate_ir_tasks - """ - graph = reduce( - operator.or_, - (generate_ir_tasks(node, partition_info) for node in traversal(ir)), - ) - return graph, (get_key_name(ir), 0) +@generate_ir_tasks.register(Union) +def _( + ir: Union, partition_info: MutableMapping[IR, PartitionInfo] +) -> MutableMapping[Any, Any]: + key_name = get_key_name(ir) + partition = itertools.count() + return { + (key_name, next(partition)): child_key + for child in ir.children + for child_key in partition_info[child].keys(child) + } -def evaluate_dask(ir: IR) -> DataFrame: - """Evaluate an IR graph with Dask.""" - from dask import get - ir, partition_info = lower_ir_graph(ir) +def _lower_ir_pwise( + ir: IR, rec: LowerIRTransformer +) -> tuple[IR, MutableMapping[IR, PartitionInfo]]: + # Lower a partition-wise (i.e. embarrassingly-parallel) IR node - graph, key = task_graph(ir, partition_info) - return get(graph, key) + # Lower children + children, _partition_info = zip(*(rec(c) for c in ir.children), strict=True) + partition_info = reduce(operator.or_, _partition_info) + counts = {partition_info[c].count for c in children} + + # Check that child partitioning is supported + if len(counts) > 1: + raise NotImplementedError( + f"Class {type(ir)} does not support unbalanced partitions." + ) # pragma: no cover + + # Return reconstructed node and partition-info dict + partition = PartitionInfo(count=max(counts)) + new_node = ir.reconstruct(children) + partition_info[new_node] = partition + return new_node, partition_info + + +lower_ir_node.register(Projection, _lower_ir_pwise) +lower_ir_node.register(Cache, _lower_ir_pwise) + + +def _generate_ir_tasks_pwise( + ir: IR, partition_info: MutableMapping[IR, PartitionInfo] +) -> MutableMapping[Any, Any]: + # Generate partition-wise (i.e. embarrassingly-parallel) tasks + child_names = [get_key_name(c) for c in ir.children] + return { + key: ( + ir.do_evaluate, + *ir._non_child_args, + *[(child_name, i) for child_name in child_names], + ) + for i, key in enumerate(partition_info[ir].keys(ir)) + } + + +generate_ir_tasks.register(Projection, _generate_ir_tasks_pwise) +generate_ir_tasks.register(Cache, _generate_ir_tasks_pwise) diff --git a/python/cudf_polars/tests/dsl/test_traversal.py b/python/cudf_polars/tests/dsl/test_traversal.py index 2f4df9289f8..9755994c419 100644 --- a/python/cudf_polars/tests/dsl/test_traversal.py +++ b/python/cudf_polars/tests/dsl/test_traversal.py @@ -116,7 +116,11 @@ def test_rewrite_ir_node(): def replace_df(node, rec): if isinstance(node, ir.DataFrameScan): return ir.DataFrameScan( - node.schema, new_df._df, node.projection, node.predicate + node.schema, + new_df._df, + node.projection, + node.predicate, + node.config_options, ) return reuse_if_unchanged(node, rec) @@ -144,7 +148,11 @@ def test_rewrite_scan_node(tmp_path): def replace_scan(node, rec): if isinstance(node, ir.Scan): return ir.DataFrameScan( - node.schema, right._df, node.with_columns, node.predicate + node.schema, + right._df, + node.with_columns, + node.predicate, + node.config_options, ) return reuse_if_unchanged(node, rec) diff --git a/python/cudf_polars/tests/experimental/test_dataframescan.py b/python/cudf_polars/tests/experimental/test_dataframescan.py new file mode 100644 index 00000000000..77c7bf0c503 --- /dev/null +++ b/python/cudf_polars/tests/experimental/test_dataframescan.py @@ -0,0 +1,53 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars import Translator +from cudf_polars.experimental.parallel import lower_ir_graph +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.fixture(scope="module") +def df(): + return pl.LazyFrame( + { + "x": range(30_000), + "y": ["cat", "dog", "fish"] * 10_000, + "z": [1.0, 2.0, 3.0, 4.0, 5.0] * 6_000, + } + ) + + +@pytest.mark.parametrize("max_rows_per_partition", [1_000, 1_000_000]) +def test_parallel_dataframescan(df, max_rows_per_partition): + total_row_count = len(df.collect()) + engine = pl.GPUEngine( + raise_on_fail=True, + executor="dask-experimental", + executor_options={"max_rows_per_partition": max_rows_per_partition}, + ) + assert_gpu_result_equal(df, engine=engine) + + # Check partitioning + qir = Translator(df._ldf.visit(), engine).translate_ir() + ir, info = lower_ir_graph(qir) + count = info[ir].count + if max_rows_per_partition < total_row_count: + assert count > 1 + else: + assert count == 1 + + +def test_dataframescan_concat(df): + engine = pl.GPUEngine( + raise_on_fail=True, + executor="dask-experimental", + executor_options={"max_rows_per_partition": 1_000}, + ) + df2 = pl.concat([df, df]) + assert_gpu_result_equal(df2, engine=engine) diff --git a/python/cudf_polars/tests/test_executors.py b/python/cudf_polars/tests/test_executors.py index 3eaea2ec9ea..b8c0bb926ab 100644 --- a/python/cudf_polars/tests/test_executors.py +++ b/python/cudf_polars/tests/test_executors.py @@ -66,3 +66,19 @@ def test_unknown_executor(): match="ValueError: Unknown executor 'unknown-executor'", ): assert_gpu_result_equal(df, executor="unknown-executor") + + +@pytest.mark.parametrize("executor", [None, "pylibcudf", "dask-experimental"]) +def test_unknown_executor_options(executor): + df = pl.LazyFrame({}) + + with pytest.raises( + pl.exceptions.ComputeError, + match="Unsupported executor_options", + ): + df.collect( + engine=pl.GPUEngine( + executor=executor, + executor_options={"foo": None}, + ) + ) From 4696bbf91ca37ab6960b606d1f7763487ee03ef6 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Tue, 3 Dec 2024 12:58:35 -0500 Subject: [PATCH 10/78] Revert "Temporarily skip tests due to dask/distributed#8953" (#17492) Reverts rapidsai/cudf#17472 The new dask nightly has resolved https://github.com/dask/distributed/issues/8953 --- .../custreamz/tests/test_dataframes.py | 56 +++---------------- 1 file changed, 7 insertions(+), 49 deletions(-) diff --git a/python/custreamz/custreamz/tests/test_dataframes.py b/python/custreamz/custreamz/tests/test_dataframes.py index 6905044039c..8c0130d2818 100644 --- a/python/custreamz/custreamz/tests/test_dataframes.py +++ b/python/custreamz/custreamz/tests/test_dataframes.py @@ -216,13 +216,7 @@ def test_set_index(): assert_eq(b[0], df.set_index(df.y + 1)) -def test_binary_stream_operators(request, stream): - request.applymarker( - pytest.mark.xfail( - isinstance(stream, DaskStream), - reason="https://github.com/dask/distributed/issues/8953", - ) - ) +def test_binary_stream_operators(stream): df = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) expected = df.x + df.y @@ -248,13 +242,7 @@ def test_index(stream): assert_eq(L[1], df.index + 5) -def test_pair_arithmetic(request, stream): - request.applymarker( - pytest.mark.xfail( - isinstance(stream, DaskStream), - reason="https://github.com/dask/distributed/issues/8953", - ) - ) +def test_pair_arithmetic(stream): df = cudf.DataFrame({"x": list(range(10)), "y": [1] * 10}) a = DataFrame(example=df.iloc[:0], stream=stream) @@ -267,13 +255,7 @@ def test_pair_arithmetic(request, stream): assert_eq(cudf.concat(L), (df.x + df.y) * 2) -def test_getitem(request, stream): - request.applymarker( - pytest.mark.xfail( - isinstance(stream, DaskStream), - reason="https://github.com/dask/distributed/issues/8953", - ) - ) +def test_getitem(stream): df = cudf.DataFrame({"x": list(range(10)), "y": [1] * 10}) a = DataFrame(example=df.iloc[:0], stream=stream) @@ -350,13 +332,7 @@ def test_repr_html(stream): assert "1" in html -def test_setitem(request, stream): - request.applymarker( - pytest.mark.xfail( - isinstance(stream, DaskStream), - reason="https://github.com/dask/distributed/issues/8953", - ) - ) +def test_setitem(stream): df = cudf.DataFrame({"x": list(range(10)), "y": [1] * 10}) sdf = DataFrame(example=df.iloc[:0], stream=stream) @@ -380,13 +356,7 @@ def test_setitem(request, stream): assert_eq(L[-1], df.mean()) -def test_setitem_overwrites(request, stream): - request.applymarker( - pytest.mark.xfail( - isinstance(stream, DaskStream), - reason="https://github.com/dask/distributed/issues/8953", - ) - ) +def test_setitem_overwrites(stream): df = cudf.DataFrame({"x": list(range(10))}) sdf = DataFrame(example=df.iloc[:0], stream=stream) stream = sdf.stream @@ -443,14 +413,8 @@ def test_setitem_overwrites(request, stream): ], ) def test_rolling_count_aggregations( - request, op, window, m, pre_get, post_get, kwargs, stream + op, window, m, pre_get, post_get, kwargs, stream ): - request.applymarker( - pytest.mark.xfail( - isinstance(stream, DaskStream) and len(kwargs) == 0, - reason="https://github.com/dask/distributed/issues/8953", - ) - ) index = pd.DatetimeIndex( pd.date_range("2000-01-01", "2000-01-03", freq="1h") ) @@ -844,13 +808,7 @@ def test_reductions_with_start_state(stream): assert output2[0] == 360 -def test_rolling_aggs_with_start_state(request, stream): - request.applymarker( - pytest.mark.xfail( - isinstance(stream, DaskStream), - reason="https://github.com/dask/distributed/issues/8953", - ) - ) +def test_rolling_aggs_with_start_state(stream): example = cudf.DataFrame({"name": [], "amount": []}, dtype="float64") sdf = DataFrame(stream, example=example) output0 = ( From d3e94d458ddeaced5ba34a825ab0af5275b73dbe Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 3 Dec 2024 10:03:29 -0800 Subject: [PATCH 11/78] Apply clang-tidy autofixes from new rules (#17431) This PR contains all of clang-tidy's autofixes for the rules outlined in https://github.com/rapidsai/cudf/issues/17410. In the process I simplified the process of performing autofixes locally. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - David Wendt (https://github.com/davidwendt) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17431 --- ci/cpp_linters.sh | 2 +- cpp/CMakeLists.txt | 14 +- cpp/src/bitmask/is_element_valid.cpp | 4 +- cpp/src/column/column_view.cpp | 97 +++--- cpp/src/copying/copy.cpp | 12 +- cpp/src/copying/pack.cpp | 81 +++-- cpp/src/datetime/timezone.cpp | 2 +- cpp/src/groupby/sort/aggregate.cpp | 96 +++--- cpp/src/interop/dlpack.cpp | 4 +- cpp/src/interop/to_arrow_schema.cpp | 4 +- cpp/src/io/avro/avro.cpp | 12 +- cpp/src/io/comp/comp.cpp | 8 +- cpp/src/io/comp/nvcomp_adapter.cpp | 280 +++++++++--------- cpp/src/io/comp/uncomp.cpp | 40 +-- cpp/src/io/functions.cpp | 63 ++-- cpp/src/io/json/parser_features.cpp | 139 ++++----- cpp/src/io/parquet/arrow_schema_writer.cpp | 2 +- .../io/parquet/compact_protocol_reader.cpp | 131 ++++---- .../io/parquet/compact_protocol_writer.cpp | 2 +- cpp/src/io/parquet/predicate_pushdown.cpp | 5 +- cpp/src/io/parquet/reader_impl.cpp | 6 +- cpp/src/io/parquet/reader_impl_helpers.cpp | 18 +- cpp/src/io/text/bgzip_utils.cpp | 2 +- cpp/src/io/utilities/base64_utilities.cpp | 6 +- cpp/src/io/utilities/data_sink.cpp | 4 +- cpp/src/io/utilities/datasource.cpp | 8 +- cpp/src/io/utilities/file_io_utilities.cpp | 41 +-- cpp/src/jit/cache.cpp | 12 +- cpp/src/jit/parser.cpp | 56 ++-- .../quantiles/tdigest/tdigest_column_view.cpp | 8 +- cpp/src/reductions/scan/scan.cpp | 3 +- cpp/src/reductions/segmented/reductions.cpp | 3 + .../detail/optimized_unbounded_window.cpp | 54 ++-- cpp/src/strings/regex/regcomp.cpp | 14 +- cpp/src/strings/regex/regexec.cpp | 6 +- cpp/src/structs/utilities.cpp | 2 +- cpp/src/table/table_view.cpp | 33 ++- cpp/src/transform/transform.cpp | 7 +- cpp/src/utilities/prefetch.cpp | 4 +- cpp/src/utilities/stream_pool.cpp | 112 +++---- 40 files changed, 722 insertions(+), 675 deletions(-) diff --git a/ci/cpp_linters.sh b/ci/cpp_linters.sh index 4d5b62ba280..9702b055512 100755 --- a/ci/cpp_linters.sh +++ b/ci/cpp_linters.sh @@ -27,7 +27,7 @@ source rapids-configure-sccache # Run the build via CMake, which will run clang-tidy when CUDF_STATIC_LINTERS is enabled. iwyu_flag="" -if [[ "${RAPIDS_BUILD_TYPE}" == "nightly" ]]; then +if [[ "${RAPIDS_BUILD_TYPE:-}" == "nightly" ]]; then iwyu_flag="-DCUDF_IWYU=ON" fi cmake -S cpp -B cpp/build -DCMAKE_BUILD_TYPE=Release -DCUDF_CLANG_TIDY=ON ${iwyu_flag} -DBUILD_TESTS=OFF -GNinja diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f25b46a52cd..12e6826f301 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -93,6 +93,7 @@ option( mark_as_advanced(CUDF_BUILD_STREAMS_TEST_UTIL) option(CUDF_CLANG_TIDY "Enable clang-tidy during compilation" OFF) option(CUDF_IWYU "Enable IWYU during compilation" OFF) +option(CUDF_CLANG_TIDY_AUTOFIX "Enable clang-tidy autofixes" OFF) option( CUDF_KVIKIO_REMOTE_IO @@ -205,9 +206,16 @@ function(enable_static_checkers target) if(_LINT_CLANG_TIDY) # clang will complain about unused link libraries on the compile line unless we specify # -Qunused-arguments. - set_target_properties( - ${target} PROPERTIES CXX_CLANG_TIDY "${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments" - ) + if(CUDF_CLANG_TIDY_AUTOFIX) + set_target_properties( + ${target} PROPERTIES CXX_CLANG_TIDY + "${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments;--fix" + ) + else() + set_target_properties( + ${target} PROPERTIES CXX_CLANG_TIDY "${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments" + ) + endif() endif() if(_LINT_IWYU) # A few extra warnings pop up when building with IWYU. I'm not sure why, but they are not diff --git a/cpp/src/bitmask/is_element_valid.cpp b/cpp/src/bitmask/is_element_valid.cpp index 4806c7a94e8..7eb80c4249e 100644 --- a/cpp/src/bitmask/is_element_valid.cpp +++ b/cpp/src/bitmask/is_element_valid.cpp @@ -30,9 +30,9 @@ bool is_element_valid_sync(column_view const& col_view, CUDF_EXPECTS(element_index >= 0 and element_index < col_view.size(), "invalid index."); if (!col_view.nullable()) { return true; } - bitmask_type word; + bitmask_type word = 0; // null_mask() returns device ptr to bitmask without offset - size_type index = element_index + col_view.offset(); + size_type const index = element_index + col_view.offset(); CUDF_CUDA_TRY(cudaMemcpyAsync(&word, col_view.null_mask() + word_index(index), sizeof(bitmask_type), diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp index e831aa9645d..ea940676f6a 100644 --- a/cpp/src/column/column_view.cpp +++ b/cpp/src/column/column_view.cpp @@ -41,7 +41,7 @@ void prefetch_col_data(ColumnView& col, void const* data_ptr, std::string_view k cudf::experimental::prefetch::detail::prefetch_noexcept( key, data_ptr, col.size() * size_of(col.type()), cudf::get_default_stream()); } else if (col.type().id() == type_id::STRING) { - strings_column_view scv{col}; + strings_column_view const scv{col}; if (data_ptr == nullptr) { // Do not call chars_size if the data_ptr is nullptr. return; @@ -58,51 +58,6 @@ void prefetch_col_data(ColumnView& col, void const* data_ptr, std::string_view k } } -} // namespace - -column_view_base::column_view_base(data_type type, - size_type size, - void const* data, - bitmask_type const* null_mask, - size_type null_count, - size_type offset) - : _type{type}, - _size{size}, - _data{data}, - _null_mask{null_mask}, - _null_count{null_count}, - _offset{offset} -{ - CUDF_EXPECTS(size >= 0, "Column size cannot be negative."); - - if (type.id() == type_id::EMPTY) { - _null_count = size; - CUDF_EXPECTS(nullptr == data, "EMPTY column should have no data."); - CUDF_EXPECTS(nullptr == null_mask, "EMPTY column should have no null mask."); - } else if (is_compound(type)) { - if (type.id() != type_id::STRING) { - CUDF_EXPECTS(nullptr == data, "Compound (parent) columns cannot have data"); - } - } else if (size > 0) { - CUDF_EXPECTS(nullptr != data, "Null data pointer."); - } - - CUDF_EXPECTS(offset >= 0, "Invalid offset."); - - if ((null_count > 0) and (type.id() != type_id::EMPTY)) { - CUDF_EXPECTS(nullptr != null_mask, "Invalid null mask for non-zero null count."); - } -} - -size_type column_view_base::null_count(size_type begin, size_type end) const -{ - CUDF_EXPECTS((begin >= 0) && (end <= size()) && (begin <= end), "Range is out of bounds."); - return (null_count() == 0) - ? 0 - : cudf::detail::null_count( - null_mask(), offset() + begin, offset() + end, cudf::get_default_stream()); -} - // Struct to use custom hash combine and fold expression struct HashValue { std::size_t hash; @@ -133,8 +88,6 @@ std::size_t shallow_hash_impl(column_view const& c, bool is_parent_empty = false }); } -std::size_t shallow_hash(column_view const& input) { return shallow_hash_impl(input); } - bool shallow_equivalent_impl(column_view const& lhs, column_view const& rhs, bool is_parent_empty = false) @@ -151,11 +104,59 @@ bool shallow_equivalent_impl(column_view const& lhs, return shallow_equivalent_impl(lhs_child, rhs_child, is_empty); }); } + +} // namespace + +column_view_base::column_view_base(data_type type, + size_type size, + void const* data, + bitmask_type const* null_mask, + size_type null_count, + size_type offset) + : _type{type}, + _size{size}, + _data{data}, + _null_mask{null_mask}, + _null_count{null_count}, + _offset{offset} +{ + CUDF_EXPECTS(size >= 0, "Column size cannot be negative."); + + if (type.id() == type_id::EMPTY) { + _null_count = size; + CUDF_EXPECTS(nullptr == data, "EMPTY column should have no data."); + CUDF_EXPECTS(nullptr == null_mask, "EMPTY column should have no null mask."); + } else if (is_compound(type)) { + if (type.id() != type_id::STRING) { + CUDF_EXPECTS(nullptr == data, "Compound (parent) columns cannot have data"); + } + } else if (size > 0) { + CUDF_EXPECTS(nullptr != data, "Null data pointer."); + } + + CUDF_EXPECTS(offset >= 0, "Invalid offset."); + + if ((null_count > 0) and (type.id() != type_id::EMPTY)) { + CUDF_EXPECTS(nullptr != null_mask, "Invalid null mask for non-zero null count."); + } +} + +size_type column_view_base::null_count(size_type begin, size_type end) const +{ + CUDF_EXPECTS((begin >= 0) && (end <= size()) && (begin <= end), "Range is out of bounds."); + return (null_count() == 0) + ? 0 + : cudf::detail::null_count( + null_mask(), offset() + begin, offset() + end, cudf::get_default_stream()); +} + bool is_shallow_equivalent(column_view const& lhs, column_view const& rhs) { return shallow_equivalent_impl(lhs, rhs); } +std::size_t shallow_hash(column_view const& input) { return shallow_hash_impl(input); } + } // namespace detail // Immutable view constructor diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp index 5e2065ba844..89d8cc3f4aa 100644 --- a/cpp/src/copying/copy.cpp +++ b/cpp/src/copying/copy.cpp @@ -62,11 +62,12 @@ struct scalar_empty_like_functor_impl { auto ls = static_cast(&input); // TODO: add a manual constructor for lists_column_view. - column_view offsets{cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0}; + column_view const offsets{cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0}; std::vector children; children.push_back(offsets); children.push_back(ls->view()); - column_view lcv{cudf::data_type{cudf::type_id::LIST}, 0, nullptr, nullptr, 0, 0, children}; + column_view const lcv{ + cudf::data_type{cudf::type_id::LIST}, 0, nullptr, nullptr, 0, 0, children}; return empty_like(lcv); } @@ -81,8 +82,9 @@ struct scalar_empty_like_functor_impl { // TODO: add a manual constructor for structs_column_view // TODO: add cudf::get_element() support for structs cudf::table_view tbl = ss->view(); - std::vector children(tbl.begin(), tbl.end()); - column_view scv{cudf::data_type{cudf::type_id::STRUCT}, 0, nullptr, nullptr, 0, 0, children}; + std::vector const children(tbl.begin(), tbl.end()); + column_view const scv{ + cudf::data_type{cudf::type_id::STRUCT}, 0, nullptr, nullptr, 0, 0, children}; return empty_like(scv); } @@ -120,7 +122,7 @@ std::unique_ptr allocate_like(column_view const& input, CUDF_FUNC_RANGE(); CUDF_EXPECTS( is_fixed_width(input.type()), "Expects only fixed-width type column", cudf::data_type_error); - mask_state allocate_mask = should_allocate_mask(mask_alloc, input.nullable()); + mask_state const allocate_mask = should_allocate_mask(mask_alloc, input.nullable()); return std::make_unique(input.type(), size, diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp index a001807c82b..42ea28f5961 100644 --- a/cpp/src/copying/pack.cpp +++ b/cpp/src/copying/pack.cpp @@ -48,20 +48,20 @@ struct serialized_column { null_count(_null_count), data_offset(_data_offset), null_mask_offset(_null_mask_offset), - num_children(_num_children), - pad(0) + num_children(_num_children) + { } data_type type; - size_type size; - size_type null_count; - int64_t data_offset; // offset into contiguous data buffer, or -1 if column data is null - int64_t null_mask_offset; // offset into contiguous data buffer, or -1 if column data is null - size_type num_children; + size_type size{}; + size_type null_count{}; + int64_t data_offset{}; // offset into contiguous data buffer, or -1 if column data is null + int64_t null_mask_offset{}; // offset into contiguous data buffer, or -1 if column data is null + size_type num_children{}; // Explicitly pad to avoid uninitialized padding bits, allowing `serialized_column` to be bit-wise // comparable - int pad; + int pad{}; }; /** @@ -137,6 +137,34 @@ void build_column_metadata(metadata_builder& mb, }); } +table_view unpack(uint8_t const* metadata, uint8_t const* gpu_data) +{ + // gpu data can be null if everything is empty but the metadata must always be valid + CUDF_EXPECTS(metadata != nullptr, "Encountered invalid packed column input"); + auto serialized_columns = reinterpret_cast(metadata); + uint8_t const* base_ptr = gpu_data; + // first entry is a stub where size == the total # of top level columns (see pack_metadata above) + auto const num_columns = serialized_columns[0].size; + size_t current_index = 1; + + std::function(size_type)> get_columns; + get_columns = [&serialized_columns, ¤t_index, base_ptr, &get_columns](size_t num_columns) { + std::vector cols; + for (size_t i = 0; i < num_columns; i++) { + auto serial_column = serialized_columns[current_index]; + current_index++; + + std::vector const children = get_columns(serial_column.num_children); + + cols.emplace_back(deserialize_column(serial_column, children, base_ptr)); + } + + return cols; + }; + + return table_view{get_columns(num_columns)}; +} + } // anonymous namespace /** @@ -198,37 +226,6 @@ class metadata_builder_impl { std::vector metadata; }; -/** - * @copydoc cudf::detail::unpack - */ -table_view unpack(uint8_t const* metadata, uint8_t const* gpu_data) -{ - // gpu data can be null if everything is empty but the metadata must always be valid - CUDF_EXPECTS(metadata != nullptr, "Encountered invalid packed column input"); - auto serialized_columns = reinterpret_cast(metadata); - uint8_t const* base_ptr = gpu_data; - // first entry is a stub where size == the total # of top level columns (see pack_metadata above) - auto const num_columns = serialized_columns[0].size; - size_t current_index = 1; - - std::function(size_type)> get_columns; - get_columns = [&serialized_columns, ¤t_index, base_ptr, &get_columns](size_t num_columns) { - std::vector cols; - for (size_t i = 0; i < num_columns; i++) { - auto serial_column = serialized_columns[current_index]; - current_index++; - - std::vector children = get_columns(serial_column.num_children); - - cols.emplace_back(deserialize_column(serial_column, children, base_ptr)); - } - - return cols; - }; - - return table_view{get_columns(num_columns)}; -} - metadata_builder::metadata_builder(size_type const num_root_columns) : impl(std::make_unique(num_root_columns + 1 /*one more extra metadata entry as below*/)) @@ -280,9 +277,6 @@ std::vector pack_metadata(table_view const& table, return detail::pack_metadata(table, contiguous_buffer, buffer_size, builder); } -/** - * @copydoc cudf::unpack - */ table_view unpack(packed_columns const& input) { CUDF_FUNC_RANGE(); @@ -292,9 +286,6 @@ table_view unpack(packed_columns const& input) reinterpret_cast(input.gpu_data->data())); } -/** - * @copydoc cudf::unpack(uint8_t const*, uint8_t const* ) - */ table_view unpack(uint8_t const* metadata, uint8_t const* gpu_data) { CUDF_FUNC_RANGE(); diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp index f786624680c..78e4198f60c 100644 --- a/cpp/src/datetime/timezone.cpp +++ b/cpp/src/datetime/timezone.cpp @@ -62,7 +62,7 @@ struct dst_transition_s { #pragma pack(pop) struct timezone_file { - timezone_file_header header; + timezone_file_header header{}; bool is_header_from_64bit = false; std::vector transition_times; diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index 3041e261945..7a8a1883ed4 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -45,6 +45,42 @@ namespace cudf { namespace groupby { namespace detail { +namespace { + +/** + * @brief Creates column views with only valid elements in both input column views + * + * @param column_0 The first column + * @param column_1 The second column + * @param stream CUDA stream used for device memory operations and kernel launches + * @return tuple with new null mask (if null masks of input differ) and new column views + */ +auto column_view_with_common_nulls(column_view const& column_0, + column_view const& column_1, + rmm::cuda_stream_view stream) +{ + auto [new_nullmask, null_count] = cudf::bitmask_and(table_view{{column_0, column_1}}, stream); + if (null_count == 0) { return std::make_tuple(std::move(new_nullmask), column_0, column_1); } + auto column_view_with_new_nullmask = [](auto const& col, void* nullmask, auto null_count) { + return column_view(col.type(), + col.size(), + col.head(), + static_cast(nullmask), + null_count, + col.offset(), + std::vector(col.child_begin(), col.child_end())); + }; + auto new_column_0 = null_count == column_0.null_count() + ? column_0 + : column_view_with_new_nullmask(column_0, new_nullmask.data(), null_count); + auto new_column_1 = null_count == column_1.null_count() + ? column_1 + : column_view_with_new_nullmask(column_1, new_nullmask.data(), null_count); + return std::make_tuple(std::move(new_nullmask), new_column_0, new_column_1); +} + +} // namespace + /** * @brief Functor to dispatch aggregation with * @@ -170,13 +206,13 @@ void aggregate_result_functor::operator()(aggregation const& a } else { auto argmin_agg = make_argmin_aggregation(); operator()(*argmin_agg); - column_view argmin_result = cache.get_result(values, *argmin_agg); + column_view const argmin_result = cache.get_result(values, *argmin_agg); // We make a view of ARGMIN result without a null mask and gather using // this mask. The values in data buffer of ARGMIN result corresponding // to null values was initialized to ARGMIN_SENTINEL which is an out of // bounds index value and causes the gathered value to be null. - column_view null_removed_map( + column_view const null_removed_map( data_type(type_to_id()), argmin_result.size(), static_cast(argmin_result.template data()), @@ -212,13 +248,13 @@ void aggregate_result_functor::operator()(aggregation const& a } else { auto argmax_agg = make_argmax_aggregation(); operator()(*argmax_agg); - column_view argmax_result = cache.get_result(values, *argmax_agg); + column_view const argmax_result = cache.get_result(values, *argmax_agg); // We make a view of ARGMAX result without a null mask and gather using // this mask. The values in data buffer of ARGMAX result corresponding // to null values was initialized to ARGMAX_SENTINEL which is an out of // bounds index value and causes the gathered value to be null. - column_view null_removed_map( + column_view const null_removed_map( data_type(type_to_id()), argmax_result.size(), static_cast(argmax_result.template data()), @@ -248,8 +284,8 @@ void aggregate_result_functor::operator()(aggregation const& auto count_agg = make_count_aggregation(); operator()(*sum_agg); operator()(*count_agg); - column_view sum_result = cache.get_result(values, *sum_agg); - column_view count_result = cache.get_result(values, *count_agg); + column_view const sum_result = cache.get_result(values, *sum_agg); + column_view const count_result = cache.get_result(values, *count_agg); // TODO (dm): Special case for timestamp. Add target_type_impl for it. // Blocked until we support operator+ on timestamps @@ -291,8 +327,8 @@ void aggregate_result_functor::operator()(aggregation con auto count_agg = make_count_aggregation(); operator()(*mean_agg); operator()(*count_agg); - column_view mean_result = cache.get_result(values, *mean_agg); - column_view group_sizes = cache.get_result(values, *count_agg); + column_view const mean_result = cache.get_result(values, *mean_agg); + column_view const group_sizes = cache.get_result(values, *count_agg); auto result = detail::group_var(get_grouped_values(), mean_result, @@ -312,7 +348,7 @@ void aggregate_result_functor::operator()(aggregation const& a auto& std_agg = dynamic_cast(agg); auto var_agg = make_variance_aggregation(std_agg._ddof); operator()(*var_agg); - column_view var_result = cache.get_result(values, *var_agg); + column_view const var_result = cache.get_result(values, *var_agg); auto result = cudf::detail::unary_operation(var_result, unary_operator::SQRT, stream, mr); cache.add_result(values, agg, std::move(result)); @@ -325,8 +361,8 @@ void aggregate_result_functor::operator()(aggregation con auto count_agg = make_count_aggregation(); operator()(*count_agg); - column_view group_sizes = cache.get_result(values, *count_agg); - auto& quantile_agg = dynamic_cast(agg); + column_view const group_sizes = cache.get_result(values, *count_agg); + auto& quantile_agg = dynamic_cast(agg); auto result = detail::group_quantiles(get_sorted_values(), group_sizes, @@ -346,7 +382,7 @@ void aggregate_result_functor::operator()(aggregation const auto count_agg = make_count_aggregation(); operator()(*count_agg); - column_view group_sizes = cache.get_result(values, *count_agg); + column_view const group_sizes = cache.get_result(values, *count_agg); auto result = detail::group_quantiles(get_sorted_values(), group_sizes, @@ -391,7 +427,7 @@ void aggregate_result_functor::operator()(aggregation } else { CUDF_FAIL("Wrong count aggregation kind"); } - column_view group_sizes = cache.get_result(values, *count_agg); + column_view const group_sizes = cache.get_result(values, *count_agg); cache.add_result(values, agg, @@ -564,38 +600,6 @@ void aggregate_result_functor::operator()(aggregat get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr)); } -/** - * @brief Creates column views with only valid elements in both input column views - * - * @param column_0 The first column - * @param column_1 The second column - * @param stream CUDA stream used for device memory operations and kernel launches - * @return tuple with new null mask (if null masks of input differ) and new column views - */ -auto column_view_with_common_nulls(column_view const& column_0, - column_view const& column_1, - rmm::cuda_stream_view stream) -{ - auto [new_nullmask, null_count] = cudf::bitmask_and(table_view{{column_0, column_1}}, stream); - if (null_count == 0) { return std::make_tuple(std::move(new_nullmask), column_0, column_1); } - auto column_view_with_new_nullmask = [](auto const& col, void* nullmask, auto null_count) { - return column_view(col.type(), - col.size(), - col.head(), - static_cast(nullmask), - null_count, - col.offset(), - std::vector(col.child_begin(), col.child_end())); - }; - auto new_column_0 = null_count == column_0.null_count() - ? column_0 - : column_view_with_new_nullmask(column_0, new_nullmask.data(), null_count); - auto new_column_1 = null_count == column_1.null_count() - ? column_1 - : column_view_with_new_nullmask(column_1, new_nullmask.data(), null_count); - return std::make_tuple(std::move(new_nullmask), new_column_0, new_column_1); -} - /** * @brief Perform covariance between two child columns of non-nullable struct column. * @@ -734,7 +738,7 @@ void aggregate_result_functor::operator()(aggregation cons auto count_agg = make_count_aggregation(); operator()(*count_agg); - column_view valid_counts = cache.get_result(values, *count_agg); + column_view const valid_counts = cache.get_result(values, *count_agg); cache.add_result(values, agg, diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp index b5cc4cbba0d..fee767255c2 100644 --- a/cpp/src/interop/dlpack.cpp +++ b/cpp/src/interop/dlpack.cpp @@ -115,8 +115,8 @@ DLDataType data_type_to_DLDataType(data_type type) // Context object to own memory allocated for DLManagedTensor struct dltensor_context { - int64_t shape[2]; // NOLINT - int64_t strides[2]; // NOLINT + int64_t shape[2]{}; // NOLINT + int64_t strides[2]{}; // NOLINT rmm::device_buffer buffer; static void deleter(DLManagedTensor* arg) diff --git a/cpp/src/interop/to_arrow_schema.cpp b/cpp/src/interop/to_arrow_schema.cpp index 5afed772656..5dd8d77c261 100644 --- a/cpp/src/interop/to_arrow_schema.cpp +++ b/cpp/src/interop/to_arrow_schema.cpp @@ -44,7 +44,7 @@ struct dispatch_to_arrow_type { template ())> int operator()(column_view input_view, column_metadata const&, ArrowSchema* out) { - cudf::type_id id = input_view.type().id(); + cudf::type_id const id = input_view.type().id(); switch (id) { case cudf::type_id::TIMESTAMP_SECONDS: return ArrowSchemaSetTypeDateTime( @@ -186,7 +186,7 @@ int dispatch_to_arrow_type::operator()(column_view input, column_metadata const& metadata, ArrowSchema* out) { - cudf::dictionary_column_view dview{input}; + cudf::dictionary_column_view const dview{input}; NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(out, id_to_arrow_type(dview.indices().type().id()))); NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateDictionary(out)); diff --git a/cpp/src/io/avro/avro.cpp b/cpp/src/io/avro/avro.cpp index b3fcca62314..c3a7f0f3053 100644 --- a/cpp/src/io/avro/avro.cpp +++ b/cpp/src/io/avro/avro.cpp @@ -200,7 +200,7 @@ bool container::parse(file_metadata* md, size_t max_num_rows, size_t first_row) // encountered. If they don't, we have to assume the data is corrupted, // and thus, we terminate processing immediately. std::array const sync_marker = {get_raw(), get_raw()}; - bool valid_sync_markers = + bool const valid_sync_markers = ((sync_marker[0] == md->sync_marker[0]) && (sync_marker[1] == md->sync_marker[1])); if (!valid_sync_markers) { return false; } } @@ -218,10 +218,10 @@ bool container::parse(file_metadata* md, size_t max_num_rows, size_t first_row) md->selected_data_size = m_cur - m_start; // Extract columns for (size_t i = 0; i < md->schema.size(); i++) { - type_kind_e kind = md->schema[i].kind; - logicaltype_kind_e logical_kind = md->schema[i].logical_kind; + type_kind_e const kind = md->schema[i].kind; + logicaltype_kind_e const logical_kind = md->schema[i].logical_kind; - bool is_supported_kind = ((kind > type_null) && (kind < type_record)); + bool const is_supported_kind = ((kind > type_null) && (kind < type_record)); if (is_supported_logical_type(logical_kind) || is_supported_kind) { column_desc col; int parent_idx = md->schema[i].parent_idx; @@ -302,7 +302,7 @@ bool schema_parser::parse(std::vector& schema, std::string const& // Empty schema if (json_str == "[]") return true; - std::array depthbuf; + std::array depthbuf{}; int depth = 0, parent_idx = -1, entry_idx = -1; json_state_e state = state_attrname; std::string str; @@ -341,7 +341,7 @@ bool schema_parser::parse(std::vector& schema, std::string const& m_cur = m_base; m_end = m_base + json_str.length(); while (more_data()) { - int c = *m_cur++; + int const c = *m_cur++; switch (c) { case '"': str = get_str(); diff --git a/cpp/src/io/comp/comp.cpp b/cpp/src/io/comp/comp.cpp index b26a6292806..2dda2287e09 100644 --- a/cpp/src/io/comp/comp.cpp +++ b/cpp/src/io/comp/comp.cpp @@ -48,13 +48,13 @@ std::vector compress_gzip(host_span src) zs.avail_out = 0; zs.next_out = nullptr; - int windowbits = 15; - int gzip_encoding = 16; - int ret = deflateInit2( + constexpr int windowbits = 15; + constexpr int gzip_encoding = 16; + int ret = deflateInit2( &zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED, windowbits | gzip_encoding, 8, Z_DEFAULT_STRATEGY); CUDF_EXPECTS(ret == Z_OK, "GZIP DEFLATE compression initialization failed."); - uint32_t estcomplen = deflateBound(&zs, src.size()); + uint32_t const estcomplen = deflateBound(&zs, src.size()); dst.resize(estcomplen); zs.avail_out = estcomplen; zs.next_out = dst.data(); diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index c3187f73a95..b8bf8be6d2d 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -31,6 +31,7 @@ #include namespace cudf::io::nvcomp { +namespace { // Dispatcher for nvcompBatchedDecompressGetTempSizeEx template @@ -50,19 +51,6 @@ auto batched_decompress_get_temp_size_ex(compression_type compression, Args&&... default: CUDF_FAIL("Unsupported compression type"); } } -size_t batched_decompress_temp_size(compression_type compression, - size_t num_chunks, - size_t max_uncomp_chunk_size, - size_t max_total_uncomp_size) -{ - size_t temp_size = 0; - nvcompStatus_t nvcomp_status = batched_decompress_get_temp_size_ex( - compression, num_chunks, max_uncomp_chunk_size, &temp_size, max_total_uncomp_size); - - CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, - "Unable to get scratch size for decompression"); - return temp_size; -} // Dispatcher for nvcompBatchedDecompressAsync template @@ -94,40 +82,6 @@ std::string compression_type_name(compression_type compression) return "compression_type(" + std::to_string(static_cast(compression)) + ")"; } -void batched_decompress(compression_type compression, - device_span const> inputs, - device_span const> outputs, - device_span results, - size_t max_uncomp_chunk_size, - size_t max_total_uncomp_size, - rmm::cuda_stream_view stream) -{ - auto const num_chunks = inputs.size(); - - // cuDF inflate inputs converted to nvcomp inputs - auto const nvcomp_args = create_batched_nvcomp_args(inputs, outputs, stream); - rmm::device_uvector actual_uncompressed_data_sizes(num_chunks, stream); - rmm::device_uvector nvcomp_statuses(num_chunks, stream); - // Temporary space required for decompression - auto const temp_size = batched_decompress_temp_size( - compression, num_chunks, max_uncomp_chunk_size, max_total_uncomp_size); - rmm::device_buffer scratch(temp_size, stream); - auto const nvcomp_status = batched_decompress_async(compression, - nvcomp_args.input_data_ptrs.data(), - nvcomp_args.input_data_sizes.data(), - nvcomp_args.output_data_sizes.data(), - actual_uncompressed_data_sizes.data(), - num_chunks, - scratch.data(), - scratch.size(), - nvcomp_args.output_data_ptrs.data(), - nvcomp_statuses.data(), - stream.value()); - CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, "unable to perform decompression"); - - update_compression_results(nvcomp_statuses, actual_uncompressed_data_sizes, results, stream); -} - size_t batched_compress_temp_size(compression_type compression, size_t batch_size, size_t max_uncompressed_chunk_bytes, @@ -172,52 +126,17 @@ size_t batched_compress_temp_size(compression_type compression, return temp_size; } -// Wrapper for nvcompBatchedCompressGetMaxOutputChunkSize -size_t compress_max_output_chunk_size(compression_type compression, - uint32_t max_uncompressed_chunk_bytes) -{ - auto const capped_uncomp_bytes = std::min( - compress_max_allowed_chunk_size(compression).value_or(max_uncompressed_chunk_bytes), - max_uncompressed_chunk_bytes); - - size_t max_comp_chunk_size = 0; - nvcompStatus_t status = nvcompStatus_t::nvcompSuccess; - switch (compression) { - case compression_type::SNAPPY: - status = nvcompBatchedSnappyCompressGetMaxOutputChunkSize( - capped_uncomp_bytes, nvcompBatchedSnappyDefaultOpts, &max_comp_chunk_size); - break; - case compression_type::DEFLATE: - status = nvcompBatchedDeflateCompressGetMaxOutputChunkSize( - capped_uncomp_bytes, nvcompBatchedDeflateDefaultOpts, &max_comp_chunk_size); - break; - case compression_type::ZSTD: - status = nvcompBatchedZstdCompressGetMaxOutputChunkSize( - capped_uncomp_bytes, nvcompBatchedZstdDefaultOpts, &max_comp_chunk_size); - break; - case compression_type::LZ4: - status = nvcompBatchedLZ4CompressGetMaxOutputChunkSize( - capped_uncomp_bytes, nvcompBatchedLZ4DefaultOpts, &max_comp_chunk_size); - break; - default: CUDF_FAIL("Unsupported compression type"); - } - - CUDF_EXPECTS(status == nvcompStatus_t::nvcompSuccess, - "failed to get max uncompressed chunk size"); - return max_comp_chunk_size; -} - // Dispatcher for nvcompBatchedCompressAsync -static void batched_compress_async(compression_type compression, - void const* const* device_uncompressed_ptrs, - size_t const* device_uncompressed_bytes, - size_t max_uncompressed_chunk_bytes, - size_t batch_size, - void* device_temp_ptr, - size_t temp_bytes, - void* const* device_compressed_ptrs, - size_t* device_compressed_bytes, - rmm::cuda_stream_view stream) +void batched_compress_async(compression_type compression, + void const* const* device_uncompressed_ptrs, + size_t const* device_uncompressed_bytes, + size_t max_uncompressed_chunk_bytes, + size_t batch_size, + void* device_temp_ptr, + size_t temp_bytes, + void* const* device_compressed_ptrs, + size_t* device_compressed_bytes, + rmm::cuda_stream_view stream) { nvcompStatus_t nvcomp_status = nvcompStatus_t::nvcompSuccess; switch (compression) { @@ -279,6 +198,137 @@ bool is_aligned(void const* ptr, std::uintptr_t alignment) noexcept return (reinterpret_cast(ptr) % alignment) == 0; } +std::optional is_compression_disabled_impl(compression_type compression, + feature_status_parameters params) +{ + switch (compression) { + case compression_type::DEFLATE: { + if (not params.are_all_integrations_enabled) { + return "DEFLATE compression is experimental, you can enable it through " + "`LIBCUDF_NVCOMP_POLICY` environment variable."; + } + return std::nullopt; + } + case compression_type::LZ4: + case compression_type::SNAPPY: + case compression_type::ZSTD: + if (not params.are_stable_integrations_enabled) { + return "nvCOMP use is disabled through the `LIBCUDF_NVCOMP_POLICY` environment variable."; + } + return std::nullopt; + default: return "Unsupported compression type"; + } +} + +std::optional is_decompression_disabled_impl(compression_type compression, + feature_status_parameters params) +{ + switch (compression) { + case compression_type::DEFLATE: + case compression_type::GZIP: { + if (not params.are_all_integrations_enabled) { + return "DEFLATE decompression is experimental, you can enable it through " + "`LIBCUDF_NVCOMP_POLICY` environment variable."; + } + return std::nullopt; + } + case compression_type::LZ4: + case compression_type::SNAPPY: + case compression_type::ZSTD: { + if (not params.are_stable_integrations_enabled) { + return "nvCOMP use is disabled through the `LIBCUDF_NVCOMP_POLICY` environment variable."; + } + return std::nullopt; + } + } + return "Unsupported compression type"; +} + +} // namespace + +size_t batched_decompress_temp_size(compression_type compression, + size_t num_chunks, + size_t max_uncomp_chunk_size, + size_t max_total_uncomp_size) +{ + size_t temp_size = 0; + nvcompStatus_t const nvcomp_status = batched_decompress_get_temp_size_ex( + compression, num_chunks, max_uncomp_chunk_size, &temp_size, max_total_uncomp_size); + + CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, + "Unable to get scratch size for decompression"); + return temp_size; +} + +void batched_decompress(compression_type compression, + device_span const> inputs, + device_span const> outputs, + device_span results, + size_t max_uncomp_chunk_size, + size_t max_total_uncomp_size, + rmm::cuda_stream_view stream) +{ + auto const num_chunks = inputs.size(); + + // cuDF inflate inputs converted to nvcomp inputs + auto const nvcomp_args = create_batched_nvcomp_args(inputs, outputs, stream); + rmm::device_uvector actual_uncompressed_data_sizes(num_chunks, stream); + rmm::device_uvector nvcomp_statuses(num_chunks, stream); + // Temporary space required for decompression + auto const temp_size = batched_decompress_temp_size( + compression, num_chunks, max_uncomp_chunk_size, max_total_uncomp_size); + rmm::device_buffer scratch(temp_size, stream); + auto const nvcomp_status = batched_decompress_async(compression, + nvcomp_args.input_data_ptrs.data(), + nvcomp_args.input_data_sizes.data(), + nvcomp_args.output_data_sizes.data(), + actual_uncompressed_data_sizes.data(), + num_chunks, + scratch.data(), + scratch.size(), + nvcomp_args.output_data_ptrs.data(), + nvcomp_statuses.data(), + stream.value()); + CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, "unable to perform decompression"); + + update_compression_results(nvcomp_statuses, actual_uncompressed_data_sizes, results, stream); +} + +// Wrapper for nvcompBatchedCompressGetMaxOutputChunkSize +size_t compress_max_output_chunk_size(compression_type compression, + uint32_t max_uncompressed_chunk_bytes) +{ + auto const capped_uncomp_bytes = std::min( + compress_max_allowed_chunk_size(compression).value_or(max_uncompressed_chunk_bytes), + max_uncompressed_chunk_bytes); + + size_t max_comp_chunk_size = 0; + nvcompStatus_t status = nvcompStatus_t::nvcompSuccess; + switch (compression) { + case compression_type::SNAPPY: + status = nvcompBatchedSnappyCompressGetMaxOutputChunkSize( + capped_uncomp_bytes, nvcompBatchedSnappyDefaultOpts, &max_comp_chunk_size); + break; + case compression_type::DEFLATE: + status = nvcompBatchedDeflateCompressGetMaxOutputChunkSize( + capped_uncomp_bytes, nvcompBatchedDeflateDefaultOpts, &max_comp_chunk_size); + break; + case compression_type::ZSTD: + status = nvcompBatchedZstdCompressGetMaxOutputChunkSize( + capped_uncomp_bytes, nvcompBatchedZstdDefaultOpts, &max_comp_chunk_size); + break; + case compression_type::LZ4: + status = nvcompBatchedLZ4CompressGetMaxOutputChunkSize( + capped_uncomp_bytes, nvcompBatchedLZ4DefaultOpts, &max_comp_chunk_size); + break; + default: CUDF_FAIL("Unsupported compression type"); + } + + CUDF_EXPECTS(status == nvcompStatus_t::nvcompSuccess, + "failed to get max uncompressed chunk size"); + return max_comp_chunk_size; +} + void batched_compress(compression_type compression, device_span const> inputs, device_span const> outputs, @@ -347,28 +397,6 @@ struct hash_feature_status_inputs { using feature_status_memo_map = std::unordered_map, hash_feature_status_inputs>; -std::optional is_compression_disabled_impl(compression_type compression, - feature_status_parameters params) -{ - switch (compression) { - case compression_type::DEFLATE: { - if (not params.are_all_integrations_enabled) { - return "DEFLATE compression is experimental, you can enable it through " - "`LIBCUDF_NVCOMP_POLICY` environment variable."; - } - return std::nullopt; - } - case compression_type::LZ4: - case compression_type::SNAPPY: - case compression_type::ZSTD: - if (not params.are_stable_integrations_enabled) { - return "nvCOMP use is disabled through the `LIBCUDF_NVCOMP_POLICY` environment variable."; - } - return std::nullopt; - default: return "Unsupported compression type"; - } -} - std::optional is_compression_disabled(compression_type compression, feature_status_parameters params) { @@ -398,30 +426,6 @@ std::optional is_compression_disabled(compression_type compression, return reason; } -std::optional is_decompression_disabled_impl(compression_type compression, - feature_status_parameters params) -{ - switch (compression) { - case compression_type::DEFLATE: - case compression_type::GZIP: { - if (not params.are_all_integrations_enabled) { - return "DEFLATE decompression is experimental, you can enable it through " - "`LIBCUDF_NVCOMP_POLICY` environment variable."; - } - return std::nullopt; - } - case compression_type::LZ4: - case compression_type::SNAPPY: - case compression_type::ZSTD: { - if (not params.are_stable_integrations_enabled) { - return "nvCOMP use is disabled through the `LIBCUDF_NVCOMP_POLICY` environment variable."; - } - return std::nullopt; - } - } - return "Unsupported compression type"; -} - std::optional is_decompression_disabled(compression_type compression, feature_status_parameters params) { diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp index b3d43fa786a..4ab5174387e 100644 --- a/cpp/src/io/comp/uncomp.cpp +++ b/cpp/src/io/comp/uncomp.cpp @@ -127,7 +127,7 @@ struct zip_archive_s { bool ParseGZArchive(gz_archive_s* dst, uint8_t const* raw, size_t len) { - gz_file_header_s const* fhdr; + gz_file_header_s const* fhdr = nullptr; if (!dst) return false; memset(dst, 0, sizeof(gz_archive_s)); @@ -138,7 +138,7 @@ bool ParseGZArchive(gz_archive_s* dst, uint8_t const* raw, size_t len) raw += sizeof(gz_file_header_s); len -= sizeof(gz_file_header_s); if (fhdr->flags & GZIPHeaderFlag::fextra) { - uint32_t xlen; + uint32_t xlen = 0; if (len < 2) return false; xlen = raw[0] | (raw[1] << 8); @@ -151,8 +151,8 @@ bool ParseGZArchive(gz_archive_s* dst, uint8_t const* raw, size_t len) len -= xlen; } if (fhdr->flags & GZIPHeaderFlag::fname) { - size_t l = 0; - uint8_t c; + size_t l = 0; + uint8_t c = 0; do { if (l >= len) return false; c = raw[l]; @@ -163,8 +163,8 @@ bool ParseGZArchive(gz_archive_s* dst, uint8_t const* raw, size_t len) len -= l; } if (fhdr->flags & GZIPHeaderFlag::fcomment) { - size_t l = 0; - uint8_t c; + size_t l = 0; + uint8_t c = 0; do { if (l >= len) return false; c = raw[l]; @@ -219,7 +219,7 @@ bool OpenZipArchive(zip_archive_s* dst, uint8_t const* raw, size_t len) int cpu_inflate(uint8_t* uncomp_data, size_t* destLen, uint8_t const* comp_data, size_t comp_len) { - int zerr; + int zerr = 0; z_stream strm; memset(&strm, 0, sizeof(strm)); @@ -291,7 +291,7 @@ size_t decompress_zlib(host_span src, host_span dst) */ size_t decompress_gzip(host_span src, host_span dst) { - gz_archive_s gz; + gz_archive_s gz{}; auto const parse_succeeded = ParseGZArchive(&gz, src.data(), src.size()); CUDF_EXPECTS(parse_succeeded, "Failed to parse GZIP header"); return decompress_zlib({gz.comp_data, gz.comp_len}, dst); @@ -303,12 +303,12 @@ size_t decompress_gzip(host_span src, host_span dst) size_t decompress_snappy(host_span src, host_span dst) { CUDF_EXPECTS(not dst.empty() and src.size() >= 1, "invalid Snappy decompress inputs"); - uint32_t uncompressed_size, bytes_left, dst_pos; + uint32_t uncompressed_size = 0, bytes_left = 0, dst_pos = 0; auto cur = src.begin(); auto const end = src.end(); // Read uncompressed length (varint) { - uint32_t l = 0, c; + uint32_t l = 0, c = 0; uncompressed_size = 0; do { c = *cur++; @@ -328,7 +328,7 @@ size_t decompress_snappy(host_span src, host_span dst) if (blen & 3) { // Copy - uint32_t offset; + uint32_t offset = 0; if (blen & 2) { // xxxxxx1x: copy with 6-bit length, 2-byte or 4-byte offset if (cur + 2 > end) break; @@ -441,7 +441,7 @@ source_properties get_source_properties(compression_type compression, host_span< switch (compression) { case compression_type::AUTO: case compression_type::GZIP: { - gz_archive_s gz; + gz_archive_s gz{}; auto const parse_succeeded = ParseGZArchive(&gz, src.data(), src.size()); CUDF_EXPECTS(parse_succeeded, "Failed to parse GZIP header while fetching source properties"); compression = compression_type::GZIP; @@ -452,26 +452,28 @@ source_properties get_source_properties(compression_type compression, host_span< [[fallthrough]]; } case compression_type::ZIP: { - zip_archive_s za; + zip_archive_s za{}; if (OpenZipArchive(&za, raw, src.size())) { size_t cdfh_ofs = 0; for (int i = 0; i < za.eocd->num_entries; i++) { auto const* cdfh = reinterpret_cast( reinterpret_cast(za.cdfh) + cdfh_ofs); - int cdfh_len = sizeof(zip_cdfh_s) + cdfh->fname_len + cdfh->extra_len + cdfh->comment_len; + int const cdfh_len = + sizeof(zip_cdfh_s) + cdfh->fname_len + cdfh->extra_len + cdfh->comment_len; if (cdfh_ofs + cdfh_len > za.eocd->cdir_size || cdfh->sig != 0x0201'4b50) { // Bad cdir break; } // For now, only accept with non-zero file sizes and DEFLATE if (cdfh->comp_method == 8 && cdfh->comp_size > 0 && cdfh->uncomp_size > 0) { - size_t lfh_ofs = cdfh->hdr_ofs; - auto const* lfh = reinterpret_cast(raw + lfh_ofs); + size_t const lfh_ofs = cdfh->hdr_ofs; + auto const* lfh = reinterpret_cast(raw + lfh_ofs); if (lfh_ofs + sizeof(zip_lfh_s) <= src.size() && lfh->sig == 0x0403'4b50 && lfh_ofs + sizeof(zip_lfh_s) + lfh->fname_len + lfh->extra_len <= src.size()) { if (lfh->comp_method == 8 && lfh->comp_size > 0 && lfh->uncomp_size > 0) { - size_t file_start = lfh_ofs + sizeof(zip_lfh_s) + lfh->fname_len + lfh->extra_len; - size_t file_end = file_start + lfh->comp_size; + size_t const file_start = + lfh_ofs + sizeof(zip_lfh_s) + lfh->fname_len + lfh->extra_len; + size_t const file_end = file_start + lfh->comp_size; if (file_end <= src.size()) { // Pick the first valid file of non-zero size (only 1 file expected in archive) compression = compression_type::ZIP; @@ -510,7 +512,7 @@ source_properties get_source_properties(compression_type compression, host_span< auto const end = src.end(); // Read uncompressed length (varint) { - uint32_t l = 0, c; + uint32_t l = 0, c = 0; do { c = *cur++; auto const lo7 = c & 0x7f; diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index ceaeb5d8f85..88423122e16 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -39,6 +39,38 @@ #include namespace cudf::io { +namespace { + +compression_type infer_compression_type(compression_type compression, source_info const& info) +{ + if (compression != compression_type::AUTO) { return compression; } + + if (info.type() != io_type::FILEPATH) { return compression_type::NONE; } + + auto filepath = info.filepaths()[0]; + + // Attempt to infer from the file extension + auto const pos = filepath.find_last_of('.'); + + if (pos == std::string::npos) { return {}; } + + auto str_tolower = [](auto const& begin, auto const& end) { + std::string out; + std::transform(begin, end, std::back_inserter(out), ::tolower); + return out; + }; + + auto const ext = str_tolower(filepath.begin() + pos + 1, filepath.end()); + + if (ext == "gz") { return compression_type::GZIP; } + if (ext == "zip") { return compression_type::ZIP; } + if (ext == "bz2") { return compression_type::BZIP2; } + if (ext == "xz") { return compression_type::XZ; } + + return compression_type::NONE; +} + +} // namespace // Returns builder for csv_reader_options csv_reader_options_builder csv_reader_options::builder(source_info src) @@ -170,35 +202,6 @@ table_with_metadata read_avro(avro_reader_options const& options, rmm::device_as return avro::read_avro(std::move(datasources[0]), options, cudf::get_default_stream(), mr); } -compression_type infer_compression_type(compression_type compression, source_info const& info) -{ - if (compression != compression_type::AUTO) { return compression; } - - if (info.type() != io_type::FILEPATH) { return compression_type::NONE; } - - auto filepath = info.filepaths()[0]; - - // Attempt to infer from the file extension - auto const pos = filepath.find_last_of('.'); - - if (pos == std::string::npos) { return {}; } - - auto str_tolower = [](auto const& begin, auto const& end) { - std::string out; - std::transform(begin, end, std::back_inserter(out), ::tolower); - return out; - }; - - auto const ext = str_tolower(filepath.begin() + pos + 1, filepath.end()); - - if (ext == "gz") { return compression_type::GZIP; } - if (ext == "zip") { return compression_type::ZIP; } - if (ext == "bz2") { return compression_type::BZIP2; } - if (ext == "xz") { return compression_type::XZ; } - - return compression_type::NONE; -} - table_with_metadata read_json(json_reader_options options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) @@ -287,7 +290,7 @@ raw_orc_statistics read_raw_orc_statistics(source_info const& src_info, CUDF_FAIL("Unsupported source type"); } - orc::metadata metadata(source.get(), stream); + orc::metadata const metadata(source.get(), stream); // Initialize statistics to return raw_orc_statistics result; diff --git a/cpp/src/io/json/parser_features.cpp b/cpp/src/io/json/parser_features.cpp index e795e8e09d8..ced7acb9cde 100644 --- a/cpp/src/io/json/parser_features.cpp +++ b/cpp/src/io/json/parser_features.cpp @@ -68,6 +68,77 @@ void json_reader_options::set_dtypes(schema_element types) } // namespace cudf::io namespace cudf::io::json::detail { +namespace { + +// example schema and its path. +// "a": int {"a", int} +// "a": [ int ] {"a", list}, {"element", int} +// "a": { "b": int} {"a", struct}, {"b", int} +// "a": [ {"b": int }] {"a", list}, {"element", struct}, {"b", int} +// "a": [ null] {"a", list}, {"element", str} +// back() is root. +// front() is leaf. +/** + * @brief Get the path data type of a column by path if present in input schema + * + * @param path path of the json column + * @param root root of input schema element + * @return data type of the column if present, otherwise std::nullopt + */ +std::optional get_path_data_type( + host_span const> path, schema_element const& root) +{ + if (path.empty() || path.size() == 1) { + return root.type; + } else { + if (path.back().second == NC_STRUCT && root.type.id() == type_id::STRUCT) { + auto const child_name = path.first(path.size() - 1).back().first; + auto const child_schema_it = root.child_types.find(child_name); + return (child_schema_it != std::end(root.child_types)) + ? get_path_data_type(path.first(path.size() - 1), child_schema_it->second) + : std::optional{}; + } else if (path.back().second == NC_LIST && root.type.id() == type_id::LIST) { + auto const child_schema_it = root.child_types.find(list_child_name); + return (child_schema_it != std::end(root.child_types)) + ? get_path_data_type(path.first(path.size() - 1), child_schema_it->second) + : std::optional{}; + } + return std::optional{}; + } +} + +std::optional child_schema_element(std::string const& col_name, + cudf::io::json_reader_options const& options) +{ + return std::visit( + cudf::detail::visitor_overload{ + [col_name](std::vector const& user_dtypes) -> std::optional { + auto column_index = atol(col_name.data()); + return (static_cast(column_index) < user_dtypes.size()) + ? std::optional{{user_dtypes[column_index]}} + : std::optional{}; + }, + [col_name]( + std::map const& user_dtypes) -> std::optional { + return (user_dtypes.find(col_name) != std::end(user_dtypes)) + ? std::optional{{user_dtypes.find(col_name)->second}} + : std::optional{}; + }, + [col_name]( + std::map const& user_dtypes) -> std::optional { + return (user_dtypes.find(col_name) != std::end(user_dtypes)) + ? user_dtypes.find(col_name)->second + : std::optional{}; + }, + [col_name](schema_element const& user_dtypes) -> std::optional { + return (user_dtypes.child_types.find(col_name) != std::end(user_dtypes.child_types)) + ? user_dtypes.child_types.find(col_name)->second + : std::optional{}; + }}, + options.get_dtypes()); +} + +} // namespace /// Created an empty column of the specified schema struct empty_column_functor { @@ -211,74 +282,6 @@ column_name_info make_column_name_info(schema_element const& schema, std::string return info; } -std::optional child_schema_element(std::string const& col_name, - cudf::io::json_reader_options const& options) -{ - return std::visit( - cudf::detail::visitor_overload{ - [col_name](std::vector const& user_dtypes) -> std::optional { - auto column_index = atol(col_name.data()); - return (static_cast(column_index) < user_dtypes.size()) - ? std::optional{{user_dtypes[column_index]}} - : std::optional{}; - }, - [col_name]( - std::map const& user_dtypes) -> std::optional { - return (user_dtypes.find(col_name) != std::end(user_dtypes)) - ? std::optional{{user_dtypes.find(col_name)->second}} - : std::optional{}; - }, - [col_name]( - std::map const& user_dtypes) -> std::optional { - return (user_dtypes.find(col_name) != std::end(user_dtypes)) - ? user_dtypes.find(col_name)->second - : std::optional{}; - }, - [col_name](schema_element const& user_dtypes) -> std::optional { - return (user_dtypes.child_types.find(col_name) != std::end(user_dtypes.child_types)) - ? user_dtypes.child_types.find(col_name)->second - : std::optional{}; - }}, - options.get_dtypes()); -} - -// example schema and its path. -// "a": int {"a", int} -// "a": [ int ] {"a", list}, {"element", int} -// "a": { "b": int} {"a", struct}, {"b", int} -// "a": [ {"b": int }] {"a", list}, {"element", struct}, {"b", int} -// "a": [ null] {"a", list}, {"element", str} -// back() is root. -// front() is leaf. -/** - * @brief Get the path data type of a column by path if present in input schema - * - * @param path path of the json column - * @param root root of input schema element - * @return data type of the column if present, otherwise std::nullopt - */ -std::optional get_path_data_type( - host_span const> path, schema_element const& root) -{ - if (path.empty() || path.size() == 1) { - return root.type; - } else { - if (path.back().second == NC_STRUCT && root.type.id() == type_id::STRUCT) { - auto const child_name = path.first(path.size() - 1).back().first; - auto const child_schema_it = root.child_types.find(child_name); - return (child_schema_it != std::end(root.child_types)) - ? get_path_data_type(path.first(path.size() - 1), child_schema_it->second) - : std::optional{}; - } else if (path.back().second == NC_LIST && root.type.id() == type_id::LIST) { - auto const child_schema_it = root.child_types.find(list_child_name); - return (child_schema_it != std::end(root.child_types)) - ? get_path_data_type(path.first(path.size() - 1), child_schema_it->second) - : std::optional{}; - } - return std::optional{}; - } -} - std::optional get_path_data_type( host_span const> path, cudf::io::json_reader_options const& options) diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp index d15435b2553..a4536ac6a3b 100644 --- a/cpp/src/io/parquet/arrow_schema_writer.cpp +++ b/cpp/src/io/parquet/arrow_schema_writer.cpp @@ -336,7 +336,7 @@ std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector con { // Lambda function to convert int32 to a string of uint8 bytes auto const convert_int32_to_byte_string = [&](int32_t const value) { - std::array buffer; + std::array buffer{}; std::memcpy(buffer.data(), &value, sizeof(int32_t)); return std::string(reinterpret_cast(buffer.data()), buffer.size()); }; diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index d276e946a51..f1ecf66c29f 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -27,23 +27,7 @@ #include namespace cudf::io::parquet::detail { - -/** - * @brief Base class for parquet field functors. - * - * Holds the field value used by all of the specialized functors. - */ -class parquet_field { - private: - int _field_val; - - protected: - parquet_field(int f) : _field_val(f) {} - - public: - virtual ~parquet_field() = default; - [[nodiscard]] int field() const { return _field_val; } -}; +namespace { std::string field_type_string(FieldType type) { @@ -79,6 +63,72 @@ void assert_bool_field_type(int type) "expected bool field, got " + field_type_string(field_type) + " field instead"); } +template +struct FunctionSwitchImpl { + template + static inline void run(CompactProtocolReader* cpr, + int field_type, + int const& field, + std::tuple& ops) + { + if (field == std::get(ops).field()) { + std::get(ops)(cpr, field_type); + } else { + FunctionSwitchImpl::run(cpr, field_type, field, ops); + } + } +}; + +template <> +struct FunctionSwitchImpl<0> { + template + static inline void run(CompactProtocolReader* cpr, + int field_type, + int const& field, + std::tuple& ops) + { + if (field == std::get<0>(ops).field()) { + std::get<0>(ops)(cpr, field_type); + } else { + cpr->skip_struct_field(field_type); + } + } +}; + +template +inline void function_builder(CompactProtocolReader* cpr, std::tuple& op) +{ + constexpr int index = std::tuple_size>::value - 1; + int field = 0; + while (true) { + int const current_byte = cpr->getb(); + if (!current_byte) { break; } + int const field_delta = current_byte >> 4; + int const field_type = current_byte & 0xf; + field = field_delta ? field + field_delta : cpr->get_i16(); + FunctionSwitchImpl::run(cpr, field_type, field, op); + } +} + +} // namespace + +/** + * @brief Base class for parquet field functors. + * + * Holds the field value used by all of the specialized functors. + */ +class parquet_field { + private: + int _field_val; + + protected: + parquet_field(int f) : _field_val(f) {} + + public: + virtual ~parquet_field() = default; + [[nodiscard]] int field() const { return _field_val; } +}; + /** * @brief Abstract base class for list functors. */ @@ -494,53 +544,6 @@ void CompactProtocolReader::skip_struct_field(int t, int depth) } } -template -struct FunctionSwitchImpl { - template - static inline void run(CompactProtocolReader* cpr, - int field_type, - int const& field, - std::tuple& ops) - { - if (field == std::get(ops).field()) { - std::get(ops)(cpr, field_type); - } else { - FunctionSwitchImpl::run(cpr, field_type, field, ops); - } - } -}; - -template <> -struct FunctionSwitchImpl<0> { - template - static inline void run(CompactProtocolReader* cpr, - int field_type, - int const& field, - std::tuple& ops) - { - if (field == std::get<0>(ops).field()) { - std::get<0>(ops)(cpr, field_type); - } else { - cpr->skip_struct_field(field_type); - } - } -}; - -template -inline void function_builder(CompactProtocolReader* cpr, std::tuple& op) -{ - constexpr int index = std::tuple_size>::value - 1; - int field = 0; - while (true) { - int const current_byte = cpr->getb(); - if (!current_byte) { break; } - int const field_delta = current_byte >> 4; - int const field_type = current_byte & 0xf; - field = field_delta ? field + field_delta : cpr->get_i16(); - FunctionSwitchImpl::run(cpr, field_type, field, op); - } -} - void CompactProtocolReader::read(FileMetaData* f) { using optional_list_column_order = diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp index 14c99f728de..bf2db013118 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.cpp +++ b/cpp/src/io/parquet/compact_protocol_writer.cpp @@ -291,7 +291,7 @@ uint32_t CompactProtocolFieldWriter::put_uint(uint64_t v) uint32_t CompactProtocolFieldWriter::put_int(int64_t v) { - int64_t s = (v < 0); + int64_t const s = (v < 0); return put_uint(((v ^ -s) << 1) + s); } diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp index cd3dcd2bce4..b0cbabf1c12 100644 --- a/cpp/src/io/parquet/predicate_pushdown.cpp +++ b/cpp/src/io/parquet/predicate_pushdown.cpp @@ -426,7 +426,7 @@ std::optional>> aggregate_reader_metadata::fi // where min(col[i]) = columns[i*2], max(col[i])=columns[i*2+1] // For each column, it contains #sources * #column_chunks_per_src rows. std::vector> columns; - stats_caster stats_col{total_row_groups, per_file_metadata, input_row_group_indices}; + stats_caster const stats_col{total_row_groups, per_file_metadata, input_row_group_indices}; for (size_t col_idx = 0; col_idx < output_dtypes.size(); col_idx++) { auto const schema_idx = output_column_schemas[col_idx]; auto const& dtype = output_dtypes[col_idx]; @@ -447,7 +447,8 @@ std::optional>> aggregate_reader_metadata::fi auto stats_table = cudf::table(std::move(columns)); // Converts AST to StatsAST with reference to min, max columns in above `stats_table`. - stats_expression_converter stats_expr{filter.get(), static_cast(output_dtypes.size())}; + stats_expression_converter const stats_expr{filter.get(), + static_cast(output_dtypes.size())}; auto stats_ast = stats_expr.get_stats_expr(); auto predicate_col = cudf::detail::compute_column(stats_table, stats_ast.get(), stream, mr); auto predicate = predicate_col->view(); diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index d74ae83b635..c48ff896e33 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -148,7 +148,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num CUDF_EXPECTS(input_col.schema_idx == pass.chunks[c].src_col_schema, "Column/page schema index mismatch"); - size_t max_depth = _metadata->get_output_nesting_depth(pass.chunks[c].src_col_schema); + size_t const max_depth = _metadata->get_output_nesting_depth(pass.chunks[c].src_col_schema); chunk_offsets.push_back(chunk_off); // get a slice of size `nesting depth` from `chunk_nested_valids` to store an array of pointers @@ -203,7 +203,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num auto& out_buf = (*cols)[input_col.nesting[idx]]; cols = &out_buf.children; - int owning_schema = out_buf.user_data & PARQUET_COLUMN_BUFFER_SCHEMA_MASK; + int const owning_schema = out_buf.user_data & PARQUET_COLUMN_BUFFER_SCHEMA_MASK; if (owning_schema == 0 || owning_schema == input_col.schema_idx) { valids[idx] = out_buf.null_mask(); data[idx] = out_buf.data(); @@ -435,7 +435,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num ColumnChunkDesc* col = &pass.chunks[pi->chunk_idx]; input_column_info const& input_col = _input_columns[col->src_col_index]; - int index = pi->nesting_decode - page_nesting_decode.device_ptr(); + int const index = pi->nesting_decode - page_nesting_decode.device_ptr(); PageNestingDecodeInfo* pndi = &page_nesting_decode[index]; auto* cols = &_output_buffers; diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index a6562d33de2..bfd0cc992cf 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -833,7 +833,7 @@ std::optional aggregate_reader_metadata::decode_ipc_message( // Lambda function to read and return 4 bytes as int32_t from the ipc message buffer and update // buffer pointer and size auto read_int32_from_ipc_message = [&]() { - int32_t bytes; + int32_t bytes = 0; std::memcpy(&bytes, message_buf, sizeof(int32_t)); // Offset the message buf and reduce remaining size message_buf += sizeof(int32_t); @@ -991,7 +991,7 @@ std::string aggregate_reader_metadata::get_pandas_index() const // One-liner regex: // "index_columns"\s*:\s*\[\s*((?:"(?:|(?:.*?(?![^\\]")).?)[^\\]?",?\s*)*)\] // Documented below. - std::regex index_columns_expr{ + std::regex const index_columns_expr{ R"("index_columns"\s*:\s*\[\s*)" // match preamble, opening square bracket, whitespace R"(()" // Open first capturing group R"((?:")" // Open non-capturing group match opening quote @@ -1013,12 +1013,12 @@ std::vector aggregate_reader_metadata::get_pandas_index_names() con std::vector names; auto str = get_pandas_index(); if (str.length() != 0) { - std::regex index_name_expr{R"(\"((?:\\.|[^\"])*)\")"}; + std::regex const index_name_expr{R"(\"((?:\\.|[^\"])*)\")"}; std::smatch sm; while (std::regex_search(str, sm, index_name_expr)) { if (sm.size() == 2) { // 2 = whole match, first item if (std::find(names.begin(), names.end(), sm[1].str()) == names.end()) { - std::regex esc_quote{R"(\\")"}; + std::regex const esc_quote{R"(\\")"}; names.emplace_back(std::regex_replace(sm[1].str(), esc_quote, R"(")")); } } @@ -1362,8 +1362,8 @@ aggregate_reader_metadata::select_columns( std::vector all_paths; std::function add_path = [&](std::string path_till_now, int schema_idx) { - auto const& schema_elem = get_schema(schema_idx); - std::string curr_path = path_till_now + schema_elem.name; + auto const& schema_elem = get_schema(schema_idx); + std::string const curr_path = path_till_now + schema_elem.name; all_paths.push_back({curr_path, schema_idx}); for (auto const& child_idx : schema_elem.children_idx) { add_path(curr_path + ".", child_idx); @@ -1376,7 +1376,7 @@ aggregate_reader_metadata::select_columns( // Find which of the selected paths are valid and get their schema index std::vector valid_selected_paths; // vector reference pushback (*use_names). If filter names passed. - std::vector const>> column_names{ + std::vector const>> const column_names{ *use_names, *filter_columns_names}; for (auto const& used_column_names : column_names) { for (auto const& selected_path : used_column_names.get()) { @@ -1408,7 +1408,7 @@ aggregate_reader_metadata::select_columns( std::vector selected_columns; if (include_index) { - std::vector index_names = get_pandas_index_names(); + std::vector const index_names = get_pandas_index_names(); std::transform(index_names.cbegin(), index_names.cend(), std::back_inserter(selected_columns), @@ -1457,7 +1457,7 @@ aggregate_reader_metadata::select_columns( } for (auto& col : selected_columns) { auto const& top_level_col_schema_idx = find_schema_child(root, col.name); - bool valid_column = build_column(&col, top_level_col_schema_idx, output_columns, false); + bool const valid_column = build_column(&col, top_level_col_schema_idx, output_columns, false); if (valid_column) { output_column_schemas.push_back(top_level_col_schema_idx); diff --git a/cpp/src/io/text/bgzip_utils.cpp b/cpp/src/io/text/bgzip_utils.cpp index cb412828e2d..77da2a44c7c 100644 --- a/cpp/src/io/text/bgzip_utils.cpp +++ b/cpp/src/io/text/bgzip_utils.cpp @@ -40,7 +40,7 @@ IntType read_int(char* data) template void write_int(std::ostream& output_stream, T val) { - std::array bytes; + std::array bytes{}; // we assume little-endian std::memcpy(&bytes[0], &val, sizeof(T)); output_stream.write(bytes.data(), bytes.size()); diff --git a/cpp/src/io/utilities/base64_utilities.cpp b/cpp/src/io/utilities/base64_utilities.cpp index 856c29599a7..2a2a07afc8d 100644 --- a/cpp/src/io/utilities/base64_utilities.cpp +++ b/cpp/src/io/utilities/base64_utilities.cpp @@ -86,7 +86,7 @@ std::string base64_encode(std::string_view string_to_encode) num_iterations += (input_length % 3) ? 1 : 0; std::string encoded; - size_t encoded_length = (input_length + 2) / 3 * 4; + size_t const encoded_length = (input_length + 2) / 3 * 4; encoded.reserve(encoded_length); // altered: modify base64 encoder loop using STL and Thrust. @@ -135,7 +135,7 @@ std::string base64_decode(std::string_view encoded_string) return std::string{}; } - size_t input_length = encoded_string.length(); + size_t const input_length = encoded_string.length(); std::string decoded; // altered: compute number of decoding iterations = floor (multiple of 4) @@ -147,7 +147,7 @@ std::string base64_decode(std::string_view encoded_string) // two bytes smaller, depending on the amount of trailing equal signs // in the encoded string. This approximation is needed to reserve // enough space in the string to be returned. - size_t approx_decoded_length = input_length / 4 * 3; + size_t const approx_decoded_length = input_length / 4 * 3; decoded.reserve(approx_decoded_length); // diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index b37a5ac900a..bed03869b34 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -86,7 +86,7 @@ class file_sink : public data_sink { { if (!supports_device_write()) CUDF_FAIL("Device writes are not supported for this file."); - size_t offset = _bytes_written; + size_t const offset = _bytes_written; _bytes_written += size; if (!_kvikio_file.closed()) { @@ -170,7 +170,7 @@ class void_sink : public data_sink { size_t bytes_written() override { return _bytes_written; } private: - size_t _bytes_written; + size_t _bytes_written{}; }; class user_sink_wrapper : public data_sink { diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 10814eea458..62ef7c7a794 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -128,7 +128,8 @@ class file_source : public datasource { rmm::cuda_stream_view stream) override { rmm::device_buffer out_data(size, stream); - size_t read = device_read(offset, size, reinterpret_cast(out_data.data()), stream); + size_t const read = + device_read(offset, size, reinterpret_cast(out_data.data()), stream); out_data.resize(read, stream); return datasource::buffer::create(std::move(out_data)); } @@ -444,7 +445,8 @@ class remote_file_source : public datasource { rmm::cuda_stream_view stream) override { rmm::device_buffer out_data(size, stream); - size_t read = device_read(offset, size, reinterpret_cast(out_data.data()), stream); + size_t const read = + device_read(offset, size, reinterpret_cast(out_data.data()), stream); out_data.resize(read, stream); return datasource::buffer::create(std::move(out_data)); } @@ -471,7 +473,7 @@ class remote_file_source : public datasource { static bool is_supported_remote_url(std::string const& url) { // Regular expression to match "s3://" - static std::regex pattern{R"(^s3://)", std::regex_constants::icase}; + static std::regex const pattern{R"(^s3://)", std::regex_constants::icase}; return std::regex_search(url, pattern); } diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp index f9750e4a505..9b17e7f6d55 100644 --- a/cpp/src/io/utilities/file_io_utilities.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -33,6 +33,24 @@ namespace cudf { namespace io { namespace detail { +namespace { + +[[nodiscard]] int open_file_checked(std::string const& filepath, int flags, mode_t mode) +{ + auto const fd = open(filepath.c_str(), flags, mode); + if (fd == -1) { throw_on_file_open_failure(filepath, flags & O_CREAT); } + + return fd; +} + +[[nodiscard]] size_t get_file_size(int file_descriptor) +{ + struct stat st {}; + CUDF_EXPECTS(fstat(file_descriptor, &st) != -1, "Cannot query file size"); + return static_cast(st.st_size); +} + +} // namespace void force_init_cuda_context() { @@ -55,26 +73,11 @@ void force_init_cuda_context() CUDF_EXPECTS(std::filesystem::exists(path), "Cannot open file; it does not exist"); } - std::array error_msg_buffer; + std::array error_msg_buffer{}; auto const error_msg = strerror_r(err, error_msg_buffer.data(), 1024); CUDF_FAIL("Cannot open file; failed with errno: " + std::string{error_msg}); } -[[nodiscard]] int open_file_checked(std::string const& filepath, int flags, mode_t mode) -{ - auto const fd = open(filepath.c_str(), flags, mode); - if (fd == -1) { throw_on_file_open_failure(filepath, flags & O_CREAT); } - - return fd; -} - -[[nodiscard]] size_t get_file_size(int file_descriptor) -{ - struct stat st; - CUDF_EXPECTS(fstat(file_descriptor, &st) != -1, "Cannot query file size"); - return static_cast(st.st_size); -} - file_wrapper::file_wrapper(std::string const& filepath, int flags, mode_t mode) : fd(open_file_checked(filepath.c_str(), flags, mode)), _size{get_file_size(fd)} { @@ -125,7 +128,7 @@ class cufile_shim { void cufile_shim::modify_cufile_json() const { std::string const json_path_env_var = "CUFILE_ENV_PATH_JSON"; - static temp_directory tmp_config_dir{"cudf_cufile_config"}; + static temp_directory const tmp_config_dir{"cudf_cufile_config"}; // Modify the config file based on the policy auto const config_file_path = getenv_or(json_path_env_var, "/etc/cufile.json"); @@ -253,7 +256,7 @@ std::future cufile_input_impl::read_async(size_t offset, uint8_t* dst, rmm::cuda_stream_view stream) { - int device; + int device = 0; CUDF_CUDA_TRY(cudaGetDevice(&device)); auto read_slice = [device, gds_read = shim->read, file_handle = cf_file.handle()]( @@ -285,7 +288,7 @@ cufile_output_impl::cufile_output_impl(std::string const& filepath) std::future cufile_output_impl::write_async(void const* data, size_t offset, size_t size) { - int device; + int device = 0; CUDF_CUDA_TRY(cudaGetDevice(&device)); auto write_slice = [device, gds_write = shim->write, file_handle = cf_file.handle()]( diff --git a/cpp/src/jit/cache.cpp b/cpp/src/jit/cache.cpp index 34a0bdce124..49f92756e43 100644 --- a/cpp/src/jit/cache.cpp +++ b/cpp/src/jit/cache.cpp @@ -22,6 +22,7 @@ namespace cudf { namespace jit { +namespace { // Get the directory in home to use for storing the cache std::filesystem::path get_user_home_cache_dir() @@ -72,13 +73,13 @@ std::filesystem::path get_cache_dir() // Make per device cache based on compute capability. This is to avoid multiple devices of // different compute capability to access the same kernel cache. - int device; - int cc_major; - int cc_minor; + int device = 0; + int cc_major = 0; + int cc_minor = 0; CUDF_CUDA_TRY(cudaGetDevice(&device)); CUDF_CUDA_TRY(cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device)); CUDF_CUDA_TRY(cudaDeviceGetAttribute(&cc_minor, cudaDevAttrComputeCapabilityMinor, device)); - int cc = cc_major * 10 + cc_minor; + int const cc = cc_major * 10 + cc_minor; kernel_cache_path /= std::to_string(cc); @@ -107,13 +108,14 @@ std::size_t try_parse_numeric_env_var(char const* const env_name, std::size_t de auto const value = std::getenv(env_name); return value != nullptr ? std::stoull(value) : default_val; } +} // namespace jitify2::ProgramCache<>& get_program_cache(jitify2::PreprocessedProgramData preprog) { static std::mutex caches_mutex{}; static std::unordered_map>> caches{}; - std::lock_guard caches_lock(caches_mutex); + std::lock_guard const caches_lock(caches_mutex); auto existing_cache = caches.find(preprog.name()); diff --git a/cpp/src/jit/parser.cpp b/cpp/src/jit/parser.cpp index 519ac2d1a2e..c79ba4347bf 100644 --- a/cpp/src/jit/parser.cpp +++ b/cpp/src/jit/parser.cpp @@ -26,10 +26,37 @@ namespace cudf { namespace jit { -constexpr char percent_escape[] = "_"; // NOLINT +namespace { inline bool is_white(char const c) { return c == ' ' || c == '\n' || c == '\r' || c == '\t'; } +std::string remove_comments(std::string const& src) +{ + std::string output; + auto f = src.cbegin(); + while (f < src.cend()) { + auto l = std::find(f, src.cend(), '/'); + output.append(f, l); // push chunk instead of 1 char at a time + f = std::next(l); // skip over '/' + if (l < src.cend()) { + char const n = f < src.cend() ? *f : '?'; + if (n == '/') { // found "//" + f = std::find(f, src.cend(), '\n'); // skip to end of line + } else if (n == '*') { // found "/*" + auto term = std::string("*/"); // skip to end of next "*/" + f = std::search(std::next(f), src.cend(), term.cbegin(), term.cend()) + term.size(); + } else { + output.push_back('/'); // lone '/' should be pushed into output + } + } + } + return output; +} + +} // namespace + +constexpr char percent_escape[] = "_"; // NOLINT + std::string ptx_parser::escape_percent(std::string const& src) { // b/c we're transforming into inline ptx we aren't allowed to have register names starting with % @@ -106,7 +133,7 @@ std::string ptx_parser::parse_instruction(std::string const& src) std::string output; std::string suffix; - std::string original_code = "\n /** " + src + " */\n"; + std::string const original_code = "\n /** " + src + " */\n"; int piece_count = 0; @@ -316,33 +343,10 @@ std::string ptx_parser::parse_function_header(std::string const& src) return "\n__device__ __inline__ void " + function_name + "(" + input_arg + "){" + "\n"; } -std::string remove_comments(std::string const& src) -{ - std::string output; - auto f = src.cbegin(); - while (f < src.cend()) { - auto l = std::find(f, src.cend(), '/'); - output.append(f, l); // push chunk instead of 1 char at a time - f = std::next(l); // skip over '/' - if (l < src.cend()) { - char n = f < src.cend() ? *f : '?'; - if (n == '/') { // found "//" - f = std::find(f, src.cend(), '\n'); // skip to end of line - } else if (n == '*') { // found "/*" - auto term = std::string("*/"); // skip to end of next "*/" - f = std::search(std::next(f), src.cend(), term.cbegin(), term.cend()) + term.size(); - } else { - output.push_back('/'); // lone '/' should be pushed into output - } - } - } - return output; -} - // The interface std::string ptx_parser::parse() { - std::string no_comments = remove_comments(ptx); + std::string const no_comments = remove_comments(ptx); input_arg_list.clear(); auto const _func = std::string(".func"); // Go directly to the .func mark diff --git a/cpp/src/quantiles/tdigest/tdigest_column_view.cpp b/cpp/src/quantiles/tdigest/tdigest_column_view.cpp index 17844b6bb0a..933ef1bfcbd 100644 --- a/cpp/src/quantiles/tdigest/tdigest_column_view.cpp +++ b/cpp/src/quantiles/tdigest/tdigest_column_view.cpp @@ -29,14 +29,14 @@ tdigest_column_view::tdigest_column_view(column_view const& col) : column_view(c CUDF_EXPECTS(col.offset() == 0, "Encountered a sliced tdigest column"); CUDF_EXPECTS(not col.nullable(), "Encountered nullable tdigest column"); - structs_column_view scv(col); + structs_column_view const scv(col); CUDF_EXPECTS(scv.num_children() == 3, "Encountered invalid tdigest column"); CUDF_EXPECTS(scv.child(min_column_index).type().id() == type_id::FLOAT64, "Encountered invalid tdigest column"); CUDF_EXPECTS(scv.child(max_column_index).type().id() == type_id::FLOAT64, "Encountered invalid tdigest column"); - lists_column_view lcv(scv.child(centroid_column_index)); + lists_column_view const lcv(scv.child(centroid_column_index)); auto data = lcv.child(); CUDF_EXPECTS(data.type().id() == type_id::STRUCT, "Encountered invalid tdigest column"); CUDF_EXPECTS(data.num_children() == 2, @@ -52,14 +52,14 @@ lists_column_view tdigest_column_view::centroids() const { return child(centroid column_view tdigest_column_view::means() const { auto c = centroids(); - structs_column_view inner(c.parent().child(lists_column_view::child_column_index)); + structs_column_view const inner(c.parent().child(lists_column_view::child_column_index)); return inner.child(mean_column_index); } column_view tdigest_column_view::weights() const { auto c = centroids(); - structs_column_view inner(c.parent().child(lists_column_view::child_column_index)); + structs_column_view const inner(c.parent().child(lists_column_view::child_column_index)); return inner.child(weight_column_index); } diff --git a/cpp/src/reductions/scan/scan.cpp b/cpp/src/reductions/scan/scan.cpp index b91ae19b51a..7afd3ba3c00 100644 --- a/cpp/src/reductions/scan/scan.cpp +++ b/cpp/src/reductions/scan/scan.cpp @@ -20,8 +20,8 @@ #include namespace cudf { - namespace detail { +namespace { std::unique_ptr scan(column_view const& input, scan_aggregation const& agg, scan_type inclusive, @@ -50,6 +50,7 @@ std::unique_ptr scan(column_view const& input, : detail::scan_inclusive(input, agg, null_handling, stream, mr); } +} // namespace } // namespace detail std::unique_ptr scan(column_view const& input, diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp index c4f6c135dde..dedfc4b0734 100644 --- a/cpp/src/reductions/segmented/reductions.cpp +++ b/cpp/src/reductions/segmented/reductions.cpp @@ -26,6 +26,8 @@ namespace cudf { namespace reduction { namespace detail { +namespace { + struct segmented_reduce_dispatch_functor { column_view const& col; device_span offsets; @@ -126,6 +128,7 @@ std::unique_ptr segmented_reduce(column_view const& segmented_values, segmented_values, offsets, output_dtype, null_handling, init, stream, mr}, agg); } +} // namespace } // namespace detail } // namespace reduction diff --git a/cpp/src/rolling/detail/optimized_unbounded_window.cpp b/cpp/src/rolling/detail/optimized_unbounded_window.cpp index 7cad31c0658..9c22c27144d 100644 --- a/cpp/src/rolling/detail/optimized_unbounded_window.cpp +++ b/cpp/src/rolling/detail/optimized_unbounded_window.cpp @@ -25,32 +25,7 @@ #include namespace cudf::detail { - -bool can_optimize_unbounded_window(bool unbounded_preceding, - bool unbounded_following, - size_type min_periods, - rolling_aggregation const& agg) -{ - auto is_supported = [](auto const& agg) { - switch (agg.kind) { - case cudf::aggregation::Kind::COUNT_ALL: [[fallthrough]]; - case cudf::aggregation::Kind::COUNT_VALID: [[fallthrough]]; - case cudf::aggregation::Kind::SUM: [[fallthrough]]; - case cudf::aggregation::Kind::MIN: [[fallthrough]]; - case cudf::aggregation::Kind::MAX: return true; - default: - // COLLECT_LIST and COLLECT_SET can be added at a later date. - // Other aggregations do not fit into the [UNBOUNDED, UNBOUNDED] - // category. For instance: - // 1. Ranking functions (ROW_NUMBER, RANK, DENSE_RANK, PERCENT_RANK) - // use [UNBOUNDED PRECEDING, CURRENT ROW]. - // 2. LEAD/LAG are defined on finite row boundaries. - return false; - } - }; - - return unbounded_preceding && unbounded_following && (min_periods == 1) && is_supported(agg); -} +namespace { /// Converts rolling_aggregation to corresponding reduce/groupby_aggregation. template @@ -145,6 +120,33 @@ std::unique_ptr reduction_based_rolling_window(column_view const& input, // Blow up results into separate column. return cudf::make_column_from_scalar(*reduce_results, input.size(), stream, mr); } +} // namespace + +bool can_optimize_unbounded_window(bool unbounded_preceding, + bool unbounded_following, + size_type min_periods, + rolling_aggregation const& agg) +{ + auto is_supported = [](auto const& agg) { + switch (agg.kind) { + case cudf::aggregation::Kind::COUNT_ALL: [[fallthrough]]; + case cudf::aggregation::Kind::COUNT_VALID: [[fallthrough]]; + case cudf::aggregation::Kind::SUM: [[fallthrough]]; + case cudf::aggregation::Kind::MIN: [[fallthrough]]; + case cudf::aggregation::Kind::MAX: return true; + default: + // COLLECT_LIST and COLLECT_SET can be added at a later date. + // Other aggregations do not fit into the [UNBOUNDED, UNBOUNDED] + // category. For instance: + // 1. Ranking functions (ROW_NUMBER, RANK, DENSE_RANK, PERCENT_RANK) + // use [UNBOUNDED PRECEDING, CURRENT ROW]. + // 2. LEAD/LAG are defined on finite row boundaries. + return false; + } + }; + + return unbounded_preceding && unbounded_following && (min_periods == 1) && is_supported(agg); +} std::unique_ptr optimized_unbounded_window(table_view const& group_keys, column_view const& input, diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index b923a301f84..b7b1338dd89 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -80,8 +80,8 @@ std::array const escapable_chars{ */ std::vector string_to_char32_vector(std::string_view pattern) { - auto size = static_cast(pattern.size()); - size_type count = std::count_if(pattern.cbegin(), pattern.cend(), [](char ch) { + auto size = static_cast(pattern.size()); + size_type const count = std::count_if(pattern.cbegin(), pattern.cend(), [](char ch) { return is_begin_utf8_char(static_cast(ch)); }); std::vector result(count + 1); @@ -89,7 +89,7 @@ std::vector string_to_char32_vector(std::string_view pattern) char const* input_ptr = pattern.data(); for (size_type idx = 0; idx < size; ++idx) { char_utf8 output_character = 0; - size_type ch_width = to_char_utf8(input_ptr, output_character); + size_type const ch_width = to_char_utf8(input_ptr, output_character); input_ptr += ch_width; idx += ch_width - 1; *output_ptr++ = output_character; @@ -102,7 +102,7 @@ std::vector string_to_char32_vector(std::string_view pattern) int32_t reprog::add_inst(int32_t t) { - reinst inst; + reinst inst{}; inst.type = t; inst.u2.left_id = 0; inst.u1.right_id = 0; @@ -968,7 +968,7 @@ class regex_compiler { } if (token != RBRA) { push_operator(token, subid); } - static std::vector tokens{STAR, STAR_LAZY, QUEST, QUEST_LAZY, PLUS, PLUS_LAZY, RBRA}; + static std::vector const tokens{STAR, STAR_LAZY, QUEST, QUEST_LAZY, PLUS, PLUS_LAZY, RBRA}; _last_was_and = std::any_of(tokens.cbegin(), tokens.cend(), [token](auto t) { return t == token; }); } @@ -1046,7 +1046,7 @@ reprog reprog::create_from(std::string_view pattern, { reprog rtn; auto pattern32 = string_to_char32_vector(pattern); - regex_compiler compiler(pattern32.data(), flags, capture, rtn); + regex_compiler const compiler(pattern32.data(), flags, capture, rtn); // for debugging, it can be helpful to call rtn.print(flags) here to dump // out the instructions that have been created from the given pattern return rtn; @@ -1114,7 +1114,7 @@ void reprog::build_start_ids() std::stack ids; ids.push(_startinst_id); while (!ids.empty()) { - int id = ids.top(); + int const id = ids.top(); ids.pop(); reinst const& inst = _insts[id]; if (inst.type == OR) { diff --git a/cpp/src/strings/regex/regexec.cpp b/cpp/src/strings/regex/regexec.cpp index 60ad714dfec..3d11b641b3f 100644 --- a/cpp/src/strings/regex/regexec.cpp +++ b/cpp/src/strings/regex/regexec.cpp @@ -99,9 +99,9 @@ std::unique_ptr> reprog_devic // place each class and append the variable length data for (int32_t idx = 0; idx < classes_count; ++idx) { auto const& h_class = h_prog.class_at(idx); - reclass_device d_class{h_class.builtins, - static_cast(h_class.literals.size()), - reinterpret_cast(d_end)}; + reclass_device const d_class{h_class.builtins, + static_cast(h_class.literals.size()), + reinterpret_cast(d_end)}; *classes++ = d_class; memcpy(h_end, h_class.literals.data(), h_class.literals.size() * sizeof(reclass_range)); h_end += h_class.literals.size() * sizeof(reclass_range); diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp index 4012ee3d21c..22328726c0e 100644 --- a/cpp/src/structs/utilities.cpp +++ b/cpp/src/structs/utilities.cpp @@ -47,7 +47,7 @@ std::vector> extract_ordered_struct_children( std::vector children; children.reserve(num_cols); for (size_type col_index = 0; col_index < num_cols; col_index++) { - structs_column_view scv(struct_cols[col_index]); + structs_column_view const scv(struct_cols[col_index]); // all inputs must have the same # of children and they must all be of the // same type. diff --git a/cpp/src/table/table_view.cpp b/cpp/src/table/table_view.cpp index 659beb749af..ee7136d8f5e 100644 --- a/cpp/src/table/table_view.cpp +++ b/cpp/src/table/table_view.cpp @@ -25,6 +25,21 @@ namespace cudf { namespace detail { +namespace { + +template +auto concatenate_column_views(std::vector const& views) +{ + using ColumnView = typename ViewType::ColumnView; + std::vector concat_cols; + for (auto& view : views) { + concat_cols.insert(concat_cols.end(), view.begin(), view.end()); + } + return concat_cols; +} + +} // namespace + template table_view_base::table_view_base(std::vector const& cols) : _columns{cols} { @@ -38,17 +53,6 @@ table_view_base::table_view_base(std::vector const& cols } } -template -auto concatenate_column_views(std::vector const& views) -{ - using ColumnView = typename ViewType::ColumnView; - std::vector concat_cols; - for (auto& view : views) { - concat_cols.insert(concat_cols.end(), view.begin(), view.end()); - } - return concat_cols; -} - // Explicit instantiation for a table of `column_view`s template class table_view_base; @@ -65,17 +69,16 @@ table_view table_view::select(std::vector const& column_indices) cons // Convert mutable view to immutable view mutable_table_view::operator table_view() { - std::vector cols{begin(), end()}; - return table_view{cols}; + return table_view{std::vector{begin(), end()}}; } table_view::table_view(std::vector const& views) - : table_view{concatenate_column_views(views)} + : table_view{detail::concatenate_column_views(views)} { } mutable_table_view::mutable_table_view(std::vector const& views) - : mutable_table_view{concatenate_column_views(views)} + : mutable_table_view{detail::concatenate_column_views(views)} { } diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp index b919ac16956..4a383bfba47 100644 --- a/cpp/src/transform/transform.cpp +++ b/cpp/src/transform/transform.cpp @@ -33,7 +33,7 @@ namespace cudf { namespace transformation { namespace jit { - +namespace { void unary_operation(mutable_column_view output, column_view input, std::string const& udf, @@ -41,7 +41,7 @@ void unary_operation(mutable_column_view output, bool is_ptx, rmm::cuda_stream_view stream) { - std::string kernel_name = + std::string const kernel_name = jitify2::reflection::Template("cudf::transformation::jit::kernel") // .instantiate(cudf::type_to_name(output.type()), // list of template arguments cudf::type_to_name(input.type())); @@ -62,6 +62,7 @@ void unary_operation(mutable_column_view output, cudf::jit::get_data_ptr(output), cudf::jit::get_data_ptr(input)); } +} // namespace } // namespace jit } // namespace transformation @@ -81,7 +82,7 @@ std::unique_ptr transform(column_view const& input, if (input.is_empty()) { return output; } - mutable_column_view output_view = *output; + mutable_column_view const output_view = *output; // transform transformation::jit::unary_operation(output_view, input, unary_udf, output_type, is_ptx, stream); diff --git a/cpp/src/utilities/prefetch.cpp b/cpp/src/utilities/prefetch.cpp index 000526723c4..6c9f677afb3 100644 --- a/cpp/src/utilities/prefetch.cpp +++ b/cpp/src/utilities/prefetch.cpp @@ -33,14 +33,14 @@ prefetch_config& prefetch_config::instance() bool prefetch_config::get(std::string_view key) { - std::shared_lock lock(config_mtx); + std::shared_lock const lock(config_mtx); auto const it = config_values.find(key.data()); return it == config_values.end() ? false : it->second; // default to not prefetching } void prefetch_config::set(std::string_view key, bool value) { - std::lock_guard lock(config_mtx); + std::lock_guard const lock(config_mtx); config_values[key.data()] = value; } diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp index 7069b59be26..9d1bebd1937 100644 --- a/cpp/src/utilities/stream_pool.cpp +++ b/cpp/src/utilities/stream_pool.cpp @@ -55,6 +55,63 @@ std::size_t constexpr STREAM_POOL_SIZE = 32; } while (0) #endif +/** + * @brief RAII struct to wrap a cuda event and ensure its proper destruction. + */ +struct cuda_event { + cuda_event() { CUDF_CUDA_TRY(cudaEventCreateWithFlags(&e_, cudaEventDisableTiming)); } + virtual ~cuda_event() { CUDF_ASSERT_CUDA_SUCCESS(cudaEventDestroy(e_)); } + + // Moveable but not copyable. + cuda_event(const cuda_event&) = delete; + cuda_event& operator=(const cuda_event&) = delete; + + cuda_event(cuda_event&&) = default; + cuda_event& operator=(cuda_event&&) = default; + + operator cudaEvent_t() { return e_; } + + private: + cudaEvent_t e_{}; +}; + +namespace { + +// FIXME: these will be available in rmm soon +inline int get_num_cuda_devices() +{ + rmm::cuda_device_id::value_type num_dev{}; + CUDF_CUDA_TRY(cudaGetDeviceCount(&num_dev)); + return num_dev; +} + +rmm::cuda_device_id get_current_cuda_device() +{ + int device_id = 0; + CUDF_CUDA_TRY(cudaGetDevice(&device_id)); + return rmm::cuda_device_id{device_id}; +} + +/** + * @brief Returns a cudaEvent_t for the current thread. + * + * The returned event is valid for the current device. + * + * @return A cudaEvent_t unique to the current thread and valid on the current device. + */ +cudaEvent_t event_for_thread() +{ + // The program may crash if this function is called from the main thread and user application + // subsequently calls cudaDeviceReset(). + // As a workaround, here we intentionally disable RAII and leak cudaEvent_t. + thread_local static std::vector thread_events(get_num_cuda_devices()); + auto const device_id = get_current_cuda_device(); + if (not thread_events[device_id.value()]) { thread_events[device_id.value()] = new cuda_event(); } + return *thread_events[device_id.value()]; +} + +} // namespace + /** * @brief Implementation of `cuda_stream_pool` that wraps an `rmm::cuda_stram_pool`. */ @@ -109,59 +166,6 @@ cuda_stream_pool* create_global_cuda_stream_pool() return new rmm_cuda_stream_pool(); } -// FIXME: these will be available in rmm soon -inline int get_num_cuda_devices() -{ - rmm::cuda_device_id::value_type num_dev{}; - CUDF_CUDA_TRY(cudaGetDeviceCount(&num_dev)); - return num_dev; -} - -rmm::cuda_device_id get_current_cuda_device() -{ - int device_id; - CUDF_CUDA_TRY(cudaGetDevice(&device_id)); - return rmm::cuda_device_id{device_id}; -} - -/** - * @brief RAII struct to wrap a cuda event and ensure its proper destruction. - */ -struct cuda_event { - cuda_event() { CUDF_CUDA_TRY(cudaEventCreateWithFlags(&e_, cudaEventDisableTiming)); } - virtual ~cuda_event() { CUDF_ASSERT_CUDA_SUCCESS(cudaEventDestroy(e_)); } - - // Moveable but not copyable. - cuda_event(const cuda_event&) = delete; - cuda_event& operator=(const cuda_event&) = delete; - - cuda_event(cuda_event&&) = default; - cuda_event& operator=(cuda_event&&) = default; - - operator cudaEvent_t() { return e_; } - - private: - cudaEvent_t e_; -}; - -/** - * @brief Returns a cudaEvent_t for the current thread. - * - * The returned event is valid for the current device. - * - * @return A cudaEvent_t unique to the current thread and valid on the current device. - */ -cudaEvent_t event_for_thread() -{ - // The program may crash if this function is called from the main thread and user application - // subsequently calls cudaDeviceReset(). - // As a workaround, here we intentionally disable RAII and leak cudaEvent_t. - thread_local std::vector thread_events(get_num_cuda_devices()); - auto const device_id = get_current_cuda_device(); - if (not thread_events[device_id.value()]) { thread_events[device_id.value()] = new cuda_event(); } - return *thread_events[device_id.value()]; -} - /** * @brief Returns a reference to the global stream pool for the current device. * @return `cuda_stream_pool` valid on the current device. @@ -174,7 +178,7 @@ cuda_stream_pool& global_cuda_stream_pool() static std::mutex mutex; auto const device_id = get_current_cuda_device(); - std::lock_guard lock(mutex); + std::lock_guard const lock(mutex); if (pools[device_id.value()] == nullptr) { pools[device_id.value()] = create_global_cuda_stream_pool(); } From beb42960a7fbf2b0c1da17c943bb66050539b39c Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 3 Dec 2024 10:05:24 -0800 Subject: [PATCH 12/78] Workaround for a misaligned access in `read_csv` on some CUDA versions (#17477) Use a global array instead of a shared memory array in the `gather_row_offsets_gpu` kernel. Impact on the kernel performance is less than 5%, and this kernel takes very little portion of the total read_csv execution time - impact on the performance is negligible. Also modified functions that take this array to take a `device_span` instead on a plain pointer. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Bradley Dice (https://github.com/bdice) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/17477 --- cpp/src/io/csv/csv_gpu.cu | 40 +++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu index 273e82edf8b..e2bc75d4bab 100644 --- a/cpp/src/io/csv/csv_gpu.cu +++ b/cpp/src/io/csv/csv_gpu.cu @@ -495,7 +495,7 @@ inline __device__ uint32_t select_rowmap(uint4 ctx_map, uint32_t ctxid) * @param t thread id (leaf node id) */ template -inline __device__ void ctx_merge(uint64_t* ctxtree, packed_rowctx_t* ctxb, uint32_t t) +inline __device__ void ctx_merge(device_span ctxtree, packed_rowctx_t* ctxb, uint32_t t) { uint64_t tmp = shuffle_xor(*ctxb, lanemask); if (!(t & tmask)) { @@ -518,7 +518,7 @@ inline __device__ void ctx_merge(uint64_t* ctxtree, packed_rowctx_t* ctxb, uint3 */ template inline __device__ void ctx_unmerge( - uint32_t base, uint64_t* ctxtree, uint32_t* ctx, uint32_t* brow4, uint32_t t) + uint32_t base, device_span ctxtree, uint32_t* ctx, uint32_t* brow4, uint32_t t) { rowctx32_t ctxb_left, ctxb_right, ctxb_sum; ctxb_sum = get_row_context(ctxtree[base], *ctx); @@ -550,7 +550,7 @@ inline __device__ void ctx_unmerge( * @param[in] ctxb packed row context for the current character block * @param t thread id (leaf node id) */ -static inline __device__ void rowctx_merge_transform(uint64_t ctxtree[1024], +static inline __device__ void rowctx_merge_transform(device_span ctxtree, packed_rowctx_t ctxb, uint32_t t) { @@ -584,8 +584,8 @@ static inline __device__ void rowctx_merge_transform(uint64_t ctxtree[1024], * * @return Final row context and count (row_position*4 + context_id format) */ -static inline __device__ rowctx32_t rowctx_inverse_merge_transform(uint64_t ctxtree[1024], - uint32_t t) +static inline __device__ rowctx32_t +rowctx_inverse_merge_transform(device_span ctxtree, uint32_t t) { uint32_t ctx = ctxtree[0] & 3; // Starting input context rowctx32_t brow4 = 0; // output row in block *4 @@ -603,6 +603,8 @@ static inline __device__ rowctx32_t rowctx_inverse_merge_transform(uint64_t ctxt return brow4 + ctx; } +constexpr auto bk_ctxtree_size = rowofs_block_dim * 2; + /** * @brief Gather row offsets from CSV character data split into 16KB chunks * @@ -634,6 +636,7 @@ static inline __device__ rowctx32_t rowctx_inverse_merge_transform(uint64_t ctxt */ CUDF_KERNEL void __launch_bounds__(rowofs_block_dim) gather_row_offsets_gpu(uint64_t* row_ctx, + device_span ctxtree, device_span offsets_out, device_span const data, size_t chunk_size, @@ -649,12 +652,8 @@ CUDF_KERNEL void __launch_bounds__(rowofs_block_dim) int escapechar, int commentchar) { - auto start = data.begin(); - using block_reduce = typename cub::BlockReduce; - __shared__ union { - typename block_reduce::TempStorage bk_storage; - __align__(8) uint64_t ctxtree[rowofs_block_dim * 2]; - } temp_storage; + auto start = data.begin(); + auto const bk_ctxtree = ctxtree.subspan(blockIdx.x * bk_ctxtree_size, bk_ctxtree_size); char const* end = start + (min(parse_pos + chunk_size, data_size) - start_offset); uint32_t t = threadIdx.x; @@ -723,16 +722,16 @@ CUDF_KERNEL void __launch_bounds__(rowofs_block_dim) // Convert the long-form {rowmap,outctx}[inctx] version into packed version // {rowcount,ouctx}[inctx], then merge the row contexts of the 32-character blocks into // a single 16K-character block context - rowctx_merge_transform(temp_storage.ctxtree, pack_rowmaps(ctx_map), t); + rowctx_merge_transform(bk_ctxtree, pack_rowmaps(ctx_map), t); // If this is the second phase, get the block's initial parser state and row counter if (offsets_out.data()) { - if (t == 0) { temp_storage.ctxtree[0] = row_ctx[blockIdx.x]; } + if (t == 0) { bk_ctxtree[0] = row_ctx[blockIdx.x]; } __syncthreads(); // Walk back the transform tree with the known initial parser state - rowctx32_t ctx = rowctx_inverse_merge_transform(temp_storage.ctxtree, t); - uint64_t row = (temp_storage.ctxtree[0] >> 2) + (ctx >> 2); + rowctx32_t ctx = rowctx_inverse_merge_transform(bk_ctxtree, t); + uint64_t row = (bk_ctxtree[0] >> 2) + (ctx >> 2); uint32_t rows_out_of_range = 0; uint32_t rowmap = select_rowmap(ctx_map, ctx & 3); // Output row positions @@ -749,11 +748,14 @@ CUDF_KERNEL void __launch_bounds__(rowofs_block_dim) } __syncthreads(); // Return the number of rows out of range - rows_out_of_range = block_reduce(temp_storage.bk_storage).Sum(rows_out_of_range); + + using block_reduce = typename cub::BlockReduce; + __shared__ typename block_reduce::TempStorage bk_storage; + rows_out_of_range = block_reduce(bk_storage).Sum(rows_out_of_range); if (t == 0) { row_ctx[blockIdx.x] = rows_out_of_range; } } else { // Just store the row counts and output contexts - if (t == 0) { row_ctx[blockIdx.x] = temp_storage.ctxtree[1]; } + if (t == 0) { row_ctx[blockIdx.x] = bk_ctxtree[1]; } } } @@ -829,7 +831,7 @@ void decode_row_column_data(cudf::io::parse_options_view const& options, // Calculate actual block count to use based on records count auto const block_size = csvparse_block_dim; auto const num_rows = row_offsets.size() - 1; - auto const grid_size = (num_rows + block_size - 1) / block_size; + auto const grid_size = cudf::util::div_rounding_up_safe(num_rows, block_size); convert_csv_to_cudf<<>>( options, data, column_flags, row_offsets, dtypes, columns, valids, valid_counts); @@ -849,9 +851,11 @@ uint32_t __host__ gather_row_offsets(parse_options_view const& options, rmm::cuda_stream_view stream) { uint32_t dim_grid = 1 + (chunk_size / rowofs_block_bytes); + auto ctxtree = rmm::device_uvector(dim_grid * bk_ctxtree_size, stream); gather_row_offsets_gpu<<>>( row_ctx, + ctxtree, offsets_out, data, chunk_size, From 7cc9a9fe8f8e1d889ac813cbbf7f7eb2d4897400 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Tue, 3 Dec 2024 15:41:18 -0600 Subject: [PATCH 13/78] Use exec_policy_nosync in write_json (#17445) Part of #12086 Replaced `rmm::exec_policy` with `rmm::exec_policy_nosync` in write_json Authors: - Karthikeyan (https://github.com/karthikeyann) - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Shruti Shivakumar (https://github.com/shrshi) - Nghia Truong (https://github.com/ttnghia) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/17445 --- cpp/src/io/json/write_json.cu | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu index a4885d59cc5..1a0c59e365a 100644 --- a/cpp/src/io/json/write_json.cu +++ b/cpp/src/io/json/write_json.cu @@ -327,7 +327,7 @@ std::unique_ptr struct_to_strings(table_view const& strings_columns, -> size_type { return idx / tbl.num_columns(); })); auto validity_iterator = cudf::detail::make_counting_transform_iterator(0, validity_fn{*tbl_device_view}); - thrust::exclusive_scan_by_key(rmm::exec_policy(stream), + thrust::exclusive_scan_by_key(rmm::exec_policy_nosync(stream), row_num, row_num + total_rows, validity_iterator, @@ -335,7 +335,7 @@ std::unique_ptr struct_to_strings(table_view const& strings_columns, false, thrust::equal_to{}, thrust::logical_or{}); - thrust::for_each(rmm::exec_policy(stream), + thrust::for_each(rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(total_rows), [write_separator = d_str_separator.begin(), @@ -362,7 +362,7 @@ std::unique_ptr struct_to_strings(table_view const& strings_columns, 0, cuda::proclaim_return_type([num_strviews_per_row] __device__(size_type const i) { return i * num_strviews_per_row; })); - thrust::gather(rmm::exec_policy(stream), + thrust::gather(rmm::exec_policy_nosync(stream), d_strview_offsets, d_strview_offsets + row_string_offsets.size(), old_offsets.begin(), @@ -427,7 +427,7 @@ std::unique_ptr join_list_of_strings(lists_column_view const& lists_stri auto const length = offsets[idx + 1] - offsets[idx]; return length == 0 ? 2 : (2 + length + length - 1); })); - thrust::exclusive_scan(rmm::exec_policy(stream), + thrust::exclusive_scan(rmm::exec_policy_nosync(stream), num_strings_per_list, num_strings_per_list + num_offsets, d_strview_offsets.begin()); @@ -436,7 +436,7 @@ std::unique_ptr join_list_of_strings(lists_column_view const& lists_stri rmm::device_uvector d_strviews(total_strings, stream); // scatter null_list and list_prefix, list_suffix auto col_device_view = cudf::column_device_view::create(lists_strings.parent(), stream); - thrust::for_each(rmm::exec_policy(stream), + thrust::for_each(rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_lists), [col = *col_device_view, @@ -458,7 +458,7 @@ std::unique_ptr join_list_of_strings(lists_column_view const& lists_stri auto labels = cudf::lists::detail::generate_labels( lists_strings, num_strings, stream, cudf::get_current_device_resource_ref()); auto d_strings_children = cudf::column_device_view::create(strings_children, stream); - thrust::for_each(rmm::exec_policy(stream), + thrust::for_each(rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_strings), [col = *col_device_view, @@ -485,7 +485,7 @@ std::unique_ptr join_list_of_strings(lists_column_view const& lists_stri // gather from offset and create a new string column auto old_offsets = strings_column_view(joined_col->view()).offsets(); rmm::device_uvector row_string_offsets(num_offsets, stream, mr); - thrust::gather(rmm::exec_policy(stream), + thrust::gather(rmm::exec_policy_nosync(stream), d_strview_offsets.begin(), d_strview_offsets.end(), old_offsets.begin(), From 541e7e864c700bedfc667b5199a3415fca1b311d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 3 Dec 2024 14:58:20 -0800 Subject: [PATCH 14/78] Make `column_empty` mask buffer creation consistent with libcudf (#16715) Based on offline discussions, this PR makes `column_empty` consistent with libcudf where * A size 0 "empty" column should not have a mask buffer * A size > 0 "empty" (i.e all null) column should have a mask buffer Additionally removes `column_empty_like` which can be subsumed by `column_empty` (I didn't find any active usage of this method across RAPIDS https://github.com/search?q=org%3Arapidsai%20column_empty_like&type=code) `column_empty` will have an unused `masked` argument, but since there is usage of this method across RAPIDS I'll need to adjust them before removing that keyword here (https://github.com/search?q=org%3Arapidsai%20column_empty&type=code) Authors: - Matthew Roeschke (https://github.com/mroeschke) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/16715 --- python/cudf/cudf/core/column/__init__.py | 1 - python/cudf/cudf/core/column/column.py | 66 +++++++++------------- python/cudf/cudf/core/dataframe.py | 14 ++--- python/cudf/cudf/core/reshape.py | 18 +++--- python/cudf/cudf/core/udf/groupby_utils.py | 5 +- python/cudf/cudf/tests/test_list.py | 2 +- python/cudf/cudf/tests/test_parquet.py | 25 ++++++++ python/cudf/cudf/tests/test_string_udfs.py | 4 +- python/cudf/cudf/utils/queryutils.py | 3 +- 9 files changed, 75 insertions(+), 63 deletions(-) diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py index 0a9d339a6a8..db8d33f013a 100644 --- a/python/cudf/cudf/core/column/__init__.py +++ b/python/cudf/cudf/core/column/__init__.py @@ -6,7 +6,6 @@ as_column, build_column, column_empty, - column_empty_like, concat_columns, deserialize_columns, serialize_columns, diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index cdc3a03f445..c8cd80f45f4 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -882,7 +882,7 @@ def take( """ # Handle zero size if indices.size == 0: - return cast(Self, column_empty_like(self, newsize=0)) + return cast(Self, column_empty(row_count=0, dtype=self.dtype)) # TODO: For performance, the check and conversion of gather map should # be done by the caller. This check will be removed in future release. @@ -1222,7 +1222,6 @@ def __cuda_array_interface__(self) -> abc.Mapping[str, Any]: "data": (self.data_ptr, False), "version": 1, } - if self.nullable and self.has_nulls(): # Create a simple Python object that exposes the # `__cuda_array_interface__` attribute here since we need to modify @@ -1516,37 +1515,6 @@ def _return_sentinel_column(): return codes.fillna(na_sentinel.value) -def column_empty_like( - column: ColumnBase, - dtype: Dtype | None = None, - masked: bool = False, - newsize: int | None = None, -) -> ColumnBase: - """Allocate a new column like the given *column*""" - if dtype is None: - dtype = column.dtype - row_count = len(column) if newsize is None else newsize - - if ( - hasattr(column, "dtype") - and isinstance(column.dtype, cudf.CategoricalDtype) - and dtype == column.dtype - ): - catcolumn = cast("cudf.core.column.CategoricalColumn", column) - codes = column_empty_like( - catcolumn.codes, masked=masked, newsize=newsize - ) - return build_column( - data=None, - dtype=dtype, - mask=codes.base_mask, - children=(codes,), - size=codes.size, - ) - - return column_empty(row_count, dtype, masked) - - def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool: """Check if an object dtype Series or array contains NaN.""" return any( @@ -1556,9 +1524,31 @@ def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool: def column_empty( - row_count: int, dtype: Dtype = "object", masked: bool = False + row_count: int, + dtype: Dtype = "object", + masked: bool = False, + for_numba: bool = False, ) -> ColumnBase: - """Allocate a new column like the given row_count and dtype.""" + """ + Allocate a new column with the given row_count and dtype. + + * Passing row_count == 0 creates a size 0 column without a mask buffer. + * Passing row_count > 0 creates an all null column with a mask buffer. + + Parameters + ---------- + row_count : int + Number of elements in the column. + + dtype : Dtype + Type of the column. + + masked : bool + Unused. + + for_numba : bool, default False + If True, don't allocate a mask as it's not supported by numba. + """ dtype = cudf.dtype(dtype) children: tuple[ColumnBase, ...] = () @@ -1600,7 +1590,7 @@ def column_empty( else: data = as_buffer(rmm.DeviceBuffer(size=row_count * dtype.itemsize)) - if masked: + if row_count > 0 and not for_numba: mask = as_buffer( plc.null_mask.create_null_mask( row_count, plc.null_mask.MaskState.ALL_NULL @@ -2353,9 +2343,7 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: if not is_dtype_equal(obj.dtype, head.dtype): # if all null, cast to appropriate dtype if obj.null_count == len(obj): - objs[i] = column_empty_like( - head, dtype=head.dtype, masked=True, newsize=len(obj) - ) + objs[i] = column_empty(row_count=len(obj), dtype=head.dtype) else: raise ValueError("All columns must be the same type") diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index fa8d517a9ef..656274bca38 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1424,8 +1424,8 @@ def __setitem__(self, arg, value): new_columns = ( value if key == arg - else column.column_empty_like( - col, masked=True, newsize=length + else column.column_empty( + row_count=length, dtype=col.dtype ) for key, col in self._column_labels_and_values ) @@ -3385,10 +3385,8 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True): if num_cols != 0: ca = self._data._from_columns_like_self( ( - column.column_empty_like( - col_data, masked=True, newsize=length - ) - for col_data in self._columns + column.column_empty(row_count=length, dtype=dtype) + for _, dtype in self._dtypes ), verify=False, ) @@ -6191,8 +6189,8 @@ def quantile( quant_index=False, )._column if len(res) == 0: - res = column.column_empty_like( - qs, dtype=ser.dtype, masked=True, newsize=len(qs) + res = column.column_empty( + row_count=len(qs), dtype=ser.dtype ) result[k] = res result = DataFrame._from_data(result) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index f37b44b1100..a6815da62c6 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -14,7 +14,7 @@ from cudf.api.extensions import no_default from cudf.api.types import is_scalar from cudf.core._compat import PANDAS_LT_300 -from cudf.core.column import ColumnBase, as_column, column_empty_like +from cudf.core.column import ColumnBase, as_column, column_empty from cudf.core.column_accessor import ColumnAccessor from cudf.utils.dtypes import min_unsigned_type @@ -421,8 +421,8 @@ def concat( # if join is inner and it contains an empty df # we return an empty df, hence creating an empty # column with dtype metadata retained. - result_data[name] = cudf.core.column.column_empty_like( - col, newsize=0 + result_data[name] = column_empty( + row_count=0, dtype=col.dtype ) else: result_data[name] = col @@ -458,8 +458,8 @@ def concat( else: col_label = (k, name) if empty_inner: - result_data[col_label] = ( - cudf.core.column.column_empty_like(col, newsize=0) + result_data[col_label] = column_empty( + row_count=0, dtype=col.dtype ) else: result_data[col_label] = col @@ -995,9 +995,7 @@ def as_tuple(x): ] new_size = nrows * len(names) scatter_map = (columns_idx * np.int32(nrows)) + index_idx - target_col = cudf.core.column.column_empty_like( - col, masked=True, newsize=new_size - ) + target_col = column_empty(row_count=new_size, dtype=col.dtype) target_col[scatter_map] = col target = cudf.Index._from_column(target_col) result.update( @@ -1300,7 +1298,9 @@ def _one_hot_encode_column( """ if isinstance(column.dtype, cudf.CategoricalDtype): if column.size == column.null_count: - column = column_empty_like(categories, newsize=column.size) + column = column_empty( + row_count=column.size, dtype=categories.dtype + ) else: column = column._get_decategorized_column() # type: ignore[attr-defined] diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index 3af662b62ea..814d3e9fc85 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -154,8 +154,9 @@ def jit_groupby_apply(offsets, grouped_values, function, *args): offsets = cp.asarray(offsets) ngroups = len(offsets) - 1 - output = cudf.core.column.column_empty(ngroups, dtype=return_type) - + output = cudf.core.column.column_empty( + ngroups, dtype=return_type, for_numba=True + ) launch_args = [ offsets, output, diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 7d87fc73621..260b481b933 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -852,7 +852,7 @@ def test_listcol_setitem_retain_dtype(): {"a": cudf.Series([["a", "b"], []]), "b": [1, 2], "c": [123, 321]} ) df1 = df.head(0) - # Performing a setitem on `b` triggers a `column.column_empty_like` call + # Performing a setitem on `b` triggers a `column.column_empty` call # which tries to create an empty ListColumn. df1["b"] = df1["c"] # Performing a copy to trigger a copy dtype which is obtained by accessing diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index de3636f7526..13efa71ebae 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -4158,6 +4158,31 @@ def test_parquet_reader_with_mismatched_schemas_error(): ) +def test_parquet_roundtrip_zero_rows_no_column_mask(): + expected = cudf.DataFrame._from_data( + { + "int": cudf.core.column.column_empty(0, "int64"), + "float": cudf.core.column.column_empty(0, "float64"), + "datetime": cudf.core.column.column_empty(0, "datetime64[ns]"), + "timedelta": cudf.core.column.column_empty(0, "timedelta64[ns]"), + "bool": cudf.core.column.column_empty(0, "bool"), + "decimal": cudf.core.column.column_empty( + 0, cudf.Decimal64Dtype(1) + ), + "struct": cudf.core.column.column_empty( + 0, cudf.StructDtype({"a": "int64"}) + ), + "list": cudf.core.column.column_empty( + 0, cudf.ListDtype("float64") + ), + } + ) + with BytesIO() as bio: + expected.to_parquet(bio) + result = cudf.read_parquet(bio) + assert_eq(result, expected) + + def test_parquet_reader_mismatched_nullability(): # Ensure that we can faithfully read the tables with mismatched nullabilities df1 = cudf.DataFrame( diff --git a/python/cudf/cudf/tests/test_string_udfs.py b/python/cudf/cudf/tests/test_string_udfs.py index 69876d97aad..f4841f42e91 100644 --- a/python/cudf/cudf/tests/test_string_udfs.py +++ b/python/cudf/cudf/tests/test_string_udfs.py @@ -82,7 +82,9 @@ def run_udf_test(data, func, dtype): ) else: dtype = np.dtype(dtype) - output = cudf.core.column.column_empty(len(data), dtype=dtype) + output = cudf.core.column.column_empty( + len(data), dtype=dtype, for_numba=True + ) cudf_column = cudf.core.column.as_column(data) str_views = column_to_string_view_array(cudf_column) diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py index 8966789fee8..4e3d32c8ed0 100644 --- a/python/cudf/cudf/utils/queryutils.py +++ b/python/cudf/cudf/utils/queryutils.py @@ -210,7 +210,6 @@ def query_execute(df, expr, callenv): Contains keys 'local_dict', 'locals' and 'globals' which are all dict. They represent the arg, local and global dictionaries of the caller. """ - # compile compiled = query_compile(expr) columns = compiled["colnames"] @@ -247,7 +246,7 @@ def query_execute(df, expr, callenv): # allocate output buffer nrows = len(df) - out = column_empty(nrows, dtype=np.bool_) + out = column_empty(nrows, dtype=np.bool_, for_numba=True) # run kernel args = [out, *colarrays, *envargs] with _CUDFNumbaConfig(): From 1b01df357a841e4aa29f3a40bc1162f1380269fb Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 4 Dec 2024 08:22:36 -0500 Subject: [PATCH 15/78] Use grid_1d utilities in copy_range.cuh (#17409) Use the `grid_1d` utilities to manage thread and stride calculations in the `copy_range.cuh` kernels. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) - MithunR (https://github.com/mythrocks) URL: https://github.com/rapidsai/cudf/pull/17409 --- cpp/include/cudf/detail/copy_range.cuh | 8 ++++---- cpp/include/cudf/detail/null_mask.cuh | 9 ++++----- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh index fcb80fe45f7..022c5c40ea0 100644 --- a/cpp/include/cudf/detail/copy_range.cuh +++ b/cpp/include/cudf/detail/copy_range.cuh @@ -56,15 +56,15 @@ CUDF_KERNEL void copy_range_kernel(SourceValueIterator source_value_begin, constexpr cudf::size_type leader_lane{0}; int const lane_id = threadIdx.x % warp_size; - cudf::size_type const tid = threadIdx.x + blockIdx.x * blockDim.x; - int const warp_id = tid / warp_size; + auto const tid = cudf::detail::grid_1d::global_thread_id(); + auto const warp_id = tid / warp_size; cudf::size_type const offset = target.offset(); cudf::size_type const begin_mask_idx = cudf::word_index(offset + target_begin); cudf::size_type const end_mask_idx = cudf::word_index(offset + target_end); cudf::size_type mask_idx = begin_mask_idx + warp_id; - cudf::size_type const masks_per_grid = gridDim.x * blockDim.x / warp_size; + cudf::size_type const masks_per_grid = cudf::detail::grid_1d::grid_stride() / warp_size; cudf::size_type target_offset = begin_mask_idx * warp_size - (offset + target_begin); cudf::size_type source_idx = tid + target_offset; @@ -92,7 +92,7 @@ CUDF_KERNEL void copy_range_kernel(SourceValueIterator source_value_begin, } } - source_idx += blockDim.x * gridDim.x; + source_idx += cudf::detail::grid_1d::grid_stride(); mask_idx += masks_per_grid; } diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index 025e2ccc3ec..17ecc0f5539 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -67,7 +67,7 @@ CUDF_KERNEL void offset_bitmask_binop(Binop op, size_type source_size_bits, size_type* count_ptr) { - auto const tid = threadIdx.x + blockIdx.x * blockDim.x; + auto const tid = cudf::detail::grid_1d::global_thread_id(); auto const last_bit_index = source_size_bits - 1; auto const last_word_index = cudf::word_index(last_bit_index); @@ -75,7 +75,7 @@ CUDF_KERNEL void offset_bitmask_binop(Binop op, size_type thread_count = 0; for (size_type destination_word_index = tid; destination_word_index < destination.size(); - destination_word_index += blockDim.x * gridDim.x) { + destination_word_index += cudf::detail::grid_1d::grid_stride()) { bitmask_type destination_word = detail::get_mask_offset_word(source[0], destination_word_index, @@ -214,8 +214,7 @@ CUDF_KERNEL void subtract_set_bits_range_boundaries_kernel(bitmask_type const* b { constexpr size_type const word_size_in_bits{detail::size_in_bits()}; - size_type const tid = threadIdx.x + blockIdx.x * blockDim.x; - size_type range_id = tid; + auto range_id = cudf::detail::grid_1d::global_thread_id(); while (range_id < num_ranges) { size_type const first_bit_index = *(first_bit_indices + range_id); @@ -243,7 +242,7 @@ CUDF_KERNEL void subtract_set_bits_range_boundaries_kernel(bitmask_type const* b // Update the null count with the computed delta. size_type updated_null_count = *(null_counts + range_id) + delta; *(null_counts + range_id) = updated_null_count; - range_id += blockDim.x * gridDim.x; + range_id += cudf::detail::grid_1d::grid_stride(); } } From 439321edb43082fb75f195b6be2049c925279089 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Wed, 4 Dec 2024 09:14:47 -0500 Subject: [PATCH 16/78] Turn off cudf.pandas 3rd party integrations tests for 24.12 (#17500) Removes the third-party integration tests for the 24.12 nightly CI. We need to do this to unblock CI. These tests have not been running properly, and we just noticed that. There are more than a few failures so we will have to resolve this in the next release. Future work is tracked in #17490. --- .github/workflows/test.yaml | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 3be07480b15..d261c370fd0 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -135,18 +135,6 @@ jobs: date: ${{ inputs.date }} sha: ${{ inputs.sha }} script: ci/cudf_pandas_scripts/run_tests.sh - third-party-integration-tests-cudf-pandas: - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 - with: - build_type: nightly - branch: ${{ inputs.branch }} - date: ${{ inputs.date }} - sha: ${{ inputs.sha }} - node_type: "gpu-v100-latest-1" - container_image: "rapidsai/ci-conda:latest" - run_script: | - ci/cudf_pandas_scripts/third-party-integration/test.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml wheel-tests-cudf-polars: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12 From 86d833bdd46f0742621e6f1ec39e4e42fe1a695d Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 4 Dec 2024 09:17:11 -0500 Subject: [PATCH 17/78] Change indices for dictionary column to signed integer type (#17390) Change the indices type for dictionary column from unsigned to signed integer type. Closes #17327 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Basit Ayantunde (https://github.com/lamarrr) - Yunsong Wang (https://github.com/PointKernel) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17390 --- .../cudf/column/column_device_view.cuh | 6 ++-- cpp/include/cudf/dictionary/encode.hpp | 4 +-- cpp/include/cudf/utilities/traits.hpp | 23 ++++++++++++++ cpp/include/cudf_test/column_wrapper.hpp | 8 ++--- cpp/src/column/column_factories.cpp | 2 +- cpp/src/dictionary/add_keys.cu | 6 ++-- cpp/src/dictionary/detail/concatenate.cu | 2 +- cpp/src/dictionary/dictionary_factories.cu | 5 +-- cpp/src/dictionary/encode.cu | 13 +++----- cpp/src/dictionary/remove_keys.cu | 4 +-- cpp/src/dictionary/search.cu | 10 +++--- cpp/src/interop/from_arrow_device.cu | 17 +++------- cpp/src/interop/from_arrow_host.cu | 17 +++------- cpp/src/utilities/traits.cpp | 16 ++++++++++ cpp/tests/copying/get_value_tests.cpp | 6 ++-- cpp/tests/dictionary/add_keys_test.cpp | 4 +-- cpp/tests/dictionary/encode_test.cpp | 8 ++--- cpp/tests/dictionary/factories_test.cpp | 31 +++++++++---------- cpp/tests/dictionary/search_test.cpp | 16 +++++----- cpp/tests/interop/from_arrow_host_test.cpp | 20 ++++++------ cpp/tests/interop/nanoarrow_utils.hpp | 8 +++-- cpp/tests/interop/to_arrow_device_test.cpp | 5 ++- cpp/tests/interop/to_arrow_test.cpp | 7 ++--- cpp/tests/rolling/lead_lag_test.cpp | 4 +-- cpp/tests/streams/dictionary_test.cpp | 16 +++++----- 25 files changed, 139 insertions(+), 119 deletions(-) diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index 35a39ef9758..db6d5255616 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -460,7 +460,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { */ struct index_element_fn { template () and std::is_unsigned_v)> + CUDF_ENABLE_IF(is_index_type() and std::is_signed_v)> __device__ size_type operator()(column_device_view const& indices, size_type index) { return static_cast(indices.element(index)); @@ -468,10 +468,10 @@ class alignas(16) column_device_view : public detail::column_device_view_base { template () and std::is_unsigned_v))> + CUDF_ENABLE_IF(not(is_index_type() and std::is_signed_v))> __device__ size_type operator()(Args&&... args) { - CUDF_UNREACHABLE("dictionary indices must be an unsigned integral type"); + CUDF_UNREACHABLE("dictionary indices must be a signed integral type"); } }; diff --git a/cpp/include/cudf/dictionary/encode.hpp b/cpp/include/cudf/dictionary/encode.hpp index dc81fd74992..ced6bd2afa4 100644 --- a/cpp/include/cudf/dictionary/encode.hpp +++ b/cpp/include/cudf/dictionary/encode.hpp @@ -41,7 +41,7 @@ namespace dictionary { * * The null mask and null count are copied from the input column to the output column. * - * @throw cudf::logic_error if indices type is not an unsigned integer type + * @throw cudf::logic_error if indices type is not a signed integer type * @throw cudf::logic_error if the column to encode is already a DICTIONARY type * * @code{.pseudo} @@ -58,7 +58,7 @@ namespace dictionary { */ std::unique_ptr encode( column_view const& column, - data_type indices_type = data_type{type_id::UINT32}, + data_type indices_type = data_type{type_id::INT32}, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp index 22a67ca049a..dae1cd38832 100644 --- a/cpp/include/cudf/utilities/traits.hpp +++ b/cpp/include/cudf/utilities/traits.hpp @@ -217,6 +217,29 @@ constexpr inline bool is_index_type() */ bool is_index_type(data_type type); +/** + * @brief Indicates whether the type `T` is a signed numeric type. + * + * @tparam T The type to verify + * @return true `T` is signed numeric + */ +template +constexpr inline bool is_signed() +{ + return std::is_signed_v; +} + +/** + * @brief Indicates whether `type` is a signed numeric `data_type`. + * + * "Signed Numeric" types include fundamental integral types such as `INT*` + * but can also be `FLOAT*` types. + * + * @param type The `data_type` to verify + * @return true `type` is signed numeric + */ +bool is_signed(data_type type); + /** * @brief Indicates whether the type `T` is a unsigned numeric type. * diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp index 6206c1311d2..6300bb87572 100644 --- a/cpp/include/cudf_test/column_wrapper.hpp +++ b/cpp/include/cudf_test/column_wrapper.hpp @@ -974,7 +974,7 @@ class dictionary_column_wrapper : public detail::column_wrapper { { wrapped = cudf::dictionary::encode(fixed_width_column_wrapper(begin, end), - cudf::data_type{type_id::UINT32}, + cudf::data_type{type_id::INT32}, cudf::test::get_default_stream()); } @@ -1009,7 +1009,7 @@ class dictionary_column_wrapper : public detail::column_wrapper { { wrapped = cudf::dictionary::encode( fixed_width_column_wrapper(begin, end, v), - cudf::data_type{type_id::UINT32}, + cudf::data_type{type_id::INT32}, cudf::test::get_default_stream()); } @@ -1173,7 +1173,7 @@ class dictionary_column_wrapper : public detail::column_wrapper { dictionary_column_wrapper(StringsIterator begin, StringsIterator end) : column_wrapper{} { wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end), - cudf::data_type{type_id::UINT32}, + cudf::data_type{type_id::INT32}, cudf::test::get_default_stream()); } @@ -1210,7 +1210,7 @@ class dictionary_column_wrapper : public detail::column_wrapper { : column_wrapper{} { wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end, v), - cudf::data_type{type_id::UINT32}, + cudf::data_type{type_id::INT32}, cudf::test::get_default_stream()); } diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp index 972f97e8668..050c23b0a3d 100644 --- a/cpp/src/column/column_factories.cpp +++ b/cpp/src/column/column_factories.cpp @@ -178,7 +178,7 @@ std::unique_ptr make_dictionary_from_scalar(scalar const& s, CUDF_EXPECTS(s.is_valid(stream), "cannot create a dictionary with a null key"); return make_dictionary_column( make_column_from_scalar(s, 1, stream, mr), - make_column_from_scalar(numeric_scalar(0, true, stream), size, stream, mr), + make_column_from_scalar(numeric_scalar(0, true, stream), size, stream, mr), rmm::device_buffer{0, stream, mr}, 0); } diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu index 565055009ba..a851fc6069d 100644 --- a/cpp/src/dictionary/add_keys.cu +++ b/cpp/src/dictionary/add_keys.cu @@ -106,10 +106,10 @@ std::unique_ptr add_keys(dictionary_column_view const& dictionary_column auto indices_column = [&] { column_view gather_result = table_indices.front()->view(); auto const indices_size = gather_result.size(); - // we can just use the lower-bound/gather data directly for UINT32 case - if (indices_type.id() == type_id::UINT32) { + // we can just use the lower-bound/gather data directly for INT32 case + if (indices_type.id() == type_id::INT32) { auto contents = table_indices.front()->release(); - return std::make_unique(data_type{type_id::UINT32}, + return std::make_unique(data_type{type_id::INT32}, indices_size, std::move(*(contents.data.release())), rmm::device_buffer{0, stream, mr}, diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu index b3a8bb4cd20..0f17858094b 100644 --- a/cpp/src/dictionary/detail/concatenate.cu +++ b/cpp/src/dictionary/detail/concatenate.cu @@ -252,7 +252,7 @@ std::unique_ptr concatenate(host_span columns, std::transform(columns.begin(), columns.end(), indices_views.begin(), [](auto cv) { auto dict_view = dictionary_column_view(cv); if (dict_view.is_empty()) { - return column_view{data_type{type_id::UINT32}, 0, nullptr, nullptr, 0}; + return column_view{data_type{type_id::INT32}, 0, nullptr, nullptr, 0}; } return dict_view.get_indices_annotated(); // nicely includes validity mask and view offset }); diff --git a/cpp/src/dictionary/dictionary_factories.cu b/cpp/src/dictionary/dictionary_factories.cu index 3e0c98d36ea..9f81c852a30 100644 --- a/cpp/src/dictionary/dictionary_factories.cu +++ b/cpp/src/dictionary/dictionary_factories.cu @@ -33,7 +33,7 @@ struct dispatch_create_indices { rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_EXPECTS(std::is_unsigned(), "indices must be an unsigned type"); + CUDF_EXPECTS(cudf::is_signed(), "indices must be a signed type"); column_view indices_view{ indices.type(), indices.size(), indices.data(), nullptr, 0, indices.offset()}; return std::make_unique(indices_view, stream, mr); @@ -83,7 +83,8 @@ std::unique_ptr make_dictionary_column(std::unique_ptr keys_colu { CUDF_EXPECTS(!keys_column->has_nulls(), "keys column must not have nulls"); CUDF_EXPECTS(!indices_column->has_nulls(), "indices column must not have nulls"); - CUDF_EXPECTS(is_unsigned(indices_column->type()), "indices must be type unsigned integer"); + CUDF_EXPECTS(is_signed(indices_column->type()) && is_index_type(indices_column->type()), + "indices must be type unsigned integer"); auto count = indices_column->size(); std::vector> children; diff --git a/cpp/src/dictionary/encode.cu b/cpp/src/dictionary/encode.cu index c8ccb511e8f..5935b4f13e8 100644 --- a/cpp/src/dictionary/encode.cu +++ b/cpp/src/dictionary/encode.cu @@ -44,7 +44,8 @@ std::unique_ptr encode(column_view const& input_column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_EXPECTS(is_unsigned(indices_type), "indices must be type unsigned integer"); + CUDF_EXPECTS(is_signed(indices_type) && is_index_type(indices_type), + "indices must be type signed integer"); CUDF_EXPECTS(input_column.type().id() != type_id::DICTIONARY32, "cannot encode a dictionary from a dictionary"); @@ -63,10 +64,6 @@ std::unique_ptr encode(column_view const& input_column, keys_column->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); // remove the null-mask } - // the encode() returns INT32 for indices - if (indices_column->type().id() != indices_type.id()) - indices_column = cudf::detail::cast(indices_column->view(), indices_type, stream, mr); - // create column with keys_column and indices_column return make_dictionary_column(std::move(keys_column), std::move(indices_column), @@ -79,9 +76,9 @@ std::unique_ptr encode(column_view const& input_column, */ data_type get_indices_type_for_size(size_type keys_size) { - if (keys_size <= std::numeric_limits::max()) return data_type{type_id::UINT8}; - if (keys_size <= std::numeric_limits::max()) return data_type{type_id::UINT16}; - return data_type{type_id::UINT32}; + if (keys_size <= std::numeric_limits::max()) return data_type{type_id::INT8}; + if (keys_size <= std::numeric_limits::max()) return data_type{type_id::INT16}; + return data_type{type_id::INT32}; } } // namespace detail diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu index 119f43a4ae9..59c8453cf33 100644 --- a/cpp/src/dictionary/remove_keys.cu +++ b/cpp/src/dictionary/remove_keys.cu @@ -180,11 +180,11 @@ std::unique_ptr remove_unused_keys(dictionary_column_view const& diction // search the indices values with key indices to look for any holes auto const matches = [&] { // build keys index to verify against indices values - rmm::device_uvector keys_positions(keys_size, stream); + rmm::device_uvector keys_positions(keys_size, stream); thrust::sequence(rmm::exec_policy(stream), keys_positions.begin(), keys_positions.end()); // wrap the indices for comparison in contains() column_view keys_positions_view( - data_type{type_id::UINT32}, keys_size, keys_positions.data(), nullptr, 0); + data_type{type_id::INT32}, keys_size, keys_positions.data(), nullptr, 0); return cudf::detail::contains(indices_view, keys_positions_view, stream, mr); }(); auto d_matches = matches->view().data(); diff --git a/cpp/src/dictionary/search.cu b/cpp/src/dictionary/search.cu index 04e2c17635d..286b1a87df2 100644 --- a/cpp/src/dictionary/search.cu +++ b/cpp/src/dictionary/search.cu @@ -158,8 +158,9 @@ std::unique_ptr get_index(dictionary_column_view const& dictionary, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - if (dictionary.is_empty()) - return std::make_unique>(0, false, stream, mr); + if (dictionary.is_empty()) { + return std::make_unique>(0, false, stream, mr); + } return type_dispatcher( dictionary.keys().type(), find_index_fn(), dictionary, key, stream, mr); } @@ -169,8 +170,9 @@ std::unique_ptr get_insert_index(dictionary_column_view const& dictionar rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - if (dictionary.is_empty()) - return std::make_unique>(0, false, stream, mr); + if (dictionary.is_empty()) { + return std::make_unique>(0, false, stream, mr); + } return type_dispatcher( dictionary.keys().type(), find_insert_index_fn(), dictionary, key, stream, mr); } diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu index 057e563c86e..cb3c4c55a61 100644 --- a/cpp/src/interop/from_arrow_device.cu +++ b/cpp/src/interop/from_arrow_device.cu @@ -194,19 +194,12 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()( get_column(&keys_schema_view, input->dictionary, keys_type, true, stream, mr); auto const dict_indices_type = [&schema]() -> data_type { - // cudf dictionary requires an unsigned type for the indices, - // since it is invalid for an arrow dictionary to contain negative - // indices, we can safely use the unsigned equivalent without having - // to modify the buffers. + // cudf dictionary requires a signed type for the indices switch (schema->storage_type) { - case NANOARROW_TYPE_INT8: - case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8); - case NANOARROW_TYPE_INT16: - case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16); - case NANOARROW_TYPE_INT32: - case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32); - case NANOARROW_TYPE_INT64: - case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64); + case NANOARROW_TYPE_INT8: return data_type(type_id::INT8); + case NANOARROW_TYPE_INT16: return data_type(type_id::INT16); + case NANOARROW_TYPE_INT32: return data_type(type_id::INT32); + case NANOARROW_TYPE_INT64: return data_type(type_id::INT64); default: CUDF_FAIL("Unsupported type_id for dictionary indices", cudf::data_type_error); } }(); diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu index 2e9504a6726..b5d2427e288 100644 --- a/cpp/src/interop/from_arrow_host.cu +++ b/cpp/src/interop/from_arrow_host.cu @@ -267,19 +267,12 @@ std::unique_ptr dispatch_copy_from_arrow_host::operator()dictionary, keys_type, true, stream, mr); auto const dict_indices_type = [&schema]() -> data_type { - // cudf dictionary requires an unsigned type for the indices, - // since it is invalid for an arrow dictionary to contain negative - // indices, we can safely use the unsigned equivalent without having - // to modify the buffers. + // cudf dictionary requires a signed type for the indices switch (schema->storage_type) { - case NANOARROW_TYPE_INT8: - case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8); - case NANOARROW_TYPE_INT16: - case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16); - case NANOARROW_TYPE_INT32: - case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32); - case NANOARROW_TYPE_INT64: - case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64); + case NANOARROW_TYPE_INT8: return data_type(type_id::INT8); + case NANOARROW_TYPE_INT16: return data_type(type_id::INT16); + case NANOARROW_TYPE_INT32: return data_type(type_id::INT32); + case NANOARROW_TYPE_INT64: return data_type(type_id::INT64); default: CUDF_FAIL("Unsupported type_id for dictionary indices", cudf::data_type_error); } }(); diff --git a/cpp/src/utilities/traits.cpp b/cpp/src/utilities/traits.cpp index 41ee4e960b6..86b4db02f54 100644 --- a/cpp/src/utilities/traits.cpp +++ b/cpp/src/utilities/traits.cpp @@ -127,6 +127,22 @@ struct is_index_type_impl { */ bool is_index_type(data_type type) { return cudf::type_dispatcher(type, is_index_type_impl{}); } +struct is_signed_impl { + template + constexpr bool operator()() + { + return is_signed(); + } +}; + +/** + * @brief Indicates whether `type` is a signed numeric `data_type`. + * + * @param type The `data_type` to verify + * @return true `type` is signed numeric + */ +bool is_signed(data_type type) { return cudf::type_dispatcher(type, is_signed_impl{}); } + struct is_unsigned_impl { template constexpr bool operator()() diff --git a/cpp/tests/copying/get_value_tests.cpp b/cpp/tests/copying/get_value_tests.cpp index b2d64dac7c8..9e8525cd96b 100644 --- a/cpp/tests/copying/get_value_tests.cpp +++ b/cpp/tests/copying/get_value_tests.cpp @@ -132,7 +132,7 @@ TYPED_TEST_SUITE(DictionaryGetValueTest, cudf::test::FixedWidthTypesWithoutFixed TYPED_TEST(DictionaryGetValueTest, BasicGet) { cudf::test::fixed_width_column_wrapper keys({6, 7, 8, 9}); - cudf::test::fixed_width_column_wrapper indices{0, 0, 1, 2, 1, 3, 3, 2}; + cudf::test::fixed_width_column_wrapper indices{0, 0, 1, 2, 1, 3, 3, 2}; auto col = cudf::make_dictionary_column(keys, indices); auto s = cudf::get_element(*col, 2); @@ -147,7 +147,7 @@ TYPED_TEST(DictionaryGetValueTest, BasicGet) TYPED_TEST(DictionaryGetValueTest, GetFromNullable) { cudf::test::fixed_width_column_wrapper keys({6, 7, 8, 9}); - cudf::test::fixed_width_column_wrapper indices( + cudf::test::fixed_width_column_wrapper indices( {0, 0, 1, 2, 1, 3, 3, 2}, {false, true, false, true, true, true, false, false}); auto col = cudf::make_dictionary_column(keys, indices); @@ -163,7 +163,7 @@ TYPED_TEST(DictionaryGetValueTest, GetFromNullable) TYPED_TEST(DictionaryGetValueTest, GetNull) { cudf::test::fixed_width_column_wrapper keys({6, 7, 8, 9}); - cudf::test::fixed_width_column_wrapper indices( + cudf::test::fixed_width_column_wrapper indices( {0, 0, 1, 2, 1, 3, 3, 2}, {false, true, false, true, true, true, false, false}); auto col = cudf::make_dictionary_column(keys, indices); diff --git a/cpp/tests/dictionary/add_keys_test.cpp b/cpp/tests/dictionary/add_keys_test.cpp index ebc8c11e86c..da8231fb8be 100644 --- a/cpp/tests/dictionary/add_keys_test.cpp +++ b/cpp/tests/dictionary/add_keys_test.cpp @@ -41,7 +41,7 @@ TEST_F(DictionaryAddKeysTest, StringsColumn) cudf::test::strings_column_wrapper keys_expected({"aaa", "bbb", "ccc", "ddd", "eee", "fff"}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected); - cudf::test::fixed_width_column_wrapper indices_expected({5, 0, 3, 1, 2, 2, 2, 5, 0}); + cudf::test::fixed_width_column_wrapper indices_expected({5, 0, 3, 1, 2, 2, 2, 5, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), indices_expected); } @@ -58,7 +58,7 @@ TEST_F(DictionaryAddKeysTest, FloatColumn) cudf::test::fixed_width_column_wrapper keys_expected{-11.75, 0.5, 4.25, 5.0, 7.125}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected); - cudf::test::fixed_width_column_wrapper expected{2, 4, 1, 0, 4, 1}; + cudf::test::fixed_width_column_wrapper expected{2, 4, 1, 0, 4, 1}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), expected); } diff --git a/cpp/tests/dictionary/encode_test.cpp b/cpp/tests/dictionary/encode_test.cpp index dfa3ede5d46..46319bb376d 100644 --- a/cpp/tests/dictionary/encode_test.cpp +++ b/cpp/tests/dictionary/encode_test.cpp @@ -34,7 +34,7 @@ TEST_F(DictionaryEncodeTest, EncodeStringColumn) cudf::test::strings_column_wrapper keys_expected({"aaa", "bbb", "ccc", "ddd", "eee"}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected); - cudf::test::fixed_width_column_wrapper indices_expected({4, 0, 3, 1, 2, 2, 2, 4, 0}); + cudf::test::fixed_width_column_wrapper indices_expected({4, 0, 3, 1, 2, 2, 2, 4, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), indices_expected); } @@ -48,7 +48,7 @@ TEST_F(DictionaryEncodeTest, EncodeFloat) cudf::test::fixed_width_column_wrapper keys_expected{-11.75, 0.5, 4.25, 7.125}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected); - cudf::test::fixed_width_column_wrapper expected{2, 3, 1, 0, 3, 1}; + cudf::test::fixed_width_column_wrapper expected{2, 3, 1, 0, 3, 1}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), expected); } @@ -64,7 +64,7 @@ TEST_F(DictionaryEncodeTest, EncodeWithNull) cudf::test::fixed_width_column_wrapper keys_expected{0, 111, 222, 333, 444}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected); - cudf::test::fixed_width_column_wrapper expected{4, 0, 3, 1, 2, 5, 2, 4, 0}; + cudf::test::fixed_width_column_wrapper expected{4, 0, 3, 1, 2, 5, 2, 4, 0}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), expected); } @@ -72,6 +72,6 @@ TEST_F(DictionaryEncodeTest, InvalidEncode) { cudf::test::fixed_width_column_wrapper input{0, 1, 2, 3, -1, -2, -3}; - EXPECT_THROW(cudf::dictionary::encode(input, cudf::data_type{cudf::type_id::INT16}), + EXPECT_THROW(cudf::dictionary::encode(input, cudf::data_type{cudf::type_id::UINT16}), cudf::logic_error); } diff --git a/cpp/tests/dictionary/factories_test.cpp b/cpp/tests/dictionary/factories_test.cpp index 051ea45aed6..30e3984d66d 100644 --- a/cpp/tests/dictionary/factories_test.cpp +++ b/cpp/tests/dictionary/factories_test.cpp @@ -29,7 +29,7 @@ struct DictionaryFactoriesTest : public cudf::test::BaseFixture {}; TEST_F(DictionaryFactoriesTest, CreateFromColumnViews) { cudf::test::strings_column_wrapper keys({"aaa", "ccc", "ddd", "www"}); - cudf::test::fixed_width_column_wrapper values{2, 0, 3, 1, 2, 2, 2, 3, 0}; + cudf::test::fixed_width_column_wrapper values{2, 0, 3, 1, 2, 2, 2, 3, 0}; auto dictionary = cudf::make_dictionary_column(keys, values); cudf::dictionary_column_view view(dictionary->view()); @@ -41,8 +41,8 @@ TEST_F(DictionaryFactoriesTest, CreateFromColumnViews) TEST_F(DictionaryFactoriesTest, ColumnViewsWithNulls) { cudf::test::fixed_width_column_wrapper keys{-11.75, 4.25, 7.125, 0.5, 12.0}; - std::vector h_values{1, 3, 2, 0, 1, 4, 1}; - cudf::test::fixed_width_column_wrapper indices( + std::vector h_values{1, 3, 2, 0, 1, 4, 1}; + cudf::test::fixed_width_column_wrapper indices( h_values.begin(), h_values.end(), thrust::make_transform_iterator(h_values.begin(), [](auto v) { return v > 0; })); @@ -50,8 +50,7 @@ TEST_F(DictionaryFactoriesTest, ColumnViewsWithNulls) cudf::dictionary_column_view view(dictionary->view()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys); - cudf::test::fixed_width_column_wrapper values_expected(h_values.begin(), - h_values.end()); + cudf::test::fixed_width_column_wrapper values_expected(h_values.begin(), h_values.end()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), values_expected); } @@ -59,16 +58,15 @@ TEST_F(DictionaryFactoriesTest, CreateFromColumns) { std::vector h_keys{"pear", "apple", "fruit", "macintosh"}; cudf::test::strings_column_wrapper keys(h_keys.begin(), h_keys.end()); - std::vector h_values{1, 2, 3, 1, 2, 3, 0}; - cudf::test::fixed_width_column_wrapper values(h_values.begin(), h_values.end()); + std::vector h_values{1, 2, 3, 1, 2, 3, 0}; + cudf::test::fixed_width_column_wrapper values(h_values.begin(), h_values.end()); auto dictionary = cudf::make_dictionary_column(keys.release(), values.release(), rmm::device_buffer{}, 0); cudf::dictionary_column_view view(dictionary->view()); cudf::test::strings_column_wrapper keys_expected(h_keys.begin(), h_keys.end()); - cudf::test::fixed_width_column_wrapper values_expected(h_values.begin(), - h_values.end()); + cudf::test::fixed_width_column_wrapper values_expected(h_values.begin(), h_values.end()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), values_expected); } @@ -77,8 +75,8 @@ TEST_F(DictionaryFactoriesTest, ColumnsWithNulls) { std::vector h_keys{-1234567890, -987654321, 0, 19283714}; cudf::test::fixed_width_column_wrapper keys(h_keys.begin(), h_keys.end()); - std::vector h_values{1, 2, 3, 1, 2, 3, 0}; - cudf::test::fixed_width_column_wrapper values(h_values.begin(), h_values.end()); + std::vector h_values{1, 2, 3, 1, 2, 3, 0}; + cudf::test::fixed_width_column_wrapper values(h_values.begin(), h_values.end()); auto size = static_cast(h_values.size()); rmm::device_buffer null_mask = create_null_mask(size, cudf::mask_state::ALL_NULL); auto dictionary = @@ -88,8 +86,7 @@ TEST_F(DictionaryFactoriesTest, ColumnsWithNulls) EXPECT_EQ(size, view.null_count()); cudf::test::fixed_width_column_wrapper keys_expected(h_keys.begin(), h_keys.end()); - cudf::test::fixed_width_column_wrapper values_expected(h_values.begin(), - h_values.end()); + cudf::test::fixed_width_column_wrapper values_expected(h_values.begin(), h_values.end()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), values_expected); } @@ -98,15 +95,15 @@ TEST_F(DictionaryFactoriesTest, KeysWithNulls) { cudf::test::fixed_width_column_wrapper keys{{0, 1, 2, 3, 4}, {true, true, true, false, true}}; - cudf::test::fixed_width_column_wrapper indices{5, 4, 3, 2, 1, 0}; + cudf::test::fixed_width_column_wrapper indices{5, 4, 3, 2, 1, 0}; EXPECT_THROW(cudf::make_dictionary_column(keys, indices), cudf::logic_error); } TEST_F(DictionaryFactoriesTest, IndicesWithNulls) { cudf::test::fixed_width_column_wrapper keys{0, 1, 2, 3, 4}; - cudf::test::fixed_width_column_wrapper indices{{5, 4, 3, 2, 1, 0}, - {true, true, true, false, true, false}}; + cudf::test::fixed_width_column_wrapper indices{{5, 4, 3, 2, 1, 0}, + {true, true, true, false, true, false}}; EXPECT_THROW( cudf::make_dictionary_column(keys.release(), indices.release(), rmm::device_buffer{}, 0), cudf::logic_error); @@ -115,7 +112,7 @@ TEST_F(DictionaryFactoriesTest, IndicesWithNulls) TEST_F(DictionaryFactoriesTest, InvalidIndices) { cudf::test::fixed_width_column_wrapper keys{0, 1, 2, 3, 4}; - cudf::test::fixed_width_column_wrapper indices{5, 4, 3, 2, 1, 0}; + cudf::test::fixed_width_column_wrapper indices{5, 4, 3, 2, 1, 0}; EXPECT_THROW(cudf::make_dictionary_column(keys, indices), cudf::logic_error); EXPECT_THROW( cudf::make_dictionary_column(keys.release(), indices.release(), rmm::device_buffer{}, 0), diff --git a/cpp/tests/dictionary/search_test.cpp b/cpp/tests/dictionary/search_test.cpp index 2774173b80a..d5877f12184 100644 --- a/cpp/tests/dictionary/search_test.cpp +++ b/cpp/tests/dictionary/search_test.cpp @@ -31,8 +31,8 @@ TEST_F(DictionarySearchTest, StringsColumn) auto result = cudf::dictionary::get_index(dictionary, cudf::string_scalar("ccc")); EXPECT_TRUE(result->is_valid()); - auto n_result = dynamic_cast*>(result.get()); - EXPECT_EQ(uint32_t{3}, n_result->value()); + auto n_result = dynamic_cast*>(result.get()); + EXPECT_EQ(int32_t{3}, n_result->value()); result = cudf::dictionary::get_index(dictionary, cudf::string_scalar("eee")); EXPECT_FALSE(result->is_valid()); @@ -40,8 +40,8 @@ TEST_F(DictionarySearchTest, StringsColumn) cudf::string_scalar("eee"), cudf::get_default_stream(), cudf::get_current_device_resource_ref()); - n_result = dynamic_cast*>(result.get()); - EXPECT_EQ(uint32_t{5}, n_result->value()); + n_result = dynamic_cast*>(result.get()); + EXPECT_EQ(int32_t{5}, n_result->value()); } TEST_F(DictionarySearchTest, WithNulls) @@ -51,8 +51,8 @@ TEST_F(DictionarySearchTest, WithNulls) auto result = cudf::dictionary::get_index(dictionary, cudf::numeric_scalar(4)); EXPECT_TRUE(result->is_valid()); - auto n_result = dynamic_cast*>(result.get()); - EXPECT_EQ(uint32_t{0}, n_result->value()); + auto n_result = dynamic_cast*>(result.get()); + EXPECT_EQ(int32_t{0}, n_result->value()); result = cudf::dictionary::get_index(dictionary, cudf::numeric_scalar(5)); EXPECT_FALSE(result->is_valid()); @@ -60,8 +60,8 @@ TEST_F(DictionarySearchTest, WithNulls) cudf::numeric_scalar(5), cudf::get_default_stream(), cudf::get_current_device_resource_ref()); - n_result = dynamic_cast*>(result.get()); - EXPECT_EQ(uint32_t{1}, n_result->value()); + n_result = dynamic_cast*>(result.get()); + EXPECT_EQ(int32_t{1}, n_result->value()); } TEST_F(DictionarySearchTest, EmptyColumn) diff --git a/cpp/tests/interop/from_arrow_host_test.cpp b/cpp/tests/interop/from_arrow_host_test.cpp index d93ef28aab8..1ab11b374b6 100644 --- a/cpp/tests/interop/from_arrow_host_test.cpp +++ b/cpp/tests/interop/from_arrow_host_test.cpp @@ -460,19 +460,17 @@ TEST_F(FromArrowHostDeviceTest, DictionaryIndicesType) // test dictionary arrays with different index types // cudf asserts that the index type must be unsigned auto array1 = - get_nanoarrow_dict_array({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); + get_nanoarrow_dict_array({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); auto array2 = - get_nanoarrow_dict_array({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); + get_nanoarrow_dict_array({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); auto array3 = - get_nanoarrow_dict_array({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); + get_nanoarrow_dict_array({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); // create equivalent cudf dictionary columns auto keys_col = cudf::test::fixed_width_column_wrapper({1, 2, 5, 7}); - auto ind1_col = cudf::test::fixed_width_column_wrapper({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); - auto ind2_col = - cudf::test::fixed_width_column_wrapper({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); - auto ind3_col = - cudf::test::fixed_width_column_wrapper({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); + auto ind1_col = cudf::test::fixed_width_column_wrapper({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); + auto ind2_col = cudf::test::fixed_width_column_wrapper({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); + auto ind3_col = cudf::test::fixed_width_column_wrapper({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1}); vector_of_columns columns; columns.emplace_back(cudf::make_dictionary_column(keys_col, ind1_col)); @@ -485,19 +483,19 @@ TEST_F(FromArrowHostDeviceTest, DictionaryIndicesType) ArrowSchemaInit(input_schema.get()); NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 3)); - NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_UINT8)); + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_INT8)); NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a")); NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[0])); NANOARROW_THROW_NOT_OK( ArrowSchemaInitFromType(input_schema->children[0]->dictionary, NANOARROW_TYPE_INT64)); - NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[1], NANOARROW_TYPE_UINT16)); + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[1], NANOARROW_TYPE_INT16)); NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[1], "b")); NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[1])); NANOARROW_THROW_NOT_OK( ArrowSchemaInitFromType(input_schema->children[1]->dictionary, NANOARROW_TYPE_INT64)); - NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[2], NANOARROW_TYPE_UINT64)); + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[2], NANOARROW_TYPE_INT64)); NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[2], "c")); NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[2])); NANOARROW_THROW_NOT_OK( diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp index 8be7e087b6d..b7b8202a3c2 100644 --- a/cpp/tests/interop/nanoarrow_utils.hpp +++ b/cpp/tests/interop/nanoarrow_utils.hpp @@ -200,17 +200,19 @@ struct nanoarrow_storage_type {}; static constexpr ArrowType type = NANOARROW_TYPE_##NanoType; \ } -DEFINE_NANOARROW_STORAGE(bool, BOOL); +DEFINE_NANOARROW_STORAGE(int8_t, INT8); +DEFINE_NANOARROW_STORAGE(int16_t, INT16); +DEFINE_NANOARROW_STORAGE(int32_t, INT32); DEFINE_NANOARROW_STORAGE(int64_t, INT64); +DEFINE_NANOARROW_STORAGE(uint8_t, UINT8); DEFINE_NANOARROW_STORAGE(uint16_t, UINT16); DEFINE_NANOARROW_STORAGE(uint64_t, UINT64); +DEFINE_NANOARROW_STORAGE(bool, BOOL); DEFINE_NANOARROW_STORAGE(cudf::duration_D, INT32); DEFINE_NANOARROW_STORAGE(cudf::duration_s, INT64); DEFINE_NANOARROW_STORAGE(cudf::duration_ms, INT64); DEFINE_NANOARROW_STORAGE(cudf::duration_us, INT64); DEFINE_NANOARROW_STORAGE(cudf::duration_ns, INT64); -DEFINE_NANOARROW_STORAGE(uint8_t, UINT8); -DEFINE_NANOARROW_STORAGE(int32_t, INT32); DEFINE_NANOARROW_STORAGE(__int128_t, DECIMAL128); #undef DEFINE_NANOARROW_STORAGE diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp index 29aa928c277..112b3e1d8e2 100644 --- a/cpp/tests/interop/to_arrow_device_test.cpp +++ b/cpp/tests/interop/to_arrow_device_test.cpp @@ -48,7 +48,6 @@ get_nanoarrow_cudf_table(cudf::size_type length) .release()); auto col4 = cudf::test::fixed_width_column_wrapper( test_data.int64_data.begin(), test_data.int64_data.end(), test_data.validity.begin()); - auto dict_col = cudf::dictionary::encode(col4); columns.emplace_back(cudf::dictionary::encode(col4)); columns.emplace_back(cudf::test::fixed_width_column_wrapper(test_data.bool_data.begin(), test_data.bool_data.end(), @@ -103,7 +102,7 @@ get_nanoarrow_cudf_table(cudf::size_type length) schema->children[1]->flags = 0; } - NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[2], NANOARROW_TYPE_UINT32)); + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(schema->children[2], NANOARROW_TYPE_INT32)); NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(schema->children[2])); NANOARROW_THROW_NOT_OK( ArrowSchemaInitFromType(schema->children[2]->dictionary, NANOARROW_TYPE_INT64)); @@ -181,7 +180,7 @@ get_nanoarrow_tables(cudf::size_type length) populate_from_col(arrow->children[0], table->get_column(0).view()); populate_from_col(arrow->children[1], table->get_column(1).view()); - populate_dict_from_col( + populate_dict_from_col( arrow->children[2], cudf::dictionary_column_view(table->get_column(2).view())); populate_from_col(arrow->children[3], table->get_column(3).view()); diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp index faa07ba3311..28a80502f08 100644 --- a/cpp/tests/interop/to_arrow_test.cpp +++ b/cpp/tests/interop/to_arrow_test.cpp @@ -63,7 +63,6 @@ std::pair, std::shared_ptr> get_table auto validity_generator = []() { return rand() % 7 != 0; }; std::generate( list_int64_data_validity.begin(), list_int64_data_validity.end(), validity_generator); - // cudf::size_type n = 0; std::generate( list_offsets.begin(), list_offsets.end(), [length_of_individual_list, n = 0]() mutable { return (n++) * length_of_individual_list; @@ -87,7 +86,6 @@ std::pair, std::shared_ptr> get_table .release()); auto col4 = cudf::test::fixed_width_column_wrapper( int64_data.begin(), int64_data.end(), validity.begin()); - auto dict_col = cudf::dictionary::encode(col4); columns.emplace_back(cudf::dictionary::encode(col4)); columns.emplace_back(cudf::test::fixed_width_column_wrapper( bool_data.begin(), bool_data.end(), bool_validity.begin()) @@ -120,11 +118,12 @@ std::pair, std::shared_ptr> get_table auto int64array = get_arrow_array(int64_data, validity); auto string_array = get_arrow_array(string_data, validity); + auto dict_col = cudf::dictionary::encode(col4); cudf::dictionary_column_view view(dict_col->view()); auto keys = cudf::test::to_host(view.keys()).first; - auto indices = cudf::test::to_host(view.indices()).first; + auto indices = cudf::test::to_host(view.indices()).first; auto dict_array = get_arrow_dict_array(std::vector(keys.begin(), keys.end()), - std::vector(indices.begin(), indices.end()), + std::vector(indices.begin(), indices.end()), validity); auto boolarray = get_arrow_array(bool_data, bool_validity); auto list_array = get_arrow_list_array( diff --git a/cpp/tests/rolling/lead_lag_test.cpp b/cpp/tests/rolling/lead_lag_test.cpp index 6519b0ed4ee..d82f512329f 100644 --- a/cpp/tests/rolling/lead_lag_test.cpp +++ b/cpp/tests/rolling/lead_lag_test.cpp @@ -1098,7 +1098,7 @@ TEST_F(LeadLagNonFixedWidthTest, Dictionary) auto expected_keys = cudf::test::strings_column_wrapper{input_strings}.release(); auto expected_values = - cudf::test::fixed_width_column_wrapper{ + cudf::test::fixed_width_column_wrapper{ {2, 3, 4, 5, 0, 0, 7, 8, 9, 10, 0, 0}, cudf::test::iterators::nulls_at(std::vector{4, 5, 10, 11})} .release(); @@ -1118,7 +1118,7 @@ TEST_F(LeadLagNonFixedWidthTest, Dictionary) auto expected_keys = cudf::test::strings_column_wrapper{input_strings}.release(); auto expected_values = - cudf::test::fixed_width_column_wrapper{ + cudf::test::fixed_width_column_wrapper{ {0, 0, 1, 2, 3, 4, 0, 6, 0, 7, 8, 9}, cudf::test::iterators::nulls_at(std::vector{0, 6})} .release(); auto expected_output = diff --git a/cpp/tests/streams/dictionary_test.cpp b/cpp/tests/streams/dictionary_test.cpp index 03e4cf47470..498504ef212 100644 --- a/cpp/tests/streams/dictionary_test.cpp +++ b/cpp/tests/streams/dictionary_test.cpp @@ -29,7 +29,7 @@ class DictionaryTest : public cudf::test::BaseFixture {}; TEST_F(DictionaryTest, FactoryColumnViews) { cudf::test::strings_column_wrapper keys({"aaa", "ccc", "ddd", "www"}); - cudf::test::fixed_width_column_wrapper values{2, 0, 3, 1, 2, 2, 2, 3, 0}; + cudf::test::fixed_width_column_wrapper values{2, 0, 3, 1, 2, 2, 2, 3, 0}; auto dictionary = cudf::make_dictionary_column(keys, values, cudf::test::get_default_stream()); cudf::dictionary_column_view view(dictionary->view()); @@ -42,15 +42,15 @@ TEST_F(DictionaryTest, FactoryColumns) { std::vector h_keys{"aaa", "ccc", "ddd", "www"}; cudf::test::strings_column_wrapper keys(h_keys.begin(), h_keys.end()); - std::vector h_values{2, 0, 3, 1, 2, 2, 2, 3, 0}; - cudf::test::fixed_width_column_wrapper values(h_values.begin(), h_values.end()); + std::vector h_values{2, 0, 3, 1, 2, 2, 2, 3, 0}; + cudf::test::fixed_width_column_wrapper values(h_values.begin(), h_values.end()); auto dictionary = cudf::make_dictionary_column( keys.release(), values.release(), cudf::test::get_default_stream()); cudf::dictionary_column_view view(dictionary->view()); cudf::test::strings_column_wrapper keys_expected(h_keys.begin(), h_keys.end()); - cudf::test::fixed_width_column_wrapper values_expected(h_values.begin(), h_values.end()); + cudf::test::fixed_width_column_wrapper values_expected(h_values.begin(), h_values.end()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), values_expected); } @@ -59,15 +59,15 @@ TEST_F(DictionaryTest, FactoryColumnsNullMaskCount) { std::vector h_keys{"aaa", "ccc", "ddd", "www"}; cudf::test::strings_column_wrapper keys(h_keys.begin(), h_keys.end()); - std::vector h_values{2, 0, 3, 1, 2, 2, 2, 3, 0}; - cudf::test::fixed_width_column_wrapper values(h_values.begin(), h_values.end()); + std::vector h_values{2, 0, 3, 1, 2, 2, 2, 3, 0}; + cudf::test::fixed_width_column_wrapper values(h_values.begin(), h_values.end()); auto dictionary = cudf::make_dictionary_column( keys.release(), values.release(), rmm::device_buffer{}, 0, cudf::test::get_default_stream()); cudf::dictionary_column_view view(dictionary->view()); cudf::test::strings_column_wrapper keys_expected(h_keys.begin(), h_keys.end()); - cudf::test::fixed_width_column_wrapper values_expected(h_values.begin(), h_values.end()); + cudf::test::fixed_width_column_wrapper values_expected(h_values.begin(), h_values.end()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected); CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), values_expected); } @@ -75,7 +75,7 @@ TEST_F(DictionaryTest, FactoryColumnsNullMaskCount) TEST_F(DictionaryTest, Encode) { cudf::test::fixed_width_column_wrapper col({1, 2, 3, 4, 5}); - cudf::data_type int32_type(cudf::type_id::UINT32); + cudf::data_type int32_type(cudf::type_id::INT32); cudf::column_view col_view = col; cudf::dictionary::encode(col_view, int32_type, cudf::test::get_default_stream()); } From 6440207ccea4bed0a0654186276de1e589acb0d9 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 4 Dec 2024 10:46:04 -0600 Subject: [PATCH 18/78] Remove upper bounds on cuda-python to allow 12.6.2 and 11.8.5 (#17326) Now that some upstream bugs have been fixed, we can allow cuda-python 12.6.2 and 11.8.5. See https://github.com/NVIDIA/cuda-python/issues/226#issuecomment-2472355738 for more information. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Kyle Edwards (https://github.com/KyleFromNVIDIA) URL: https://github.com/rapidsai/cudf/pull/17326 --- conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +- conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +- conda/recipes/cudf/meta.yaml | 4 ++-- conda/recipes/pylibcudf/meta.yaml | 4 ++-- dependencies.yaml | 8 ++++---- python/cudf/pyproject.toml | 2 +- python/pylibcudf/pyproject.toml | 2 +- 7 files changed, 12 insertions(+), 12 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 2be64b7cd70..87c40421be0 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -19,7 +19,7 @@ dependencies: - cramjam - cubinlinker - cuda-nvtx=11.8 -- cuda-python>=11.7.1,<12.0a0,<=11.8.3 +- cuda-python>=11.7.1,<12.0a0 - cuda-sanitizer-api=11.8.86 - cuda-version=11.8 - cudatoolkit diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 6b5ca04c015..0935de96d19 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -21,7 +21,7 @@ dependencies: - cuda-nvcc - cuda-nvrtc-dev - cuda-nvtx-dev -- cuda-python>=12.0,<13.0a0,<=12.6.0 +- cuda-python>=12.0,<13.0a0 - cuda-sanitizer-api - cuda-version=12.5 - cupy>=12.0.0 diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 04904e95630..e52b8c5f2a0 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -91,7 +91,7 @@ requirements: - cudatoolkit - ptxcompiler >=0.7.0 - cubinlinker # CUDA enhanced compatibility. - - cuda-python >=11.7.1,<12.0a0,<=11.8.3 + - cuda-python >=11.7.1,<12.0a0 {% else %} - cuda-cudart - libcufile # [linux64] @@ -100,7 +100,7 @@ requirements: # TODO: Add nvjitlink here # xref: https://github.com/rapidsai/cudf/issues/12822 - cuda-nvrtc - - cuda-python >=12.0,<13.0a0,<=12.6.0 + - cuda-python >=12.0,<13.0a0 - pynvjitlink {% endif %} - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml index ec3fcd59c62..3d965f30986 100644 --- a/conda/recipes/pylibcudf/meta.yaml +++ b/conda/recipes/pylibcudf/meta.yaml @@ -83,9 +83,9 @@ requirements: - {{ pin_compatible('rmm', max_pin='x.x') }} - fsspec >=0.6.0 {% if cuda_major == "11" %} - - cuda-python >=11.7.1,<12.0a0,<=11.8.3 + - cuda-python >=11.7.1,<12.0a0 {% else %} - - cuda-python >=12.0,<13.0a0,<=12.6.0 + - cuda-python >=12.0,<13.0a0 {% endif %} - nvtx >=0.2.1 - packaging diff --git a/dependencies.yaml b/dependencies.yaml index 259d41b59fe..044c7d187b3 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -679,10 +679,10 @@ dependencies: matrices: - matrix: {cuda: "12.*"} packages: - - cuda-python>=12.0,<13.0a0,<=12.6.0 + - cuda-python>=12.0,<13.0a0 - matrix: {cuda: "11.*"} packages: &run_pylibcudf_packages_all_cu11 - - cuda-python>=11.7.1,<12.0a0,<=11.8.3 + - cuda-python>=11.7.1,<12.0a0 - {matrix: null, packages: *run_pylibcudf_packages_all_cu11} run_cudf: common: @@ -705,10 +705,10 @@ dependencies: matrices: - matrix: {cuda: "12.*"} packages: - - cuda-python>=12.0,<13.0a0,<=12.6.0 + - cuda-python>=12.0,<13.0a0 - matrix: {cuda: "11.*"} packages: &run_cudf_packages_all_cu11 - - cuda-python>=11.7.1,<12.0a0,<=11.8.3 + - cuda-python>=11.7.1,<12.0a0 - {matrix: null, packages: *run_cudf_packages_all_cu11} - output_types: conda matrices: diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index df3e6b87991..80de9056a0a 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -20,7 +20,7 @@ requires-python = ">=3.10" dependencies = [ "cachetools", "cubinlinker", - "cuda-python>=11.7.1,<12.0a0,<=11.8.3", + "cuda-python>=11.7.1,<12.0a0", "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", "libcudf==25.2.*,>=0.0.0a0", diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index dc82eb363d0..a5e5704b8ed 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -18,7 +18,7 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ - "cuda-python>=11.7.1,<12.0a0,<=11.8.3", + "cuda-python>=11.7.1,<12.0a0", "libcudf==25.2.*,>=0.0.0a0", "nvtx>=0.2.1", "packaging", From 38820ff0e8cd7cd54793fd5c49fb1566a24686b1 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 4 Dec 2024 11:43:37 -0600 Subject: [PATCH 19/78] Update to CCCL 2.7.0-rc2. (#17233) This PR updates to CCCL 2.7.0-rc2. Do not merge until all of RAPIDS is ready to update. Depends on https://github.com/rapidsai/rapids-cmake/pull/710 and should be admin-merged immediately after that PR. Part of https://github.com/rapidsai/build-planning/issues/115. --------- Co-authored-by: Michael Schellenberger Costa --- .../thirdparty/patches/cccl_override.json | 5 -- .../patches/cccl_symbol_visibility.diff | 27 -------- .../thrust_disable_64bit_dispatching.diff | 66 ++++++++++++++----- .../thrust_faster_sort_compile_times.diff | 12 ++-- 4 files changed, 56 insertions(+), 54 deletions(-) delete mode 100644 cpp/cmake/thirdparty/patches/cccl_symbol_visibility.diff diff --git a/cpp/cmake/thirdparty/patches/cccl_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json index dcf9c1139f9..2f29578f7ae 100644 --- a/cpp/cmake/thirdparty/patches/cccl_override.json +++ b/cpp/cmake/thirdparty/patches/cccl_override.json @@ -3,11 +3,6 @@ "packages" : { "CCCL" : { "patches" : [ - { - "file" : "${current_json_dir}/cccl_symbol_visibility.diff", - "issue" : "Correct symbol visibility issues in libcudacxx [https://github.com/NVIDIA/cccl/pull/1832/]", - "fixed_in" : "2.6" - }, { "file" : "${current_json_dir}/thrust_disable_64bit_dispatching.diff", "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]", diff --git a/cpp/cmake/thirdparty/patches/cccl_symbol_visibility.diff b/cpp/cmake/thirdparty/patches/cccl_symbol_visibility.diff deleted file mode 100644 index f745d5fa314..00000000000 --- a/cpp/cmake/thirdparty/patches/cccl_symbol_visibility.diff +++ /dev/null @@ -1,27 +0,0 @@ -diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__config b/libcudacxx/include/cuda/std/detail/libcxx/include/__config -index e7c62c031b..5db861853a 100644 ---- a/libcudacxx/include/cuda/std/detail/libcxx/include/__config -+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__config -@@ -1049,7 +1049,6 @@ typedef __char32_t char32_t; - # define _LIBCUDACXX_EXPORTED_FROM_ABI __declspec(dllimport) - # endif - --# define _LIBCUDACXX_TYPE_VIS _LIBCUDACXX_DLL_VIS - # define _LIBCUDACXX_FUNC_VIS _LIBCUDACXX_DLL_VIS - # define _LIBCUDACXX_EXCEPTION_ABI _LIBCUDACXX_DLL_VIS - # define _LIBCUDACXX_HIDDEN -@@ -1448,14 +1447,6 @@ __sanitizer_annotate_contiguous_container(const void*, const void*, const void*, - # define _LIBCUDACXX_WEAK __attribute__((__weak__)) - # endif - --// Redefine some macros for internal use --# if defined(__cuda_std__) --# undef _LIBCUDACXX_FUNC_VIS --# define _LIBCUDACXX_FUNC_VIS _LIBCUDACXX_INLINE_VISIBILITY --# undef _LIBCUDACXX_TYPE_VIS --# define _LIBCUDACXX_TYPE_VIS --# endif // __cuda_std__ -- - // Thread API - # ifndef _LIBCUDACXX_HAS_THREAD_API_EXTERNAL - # if defined(_CCCL_COMPILER_NVRTC) || defined(__EMSCRIPTEN__) diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff index 6ae1e1c917b..291eabe25fd 100644 --- a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff +++ b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff @@ -1,25 +1,59 @@ diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h -index 2a3cc4e33..8fb337b26 100644 +index 971b93d62..0d6b25b07 100644 --- a/thrust/thrust/system/cuda/detail/dispatch.h +++ b/thrust/thrust/system/cuda/detail/dispatch.h -@@ -44,8 +44,7 @@ - } \ - else \ - { \ -- auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ -- status = call arguments; \ +@@ -36,16 +36,15 @@ + * that callables for both branches consist of the same tokens, and is intended to be used with Thrust-style dispatch + * interfaces, that always deduce the size type from the arguments. + */ +-#define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments) \ +- if (count <= thrust::detail::integer_traits::const_max) \ +- { \ +- auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ +- status = call arguments; \ +- } \ +- else \ +- { \ +- auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ +- status = call arguments; \ ++#define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments) \ ++ if (count <= thrust::detail::integer_traits::const_max) \ ++ { \ ++ auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ ++ status = call arguments; \ ++ } \ ++ else \ ++ { \ + throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ } - + /** -@@ -66,9 +65,7 @@ - } \ - else \ - { \ -- auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ -- auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ -- status = call arguments; \ +@@ -55,18 +54,16 @@ + * + * This version of the macro supports providing two count variables, which is necessary for set algorithms. + */ +-#define THRUST_DOUBLE_INDEX_TYPE_DISPATCH(status, call, count1, count2, arguments) \ +- if (count1 + count2 <= thrust::detail::integer_traits::const_max) \ +- { \ +- auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ +- auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ +- status = call arguments; \ +- } \ +- else \ +- { \ +- auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ +- auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ +- status = call arguments; \ ++#define THRUST_DOUBLE_INDEX_TYPE_DISPATCH(status, call, count1, count2, arguments) \ ++ if (count1 + count2 <= thrust::detail::integer_traits::const_max) \ ++ { \ ++ auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ ++ auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ ++ status = call arguments; \ ++ } \ ++ else \ ++ { \ + throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ } + /** - * Dispatch between 32-bit and 64-bit index based versions of the same algorithm diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff index cb0cc55f4d2..5f1981e9806 100644 --- a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff +++ b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff @@ -1,20 +1,20 @@ diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh -index eb76ebb0b..c6c529a50 100644 +index 29510db5e..cf57e5786 100644 --- a/cub/cub/block/block_merge_sort.cuh +++ b/cub/cub/block/block_merge_sort.cuh @@ -95,7 +95,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge( KeyT key1 = keys_shared[keys1_beg]; KeyT key2 = keys_shared[keys2_beg]; - + -#pragma unroll +#pragma unroll 1 for (int item = 0; item < ITEMS_PER_THREAD; ++item) { - bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1)); -@@ -376,7 +376,7 @@ public: + const bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1)); +@@ -374,7 +374,7 @@ public: // KeyT max_key = oob_default; - + -#pragma unroll +#pragma unroll 1 for (int item = 1; item < ITEMS_PER_THREAD; ++item) @@ -27,7 +27,7 @@ index 7d9e8622f..da5627306 100644 @@ -87,10 +87,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THRE { constexpr bool KEYS_ONLY = ::cuda::std::is_same::value; - + -#pragma unroll +#pragma unroll 1 for (int i = 0; i < ITEMS_PER_THREAD; ++i) From 43fac3b64ee69427073adf76b4d6b11a3873fc10 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Wed, 4 Dec 2024 13:48:48 -0500 Subject: [PATCH 20/78] Expose stream-ordering in nvtext API (#17446) Adds stream parameter to ``` cudf::nvtext::byte_pair_encoding ``` Added stream gtests to verify correct stream forwarding. Reference: https://github.com/rapidsai/cudf/issues/13744 Authors: - Shruti Shivakumar (https://github.com/shrshi) - David Wendt (https://github.com/davidwendt) Approvers: - Yunsong Wang (https://github.com/PointKernel) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/17446 --- cpp/include/nvtext/byte_pair_encoding.hpp | 2 + cpp/src/text/bpe/byte_pair_encoding.cu | 3 +- cpp/src/text/bpe/load_merge_pairs.cu | 3 +- cpp/tests/CMakeLists.txt | 1 + cpp/tests/streams/text/bpe_test.cpp | 59 +++++++++++++++++++++++ 5 files changed, 66 insertions(+), 2 deletions(-) create mode 100644 cpp/tests/streams/text/bpe_test.cpp diff --git a/cpp/include/nvtext/byte_pair_encoding.hpp b/cpp/include/nvtext/byte_pair_encoding.hpp index ab862df044d..71b68565e77 100644 --- a/cpp/include/nvtext/byte_pair_encoding.hpp +++ b/cpp/include/nvtext/byte_pair_encoding.hpp @@ -122,6 +122,7 @@ std::unique_ptr load_merge_pairs( * @param merges_pairs Created by a call to @ref nvtext::load_merge_pairs. * @param separator String used to build the output after encoding. * Default is a space. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Memory resource to allocate any returned objects. * @return An encoded column of strings. */ @@ -129,6 +130,7 @@ std::unique_ptr byte_pair_encoding( cudf::strings_column_view const& input, bpe_merge_pairs const& merges_pairs, cudf::string_scalar const& separator = cudf::string_scalar(" "), + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group diff --git a/cpp/src/text/bpe/byte_pair_encoding.cu b/cpp/src/text/bpe/byte_pair_encoding.cu index f46f49ddc0e..0aacfd16f67 100644 --- a/cpp/src/text/bpe/byte_pair_encoding.cu +++ b/cpp/src/text/bpe/byte_pair_encoding.cu @@ -459,10 +459,11 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const std::unique_ptr byte_pair_encoding(cudf::strings_column_view const& input, bpe_merge_pairs const& merges_table, cudf::string_scalar const& separator, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::byte_pair_encoding(input, merges_table, separator, cudf::get_default_stream(), mr); + return detail::byte_pair_encoding(input, merges_table, separator, stream, mr); } } // namespace nvtext diff --git a/cpp/src/text/bpe/load_merge_pairs.cu b/cpp/src/text/bpe/load_merge_pairs.cu index cd68566bdec..a13a435a271 100644 --- a/cpp/src/text/bpe/load_merge_pairs.cu +++ b/cpp/src/text/bpe/load_merge_pairs.cu @@ -103,7 +103,8 @@ std::unique_ptr create_bpe_merge_pairs_im rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - auto pairs = cudf::strings::split_record(input, cudf::string_scalar(" "), 1, stream, mr); + auto pairs = + cudf::strings::split_record(input, cudf::string_scalar(" ", true, stream, mr), 1, stream, mr); auto content = pairs->release(); return create_bpe_merge_pairs_impl(std::move(content.children.back()), stream); } diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 8928d27a871..adf512811cc 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -742,6 +742,7 @@ ConfigureTest( ) ConfigureTest( STREAM_TEXT_TEST + streams/text/bpe_test.cpp streams/text/edit_distance_test.cpp streams/text/ngrams_test.cpp streams/text/replace_test.cpp diff --git a/cpp/tests/streams/text/bpe_test.cpp b/cpp/tests/streams/text/bpe_test.cpp new file mode 100644 index 00000000000..0510edc122a --- /dev/null +++ b/cpp/tests/streams/text/bpe_test.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +#include + +struct TextBytePairEncoding : public cudf::test::BaseFixture {}; + +TEST_F(TextBytePairEncoding, BytePairEncoding) +{ + auto stream = cudf::test::get_default_stream(); + // partial table based on values from https://huggingface.co/gpt2/raw/main/merges.txt + auto mpt = cudf::test::strings_column_wrapper({ + "e n", // 14 + "i t", // 16 + "i s", // 17 + "e s", // 20 + "en t", // 44 + "c e", // 90 + "es t", // 141 + "en ce", // 340 + "t h", // 146 + "h i", // 5049 + "th is", // 5407 + "t est", // 9034 + "s i", // 13142 + "s ent" // 33832 + }); + + auto merge_pairs = nvtext::load_merge_pairs(cudf::strings_column_view(mpt), stream); + + auto validity = cudf::test::iterators::null_at(4); + cudf::test::strings_column_wrapper input( + {"thisisit", "thisis test-sentence-1", "thisistestsentence-2", "this-istestsentence 3", "", ""}, + validity); + auto sv = cudf::strings_column_view(input); + + auto results = + nvtext::byte_pair_encoding(sv, *merge_pairs, cudf::string_scalar(" ", true, stream), stream); +} From 4505c5399a7aea119e07dded7b54084be713e985 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 4 Dec 2024 13:52:38 -0500 Subject: [PATCH 21/78] Return empty result for segmented_reduce if input and offsets are both empty (#17437) Changes the behavior of `cudf::segmented_reduce` to return an empty column if both the input and the offsets parameter are empty. Closes #17433 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Karthikeyan (https://github.com/karthikeyann) - Bradley Dice (https://github.com/bdice) - Basit Ayantunde (https://github.com/lamarrr) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/17437 --- cpp/src/reductions/segmented/reductions.cpp | 6 ++++++ .../reductions/segmented_reduction_tests.cpp | 20 +++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp index dedfc4b0734..1c3a2b0c0f3 100644 --- a/cpp/src/reductions/segmented/reductions.cpp +++ b/cpp/src/reductions/segmented/reductions.cpp @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include #include #include #include @@ -120,6 +121,11 @@ std::unique_ptr segmented_reduce(column_view const& segmented_values, CUDF_FAIL( "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, and ALL aggregation types"); } + + if (segmented_values.is_empty() && offsets.empty()) { + return cudf::make_empty_column(output_dtype); + } + CUDF_EXPECTS(offsets.size() > 0, "`offsets` should have at least 1 element."); return cudf::detail::aggregation_dispatcher( diff --git a/cpp/tests/reductions/segmented_reduction_tests.cpp b/cpp/tests/reductions/segmented_reduction_tests.cpp index bc0321bd40a..2281a517aa6 100644 --- a/cpp/tests/reductions/segmented_reduction_tests.cpp +++ b/cpp/tests/reductions/segmented_reduction_tests.cpp @@ -1122,6 +1122,26 @@ TEST_F(SegmentedReductionTestUntyped, EmptyInputWithOffsets) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expect_bool); } +TEST_F(SegmentedReductionTestUntyped, EmptyInputEmptyOffsets) +{ + auto const str_empty = cudf::test::strings_column_wrapper{}; + auto const int_empty = cudf::test::fixed_width_column_wrapper{}; + auto result = + cudf::segmented_reduce(str_empty, + cudf::column_view{int_empty}, + *cudf::make_max_aggregation(), + cudf::data_type{cudf::type_id::STRING}, + cudf::null_policy::EXCLUDE); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, str_empty); + + result = cudf::segmented_reduce(int_empty, + cudf::column_view{int_empty}, + *cudf::make_min_aggregation(), + cudf::data_type{cudf::type_id::INT32}, + cudf::null_policy::INCLUDE); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, int_empty); +} + template struct SegmentedReductionFixedPointTest : public cudf::test::BaseFixture {}; From 351ece53a3f1b5269c0b15f7254e67cd06535740 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 4 Dec 2024 11:11:37 -0800 Subject: [PATCH 22/78] Remove cudf._lib.binops in favor of inlining pylibcudf (#17468) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/17468 --- python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/binaryop.pyx | 61 -------------------- python/cudf/cudf/core/_internals/binaryop.py | 60 +++++++++++++++++++ python/cudf/cudf/core/column/column.py | 2 +- python/cudf/cudf/core/column/datetime.py | 16 ++--- python/cudf/cudf/core/column/decimal.py | 13 +++-- python/cudf/cudf/core/column/lists.py | 4 +- python/cudf/cudf/core/column/numerical.py | 10 ++-- python/cudf/cudf/core/column/string.py | 10 ++-- python/cudf/cudf/core/column/timedelta.py | 13 ++--- python/cudf/cudf/utils/applyutils.py | 4 +- 12 files changed, 94 insertions(+), 101 deletions(-) delete mode 100644 python/cudf/cudf/_lib/binaryop.pyx create mode 100644 python/cudf/cudf/core/_internals/binaryop.py diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index e69a2672163..dd27aae7133 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -14,7 +14,6 @@ set(cython_sources aggregation.pyx - binaryop.pyx column.pyx copying.pyx csv.pyx diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index ec32386b2ce..cdf7cbe13c4 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -2,7 +2,6 @@ import numpy as np from . import ( - binaryop, copying, csv, groupby, diff --git a/python/cudf/cudf/_lib/binaryop.pyx b/python/cudf/cudf/_lib/binaryop.pyx deleted file mode 100644 index e2547476849..00000000000 --- a/python/cudf/cudf/_lib/binaryop.pyx +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.types cimport dtype_to_pylibcudf_type - -import pylibcudf - -from cudf._lib.scalar import as_device_scalar -from cudf.core.buffer import acquire_spill_lock - -# Map pandas operation names to pylibcudf operation names. -_op_map = { - "TRUEDIV": "TRUE_DIV", - "FLOORDIV": "FLOOR_DIV", - "MOD": "PYMOD", - "EQ": "EQUAL", - "NE": "NOT_EQUAL", - "LT": "LESS", - "GT": "GREATER", - "LE": "LESS_EQUAL", - "GE": "GREATER_EQUAL", - "AND": "BITWISE_AND", - "OR": "BITWISE_OR", - "XOR": "BITWISE_XOR", - "L_AND": "LOGICAL_AND", - "L_OR": "LOGICAL_OR", -} - - -@acquire_spill_lock() -def binaryop(lhs, rhs, op, dtype): - """ - Dispatches a binary op call to the appropriate libcudf function: - """ - # TODO: Shouldn't have to keep special-casing. We need to define a separate - # pipeline for libcudf binops that don't map to Python binops. - if op not in {"INT_POW", "NULL_EQUALS", "NULL_NOT_EQUALS"}: - op = op[2:-2] - op = op.upper() - op = _op_map.get(op, op) - - return Column.from_pylibcudf( - # Check if the dtype args are desirable here. - pylibcudf.binaryop.binary_operation( - lhs.to_pylibcudf(mode="read") if isinstance(lhs, Column) - else ( - as_device_scalar( - lhs, dtype=rhs.dtype if lhs is None else None - ) - ).c_value, - rhs.to_pylibcudf(mode="read") if isinstance(rhs, Column) - else ( - as_device_scalar( - rhs, dtype=lhs.dtype if rhs is None else None - ) - ).c_value, - pylibcudf.binaryop.BinaryOperator[op], - dtype_to_pylibcudf_type(dtype), - ) - ) diff --git a/python/cudf/cudf/core/_internals/binaryop.py b/python/cudf/cudf/core/_internals/binaryop.py new file mode 100644 index 00000000000..212150f505e --- /dev/null +++ b/python/cudf/cudf/core/_internals/binaryop.py @@ -0,0 +1,60 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pylibcudf as plc + +from cudf._lib.column import Column +from cudf._lib.types import dtype_to_pylibcudf_type +from cudf.core.buffer import acquire_spill_lock + +if TYPE_CHECKING: + from cudf._typing import Dtype + from cudf.core.column import ColumnBase + from cudf.core.scalar import Scalar + + +@acquire_spill_lock() +def binaryop( + lhs: ColumnBase | Scalar, rhs: ColumnBase | Scalar, op: str, dtype: Dtype +) -> ColumnBase: + """ + Dispatches a binary op call to the appropriate libcudf function: + """ + # TODO: Shouldn't have to keep special-casing. We need to define a separate + # pipeline for libcudf binops that don't map to Python binops. + if op not in {"INT_POW", "NULL_EQUALS", "NULL_NOT_EQUALS"}: + op = op[2:-2] + # Map pandas operation names to pylibcudf operation names. + _op_map = { + "TRUEDIV": "TRUE_DIV", + "FLOORDIV": "FLOOR_DIV", + "MOD": "PYMOD", + "EQ": "EQUAL", + "NE": "NOT_EQUAL", + "LT": "LESS", + "GT": "GREATER", + "LE": "LESS_EQUAL", + "GE": "GREATER_EQUAL", + "AND": "BITWISE_AND", + "OR": "BITWISE_OR", + "XOR": "BITWISE_XOR", + "L_AND": "LOGICAL_AND", + "L_OR": "LOGICAL_OR", + } + op = op.upper() + op = _op_map.get(op, op) + + return Column.from_pylibcudf( + plc.binaryop.binary_operation( + lhs.to_pylibcudf(mode="read") + if isinstance(lhs, Column) + else lhs.device_value.c_value, + rhs.to_pylibcudf(mode="read") + if isinstance(rhs, Column) + else rhs.device_value.c_value, + plc.binaryop.BinaryOperator[op], + dtype_to_pylibcudf_type(dtype), + ) + ) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index c8cd80f45f4..1ddc79e8970 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1366,7 +1366,7 @@ def nans_to_nulls(self: Self) -> Self: def normalize_binop_value( self, other: ScalarLike - ) -> ColumnBase | ScalarLike: + ) -> ColumnBase | cudf.Scalar: raise NotImplementedError def _reduce( diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index c9be3f239f9..b526a6efa51 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -22,7 +22,7 @@ import cudf.core.column.string as string from cudf import _lib as libcudf from cudf.core._compat import PANDAS_GE_220 -from cudf.core._internals import unary +from cudf.core._internals import binaryop, unary from cudf.core._internals.search import search_sorted from cudf.core._internals.timezones import ( check_ambiguous_and_nonexistent, @@ -509,7 +509,9 @@ def isocalendar(self) -> dict[str, ColumnBase]: ) } - def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike: + def normalize_binop_value( # type: ignore[override] + self, other: DatetimeLikeScalar + ) -> cudf.Scalar | cudf.DateOffset | ColumnBase: if isinstance(other, (cudf.Scalar, ColumnBase, cudf.DateOffset)): return other @@ -789,12 +791,12 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: if out_dtype is None: return NotImplemented - result_col = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) - if out_dtype != cudf.dtype(np.bool_) and op == "__add__": + result_col = binaryop.binaryop(lhs, rhs, op, out_dtype) + if out_dtype.kind != "b" and op == "__add__": return result_col - elif cudf.get_option( - "mode.pandas_compatible" - ) and out_dtype == cudf.dtype(np.bool_): + elif ( + cudf.get_option("mode.pandas_compatible") and out_dtype.kind == "b" + ): return result_col.fillna(op == "__ne__") else: return result_col diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index ac9a2caad50..2c22724d3d7 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -11,12 +11,11 @@ import pyarrow as pa import cudf -from cudf import _lib as libcudf from cudf._lib.strings.convert.convert_fixed_point import ( from_decimal as cpp_from_decimal, ) from cudf.api.types import is_scalar -from cudf.core._internals import unary +from cudf.core._internals import binaryop, unary from cudf.core.buffer import as_buffer from cudf.core.column.column import ColumnBase from cudf.core.column.numerical_base import NumericalBaseColumn @@ -30,6 +29,8 @@ from cudf.utils.utils import pa_mask_buffer_to_mask if TYPE_CHECKING: + from typing_extensions import Self + from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike from cudf.core.buffer import Buffer @@ -141,7 +142,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str): rhs = rhs.astype( type(output_type)(rhs.dtype.precision, rhs.dtype.scale) ) - result = libcudf.binaryop.binaryop(lhs, rhs, op, output_type) + result = binaryop.binaryop(lhs, rhs, op, output_type) # libcudf doesn't support precision, so result.dtype doesn't # maintain output_type.precision result.dtype.precision = output_type.precision @@ -153,7 +154,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str): "__le__", "__ge__", }: - result = libcudf.binaryop.binaryop(lhs, rhs, op, bool) + result = binaryop.binaryop(lhs, rhs, op, bool) else: raise TypeError( f"{op} not supported for the following dtypes: " @@ -177,7 +178,7 @@ def _validate_fillna_value( "integer values" ) - def normalize_binop_value(self, other): + def normalize_binop_value(self, other) -> Self | cudf.Scalar: if isinstance(other, ColumnBase): if isinstance(other, cudf.core.column.NumericalColumn): if other.dtype.kind not in "iu": @@ -209,7 +210,7 @@ def normalize_binop_value(self, other): other = Decimal(other) metadata = other.as_tuple() precision = max(len(metadata.digits), metadata.exponent) - scale = -metadata.exponent + scale = -cast(int, metadata.exponent) return cudf.Scalar( other, dtype=self.dtype.__class__(precision, scale) ) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 789c4a7f3cb..ea384888388 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -188,8 +188,8 @@ def __cuda_array_interface__(self): "Lists are not yet supported via `__cuda_array_interface__`" ) - def normalize_binop_value(self, other): - if not isinstance(other, ListColumn): + def normalize_binop_value(self, other) -> Self: + if not isinstance(other, type(self)): return NotImplemented return other diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 8ca42debb72..9514aaeab50 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -16,7 +16,7 @@ import cudf.core.column.string as string from cudf import _lib as libcudf from cudf.api.types import is_integer, is_scalar -from cudf.core._internals import unary +from cudf.core._internals import binaryop, unary from cudf.core.column.column import ColumnBase, as_column from cudf.core.column.numerical_base import NumericalBaseColumn from cudf.core.dtypes import CategoricalDtype @@ -292,7 +292,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: lhs, rhs = (other, self) if reflect else (self, other) - return libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) + return binaryop.binaryop(lhs, rhs, op, out_dtype) def nans_to_nulls(self: Self) -> Self: # Only floats can contain nan. @@ -301,11 +301,9 @@ def nans_to_nulls(self: Self) -> Self: newmask = libcudf.transform.nans_to_nulls(self) return self.set_mask(newmask) - def normalize_binop_value( - self, other: ScalarLike - ) -> ColumnBase | cudf.Scalar: + def normalize_binop_value(self, other: ScalarLike) -> Self | cudf.Scalar: if isinstance(other, ColumnBase): - if not isinstance(other, NumericalColumn): + if not isinstance(other, type(self)): return NotImplemented return other if isinstance(other, cudf.Scalar): diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 76d67585609..6b45828568c 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -19,11 +19,11 @@ import cudf.api.types import cudf.core.column.column as column import cudf.core.column.datetime as datetime -from cudf import _lib as libcudf from cudf._lib import string_casting as str_cast, strings as libstrings from cudf._lib.column import Column from cudf._lib.types import size_type_dtype from cudf.api.types import is_integer, is_scalar, is_string_dtype +from cudf.core._internals import binaryop from cudf.core.buffer import acquire_spill_lock from cudf.core.column.column import ColumnBase from cudf.core.column.methods import ColumnMethods @@ -6200,7 +6200,7 @@ def normalize_binop_value(self, other) -> column.ColumnBase | cudf.Scalar: def _binaryop( self, other: ColumnBinaryOperand, op: str - ) -> "column.ColumnBase": + ) -> column.ColumnBase: reflect, op = self._check_reflected_op(op) # Due to https://github.com/pandas-dev/pandas/issues/46332 we need to # support binary operations between empty or all null string columns @@ -6229,7 +6229,7 @@ def _binaryop( if other is NotImplemented: return NotImplemented - if isinstance(other, (StringColumn, str, cudf.Scalar)): + if isinstance(other, (StringColumn, cudf.Scalar)): if isinstance(other, cudf.Scalar) and other.dtype != "O": if op in { "__eq__", @@ -6279,9 +6279,7 @@ def _binaryop( "NULL_NOT_EQUALS", }: lhs, rhs = (other, self) if reflect else (self, other) - return libcudf.binaryop.binaryop( - lhs=lhs, rhs=rhs, op=op, dtype="bool" - ) + return binaryop.binaryop(lhs=lhs, rhs=rhs, op=op, dtype="bool") return NotImplemented @copy_docstring(column.ColumnBase.view) diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index ccc9ef2b3f6..f3a7916aa35 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -13,9 +13,8 @@ import cudf import cudf.core.column.column as column import cudf.core.column.string as string -from cudf import _lib as libcudf from cudf.api.types import is_scalar -from cudf.core._internals import unary +from cudf.core._internals import binaryop, unary from cudf.core.buffer import Buffer, acquire_spill_lock from cudf.core.column.column import ColumnBase from cudf.utils.dtypes import np_to_pa_dtype @@ -188,8 +187,8 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: this = self.astype(common_dtype).astype(out_dtype) if isinstance(other, cudf.Scalar): if other.is_valid(): - other = other.value.astype(common_dtype).astype( - out_dtype + other = cudf.Scalar( + other.value.astype(common_dtype).astype(out_dtype) ) else: other = cudf.Scalar(None, out_dtype) @@ -219,10 +218,8 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: lhs, rhs = (other, this) if reflect else (this, other) - result = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) - if cudf.get_option( - "mode.pandas_compatible" - ) and out_dtype == cudf.dtype(np.bool_): + result = binaryop.binaryop(lhs, rhs, op, out_dtype) + if cudf.get_option("mode.pandas_compatible") and out_dtype.kind == "b": result = result.fillna(op == "__ne__") return result diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py index cd7fe5ee023..4d6f4ea73a8 100644 --- a/python/cudf/cudf/utils/applyutils.py +++ b/python/cudf/cudf/utils/applyutils.py @@ -9,7 +9,7 @@ from numba.core.utils import pysignature import cudf -from cudf import _lib as libcudf +from cudf.core._internals import binaryop from cudf.core.buffer import acquire_spill_lock from cudf.core.column import column from cudf.utils import utils @@ -121,7 +121,7 @@ def make_aggregate_nullmask(df, columns=None, op="__and__"): nullmask.copy(), dtype=utils.mask_dtype ) else: - out_mask = libcudf.binaryop.binaryop( + out_mask = binaryop.binaryop( nullmask, out_mask, op, out_mask.dtype ) From cd3e352be06795b825828156da10ba83e1e8939f Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Wed, 4 Dec 2024 14:38:35 -0500 Subject: [PATCH 23/78] Migrate `cudf::io::merge_row_group_metadata` to pylibcudf (#17491) Apart of #15162 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/17491 --- python/cudf/cudf/_lib/parquet.pyx | 22 ++++---------- python/pylibcudf/pylibcudf/io/parquet.pxd | 2 ++ python/pylibcudf/pylibcudf/io/parquet.pyi | 1 + python/pylibcudf/pylibcudf/io/parquet.pyx | 36 +++++++++++++++++++++-- 4 files changed, 41 insertions(+), 20 deletions(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index d4bd0cd306c..6c80120ad6e 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -14,8 +14,6 @@ except ImportError: import numpy as np -from cython.operator cimport dereference - from cudf.api.types import is_list_like from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io @@ -25,7 +23,7 @@ from cudf._lib.utils import _index_level_name, generate_pandas_metadata from libc.stdint cimport int64_t, uint8_t from libcpp cimport bool from libcpp.map cimport map -from libcpp.memory cimport make_unique, unique_ptr +from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.utility cimport move from libcpp.vector cimport vector @@ -35,7 +33,6 @@ from pylibcudf.io.parquet cimport ChunkedParquetReader from pylibcudf.libcudf.io.data_sink cimport data_sink from pylibcudf.libcudf.io.parquet cimport ( chunked_parquet_writer_options, - merge_row_group_metadata as parquet_merge_metadata, parquet_chunked_writer as cpp_parquet_chunked_writer, parquet_writer_options, write_parquet as parquet_writer, @@ -64,6 +61,7 @@ import pylibcudf as plc from pylibcudf cimport Table from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT +from cython.operator cimport dereference cdef class BufferArrayFromVector: @@ -808,19 +806,9 @@ cpdef merge_filemetadata(object filemetadata_list): -------- cudf.io.parquet.merge_row_group_metadata """ - cdef vector[unique_ptr[vector[uint8_t]]] list_c - cdef vector[uint8_t] blob_c - cdef unique_ptr[vector[uint8_t]] output_c - - for blob_py in filemetadata_list: - blob_c = blob_py - list_c.push_back(move(make_unique[vector[uint8_t]](blob_c))) - - with nogil: - output_c = move(parquet_merge_metadata(list_c)) - - out_metadata_py = BufferArrayFromVector.from_unique_ptr(move(output_c)) - return np.asarray(out_metadata_py) + return np.asarray( + plc.io.parquet.merge_row_group_metadata(filemetadata_list).obj + ) cdef statistics_freq _get_stat_freq(str statistics): diff --git a/python/pylibcudf/pylibcudf/io/parquet.pxd b/python/pylibcudf/pylibcudf/io/parquet.pxd index 1a61c20d783..79080fa7243 100644 --- a/python/pylibcudf/pylibcudf/io/parquet.pxd +++ b/python/pylibcudf/pylibcudf/io/parquet.pxd @@ -91,3 +91,5 @@ cdef class ParquetWriterOptionsBuilder: cpdef ParquetWriterOptions build(self) cpdef memoryview write_parquet(ParquetWriterOptions options) + +cpdef memoryview merge_row_group_metadata(list metdata_list) diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyi b/python/pylibcudf/pylibcudf/io/parquet.pyi index eb2ca68109b..3eb3d7c3a92 100644 --- a/python/pylibcudf/pylibcudf/io/parquet.pyi +++ b/python/pylibcudf/pylibcudf/io/parquet.pyi @@ -78,3 +78,4 @@ class ParquetWriterOptionsBuilder: def build(self) -> ParquetWriterOptions: ... def write_parquet(options: ParquetWriterOptions) -> memoryview: ... +def merge_row_group_metadata(metdata_list: list) -> memoryview: ... diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyx b/python/pylibcudf/pylibcudf/io/parquet.pyx index b95b1f39de1..93843c932ad 100644 --- a/python/pylibcudf/pylibcudf/io/parquet.pyx +++ b/python/pylibcudf/pylibcudf/io/parquet.pyx @@ -2,7 +2,7 @@ from cython.operator cimport dereference from libc.stdint cimport int64_t, uint8_t from libcpp cimport bool -from libcpp.memory cimport unique_ptr +from libcpp.memory cimport unique_ptr, make_unique from libcpp.string cimport string from libcpp.utility cimport move from libcpp.vector cimport vector @@ -22,6 +22,7 @@ from pylibcudf.libcudf.io.parquet cimport ( read_parquet as cpp_read_parquet, write_parquet as cpp_write_parquet, parquet_writer_options, + merge_row_group_metadata as cpp_merge_row_group_metadata, ) from pylibcudf.libcudf.io.types cimport ( compression_type, @@ -38,10 +39,10 @@ __all__ = [ "ParquetWriterOptions", "ParquetWriterOptionsBuilder", "read_parquet", - "write_parquet" + "write_parquet", + "merge_row_group_metadata", ] - cdef parquet_reader_options _setup_parquet_reader_options( SourceInfo source_info, list columns = None, @@ -577,3 +578,32 @@ cpdef memoryview write_parquet(ParquetWriterOptions options): c_result = cpp_write_parquet(c_options) return memoryview(HostBuffer.from_unique_ptr(move(c_result))) + + +cpdef memoryview merge_row_group_metadata(list metdata_list): + """ + Merges multiple raw metadata blobs that were previously + created by write_parquet into a single metadata blob. + + For details, see :cpp:func:`merge_row_group_metadata`. + + Parameters + ---------- + metdata_list : list + List of input file metadata + + Returns + ------- + memoryview + A parquet-compatible blob that contains the data for all row groups in the list + """ + cdef vector[unique_ptr[vector[uint8_t]]] list_c + cdef unique_ptr[vector[uint8_t]] output_c + + for blob in metdata_list: + list_c.push_back(move(make_unique[vector[uint8_t]]( blob))) + + with nogil: + output_c = move(cpp_merge_row_group_metadata(list_c)) + + return memoryview(HostBuffer.from_unique_ptr(move(output_c))) From 47e49d04281da3f488bc0d954b366b272c08d316 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 4 Dec 2024 12:55:18 -0800 Subject: [PATCH 24/78] Fix groupby(as_index=False).size not reseting index (#17499) closes #17478 Also fixes a bug where the `Series.name` attribute wasn't preserved with `size` Authors: - Matthew Roeschke (https://github.com/mroeschke) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Bradley Dice (https://github.com/bdice) - Matthew Murray (https://github.com/Matt711) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/17499 --- python/cudf/cudf/core/groupby/groupby.py | 7 +++++-- python/cudf/cudf/tests/test_groupby.py | 14 ++++++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 29ab3b60d9d..0f12f266a95 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -497,11 +497,14 @@ def size(self): col = cudf.core.column.column_empty( len(self.obj), "int8", masked=False ) - return ( - cudf.Series._from_column(col) + result = ( + cudf.Series._from_column(col, name=getattr(self.obj, "name", None)) .groupby(self.grouping, sort=self._sort, dropna=self._dropna) .agg("size") ) + if not self._as_index: + result = result.rename("size").reset_index() + return result @_performance_tracking def cumcount(self, ascending: bool = True): diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index eae0fd23ef8..d8a2528230e 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -4074,3 +4074,17 @@ def test_get_group_list_like(): with pytest.raises(KeyError): df.groupby(["a"]).get_group([1]) + + +def test_size_as_index_false(): + df = pd.DataFrame({"a": [1, 2, 1], "b": [1, 2, 3]}, columns=["a", "b"]) + expected = df.groupby("a", as_index=False).size() + result = cudf.from_pandas(df).groupby("a", as_index=False).size() + assert_eq(result, expected) + + +def test_size_series_with_name(): + ser = pd.Series(range(3), name="foo") + expected = ser.groupby(ser).size() + result = cudf.from_pandas(ser).groupby(ser).size() + assert_eq(result, expected) From 1b82963df736f3ad71b003443a4de1414f3ce2e5 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 4 Dec 2024 16:33:24 -0500 Subject: [PATCH 25/78] Fix libcudf compile error when logging is disabled (#17512) Adds `[[maybe_unused]]` to the `compression_type_name` function to prevent the warning/error. Error/warning introduced in #17431 Closes #17510 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Muhammad Haseeb (https://github.com/mhaseeb123) - MithunR (https://github.com/mythrocks) - Vukasin Milovanovic (https://github.com/vuule) - Jason Lowe (https://github.com/jlowe) URL: https://github.com/rapidsai/cudf/pull/17512 --- cpp/src/io/comp/nvcomp_adapter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index b8bf8be6d2d..9d3cf75a13f 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -70,7 +70,7 @@ auto batched_decompress_async(compression_type compression, Args&&... args) } } -std::string compression_type_name(compression_type compression) +[[maybe_unused]] std::string compression_type_name(compression_type compression) { switch (compression) { case compression_type::SNAPPY: return "Snappy"; From fbc32563809f509c0186081e6012f72a0e83ebcd Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 5 Dec 2024 12:08:37 -0600 Subject: [PATCH 26/78] Force Thrust to use 32-bit offset type. (#17523) This fixes the patch we use for Thrust to always get a 32-bit offset type. The net effect of this patch is that we are behaving as if `THRUST_FORCE_32_BIT_OFFSET_TYPE` is set. This replaces a previous patch which I mistakenly did not update between CCCL 2.6.x testing and 2.7.0-rc2 testing. In the future we hope to configure this with CMake and drop the patches, but that will require us to use features from https://github.com/NVIDIA/cccl/pull/2844 (which is not available in 2.7.0-rc2). Authors: - Bradley Dice (https://github.com/bdice) Approvers: - David Wendt (https://github.com/davidwendt) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17523 --- .../thrust_disable_64bit_dispatching.diff | 75 +++++-------------- 1 file changed, 19 insertions(+), 56 deletions(-) diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff index 291eabe25fd..9f68d85e7db 100644 --- a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff +++ b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff @@ -1,59 +1,22 @@ diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h -index 971b93d62..0d6b25b07 100644 +index 3d004aa55..71ce86bea 100644 --- a/thrust/thrust/system/cuda/detail/dispatch.h +++ b/thrust/thrust/system/cuda/detail/dispatch.h -@@ -36,16 +36,15 @@ - * that callables for both branches consist of the same tokens, and is intended to be used with Thrust-style dispatch - * interfaces, that always deduce the size type from the arguments. - */ --#define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments) \ -- if (count <= thrust::detail::integer_traits::const_max) \ -- { \ -- auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ -- status = call arguments; \ -- } \ -- else \ -- { \ -- auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ -- status = call arguments; \ -+#define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments) \ -+ if (count <= thrust::detail::integer_traits::const_max) \ -+ { \ -+ auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ -+ status = call arguments; \ -+ } \ -+ else \ -+ { \ -+ throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ - } - - /** -@@ -55,18 +54,16 @@ - * - * This version of the macro supports providing two count variables, which is necessary for set algorithms. - */ --#define THRUST_DOUBLE_INDEX_TYPE_DISPATCH(status, call, count1, count2, arguments) \ -- if (count1 + count2 <= thrust::detail::integer_traits::const_max) \ -- { \ -- auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ -- auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ -- status = call arguments; \ -- } \ -- else \ -- { \ -- auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ -- auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ -- status = call arguments; \ -+#define THRUST_DOUBLE_INDEX_TYPE_DISPATCH(status, call, count1, count2, arguments) \ -+ if (count1 + count2 <= thrust::detail::integer_traits::const_max) \ -+ { \ -+ auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ -+ auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ -+ status = call arguments; \ -+ } \ -+ else \ -+ { \ -+ throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ - } - - /** +@@ -63,7 +63,7 @@ + _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count1) \ + _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count2) + +-#if defined(THRUST_FORCE_64_BIT_OFFSET_TYPE) ++#if 0 + //! @brief Always dispatches to 64 bit offset version of an algorithm + # define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments) \ + _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count) \ +@@ -89,7 +89,7 @@ + _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count) \ + _THRUST_INDEX_TYPE_DISPATCH(std::uint64_t, status, call_64, count, arguments) + +-#elif defined(THRUST_FORCE_32_BIT_OFFSET_TYPE) ++#elif 1 + + //! @brief Ensures that the size of the input does not overflow the offset type + # define _THRUST_INDEX_TYPE_DISPATCH_GUARD_OVERFLOW(index_type, count) \ From 06e937b7be83c69e94e27e1dc50e98755d341d2c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 5 Dec 2024 15:43:17 -0800 Subject: [PATCH 27/78] Remove cudf._lib.merge in favor of inlining pylibcudf (#17370) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17370 --- python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/merge.pyx | 47 -------------------------- python/cudf/cudf/core/reshape.py | 50 ++++++++++++++++++++++------ 4 files changed, 39 insertions(+), 60 deletions(-) delete mode 100644 python/cudf/cudf/_lib/merge.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index dd27aae7133..e3d9a48e2ba 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -19,7 +19,6 @@ set(cython_sources csv.pyx groupby.pyx interop.pyx - merge.pyx orc.pyx parquet.pyx reduce.pyx diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index cdf7cbe13c4..cb2d0501fea 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -6,7 +6,6 @@ csv, groupby, interop, - merge, nvtext, orc, parquet, diff --git a/python/cudf/cudf/_lib/merge.pyx b/python/cudf/cudf/_lib/merge.pyx deleted file mode 100644 index 9372acdab44..00000000000 --- a/python/cudf/cudf/_lib/merge.pyx +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libcpp cimport bool - -from cudf._lib.utils cimport columns_from_pylibcudf_table - -import pylibcudf - - -def merge_sorted( - list input_columns, - list key_columns_indices, - bool ascending=True, - str na_position="last", -): - """Merge multiple lists of lexicographically sorted columns into one list - of sorted columns. `input_columns` is a list of lists of columns to be - merged. - """ - c_input_tables = [ - pylibcudf.Table( - [c.to_pylibcudf(mode="read") for c in source_columns] - ) for source_columns in input_columns - ] - - num_keys = len(key_columns_indices) - - column_order = ( - pylibcudf.types.Order.ASCENDING if ascending - else pylibcudf.types.Order.DESCENDING - ) - - if not ascending: - na_position = "last" if na_position == "first" else "first" - null_precedence = ( - pylibcudf.types.NullOrder.BEFORE if na_position == "first" - else pylibcudf.types.NullOrder.AFTER - ) - - return columns_from_pylibcudf_table( - pylibcudf.merge.merge( - c_input_tables, - key_columns_indices, - [column_order] * num_keys, - [null_precedence] * num_keys, - ) - ) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index a6815da62c6..84c653c5b3f 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -8,7 +8,10 @@ import numpy as np import pandas as pd +import pylibcudf as plc + import cudf +from cudf._lib.column import Column from cudf._lib.transform import one_hot_encode from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default @@ -941,21 +944,46 @@ def _merge_sorted( idx + objs[0].index.nlevels for idx in key_columns_indices ] - columns = [ - [ - *(obj.index._columns if not ignore_index else ()), - *obj._columns, - ] + columns = ( + itertools.chain(obj.index._columns, obj._columns) + if not ignore_index + else obj._columns for obj in objs + ) + + input_tables = [ + plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]) + for source_columns in columns + ] + + num_keys = len(key_columns_indices) + + column_order = ( + plc.types.Order.ASCENDING if ascending else plc.types.Order.DESCENDING + ) + + if not ascending: + na_position = "last" if na_position == "first" else "first" + + null_precedence = ( + plc.types.NullOrder.BEFORE + if na_position == "first" + else plc.types.NullOrder.AFTER + ) + + plc_table = plc.merge.merge( + input_tables, + key_columns_indices, + [column_order] * num_keys, + [null_precedence] * num_keys, + ) + + result_columns = [ + Column.from_pylibcudf(col) for col in plc_table.columns() ] return objs[0]._from_columns_like_self( - cudf._lib.merge.merge_sorted( - input_columns=columns, - key_columns_indices=key_columns_indices, - ascending=ascending, - na_position=na_position, - ), + result_columns, column_names=objs[0]._column_names, index_names=None if ignore_index else objs[0]._index_names, ) From c0a4c6ca47515ac368b62582ecd2a7af241b0238 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 5 Dec 2024 16:26:59 -0800 Subject: [PATCH 28/78] Move cudf._lib.aggregation to cudf.core._internals (#17516) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17516 --- python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/aggregation.pyx | 245 --------------- python/cudf/cudf/_lib/groupby.pyx | 2 +- python/cudf/cudf/_lib/reduce.pyx | 2 +- .../cudf/cudf/core/_internals/aggregation.py | 288 ++++++++++++++++++ python/cudf/cudf/core/window/rolling.py | 2 +- 6 files changed, 291 insertions(+), 249 deletions(-) delete mode 100644 python/cudf/cudf/_lib/aggregation.pyx create mode 100644 python/cudf/cudf/core/_internals/aggregation.py diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index e3d9a48e2ba..2f05101e8e3 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -13,7 +13,6 @@ # ============================================================================= set(cython_sources - aggregation.pyx column.pyx copying.pyx csv.pyx diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx deleted file mode 100644 index 3c96b90f0a1..00000000000 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ /dev/null @@ -1,245 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import pandas as pd -from numba.np import numpy_support - -import pylibcudf - -import cudf -from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES -from cudf.utils import cudautils - -_agg_name_map = { - "COUNT_VALID": "COUNT", - "COUNT_ALL": "SIZE", - "VARIANCE": "VAR", - "NTH_ELEMENT": "NTH", - "COLLECT_LIST": "COLLECT", - "COLLECT_SET": "UNIQUE", -} - - -class Aggregation: - def __init__(self, agg): - self.c_obj = agg - - @property - def kind(self): - name = self.c_obj.kind().name - return _agg_name_map.get(name, name) - - @classmethod - def sum(cls): - return cls(pylibcudf.aggregation.sum()) - - @classmethod - def min(cls): - return cls(pylibcudf.aggregation.min()) - - @classmethod - def max(cls): - return cls(pylibcudf.aggregation.max()) - - @classmethod - def idxmin(cls): - return cls(pylibcudf.aggregation.argmin()) - - @classmethod - def idxmax(cls): - return cls(pylibcudf.aggregation.argmax()) - - @classmethod - def mean(cls): - return cls(pylibcudf.aggregation.mean()) - - @classmethod - def count(cls, dropna=True): - return cls(pylibcudf.aggregation.count( - pylibcudf.types.NullPolicy.EXCLUDE - if dropna else pylibcudf.types.NullPolicy.INCLUDE - )) - - @classmethod - def ewma(cls, com=1.0, adjust=True): - return cls(pylibcudf.aggregation.ewma( - com, - pylibcudf.aggregation.EWMHistory.INFINITE - if adjust else pylibcudf.aggregation.EWMHistory.FINITE - )) - - @classmethod - def size(cls): - return cls(pylibcudf.aggregation.count(pylibcudf.types.NullPolicy.INCLUDE)) - - @classmethod - def collect(cls): - return cls( - pylibcudf.aggregation.collect_list(pylibcudf.types.NullPolicy.INCLUDE) - ) - - @classmethod - def nunique(cls, dropna=True): - return cls(pylibcudf.aggregation.nunique( - pylibcudf.types.NullPolicy.EXCLUDE - if dropna else pylibcudf.types.NullPolicy.INCLUDE - )) - - @classmethod - def nth(cls, size): - return cls(pylibcudf.aggregation.nth_element(size)) - - @classmethod - def product(cls): - return cls(pylibcudf.aggregation.product()) - prod = product - - @classmethod - def sum_of_squares(cls): - return cls(pylibcudf.aggregation.sum_of_squares()) - - @classmethod - def var(cls, ddof=1): - return cls(pylibcudf.aggregation.variance(ddof)) - - @classmethod - def std(cls, ddof=1): - return cls(pylibcudf.aggregation.std(ddof)) - - @classmethod - def median(cls): - return cls(pylibcudf.aggregation.median()) - - @classmethod - def quantile(cls, q=0.5, interpolation="linear"): - if not pd.api.types.is_list_like(q): - q = [q] - - return cls(pylibcudf.aggregation.quantile( - q, pylibcudf.types.Interpolation[interpolation.upper()] - )) - - @classmethod - def unique(cls): - return cls(pylibcudf.aggregation.collect_set( - pylibcudf.types.NullPolicy.INCLUDE, - pylibcudf.types.NullEquality.EQUAL, - pylibcudf.types.NanEquality.ALL_EQUAL, - - )) - - @classmethod - def first(cls): - return cls( - pylibcudf.aggregation.nth_element(0, pylibcudf.types.NullPolicy.EXCLUDE) - ) - - @classmethod - def last(cls): - return cls( - pylibcudf.aggregation.nth_element(-1, pylibcudf.types.NullPolicy.EXCLUDE) - ) - - @classmethod - def corr(cls, method, min_periods): - return cls(pylibcudf.aggregation.correlation( - pylibcudf.aggregation.CorrelationType[method.upper()], - min_periods - - )) - - @classmethod - def cov(cls, min_periods, ddof=1): - return cls(pylibcudf.aggregation.covariance( - min_periods, - ddof - )) - - # scan aggregations - @classmethod - def cumcount(cls): - return cls.count(False) - - cumsum = sum - cummin = min - cummax = max - cumprod = product - - @classmethod - def rank(cls, method, ascending, na_option, pct): - return cls(pylibcudf.aggregation.rank( - pylibcudf.aggregation.RankMethod[method.upper()], - (pylibcudf.types.Order.ASCENDING if ascending else - pylibcudf.types.Order.DESCENDING), - (pylibcudf.types.NullPolicy.EXCLUDE if na_option == "keep" else - pylibcudf.types.NullPolicy.INCLUDE), - (pylibcudf.types.NullOrder.BEFORE - if (na_option == "top") == ascending else - pylibcudf.types.NullOrder.AFTER), - (pylibcudf.aggregation.RankPercentage.ZERO_NORMALIZED - if pct else - pylibcudf.aggregation.RankPercentage.NONE) - - )) - - # Reduce aggregations - @classmethod - def any(cls): - return cls(pylibcudf.aggregation.any()) - - @classmethod - def all(cls): - return cls(pylibcudf.aggregation.all()) - - # Rolling aggregations - @classmethod - def from_udf(cls, op, *args, **kwargs): - # Handling UDF type - nb_type = numpy_support.from_dtype(kwargs['dtype']) - type_signature = (nb_type[:],) - ptx_code, output_dtype = cudautils.compile_udf(op, type_signature) - output_np_dtype = cudf.dtype(output_dtype) - if output_np_dtype not in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES: - raise TypeError(f"Result of window function has unsupported dtype {op[1]}") - - return cls( - pylibcudf.aggregation.udf( - ptx_code, - pylibcudf.DataType(SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[output_np_dtype]), - ) - ) - - -def make_aggregation(op, kwargs=None): - r""" - Parameters - ---------- - op : str or callable - If callable, must meet one of the following requirements: - - * Is of the form lambda x: x.agg(*args, **kwargs), where - `agg` is the name of a supported aggregation. Used to - to specify aggregations that take arguments, e.g., - `lambda x: x.quantile(0.5)`. - * Is a user defined aggregation function that operates on - group values. In this case, the output dtype must be - specified in the `kwargs` dictionary. - \*\*kwargs : dict, optional - Any keyword arguments to be passed to the op. - - Returns - ------- - Aggregation - """ - if kwargs is None: - kwargs = {} - - if isinstance(op, str): - return getattr(Aggregation, op)(**kwargs) - elif callable(op): - if op is list: - return Aggregation.collect() - elif "dtype" in kwargs: - return Aggregation.from_udf(op, **kwargs) - else: - return op(Aggregation) - raise TypeError(f"Unknown aggregation {op}") diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 4e712be6738..80a77ef2267 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -20,7 +20,7 @@ from cudf._lib.scalar import as_device_scalar import pylibcudf -from cudf._lib.aggregation import make_aggregation +from cudf.core._internals.aggregation import make_aggregation # The sets below define the possible aggregations that can be performed on # different dtypes. These strings must be elements of the AggregationKind enum. diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx index 944753d28b8..2850cab93a1 100644 --- a/python/cudf/cudf/_lib/reduce.pyx +++ b/python/cudf/cudf/_lib/reduce.pyx @@ -10,7 +10,7 @@ from cudf._lib.types cimport dtype_to_pylibcudf_type, is_decimal_type_id import pylibcudf -from cudf._lib.aggregation import make_aggregation +from cudf.core._internals.aggregation import make_aggregation @acquire_spill_lock() diff --git a/python/cudf/cudf/core/_internals/aggregation.py b/python/cudf/cudf/core/_internals/aggregation.py new file mode 100644 index 00000000000..fe8ea5a947a --- /dev/null +++ b/python/cudf/cudf/core/_internals/aggregation.py @@ -0,0 +1,288 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal + +from numba.np import numpy_support + +import pylibcudf as plc + +import cudf +from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES +from cudf.api.types import is_scalar +from cudf.utils import cudautils + +if TYPE_CHECKING: + from collections.abc import Callable + + from typing_extensions import Self + +_agg_name_map = { + "COUNT_VALID": "COUNT", + "COUNT_ALL": "SIZE", + "VARIANCE": "VAR", + "NTH_ELEMENT": "NTH", + "COLLECT_LIST": "COLLECT", + "COLLECT_SET": "UNIQUE", +} + + +class Aggregation: + def __init__(self, agg: plc.aggregation.Aggregation) -> None: + self.c_obj = agg + + @property + def kind(self) -> str: + name = self.c_obj.kind().name + return _agg_name_map.get(name, name) + + @classmethod + def sum(cls) -> Self: + return cls(plc.aggregation.sum()) + + @classmethod + def min(cls) -> Self: + return cls(plc.aggregation.min()) + + @classmethod + def max(cls) -> Self: + return cls(plc.aggregation.max()) + + @classmethod + def idxmin(cls) -> Self: + return cls(plc.aggregation.argmin()) + + @classmethod + def idxmax(cls) -> Self: + return cls(plc.aggregation.argmax()) + + @classmethod + def mean(cls) -> Self: + return cls(plc.aggregation.mean()) + + @classmethod + def count(cls, dropna: bool = True) -> Self: + return cls( + plc.aggregation.count( + plc.types.NullPolicy.EXCLUDE + if dropna + else plc.types.NullPolicy.INCLUDE + ) + ) + + @classmethod + def ewma(cls, com: float = 1.0, adjust: bool = True) -> Self: + return cls( + plc.aggregation.ewma( + com, + plc.aggregation.EWMHistory.INFINITE + if adjust + else plc.aggregation.EWMHistory.FINITE, + ) + ) + + @classmethod + def size(cls) -> Self: + return cls(plc.aggregation.count(plc.types.NullPolicy.INCLUDE)) + + @classmethod + def collect(cls) -> Self: + return cls(plc.aggregation.collect_list(plc.types.NullPolicy.INCLUDE)) + + @classmethod + def nunique(cls, dropna: bool = True) -> Self: + return cls( + plc.aggregation.nunique( + plc.types.NullPolicy.EXCLUDE + if dropna + else plc.types.NullPolicy.INCLUDE + ) + ) + + @classmethod + def nth(cls, size: int) -> Self: + return cls(plc.aggregation.nth_element(size)) + + @classmethod + def product(cls) -> Self: + return cls(plc.aggregation.product()) + + prod = product + + @classmethod + def sum_of_squares(cls) -> Self: + return cls(plc.aggregation.sum_of_squares()) + + @classmethod + def var(cls, ddof: int = 1) -> Self: + return cls(plc.aggregation.variance(ddof)) + + @classmethod + def std(cls, ddof: int = 1) -> Self: + return cls(plc.aggregation.std(ddof)) + + @classmethod + def median(cls) -> Self: + return cls(plc.aggregation.median()) + + @classmethod + def quantile( + cls, + q: float | list[float] = 0.5, + interpolation: Literal[ + "linear", "lower", "higher", "midpoint", "nearest" + ] = "linear", + ) -> Self: + return cls( + plc.aggregation.quantile( + [q] if is_scalar(q) else q, + plc.types.Interpolation[interpolation.upper()], + ) + ) + + @classmethod + def unique(cls) -> Self: + return cls( + plc.aggregation.collect_set( + plc.types.NullPolicy.INCLUDE, + plc.types.NullEquality.EQUAL, + plc.types.NanEquality.ALL_EQUAL, + ) + ) + + @classmethod + def first(cls) -> Self: + return cls( + plc.aggregation.nth_element(0, plc.types.NullPolicy.EXCLUDE) + ) + + @classmethod + def last(cls) -> Self: + return cls( + plc.aggregation.nth_element(-1, plc.types.NullPolicy.EXCLUDE) + ) + + @classmethod + def corr(cls, method, min_periods) -> Self: + return cls( + plc.aggregation.correlation( + plc.aggregation.CorrelationType[method.upper()], min_periods + ) + ) + + @classmethod + def cov(cls, min_periods: int, ddof: int = 1) -> Self: + return cls(plc.aggregation.covariance(min_periods, ddof)) + + # scan aggregations + @classmethod + def cumcount(cls) -> Self: + return cls.count(False) + + cumsum = sum + cummin = min + cummax = max + cumprod = product + + @classmethod + def rank( + cls, + method: Literal["first", "average", "min", "max", "dense"], + ascending: bool, + na_option: Literal["keep", "top", "bottom"], + pct: bool, + ) -> Self: + return cls( + plc.aggregation.rank( + plc.aggregation.RankMethod[method.upper()], + ( + plc.types.Order.ASCENDING + if ascending + else plc.types.Order.DESCENDING + ), + ( + plc.types.NullPolicy.EXCLUDE + if na_option == "keep" + else plc.types.NullPolicy.INCLUDE + ), + ( + plc.types.NullOrder.BEFORE + if (na_option == "top") == ascending + else plc.types.NullOrder.AFTER + ), + ( + plc.aggregation.RankPercentage.ZERO_NORMALIZED + if pct + else plc.aggregation.RankPercentage.NONE + ), + ) + ) + + # Reduce aggregations + @classmethod + def any(cls) -> Self: + return cls(plc.aggregation.any()) + + @classmethod + def all(cls) -> Self: + return cls(plc.aggregation.all()) + + # Rolling aggregations + @classmethod + def from_udf(cls, op, *args, **kwargs) -> Self: + # Handling UDF type + nb_type = numpy_support.from_dtype(kwargs["dtype"]) + type_signature = (nb_type[:],) + ptx_code, output_dtype = cudautils.compile_udf(op, type_signature) + output_np_dtype = cudf.dtype(output_dtype) + if output_np_dtype not in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES: + raise TypeError( + f"Result of window function has unsupported dtype {op[1]}" + ) + + return cls( + plc.aggregation.udf( + ptx_code, + plc.DataType( + SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[output_np_dtype] + ), + ) + ) + + +def make_aggregation( + op: str | Callable, kwargs: dict | None = None +) -> Aggregation: + r""" + Parameters + ---------- + op : str or callable + If callable, must meet one of the following requirements: + + * Is of the form lambda x: x.agg(*args, **kwargs), where + `agg` is the name of a supported aggregation. Used to + to specify aggregations that take arguments, e.g., + `lambda x: x.quantile(0.5)`. + * Is a user defined aggregation function that operates on + group values. In this case, the output dtype must be + specified in the `kwargs` dictionary. + \*\*kwargs : dict, optional + Any keyword arguments to be passed to the op. + + Returns + ------- + Aggregation + """ + if kwargs is None: + kwargs = {} + + if isinstance(op, str): + return getattr(Aggregation, op)(**kwargs) + elif callable(op): + if op is list: + return Aggregation.collect() + elif "dtype" in kwargs: + return Aggregation.from_udf(op, **kwargs) + else: + return op(Aggregation) + raise TypeError(f"Unknown aggregation {op}") diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index d2cb5e8c190..a580c35ccbf 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -12,8 +12,8 @@ import cudf from cudf import _lib as libcudf -from cudf._lib.aggregation import make_aggregation from cudf.api.types import is_integer, is_number +from cudf.core._internals.aggregation import make_aggregation from cudf.core.buffer import acquire_spill_lock from cudf.core.column.column import as_column from cudf.core.mixins import Reducible From 84690b5fe5f995937214552826d3541041cb37ab Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Thu, 5 Dec 2024 19:35:53 -0500 Subject: [PATCH 29/78] Migrate copy_column and Column.from_scalar to pylibcudf (#17513) Apart of #15162 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17513 --- python/cudf/cudf/_lib/column.pyx | 14 ++++++-------- python/cudf/cudf/_lib/copying.pyx | 15 +++------------ 2 files changed, 9 insertions(+), 20 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 9cbe11d61ac..245a5d03981 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -42,8 +42,7 @@ cimport pylibcudf.libcudf.types as libcudf_types cimport pylibcudf.libcudf.unary as libcudf_unary from pylibcudf.libcudf.column.column cimport column, column_contents from pylibcudf.libcudf.column.column_factories cimport ( - make_column_from_scalar as cpp_make_column_from_scalar, - make_numeric_column, + make_numeric_column ) from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.null_mask cimport null_count as cpp_null_count @@ -840,9 +839,8 @@ cdef class Column: @staticmethod def from_scalar(py_val, size_type size): - cdef DeviceScalar val = py_val.device_value - cdef const scalar* c_val = val.get_raw_ptr() - cdef unique_ptr[column] c_result - with nogil: - c_result = move(cpp_make_column_from_scalar(c_val[0], size)) - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + pylibcudf.Column.from_scalar( + py_val.device_value.c_value, size + ) + ) diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index 4dfb12d8ab3..1f3f03f4be1 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -3,8 +3,6 @@ import pickle from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move import pylibcudf import cudf @@ -18,10 +16,6 @@ from cudf._lib.scalar cimport DeviceScalar from cudf._lib.reduce import minmax -from libcpp.memory cimport make_unique - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.types cimport size_type from cudf._lib.utils cimport columns_from_pylibcudf_table, data_from_pylibcudf_table @@ -59,12 +53,9 @@ def copy_column(Column input_column): ------- Deep copied column """ - cdef unique_ptr[column] c_result - cdef column_view input_column_view = input_column.view() - with nogil: - c_result = move(make_unique[column](input_column_view)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + input_column.to_pylibcudf(mode="read").copy() + ) @acquire_spill_lock() From 169a45a751862cccaf9898d6d83eb695c4d7b9bf Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Fri, 6 Dec 2024 07:38:31 -0500 Subject: [PATCH 30/78] Plumb pylibcudf.io.parquet options classes through cudf python (#17506) Apart of #15162 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/17506 --- python/cudf/cudf/_lib/parquet.pyx | 266 ++++++++------------ python/pylibcudf/pylibcudf/io/parquet.pxd | 47 ++++ python/pylibcudf/pylibcudf/io/parquet.pyi | 30 +++ python/pylibcudf/pylibcudf/io/parquet.pyx | 289 +++++++++++++++++++++- 4 files changed, 464 insertions(+), 168 deletions(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 6c80120ad6e..c77c9875342 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -3,7 +3,7 @@ import io import pyarrow as pa - +import itertools import cudf from cudf.core.buffer import acquire_spill_lock @@ -22,45 +22,31 @@ from cudf._lib.utils import _index_level_name, generate_pandas_metadata from libc.stdint cimport int64_t, uint8_t from libcpp cimport bool -from libcpp.map cimport map from libcpp.memory cimport unique_ptr -from libcpp.string cimport string from libcpp.utility cimport move from libcpp.vector cimport vector from pylibcudf.expressions cimport Expression from pylibcudf.io.parquet cimport ChunkedParquetReader -from pylibcudf.libcudf.io.data_sink cimport data_sink -from pylibcudf.libcudf.io.parquet cimport ( - chunked_parquet_writer_options, - parquet_chunked_writer as cpp_parquet_chunked_writer, - parquet_writer_options, - write_parquet as parquet_writer, -) from pylibcudf.libcudf.io.types cimport ( - sink_info, - column_in_metadata, - table_input_metadata, - partition_info, statistics_freq, compression_type, dictionary_policy, ) -from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column from cudf._lib.io.utils cimport ( add_df_col_struct_names, - make_sinks_info, ) -from cudf._lib.utils cimport table_view_from_table import pylibcudf as plc from pylibcudf cimport Table from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT +from pylibcudf.io.types cimport TableInputMetadata, SinkInfo, ColumnInMetadata +from pylibcudf.io.parquet cimport ParquetChunkedWriter from cython.operator cimport dereference @@ -440,44 +426,34 @@ def write_parquet( -------- cudf.io.parquet.write_parquet """ - - # Create the write options - cdef table_input_metadata tbl_meta - - cdef vector[map[string, string]] user_data - cdef table_view tv - cdef vector[unique_ptr[data_sink]] _data_sinks - cdef sink_info sink = make_sinks_info( - filepaths_or_buffers, _data_sinks - ) - if index is True or ( index is None and not isinstance(table._index, cudf.RangeIndex) ): - tv = table_view_from_table(table) - tbl_meta = table_input_metadata(tv) + columns = [*table.index._columns, *table._columns] + plc_table = plc.Table([col.to_pylibcudf(mode="read") for col in columns]) + tbl_meta = TableInputMetadata(plc_table) for level, idx_name in enumerate(table._index.names): tbl_meta.column_metadata[level].set_name( - str.encode( - _index_level_name(idx_name, level, table._column_names) - ) + _index_level_name(idx_name, level, table._column_names) ) num_index_cols_meta = len(table._index.names) else: - tv = table_view_from_table(table, ignore_index=True) - tbl_meta = table_input_metadata(tv) + plc_table = plc.Table( + [col.to_pylibcudf(mode="read") for col in table._columns] + ) + tbl_meta = TableInputMetadata(plc_table) num_index_cols_meta = 0 for i, name in enumerate(table._column_names, num_index_cols_meta): if not isinstance(name, str): if cudf.get_option("mode.pandas_compatible"): - tbl_meta.column_metadata[i].set_name(str(name).encode()) + tbl_meta.column_metadata[i].set_name(str(name)) else: raise ValueError( "Writing a Parquet file requires string column names" ) else: - tbl_meta.column_metadata[i].set_name(name.encode()) + tbl_meta.column_metadata[i].set_name(name) _set_col_metadata( table[name]._column, @@ -489,21 +465,16 @@ def write_parquet( column_type_length, output_as_binary ) - - cdef map[string, string] tmp_user_data if partitions_info is not None: - for start_row, num_row in partitions_info: - partitioned_df = table.iloc[start_row: start_row + num_row].copy( - deep=False - ) - pandas_metadata = generate_pandas_metadata(partitioned_df, index) - tmp_user_data[str.encode("pandas")] = str.encode(pandas_metadata) - user_data.push_back(tmp_user_data) - tmp_user_data.clear() + user_data = [ + {"pandas": generate_pandas_metadata( + table.iloc[start_row:start_row + num_row].copy(deep=False), + index + )} + for start_row, num_row in partitions_info + ] else: - pandas_metadata = generate_pandas_metadata(table, index) - tmp_user_data[str.encode("pandas")] = str.encode(pandas_metadata) - user_data.push_back(tmp_user_data) + user_data = [{"pandas": generate_pandas_metadata(table, index)}] if header_version not in ("1.0", "2.0"): raise ValueError( @@ -519,20 +490,15 @@ def write_parquet( comp_type = _get_comp_type(compression) stat_freq = _get_stat_freq(statistics) - - cdef unique_ptr[vector[uint8_t]] out_metadata_c - cdef vector[string] c_column_chunks_file_paths - cdef bool _int96_timestamps = int96_timestamps - cdef vector[partition_info] partitions - - # Perform write - cdef parquet_writer_options args = move( - parquet_writer_options.builder(sink, tv) + options = ( + plc.io.parquet.ParquetWriterOptions.builder( + plc.io.SinkInfo(filepaths_or_buffers), plc_table + ) .metadata(tbl_meta) - .key_value_metadata(move(user_data)) + .key_value_metadata(user_data) .compression(comp_type) .stats_level(stat_freq) - .int96_timestamps(_int96_timestamps) + .int96_timestamps(int96_timestamps) .write_v2_headers(header_version == "2.0") .dictionary_policy(dict_policy) .utc_timestamps(False) @@ -540,40 +506,27 @@ def write_parquet( .build() ) if partitions_info is not None: - partitions.reserve(len(partitions_info)) - for part in partitions_info: - partitions.push_back( - partition_info(part[0], part[1]) - ) - args.set_partitions(move(partitions)) + options.set_partitions( + [plc.io.types.PartitionInfo(part[0], part[1]) for part in partitions_info] + ) if metadata_file_path is not None: if is_list_like(metadata_file_path): - for path in metadata_file_path: - c_column_chunks_file_paths.push_back(str.encode(path)) + options.set_column_chunks_file_paths(metadata_file_path) else: - c_column_chunks_file_paths.push_back( - str.encode(metadata_file_path) - ) - args.set_column_chunks_file_paths(move(c_column_chunks_file_paths)) + options.set_column_chunks_file_paths([metadata_file_path]) if row_group_size_bytes is not None: - args.set_row_group_size_bytes(row_group_size_bytes) + options.set_row_group_size_bytes(row_group_size_bytes) if row_group_size_rows is not None: - args.set_row_group_size_rows(row_group_size_rows) + options.set_row_group_size_rows(row_group_size_rows) if max_page_size_bytes is not None: - args.set_max_page_size_bytes(max_page_size_bytes) + options.set_max_page_size_bytes(max_page_size_bytes) if max_page_size_rows is not None: - args.set_max_page_size_rows(max_page_size_rows) + options.set_max_page_size_rows(max_page_size_rows) if max_dictionary_size is not None: - args.set_max_dictionary_size(max_dictionary_size) - - with nogil: - out_metadata_c = move(parquet_writer(args)) - + options.set_max_dictionary_size(max_dictionary_size) + blob = plc.io.parquet.write_parquet(options) if metadata_file_path is not None: - out_metadata_py = BufferArrayFromVector.from_unique_ptr( - move(out_metadata_c) - ) - return np.asarray(out_metadata_py) + return np.asarray(blob.obj) else: return None @@ -624,10 +577,9 @@ cdef class ParquetWriter: cudf.io.parquet.write_parquet """ cdef bool initialized - cdef unique_ptr[cpp_parquet_chunked_writer] writer - cdef table_input_metadata tbl_meta - cdef sink_info sink - cdef vector[unique_ptr[data_sink]] _data_sink + cdef ParquetChunkedWriter writer + cdef SinkInfo sink + cdef TableInputMetadata tbl_meta cdef str statistics cdef object compression cdef object index @@ -653,7 +605,7 @@ cdef class ParquetWriter: if is_list_like(filepath_or_buffer) else [filepath_or_buffer] ) - self.sink = make_sinks_info(filepaths_or_buffers, self._data_sink) + self.sink = plc.io.SinkInfo(filepaths_or_buffers) self.statistics = statistics self.compression = compression self.index = index @@ -673,52 +625,29 @@ cdef class ParquetWriter: table, num_partitions=len(partitions_info) if partitions_info else 1 ) - - cdef table_view tv if self.index is not False and ( table._index.name is not None or isinstance(table._index, cudf.core.multiindex.MultiIndex)): - tv = table_view_from_table(table) + columns = [*table.index._columns, *table._columns] + plc_table = plc.Table([col.to_pylibcudf(mode="read") for col in columns]) else: - tv = table_view_from_table(table, ignore_index=True) - - cdef vector[partition_info] partitions - if partitions_info is not None: - for part in partitions_info: - partitions.push_back( - partition_info(part[0], part[1]) - ) - - with nogil: - self.writer.get()[0].write(tv, partitions) + plc_table = plc.Table( + [col.to_pylibcudf(mode="read") for col in table._columns] + ) + self.writer.write(plc_table, partitions_info) def close(self, object metadata_file_path=None): - cdef unique_ptr[vector[uint8_t]] out_metadata_c - cdef vector[string] column_chunks_file_paths - if not self.initialized: return None - - # Update metadata-collection options + column_chunks_file_paths=[] if metadata_file_path is not None: if is_list_like(metadata_file_path): - for path in metadata_file_path: - column_chunks_file_paths.push_back(str.encode(path)) + column_chunks_file_paths = list(metadata_file_path) else: - column_chunks_file_paths.push_back( - str.encode(metadata_file_path) - ) - - with nogil: - out_metadata_c = move( - self.writer.get()[0].close(column_chunks_file_paths) - ) - + column_chunks_file_paths = [metadata_file_path] + blob = self.writer.close(column_chunks_file_paths) if metadata_file_path is not None: - out_metadata_py = BufferArrayFromVector.from_unique_ptr( - move(out_metadata_c) - ) - return np.asarray(out_metadata_py) + return np.asarray(blob.obj) return None def __enter__(self): @@ -730,32 +659,44 @@ cdef class ParquetWriter: def _initialize_chunked_state(self, table, num_partitions=1): """ Prepares all the values required to build the chunked_parquet_writer_options and creates a writer""" - cdef table_view tv # Set the table_metadata num_index_cols_meta = 0 - self.tbl_meta = table_input_metadata( - table_view_from_table(table, ignore_index=True)) + plc_table = plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in table._columns + ] + ) + self.tbl_meta = TableInputMetadata(plc_table) if self.index is not False: if isinstance(table._index, cudf.core.multiindex.MultiIndex): - tv = table_view_from_table(table) - self.tbl_meta = table_input_metadata(tv) + plc_table = plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in itertools.chain(table.index._columns, table._columns) + ] + ) + self.tbl_meta = TableInputMetadata(plc_table) for level, idx_name in enumerate(table._index.names): - self.tbl_meta.column_metadata[level].set_name( - (str.encode(idx_name)) - ) + self.tbl_meta.column_metadata[level].set_name(idx_name) num_index_cols_meta = len(table._index.names) else: if table._index.name is not None: - tv = table_view_from_table(table) - self.tbl_meta = table_input_metadata(tv) - self.tbl_meta.column_metadata[0].set_name( - str.encode(table._index.name) + plc_table = plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in itertools.chain( + table.index._columns, table._columns + ) + ] ) + self.tbl_meta = TableInputMetadata(plc_table) + self.tbl_meta.column_metadata[0].set_name(table._index.name) num_index_cols_meta = 1 for i, name in enumerate(table._column_names, num_index_cols_meta): - self.tbl_meta.column_metadata[i].set_name(name.encode()) + self.tbl_meta.column_metadata[i].set_name(name) _set_col_metadata( table[name]._column, self.tbl_meta.column_metadata[i], @@ -764,13 +705,7 @@ cdef class ParquetWriter: index = ( False if isinstance(table._index, cudf.RangeIndex) else self.index ) - pandas_metadata = generate_pandas_metadata(table, index) - cdef map[string, string] tmp_user_data - tmp_user_data[str.encode("pandas")] = str.encode(pandas_metadata) - cdef vector[map[string, string]] user_data - user_data = vector[map[string, string]](num_partitions, tmp_user_data) - - cdef chunked_parquet_writer_options args + user_data = [{"pandas" : generate_pandas_metadata(table, index)}]*num_partitions cdef compression_type comp_type = _get_comp_type(self.compression) cdef statistics_freq stat_freq = _get_stat_freq(self.statistics) cdef dictionary_policy dict_policy = ( @@ -778,23 +713,22 @@ cdef class ParquetWriter: if self.use_dictionary else plc.io.types.DictionaryPolicy.NEVER ) - with nogil: - args = move( - chunked_parquet_writer_options.builder(self.sink) - .metadata(self.tbl_meta) - .key_value_metadata(move(user_data)) - .compression(comp_type) - .stats_level(stat_freq) - .row_group_size_bytes(self.row_group_size_bytes) - .row_group_size_rows(self.row_group_size_rows) - .max_page_size_bytes(self.max_page_size_bytes) - .max_page_size_rows(self.max_page_size_rows) - .max_dictionary_size(self.max_dictionary_size) - .write_arrow_schema(self.write_arrow_schema) - .build() - ) - args.set_dictionary_policy(dict_policy) - self.writer.reset(new cpp_parquet_chunked_writer(args)) + options = ( + plc.io.parquet.ChunkedParquetWriterOptions.builder(self.sink) + .metadata(self.tbl_meta) + .key_value_metadata(user_data) + .compression(comp_type) + .stats_level(stat_freq) + .row_group_size_bytes(self.row_group_size_bytes) + .row_group_size_rows(self.row_group_size_rows) + .max_page_size_bytes(self.max_page_size_bytes) + .max_page_size_rows(self.max_page_size_rows) + .max_dictionary_size(self.max_dictionary_size) + .write_arrow_schema(self.write_arrow_schema) + .build() + ) + options.set_dictionary_policy(dict_policy) + self.writer = plc.io.parquet.ParquetChunkedWriter.from_options(options) self.initialized = True @@ -837,7 +771,7 @@ cdef compression_type _get_comp_type(object compression): cdef _set_col_metadata( Column col, - column_in_metadata& col_meta, + ColumnInMetadata col_meta, bool force_nullable_schema=False, str path=None, object skip_compression=None, @@ -847,7 +781,7 @@ cdef _set_col_metadata( ): need_path = (skip_compression is not None or column_encoding is not None or column_type_length is not None or output_as_binary is not None) - name = col_meta.get_name().decode('UTF-8') if need_path else None + name = col_meta.get_name() if need_path else None full_path = path + "." + name if path is not None else name if force_nullable_schema: @@ -880,7 +814,7 @@ cdef _set_col_metadata( for i, (child_col, name) in enumerate( zip(col.children, list(col.dtype.fields)) ): - col_meta.child(i).set_name(name.encode()) + col_meta.child(i).set_name(name) _set_col_metadata( child_col, col_meta.child(i), @@ -894,7 +828,7 @@ cdef _set_col_metadata( elif isinstance(col.dtype, cudf.ListDtype): if full_path is not None: full_path = full_path + ".list" - col_meta.child(1).set_name("element".encode()) + col_meta.child(1).set_name("element") _set_col_metadata( col.children[1], col_meta.child(1), diff --git a/python/pylibcudf/pylibcudf/io/parquet.pxd b/python/pylibcudf/pylibcudf/io/parquet.pxd index 79080fa7243..7bd6ba91ca9 100644 --- a/python/pylibcudf/pylibcudf/io/parquet.pxd +++ b/python/pylibcudf/pylibcudf/io/parquet.pxd @@ -15,9 +15,12 @@ from pylibcudf.io.types cimport ( TableWithMetadata, ) from pylibcudf.libcudf.io.parquet cimport ( + parquet_chunked_writer as cpp_parquet_chunked_writer, chunked_parquet_reader as cpp_chunked_parquet_reader, parquet_writer_options, parquet_writer_options_builder, + chunked_parquet_writer_options, + chunked_parquet_writer_options_builder, ) from pylibcudf.libcudf.types cimport size_type from pylibcudf.table cimport Table @@ -46,6 +49,50 @@ cpdef read_parquet( # DataType timestamp_type = * ) + +cdef class ParquetChunkedWriter: + cdef unique_ptr[cpp_parquet_chunked_writer] c_obj + cpdef memoryview close(self, list column_chunks_file_paths) + cpdef void write(self, Table table, object partitions_info=*) + + +cdef class ChunkedParquetWriterOptions: + cdef chunked_parquet_writer_options c_obj + cdef SinkInfo sink + + cpdef void set_dictionary_policy(self, dictionary_policy policy) + + +cdef class ChunkedParquetWriterOptionsBuilder: + cdef chunked_parquet_writer_options_builder c_obj + cdef SinkInfo sink + + cpdef ChunkedParquetWriterOptionsBuilder metadata(self, TableInputMetadata metadata) + + cpdef ChunkedParquetWriterOptionsBuilder key_value_metadata(self, list metadata) + + cpdef ChunkedParquetWriterOptionsBuilder compression( + self, + compression_type compression + ) + + cpdef ChunkedParquetWriterOptionsBuilder stats_level(self, statistics_freq sf) + + cpdef ChunkedParquetWriterOptionsBuilder row_group_size_bytes(self, size_t val) + + cpdef ChunkedParquetWriterOptionsBuilder row_group_size_rows(self, size_type val) + + cpdef ChunkedParquetWriterOptionsBuilder max_page_size_bytes(self, size_t val) + + cpdef ChunkedParquetWriterOptionsBuilder max_page_size_rows(self, size_type val) + + cpdef ChunkedParquetWriterOptionsBuilder max_dictionary_size(self, size_t val) + + cpdef ChunkedParquetWriterOptionsBuilder write_arrow_schema(self, bool enabled) + + cpdef ChunkedParquetWriterOptions build(self) + + cdef class ParquetWriterOptions: cdef parquet_writer_options c_obj cdef Table table_ref diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyi b/python/pylibcudf/pylibcudf/io/parquet.pyi index 3eb3d7c3a92..22bea1abd8e 100644 --- a/python/pylibcudf/pylibcudf/io/parquet.pyi +++ b/python/pylibcudf/pylibcudf/io/parquet.pyi @@ -78,4 +78,34 @@ class ParquetWriterOptionsBuilder: def build(self) -> ParquetWriterOptions: ... def write_parquet(options: ParquetWriterOptions) -> memoryview: ... + +class ParquetChunkedWriter: + def __init__(self): ... + def close(self, metadata_file_path: list) -> memoryview: ... + def write(self, table: Table) -> None: ... + @staticmethod + def from_options(options: ChunkedParquetWriterOptions) -> Self: ... + +class ChunkedParquetWriterOptions: + def __init__(self): ... + def set_dictionary_policy(self, policy: DictionaryPolicy) -> None: ... + @staticmethod + def builder(sink: SinkInfo) -> ChunkedParquetWriterOptionsBuilder: ... + +class ChunkedParquetWriterOptionsBuilder: + def __init__(self): ... + def metadata(self, metadata: TableInputMetadata) -> Self: ... + def key_value_metadata( + self, metadata: list[Mapping[str, str]] + ) -> Self: ... + def compression(self, compression: CompressionType) -> Self: ... + def stats_level(self, sf: StatisticsFreq) -> Self: ... + def row_group_size_bytes(self, val: int) -> Self: ... + def row_group_size_rows(self, val: int) -> Self: ... + def max_page_size_bytes(self, val: int) -> Self: ... + def max_page_size_rows(self, val: int) -> Self: ... + def max_dictionary_size(self, val: int) -> Self: ... + def write_arrow_schema(self, enabled: bool) -> Self: ... + def build(self) -> ChunkedParquetWriterOptions: ... + def merge_row_group_metadata(metdata_list: list) -> memoryview: ... diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyx b/python/pylibcudf/pylibcudf/io/parquet.pyx index 93843c932ad..9bdf849a30c 100644 --- a/python/pylibcudf/pylibcudf/io/parquet.pyx +++ b/python/pylibcudf/pylibcudf/io/parquet.pyx @@ -22,6 +22,8 @@ from pylibcudf.libcudf.io.parquet cimport ( read_parquet as cpp_read_parquet, write_parquet as cpp_write_parquet, parquet_writer_options, + parquet_chunked_writer as cpp_parquet_chunked_writer, + chunked_parquet_writer_options, merge_row_group_metadata as cpp_merge_row_group_metadata, ) from pylibcudf.libcudf.io.types cimport ( @@ -40,6 +42,8 @@ __all__ = [ "ParquetWriterOptionsBuilder", "read_parquet", "write_parquet", + "ChunkedParquetWriterOptions", + "ChunkedParquetWriterOptionsBuilder" "merge_row_group_metadata", ] @@ -247,6 +251,288 @@ cpdef read_parquet( return TableWithMetadata.from_libcudf(c_result) +cdef class ParquetChunkedWriter: + cpdef memoryview close(self, list metadata_file_path): + """ + Closes the chunked Parquet writer. + + Parameters + ---------- + metadata_file_path: list + Column chunks file path to be set in the raw output metadata + + Returns + ------- + None + """ + cdef vector[string] column_chunks_file_paths + cdef unique_ptr[vector[uint8_t]] out_metadata_c + if metadata_file_path: + for path in metadata_file_path: + column_chunks_file_paths.push_back(path.encode()) + with nogil: + out_metadata_c = move(self.c_obj.get()[0].close(column_chunks_file_paths)) + return memoryview(HostBuffer.from_unique_ptr(move(out_metadata_c))) + + cpdef void write(self, Table table, object partitions_info=None): + """ + Writes table to output. + + Parameters + ---------- + table: Table + Table that needs to be written + partitions_info: object, default None + Optional partitions to divide the table into. + If specified, must be same size as number of sinks. + + Returns + ------- + None + """ + if partitions_info is None: + with nogil: + self.c_obj.get()[0].write(table.view()) + return + cdef vector[partition_info] partitions + for part in partitions_info: + partitions.push_back( + partition_info(part[0], part[1]) + ) + with nogil: + self.c_obj.get()[0].write(table.view(), partitions) + + @staticmethod + def from_options(ChunkedParquetWriterOptions options): + """ + Creates a chunked Parquet writer from options + + Parameters + ---------- + options: ChunkedParquetWriterOptions + Settings for controlling writing behavior + + Returns + ------- + ParquetChunkedWriter + """ + cdef ParquetChunkedWriter parquet_writer = ParquetChunkedWriter.__new__( + ParquetChunkedWriter + ) + parquet_writer.c_obj.reset(new cpp_parquet_chunked_writer(options.c_obj)) + return parquet_writer + + +cdef class ChunkedParquetWriterOptions: + @staticmethod + def builder(SinkInfo sink): + """ + Create builder to create ChunkedParquetWriterOptions. + + Parameters + ---------- + sink: SinkInfo + The sink used for writer output + + Returns + ------- + ChunkedParquetWriterOptionsBuilder + """ + cdef ChunkedParquetWriterOptionsBuilder parquet_builder = ( + ChunkedParquetWriterOptionsBuilder.__new__( + ChunkedParquetWriterOptionsBuilder + ) + ) + parquet_builder.c_obj = chunked_parquet_writer_options.builder(sink.c_obj) + parquet_builder.sink = sink + return parquet_builder + + cpdef void set_dictionary_policy(self, dictionary_policy_t policy): + """ + Sets the policy for dictionary use. + + Parameters + ---------- + policy : DictionaryPolicy + Policy for dictionary use + + Returns + ------- + None + """ + self.c_obj.set_dictionary_policy(policy) + + +cdef class ChunkedParquetWriterOptionsBuilder: + cpdef ChunkedParquetWriterOptionsBuilder metadata( + self, + TableInputMetadata metadata + ): + self.c_obj.metadata(metadata.c_obj) + return self + + cpdef ChunkedParquetWriterOptionsBuilder key_value_metadata(self, list metadata): + """ + Sets Key-Value footer metadata. + + Parameters + ---------- + metadata : list[dict[str, str]] + Key-Value footer metadata + + Returns + ------- + Self + """ + self.c_obj.key_value_metadata( + [ + {key.encode(): value.encode() for key, value in mapping.items()} + for mapping in metadata + ] + ) + return self + + cpdef ChunkedParquetWriterOptionsBuilder compression( + self, + compression_type compression + ): + """ + Sets compression type. + + Parameters + ---------- + compression : CompressionType + The compression type to use + + Returns + ------- + Self + """ + self.c_obj.compression(compression) + return self + + cpdef ChunkedParquetWriterOptionsBuilder stats_level(self, statistics_freq sf): + """ + Sets the level of statistics. + + Parameters + ---------- + sf : StatisticsFreq + Level of statistics requested in the output file + + Returns + ------- + Self + """ + self.c_obj.stats_level(sf) + return self + + cpdef ChunkedParquetWriterOptionsBuilder row_group_size_bytes(self, size_t val): + """ + Sets the maximum row group size, in bytes. + + Parameters + ---------- + val : size_t + Maximum row group size, in bytes to set + + Returns + ------- + Self + """ + self.c_obj.row_group_size_bytes(val) + return self + + cpdef ChunkedParquetWriterOptionsBuilder row_group_size_rows(self, size_type val): + """ + Sets the maximum row group size, in rows. + + Parameters + ---------- + val : size_type + Maximum row group size, in rows to set + + Returns + ------- + Self + """ + self.c_obj.row_group_size_rows(val) + return self + + cpdef ChunkedParquetWriterOptionsBuilder max_page_size_bytes(self, size_t val): + """ + Sets the maximum uncompressed page size, in bytes. + + Parameters + ---------- + val : size_t + Maximum uncompressed page size, in bytes to set + + Returns + ------- + Self + """ + self.c_obj.max_page_size_bytes(val) + return self + + cpdef ChunkedParquetWriterOptionsBuilder max_page_size_rows(self, size_type val): + """ + Sets the maximum page size, in rows. + + Parameters + ---------- + val : size_type + Maximum page size, in rows to set. + + Returns + ------- + Self + """ + self.c_obj.max_page_size_rows(val) + return self + + cpdef ChunkedParquetWriterOptionsBuilder max_dictionary_size(self, size_t val): + """ + Sets the maximum dictionary size, in bytes. + + Parameters + ---------- + val : size_t + Sets the maximum dictionary size, in bytes. + + Returns + ------- + Self + """ + self.c_obj.max_dictionary_size(val) + return self + + cpdef ChunkedParquetWriterOptionsBuilder write_arrow_schema(self, bool enabled): + """ + Set to true if arrow schema is to be written. + + Parameters + ---------- + enabled : bool + Boolean value to enable/disable writing of arrow schema. + + Returns + ------- + Self + """ + self.c_obj.write_arrow_schema(enabled) + return self + + cpdef ChunkedParquetWriterOptions build(self): + """Create a ChunkedParquetWriterOptions object""" + cdef ChunkedParquetWriterOptions parquet_options = ( + ChunkedParquetWriterOptions.__new__(ChunkedParquetWriterOptions) + ) + parquet_options.c_obj = move(self.c_obj.build()) + parquet_options.sink = self.sink + return parquet_options + + cdef class ParquetWriterOptions: @staticmethod @@ -571,11 +857,10 @@ cpdef memoryview write_parquet(ParquetWriterOptions options): (parquet FileMetadata thrift message) if requested in parquet_writer_options (empty blob otherwise). """ - cdef parquet_writer_options c_options = options.c_obj cdef unique_ptr[vector[uint8_t]] c_result with nogil: - c_result = cpp_write_parquet(c_options) + c_result = cpp_write_parquet(move(options.c_obj)) return memoryview(HostBuffer.from_unique_ptr(move(c_result))) From 38261f8509245f88bdeab193a1357d9c73d765f0 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 6 Dec 2024 08:32:41 -0500 Subject: [PATCH 31/78] Improve strings contains/find performance for smaller strings (#17330) Replaces usage of `cudf::string_view::find()` with loop and call to `cudf::string_view::compare()` where possible. This showed significant performance improvement. This was also slightly faster than a KMP prototype implementation. Also updates the find/contains benchmarks to remove the 2GB limit and include column versions of the find APIs. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Basit Ayantunde (https://github.com/lamarrr) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17330 --- cpp/benchmarks/string/find.cpp | 59 ++++++++++++++++-------- cpp/include/cudf/strings/string_view.cuh | 17 ++++--- cpp/src/strings/search/find.cu | 24 ++++++---- 3 files changed, 61 insertions(+), 39 deletions(-) diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp index 3ea3ff13a2f..2ba793e998e 100644 --- a/cpp/benchmarks/string/find.cpp +++ b/cpp/benchmarks/string/find.cpp @@ -28,21 +28,19 @@ static void bench_find_string(nvbench::state& state) { - auto const n_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const hit_rate = static_cast(state.get_int64("hit_rate")); auto const api = state.get_string("api"); - - if (static_cast(n_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const tgt_type = state.get_string("target"); auto const stream = cudf::get_default_stream(); - auto const col = create_string_column(n_rows, row_width, hit_rate); + auto const col = create_string_column(num_rows, max_width, hit_rate); auto const input = cudf::strings_column_view(col->view()); - cudf::string_scalar target("0987 5W43"); + auto target = cudf::string_scalar("0987 5W43"); + auto targets_col = cudf::make_column_from_scalar(target, num_rows); + auto const targets = cudf::strings_column_view(targets_col->view()); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); auto const chars_size = input.chars_size(stream); @@ -55,23 +53,44 @@ static void bench_find_string(nvbench::state& state) } if (api == "find") { - state.exec(nvbench::exec_tag::sync, - [&](nvbench::launch& launch) { cudf::strings::find(input, target); }); + if (tgt_type == "scalar") { + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::find(input, target); }); + } else if (tgt_type == "column") { + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::find(input, targets); }); + } } else if (api == "contains") { - state.exec(nvbench::exec_tag::sync, - [&](nvbench::launch& launch) { cudf::strings::contains(input, target); }); + if (tgt_type == "scalar") { + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::contains(input, target); }); + } else if (tgt_type == "column") { + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::contains(input, targets); }); + } } else if (api == "starts_with") { - state.exec(nvbench::exec_tag::sync, - [&](nvbench::launch& launch) { cudf::strings::starts_with(input, target); }); + if (tgt_type == "scalar") { + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::starts_with(input, target); }); + } else if (tgt_type == "column") { + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::starts_with(input, targets); }); + } } else if (api == "ends_with") { - state.exec(nvbench::exec_tag::sync, - [&](nvbench::launch& launch) { cudf::strings::ends_with(input, target); }); + if (tgt_type == "scalar") { + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::ends_with(input, target); }); + } else if (tgt_type == "column") { + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::ends_with(input, targets); }); + } } } NVBENCH_BENCH(bench_find_string) .set_name("find_string") + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) + .add_int64_axis("hit_rate", {20, 80}) // percentage .add_string_axis("api", {"find", "contains", "starts_with", "ends_with"}) - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) - .add_int64_axis("num_rows", {260'000, 1'953'000, 16'777'216}) - .add_int64_axis("hit_rate", {20, 80}); // percentage + .add_string_axis("target", {"scalar", "column"}); diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index 34ed3c5618e..1ae4c3703b2 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -373,24 +373,23 @@ __device__ inline size_type string_view::find_impl(char const* str, size_type pos, size_type count) const { - auto const nchars = length(); - if (!str || pos < 0 || pos > nchars) return npos; - if (count < 0) count = nchars; + if (!str || pos < 0) { return npos; } + if (pos > 0 && pos > length()) { return npos; } // use iterator to help reduce character/byte counting - auto itr = begin() + pos; + auto const itr = begin() + pos; auto const spos = itr.byte_offset(); - auto const epos = ((pos + count) < nchars) ? (itr + count).byte_offset() : size_bytes(); + auto const epos = + (count >= 0) && ((pos + count) < length()) ? (itr + count).byte_offset() : size_bytes(); auto const find_length = (epos - spos) - bytes + 1; + auto const d_target = string_view{str, bytes}; auto ptr = data() + (forward ? spos : (epos - bytes)); for (size_type idx = 0; idx < find_length; ++idx) { - bool match = true; - for (size_type jdx = 0; match && (jdx < bytes); ++jdx) { - match = (ptr[jdx] == str[jdx]); + if (d_target.compare(ptr, bytes) == 0) { + return forward ? pos : character_offset(epos - bytes - idx); } - if (match) { return forward ? pos : character_offset(epos - bytes - idx); } // use pos to record the current find position pos += strings::detail::is_begin_utf8_char(*ptr); forward ? ++ptr : --ptr; diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu index 3cf4970d36e..0f33fcb6fe1 100644 --- a/cpp/src/strings/search/find.cu +++ b/cpp/src/strings/search/find.cu @@ -70,13 +70,11 @@ struct finder_fn { if (d_strings.is_null(idx)) { return -1; } auto const d_str = d_strings.element(idx); if (d_str.empty() && (start > 0)) { return -1; } + if (stop >= 0 && start > stop) { return -1; } auto const d_target = d_targets[idx]; - auto const length = d_str.length(); - auto const begin = (start > length) ? length : start; - auto const end = (stop < 0) || (stop > length) ? length : stop; - return forward ? d_str.find(d_target, begin, end - begin) - : d_str.rfind(d_target, begin, end - begin); + auto const count = (stop < 0) ? stop : (stop - start); + return forward ? d_str.find(d_target, start, count) : d_str.rfind(d_target, start, count); } }; @@ -367,7 +365,7 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings, i += cudf::detail::warp_size * bytes_per_warp) { // check the target matches this part of the d_str data // this is definitely faster for very long strings > 128B - for (auto j = 0; j < bytes_per_warp; j++) { + for (auto j = 0; !found && (j < bytes_per_warp); j++) { if (((i + j + d_target.size_bytes()) <= d_str.size_bytes()) && d_target.compare(d_str.data() + i + j, d_target.size_bytes()) == 0) { found = true; @@ -531,7 +529,6 @@ std::unique_ptr contains_fn(strings_column_view const& strings, results->set_null_count(strings.null_count()); return results; } - } // namespace std::unique_ptr contains(strings_column_view const& input, @@ -541,13 +538,17 @@ std::unique_ptr contains(strings_column_view const& input, { // use warp parallel when the average string width is greater than the threshold if ((input.null_count() < input.size()) && - ((input.chars_size(stream) / input.size()) > AVG_CHAR_BYTES_THRESHOLD)) { + ((input.chars_size(stream) / (input.size() - input.null_count())) > + AVG_CHAR_BYTES_THRESHOLD)) { return contains_warp_parallel(input, target, stream, mr); } // benchmark measurements showed this to be faster for smaller strings auto pfn = [] __device__(string_view d_string, string_view d_target) { - return d_string.find(d_target) != string_view::npos; + for (size_type i = 0; i <= (d_string.size_bytes() - d_target.size_bytes()); ++i) { + if (d_target.compare(d_string.data() + i, d_target.size_bytes()) == 0) { return true; } + } + return false; }; return contains_fn(input, target, pfn, stream, mr); } @@ -558,7 +559,10 @@ std::unique_ptr contains(strings_column_view const& strings, rmm::device_async_resource_ref mr) { auto pfn = [] __device__(string_view d_string, string_view d_target) { - return d_string.find(d_target) != string_view::npos; + for (size_type i = 0; i <= (d_string.size_bytes() - d_target.size_bytes()); ++i) { + if (d_target.compare(d_string.data() + i, d_target.size_bytes()) == 0) { return true; } + } + return false; }; return contains_fn(strings, targets, pfn, stream, mr); } From c791f8044d0d11f55042afd7a66698d8ce2e1973 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 6 Dec 2024 10:20:06 -0800 Subject: [PATCH 32/78] Remove cudf._lib.text in favor of inlining pylibcudf (#17408) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17408 --- python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/text.pyx | 53 ---------------------------- python/cudf/cudf/io/text.py | 45 +++++++++++++++++------ 4 files changed, 34 insertions(+), 66 deletions(-) delete mode 100644 python/cudf/cudf/_lib/text.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 2f05101e8e3..4e1bf860872 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -27,7 +27,6 @@ set(cython_sources stream_compaction.pyx string_casting.pyx strings_udf.pyx - text.pyx transform.pyx types.pyx utils.pyx diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index cb2d0501fea..c79d5100622 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -16,7 +16,6 @@ string_casting, strings, strings_udf, - text, ) MAX_COLUMN_SIZE = np.iinfo(np.int32).max diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx deleted file mode 100644 index 7942d067c2b..00000000000 --- a/python/cudf/cudf/_lib/text.pyx +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libcpp cimport bool - -from io import TextIOBase - -import pylibcudf as plc - -from cudf._lib.column cimport Column - - -def read_text(object filepaths_or_buffers, - str delimiter, - object byte_range, - bool strip_delimiters, - object compression, - object compression_offsets): - """ - Cython function to call into libcudf API, see `multibyte_split`. - - See Also - -------- - cudf.io.text.read_text - """ - if compression is None: - if isinstance(filepaths_or_buffers, TextIOBase): - datasource = plc.io.text.make_source(filepaths_or_buffers.read()) - else: - datasource = plc.io.text.make_source_from_file(filepaths_or_buffers) - elif compression == "bgzip": - if isinstance(filepaths_or_buffers, TextIOBase): - raise ValueError("bgzip compression requires a file path") - if compression_offsets is not None: - if len(compression_offsets) != 2: - raise ValueError( - "compression offsets need to consist of two elements") - datasource = plc.io.text.make_source_from_bgzip_file( - filepaths_or_buffers, - compression_offsets[0], - compression_offsets[1] - ) - else: - datasource = plc.io.text.make_source_from_bgzip_file( - filepaths_or_buffers, - ) - else: - raise ValueError("Only bgzip compression is supported at the moment") - - options = plc.io.text.ParseOptions( - byte_range=byte_range, strip_delimiters=strip_delimiters - ) - plc_column = plc.io.text.multibyte_split(datasource, delimiter, options) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py index 5ce738cae0e..5e266c5ff55 100644 --- a/python/cudf/cudf/io/text.py +++ b/python/cudf/cudf/io/text.py @@ -1,9 +1,10 @@ # Copyright (c) 2018-2024, NVIDIA CORPORATION. -from io import BytesIO, StringIO +from io import BytesIO, StringIO, TextIOBase + +import pylibcudf as plc import cudf -from cudf._lib import text as libtext from cudf.utils import ioutils from cudf.utils.performance_tracking import _performance_tracking @@ -33,13 +34,35 @@ def read_text( filepath_or_buffer, "read_text" ) - return cudf.Series._from_column( - libtext.read_text( - filepath_or_buffer, - delimiter=delimiter, - byte_range=byte_range, - strip_delimiters=strip_delimiters, - compression=compression, - compression_offsets=compression_offsets, - ) + if compression is None: + if isinstance(filepath_or_buffer, TextIOBase): + datasource = plc.io.text.make_source(filepath_or_buffer.read()) + else: + datasource = plc.io.text.make_source_from_file(filepath_or_buffer) + elif compression == "bgzip": + if isinstance(filepath_or_buffer, TextIOBase): + raise ValueError("bgzip compression requires a file path") + if compression_offsets is not None: + if len(compression_offsets) != 2: + raise ValueError( + "Compression offsets need to consist of two elements" + ) + datasource = plc.io.text.make_source_from_bgzip_file( + filepath_or_buffer, + compression_offsets[0], + compression_offsets[1], + ) + else: + datasource = plc.io.text.make_source_from_bgzip_file( + filepath_or_buffer, + ) + else: + raise ValueError("Only bgzip compression is supported at the moment") + + options = plc.io.text.ParseOptions( + byte_range=byte_range, strip_delimiters=strip_delimiters ) + plc_column = plc.io.text.multibyte_split(datasource, delimiter, options) + result = cudf._lib.column.Column.from_pylibcudf(plc_column) + + return cudf.Series._from_column(result) From 467cf7a7c0a248bdba34e48fc8932acff5797016 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Fri, 6 Dec 2024 14:06:08 -0500 Subject: [PATCH 33/78] Replaces uses of `cudf._lib.Column.from_unique_ptr` with `pylibcudf.Column.from_libcudf` (#17517) Apart of #15162. In a follow-up PR we'll deprecate the cudf python column APIs and others that are used outside cudf. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/17517 --- python/cudf/cudf/_lib/strings_udf.pyx | 8 ++++---- python/cudf/cudf/_lib/utils.pyx | 11 ++++++----- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/_lib/strings_udf.pyx b/python/cudf/cudf/_lib/strings_udf.pyx index dd2fafbe07f..83f0cb850a5 100644 --- a/python/cudf/cudf/_lib/strings_udf.pyx +++ b/python/cudf/cudf/_lib/strings_udf.pyx @@ -1,7 +1,6 @@ # Copyright (c) 2022-2024, NVIDIA CORPORATION. from libc.stdint cimport uint8_t, uint16_t, uintptr_t - from pylibcudf.libcudf.strings_udf cimport ( get_character_cases_table as cpp_get_character_cases_table, get_character_flags_table as cpp_get_character_flags_table, @@ -27,6 +26,7 @@ from rmm.librmm.device_buffer cimport device_buffer from rmm.pylibrmm.device_buffer cimport DeviceBuffer from cudf._lib.column cimport Column +from pylibcudf cimport Column as plc_Column def get_cuda_build_version(): @@ -52,9 +52,9 @@ def column_from_udf_string_array(DeviceBuffer d_buffer): c_result = move(cpp_column_from_udf_string_array(data, size)) cpp_free_udf_string_array(data, size) - result = Column.from_unique_ptr(move(c_result)) - - return result + return Column.from_pylibcudf( + plc_Column.from_libcudf(move(c_result)) + ) def get_character_flags_table_ptr(): diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 6b3f10e1806..ff032656f80 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -16,7 +16,7 @@ from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column - +from pylibcudf cimport Column as plc_Column try: import ujson as json except ImportError: @@ -223,10 +223,11 @@ cdef columns_from_unique_ptr( cdef size_t i - columns = [Column.from_unique_ptr(move(dereference(it+i))) - for i in range(c_columns.size())] - - return columns + return [ + Column.from_pylibcudf( + plc_Column.from_libcudf(move(dereference(it+i))) + ) for i in range(c_columns.size()) + ] cpdef columns_from_pylibcudf_table(tbl): From 1a62b46938b76abd00711337d03ff4864845257c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 6 Dec 2024 11:17:11 -0800 Subject: [PATCH 34/78] Remove cudf._lib.round in favor of inlining pylibcudf (#17430) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17430 --- python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/round.pyx | 39 ------------------- .../cudf/cudf/core/column/numerical_base.py | 19 ++++++--- 4 files changed, 14 insertions(+), 46 deletions(-) delete mode 100644 python/cudf/cudf/_lib/round.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 4e1bf860872..cff25f5752c 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -21,7 +21,6 @@ set(cython_sources orc.pyx parquet.pyx reduce.pyx - round.pyx scalar.pyx sort.pyx stream_compaction.pyx diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index c79d5100622..05310d8d232 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -10,7 +10,6 @@ orc, parquet, reduce, - round, sort, stream_compaction, string_casting, diff --git a/python/cudf/cudf/_lib/round.pyx b/python/cudf/cudf/_lib/round.pyx deleted file mode 100644 index f961c09e6f6..00000000000 --- a/python/cudf/cudf/_lib/round.pyx +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc -from pylibcudf.round import RoundingMethod - - -@acquire_spill_lock() -def round(Column input_col, int decimal_places=0, how="half_even"): - """ - Round column values to the given number of decimal places - - Parameters - ---------- - input_col : Column whose values will be rounded - decimal_places : The number or decimal places to round to - - Returns - ------- - A Column with values rounded to the given number of decimal places - """ - if how not in {"half_even", "half_up"}: - raise ValueError("'how' must be either 'half_even' or 'half_up'") - - how = ( - RoundingMethod.HALF_EVEN if how == "half_even" - else RoundingMethod.HALF_UP - ) - - return Column.from_pylibcudf( - plc.round.round( - input_col.to_pylibcudf(mode="read"), - decimal_places, - how - ) - ) diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index ea242e34edb..3f9abdabc2f 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -3,7 +3,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING, Literal, cast import numpy as np @@ -246,12 +246,21 @@ def corr(self, other: NumericalBaseColumn) -> float: return cov / lhs_std / rhs_std def round( - self, decimals: int = 0, how: str = "half_even" + self, + decimals: int = 0, + how: Literal["half_even", "half_up"] = "half_even", ) -> NumericalBaseColumn: if not cudf.api.types.is_integer(decimals): - raise TypeError("Values in decimals must be integers") - """Round the values in the Column to the given number of decimals.""" - return libcudf.round.round(self, decimal_places=decimals, how=how) + raise TypeError("Argument 'decimals' must an integer") + if how not in {"half_even", "half_up"}: + raise ValueError(f"{how=} must be either 'half_even' or 'half_up'") + plc_how = plc.round.RoundingMethod[how.upper()] + with acquire_spill_lock(): + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.round.round( + self.to_pylibcudf(mode="read"), decimals, plc_how + ) + ) def _scan(self, op: str) -> ColumnBase: return libcudf.reduce.scan( From b6f7e6ea33d8f516033508224cd89bbd09a791ee Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 6 Dec 2024 12:55:22 -0800 Subject: [PATCH 35/78] Remove cudf._lib.orc in favor of inlining pylibcudf (#17466) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/17466 --- python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/orc.pyx | 466 ------------------ python/cudf/cudf/io/orc.py | 613 +++++++++++++++++------- python/cudf/cudf/utils/ioutils.py | 161 ++++++- python/pylibcudf/pylibcudf/io/types.pxd | 1 - python/pylibcudf/pylibcudf/io/types.pyi | 2 + python/pylibcudf/pylibcudf/io/types.pyx | 6 +- 8 files changed, 603 insertions(+), 648 deletions(-) delete mode 100644 python/cudf/cudf/_lib/orc.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index cff25f5752c..e98cf283bbb 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -18,7 +18,6 @@ set(cython_sources csv.pyx groupby.pyx interop.pyx - orc.pyx parquet.pyx reduce.pyx scalar.pyx diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 05310d8d232..4758a933898 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -7,7 +7,6 @@ groupby, interop, nvtext, - orc, parquet, reduce, sort, diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx deleted file mode 100644 index c829cac6409..00000000000 --- a/python/cudf/cudf/_lib/orc.pyx +++ /dev/null @@ -1,466 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libc.stdint cimport int64_t -from libcpp cimport bool, int -from libcpp.map cimport map -from libcpp.string cimport string -from libcpp.vector cimport vector -import itertools -from collections import OrderedDict - -try: - import ujson as json -except ImportError: - import json - -cimport pylibcudf.libcudf.lists.lists_column_view as cpp_lists_column_view - -from cudf._lib.column cimport Column -from cudf._lib.io.utils cimport update_col_struct_field_names -from cudf._lib.utils cimport data_from_pylibcudf_io - -import pylibcudf as plc - -import cudf -from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES -from cudf._lib.utils import _index_level_name, generate_pandas_metadata -from cudf.core.buffer import acquire_spill_lock -from pylibcudf.io.types cimport TableInputMetadata, SinkInfo, ColumnInMetadata -from pylibcudf.io.orc cimport OrcChunkedWriter - -# TODO: Consider inlining this function since it seems to only be used in one place. -cpdef read_parsed_orc_statistics(filepath_or_buffer): - """ - Cython function to call into libcudf API, see `read_parsed_orc_statistics`. - - See Also - -------- - cudf.io.orc.read_orc_statistics - """ - - parsed = ( - plc.io.orc.read_parsed_orc_statistics( - plc.io.SourceInfo([filepath_or_buffer]) - ) - ) - - return parsed.column_names, parsed.file_stats, parsed.stripes_stats - - -cpdef read_orc(object filepaths_or_buffers, - object columns=None, - object stripes=None, - object skip_rows=None, - object num_rows=None, - bool use_index=True, - object timestamp_type=None): - """ - Cython function to call into libcudf API, see `read_orc`. - - See Also - -------- - cudf.read_orc - - Notes - ----- - Currently this function only considers the metadata of the first file in the list of - filepaths_or_buffers. - """ - - if columns is not None: - columns = [str(col) for col in columns] - - tbl_w_meta = plc.io.orc.read_orc( - plc.io.SourceInfo(filepaths_or_buffers), - columns, - stripes, - get_skiprows_arg(skip_rows), - get_num_rows_arg(num_rows), - use_index, - plc.types.DataType( - SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[ - cudf.dtype(timestamp_type) - ] - ) - ) - - names = tbl_w_meta.column_names(include_children=False) - - actual_index_names, col_names, is_range_index, reset_index_name, \ - range_idx = _get_index_from_metadata(tbl_w_meta.per_file_user_data, - names, - skip_rows, - num_rows) - - if columns is not None and (isinstance(columns, list) and len(columns) == 0): - # When `columns=[]`, index needs to be - # established, but not the columns. - nrows = tbl_w_meta.tbl.num_rows() - return {}, cudf.RangeIndex(nrows) - - data, index = data_from_pylibcudf_io( - tbl_w_meta, - col_names if columns is None else names, - actual_index_names - ) - - if is_range_index: - index = range_idx - elif reset_index_name: - index.names = [None] * len(index.names) - - child_name_values = tbl_w_meta.child_names.values() - - data = { - name: update_col_struct_field_names( - col, child_names - ) - for (name, col), child_names in zip(data.items(), child_name_values) - } - - return data, index - - -def _get_comp_type(object compression): - if compression is None or compression is False: - return plc.io.types.CompressionType.NONE - - compression = str(compression).upper() - if compression == "SNAPPY": - return plc.io.types.CompressionType.SNAPPY - elif compression == "ZLIB": - return plc.io.types.CompressionType.ZLIB - elif compression == "ZSTD": - return plc.io.types.CompressionType.ZSTD - elif compression == "LZ4": - return plc.io.types.CompressionType.LZ4 - else: - raise ValueError(f"Unsupported `compression` type {compression}") - - -cdef tuple _get_index_from_metadata( - vector[map[string, string]] user_data, - object names, - object skip_rows, - object num_rows): - - meta = None - index_col = None - is_range_index = False - reset_index_name = False - range_idx = None - - if user_data.size() > 0: - json_str = user_data[0][b'pandas'].decode('utf-8') - if json_str != "": - meta = json.loads(json_str) - if 'index_columns' in meta and len(meta['index_columns']) > 0: - index_col = meta['index_columns'] - if isinstance(index_col[0], dict) and \ - index_col[0]['kind'] == 'range': - is_range_index = True - else: - index_col_names = OrderedDict() - for idx_col in index_col: - for c in meta['columns']: - if c['field_name'] == idx_col: - index_col_names[idx_col] = \ - c['name'] or c['field_name'] - if c['name'] is None: - reset_index_name = True - - actual_index_names = None - if index_col is not None and len(index_col) > 0: - if is_range_index: - range_index_meta = index_col[0] - range_idx = cudf.RangeIndex( - start=range_index_meta['start'], - stop=range_index_meta['stop'], - step=range_index_meta['step'], - name=range_index_meta['name'] - ) - if skip_rows is not None: - range_idx = range_idx[skip_rows:] - if num_rows is not None: - range_idx = range_idx[:num_rows] - else: - actual_index_names = list(index_col_names.values()) - names = names[len(actual_index_names):] - - return ( - actual_index_names, - names, - is_range_index, - reset_index_name, - range_idx - ) - - -def _get_orc_stat_freq(str statistics): - """ - Convert ORC statistics terms to CUDF convention: - - ORC "STRIPE" == CUDF "ROWGROUP" - - ORC "ROWGROUP" == CUDF "PAGE" - """ - statistics = str(statistics).upper() - if statistics == "NONE": - return plc.io.types.StatisticsFreq.STATISTICS_NONE - elif statistics == "STRIPE": - return plc.io.types.StatisticsFreq.STATISTICS_ROWGROUP - elif statistics == "ROWGROUP": - return plc.io.types.StatisticsFreq.STATISTICS_PAGE - else: - raise ValueError(f"Unsupported `statistics_freq` type {statistics}") - - -@acquire_spill_lock() -def write_orc( - table, - object path_or_buf, - object compression="snappy", - str statistics="ROWGROUP", - object stripe_size_bytes=None, - object stripe_size_rows=None, - object row_index_stride=None, - object cols_as_map_type=None, - object index=None -): - """ - Cython function to call into libcudf API, see `cudf::io::write_orc`. - - See Also - -------- - cudf.read_orc - """ - user_data = {} - user_data["pandas"] = generate_pandas_metadata(table, index) - if index is True or ( - index is None and not isinstance(table._index, cudf.RangeIndex) - ): - columns = table._columns if table._index is None else [ - *table.index._columns, *table._columns - ] - plc_table = plc.Table([col.to_pylibcudf(mode="read") for col in columns]) - tbl_meta = TableInputMetadata(plc_table) - for level, idx_name in enumerate(table._index.names): - tbl_meta.column_metadata[level].set_name( - _index_level_name(idx_name, level, table._column_names) - ) - num_index_cols_meta = len(table._index.names) - else: - plc_table = plc.Table( - [col.to_pylibcudf(mode="read") for col in table._columns] - ) - tbl_meta = TableInputMetadata(plc_table) - num_index_cols_meta = 0 - - if cols_as_map_type is not None: - cols_as_map_type = set(cols_as_map_type) - - for i, name in enumerate(table._column_names, num_index_cols_meta): - tbl_meta.column_metadata[i].set_name(name) - _set_col_children_metadata( - table[name]._column, - tbl_meta.column_metadata[i], - (cols_as_map_type is not None) - and (name in cols_as_map_type), - ) - - options = ( - plc.io.orc.OrcWriterOptions.builder( - plc.io.SinkInfo([path_or_buf]), plc_table - ) - .metadata(tbl_meta) - .key_value_metadata(user_data) - .compression(_get_comp_type(compression)) - .enable_statistics(_get_orc_stat_freq(statistics)) - .build() - ) - if stripe_size_bytes is not None: - options.set_stripe_size_bytes(stripe_size_bytes) - if stripe_size_rows is not None: - options.set_stripe_size_rows(stripe_size_rows) - if row_index_stride is not None: - options.set_row_index_stride(row_index_stride) - - plc.io.orc.write_orc(options) - - -cdef int64_t get_skiprows_arg(object arg) except*: - arg = 0 if arg is None else arg - if not isinstance(arg, int) or arg < 0: - raise TypeError("skiprows must be an int >= 0") - return arg - -cdef int64_t get_num_rows_arg(object arg) except*: - arg = -1 if arg is None else arg - if not isinstance(arg, int) or arg < -1: - raise TypeError("num_rows must be an int >= -1") - return arg - - -cdef class ORCWriter: - """ - ORCWriter lets you you incrementally write out a ORC file from a series - of cudf tables - - See Also - -------- - cudf.io.orc.to_orc - """ - cdef bool initialized - cdef OrcChunkedWriter writer - cdef SinkInfo sink - cdef str statistics - cdef object compression - cdef object index - cdef TableInputMetadata tbl_meta - cdef object cols_as_map_type - cdef object stripe_size_bytes - cdef object stripe_size_rows - cdef object row_index_stride - - def __cinit__(self, - object path, - object index=None, - object compression="snappy", - str statistics="ROWGROUP", - object cols_as_map_type=None, - object stripe_size_bytes=None, - object stripe_size_rows=None, - object row_index_stride=None): - self.sink = plc.io.SinkInfo([path]) - self.statistics = statistics - self.compression = compression - self.index = index - self.cols_as_map_type = cols_as_map_type \ - if cols_as_map_type is None else set(cols_as_map_type) - self.stripe_size_bytes = stripe_size_bytes - self.stripe_size_rows = stripe_size_rows - self.row_index_stride = row_index_stride - self.initialized = False - - def write_table(self, table): - """ Writes a single table to the file """ - if not self.initialized: - self._initialize_chunked_state(table) - - keep_index = self.index is not False and ( - table._index.name is not None or - isinstance(table._index, cudf.core.multiindex.MultiIndex) - ) - if keep_index: - columns = [ - col.to_pylibcudf(mode="read") - for col in itertools.chain(table.index._columns, table._columns) - ] - else: - columns = [col.to_pylibcudf(mode="read") for col in table._columns] - - self.writer.write(plc.Table(columns)) - - def close(self): - if not self.initialized: - return - - self.writer.close() - - def __dealloc__(self): - self.close() - - def _initialize_chunked_state(self, table): - """ - Prepare all the values required to build the - chunked_orc_writer_options anb creates a writer""" - - num_index_cols_meta = 0 - plc_table = plc.Table( - [ - col.to_pylibcudf(mode="read") - for col in table._columns - ] - ) - self.tbl_meta = TableInputMetadata(plc_table) - if self.index is not False: - if isinstance(table._index, cudf.core.multiindex.MultiIndex): - plc_table = plc.Table( - [ - col.to_pylibcudf(mode="read") - for col in itertools.chain(table.index._columns, table._columns) - ] - ) - self.tbl_meta = TableInputMetadata(plc_table) - for level, idx_name in enumerate(table._index.names): - self.tbl_meta.column_metadata[level].set_name( - idx_name - ) - num_index_cols_meta = len(table._index.names) - else: - if table._index.name is not None: - plc_table = plc.Table( - [ - col.to_pylibcudf(mode="read") - for col in itertools.chain( - table.index._columns, table._columns - ) - ] - ) - self.tbl_meta = TableInputMetadata(plc_table) - self.tbl_meta.column_metadata[0].set_name( - table._index.name - ) - num_index_cols_meta = 1 - - for i, name in enumerate(table._column_names, num_index_cols_meta): - self.tbl_meta.column_metadata[i].set_name(name) - _set_col_children_metadata( - table[name]._column, - self.tbl_meta.column_metadata[i], - (self.cols_as_map_type is not None) - and (name in self.cols_as_map_type), - ) - - user_data = {} - pandas_metadata = generate_pandas_metadata(table, self.index) - user_data["pandas"] = pandas_metadata - - options = ( - plc.io.orc.ChunkedOrcWriterOptions.builder(self.sink) - .metadata(self.tbl_meta) - .key_value_metadata(user_data) - .compression(_get_comp_type(self.compression)) - .enable_statistics(_get_orc_stat_freq(self.statistics)) - .build() - ) - if self.stripe_size_bytes is not None: - options.set_stripe_size_bytes(self.stripe_size_bytes) - if self.stripe_size_rows is not None: - options.set_stripe_size_rows(self.stripe_size_rows) - if self.row_index_stride is not None: - options.set_row_index_stride(self.row_index_stride) - - self.writer = plc.io.orc.OrcChunkedWriter.from_options(options) - - self.initialized = True - -cdef _set_col_children_metadata(Column col, - ColumnInMetadata col_meta, - list_column_as_map=False): - if isinstance(col.dtype, cudf.StructDtype): - for i, (child_col, name) in enumerate( - zip(col.children, list(col.dtype.fields)) - ): - col_meta.child(i).set_name(name) - _set_col_children_metadata( - child_col, col_meta.child(i), list_column_as_map - ) - elif isinstance(col.dtype, cudf.ListDtype): - if list_column_as_map: - col_meta.set_list_column_as_map() - _set_col_children_metadata( - col.children[cpp_lists_column_view.child_column_index], - col_meta.child(cpp_lists_column_view.child_column_index), - list_column_as_map - ) - else: - return diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index 68b60809bb9..5616413b7e4 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -1,147 +1,28 @@ # Copyright (c) 2019-2024, NVIDIA CORPORATION. +from __future__ import annotations -import datetime +import itertools import warnings +from typing import TYPE_CHECKING, Literal import pyarrow as pa +import pylibcudf as plc + import cudf -from cudf._lib import orc as liborc +from cudf._lib.types import dtype_to_pylibcudf_type +from cudf._lib.utils import data_from_pylibcudf_io from cudf.api.types import is_list_like +from cudf.core.buffer import acquire_spill_lock from cudf.utils import ioutils +try: + import ujson as json # type: ignore[import-untyped] +except ImportError: + import json -def _make_empty_df(filepath_or_buffer, columns): - from pyarrow import orc - - orc_file = orc.ORCFile(filepath_or_buffer) - schema = orc_file.schema - col_names = schema.names if columns is None else columns - return cudf.DataFrame._from_data( - data={ - col_name: cudf.core.column.column_empty( - row_count=0, - dtype=schema.field(col_name).type.to_pandas_dtype(), - ) - for col_name in col_names - } - ) - - -def _parse_column_statistics(cs, column_statistics_blob): - # Initialize stats to return and parse stats blob - column_statistics = {} - cs.ParseFromString(column_statistics_blob) - - # Load from parsed stats blob into stats to return - if cs.HasField("numberOfValues"): - column_statistics["number_of_values"] = cs.numberOfValues - if cs.HasField("hasNull"): - column_statistics["has_null"] = cs.hasNull - - if cs.HasField("intStatistics"): - column_statistics["minimum"] = ( - cs.intStatistics.minimum - if cs.intStatistics.HasField("minimum") - else None - ) - column_statistics["maximum"] = ( - cs.intStatistics.maximum - if cs.intStatistics.HasField("maximum") - else None - ) - column_statistics["sum"] = ( - cs.intStatistics.sum if cs.intStatistics.HasField("sum") else None - ) - - elif cs.HasField("doubleStatistics"): - column_statistics["minimum"] = ( - cs.doubleStatistics.minimum - if cs.doubleStatistics.HasField("minimum") - else None - ) - column_statistics["maximum"] = ( - cs.doubleStatistics.maximum - if cs.doubleStatistics.HasField("maximum") - else None - ) - column_statistics["sum"] = ( - cs.doubleStatistics.sum - if cs.doubleStatistics.HasField("sum") - else None - ) - - elif cs.HasField("stringStatistics"): - column_statistics["minimum"] = ( - cs.stringStatistics.minimum - if cs.stringStatistics.HasField("minimum") - else None - ) - column_statistics["maximum"] = ( - cs.stringStatistics.maximum - if cs.stringStatistics.HasField("maximum") - else None - ) - column_statistics["sum"] = cs.stringStatistics.sum - - elif cs.HasField("bucketStatistics"): - column_statistics["true_count"] = cs.bucketStatistics.count[0] - column_statistics["false_count"] = ( - column_statistics["number_of_values"] - - column_statistics["true_count"] - ) - - elif cs.HasField("decimalStatistics"): - column_statistics["minimum"] = ( - cs.decimalStatistics.minimum - if cs.decimalStatistics.HasField("minimum") - else None - ) - column_statistics["maximum"] = ( - cs.decimalStatistics.maximum - if cs.decimalStatistics.HasField("maximum") - else None - ) - column_statistics["sum"] = cs.decimalStatistics.sum - - elif cs.HasField("dateStatistics"): - column_statistics["minimum"] = ( - datetime.datetime.fromtimestamp( - datetime.timedelta(cs.dateStatistics.minimum).total_seconds(), - datetime.timezone.utc, - ) - if cs.dateStatistics.HasField("minimum") - else None - ) - column_statistics["maximum"] = ( - datetime.datetime.fromtimestamp( - datetime.timedelta(cs.dateStatistics.maximum).total_seconds(), - datetime.timezone.utc, - ) - if cs.dateStatistics.HasField("maximum") - else None - ) - - elif cs.HasField("timestampStatistics"): - # Before ORC-135, the local timezone offset was included and they were - # stored as minimum and maximum. After ORC-135, the timestamp is - # adjusted to UTC before being converted to milliseconds and stored - # in minimumUtc and maximumUtc. - # TODO: Support minimum and maximum by reading writer's local timezone - if cs.timestampStatistics.HasField( - "minimumUtc" - ) and cs.timestampStatistics.HasField("maximumUtc"): - column_statistics["minimum"] = datetime.datetime.fromtimestamp( - cs.timestampStatistics.minimumUtc / 1000, datetime.timezone.utc - ) - column_statistics["maximum"] = datetime.datetime.fromtimestamp( - cs.timestampStatistics.maximumUtc / 1000, datetime.timezone.utc - ) - - elif cs.HasField("binaryStatistics"): - column_statistics["sum"] = cs.binaryStatistics.sum - - return column_statistics +if TYPE_CHECKING: + from cudf.core.column import ColumnBase @ioutils.doc_read_orc_metadata() @@ -175,11 +56,12 @@ def read_orc_statistics( path_or_buf = ioutils._select_single_source( path_or_buf, "read_orc_statistics" ) - ( - column_names, - parsed_file_statistics, - parsed_stripes_statistics, - ) = liborc.read_parsed_orc_statistics(path_or_buf) + parsed = plc.io.orc.read_parsed_orc_statistics( + plc.io.SourceInfo([path_or_buf]) + ) + column_names = parsed.column_names + parsed_file_statistics = parsed.file_stats + parsed_stripes_statistics = parsed.stripes_stats # Parse file statistics file_statistics = { @@ -273,16 +155,14 @@ def read_orc( columns=None, filters=None, stripes=None, - skiprows=None, - num_rows=None, - use_index=True, + skiprows: int | None = None, + num_rows: int | None = None, + use_index: bool = True, timestamp_type=None, storage_options=None, bytes_per_thread=None, ): """{docstring}""" - from cudf import DataFrame - if skiprows is not None: # Do not remove until cuIO team approves its removal. warnings.warn( @@ -329,31 +209,132 @@ def read_orc( # Return empty if everything was filtered if len(selected_stripes) == 0: - return _make_empty_df(filepaths_or_buffers[0], columns) + from pyarrow import orc + + orc_file = orc.ORCFile(filepaths_or_buffers[0]) + schema = orc_file.schema + col_names = schema.names if columns is None else columns + return cudf.DataFrame._from_data( + data={ + col_name: cudf.core.column.column_empty( + row_count=0, + dtype=schema.field(col_name).type.to_pandas_dtype(), + ) + for col_name in col_names + } + ) else: stripes = selected_stripes if engine == "cudf": - return DataFrame._from_data( - *liborc.read_orc( - filepaths_or_buffers, - columns, - stripes, - skiprows, - num_rows, - use_index, - timestamp_type, - ) + if columns is not None: + columns = [str(col) for col in columns] + + if skiprows is None: + skiprows = 0 + elif not isinstance(skiprows, int) or skiprows < 0: + raise TypeError("skiprows must be an int >= 0") + + if num_rows is None: + num_rows = -1 + elif not isinstance(num_rows, int) or num_rows < -1: + raise TypeError("num_rows must be an int >= -1") + + tbl_w_meta = plc.io.orc.read_orc( + plc.io.SourceInfo(filepaths_or_buffers), + columns, + stripes, + skiprows, + num_rows, + use_index, + dtype_to_pylibcudf_type(cudf.dtype(timestamp_type)), ) + + if isinstance(columns, list) and len(columns) == 0: + # When `columns=[]`, index needs to be + # established, but not the columns. + nrows = tbl_w_meta.tbl.num_rows() + data = {} + index = cudf.RangeIndex(nrows) + else: + names = tbl_w_meta.column_names(include_children=False) + index_col = None + is_range_index = False + reset_index_name = False + range_idx = None + + if len(tbl_w_meta.per_file_user_data) > 0: + json_str = ( + tbl_w_meta.per_file_user_data[0] + .get(b"pandas", b"") + .decode("utf-8") + ) + if json_str != "": + meta = json.loads(json_str) + if ( + "index_columns" in meta + and len(meta["index_columns"]) > 0 + ): + index_col = meta["index_columns"] + if ( + isinstance(index_col[0], dict) + and index_col[0]["kind"] == "range" + ): + is_range_index = True + else: + index_col_names = {} + for idx_col in index_col: + for c in meta["columns"]: + if c["field_name"] == idx_col: + index_col_names[idx_col] = ( + c["name"] or c["field_name"] + ) + if c["name"] is None: + reset_index_name = True + + actual_index_names = None + col_names = names + if index_col is not None and len(index_col) > 0: + if is_range_index: + range_index_meta = index_col[0] + range_idx = cudf.RangeIndex( + start=range_index_meta["start"], + stop=range_index_meta["stop"], + step=range_index_meta["step"], + name=range_index_meta["name"], + ) + if skiprows != 0: + range_idx = range_idx[skiprows:] + if num_rows != -1: + range_idx = range_idx[:num_rows] + else: + actual_index_names = list(index_col_names.values()) + col_names = names[len(actual_index_names) :] + + data, index = data_from_pylibcudf_io( + tbl_w_meta, + col_names if columns is None else names, + actual_index_names, + ) + + if is_range_index: + index = range_idx + elif reset_index_name: + index.names = [None] * len(index.names) + + child_name_values = tbl_w_meta.child_names.values() + + data = { + name: ioutils._update_col_struct_field_names(col, child_names) + for (name, col), child_names in zip( + data.items(), child_name_values + ) + } + + return cudf.DataFrame._from_data(data, index=index) else: from pyarrow import orc - def read_orc_stripe(orc_file, stripe, columns): - pa_table = orc_file.read_stripe(stripe, columns) - if isinstance(pa_table, pa.RecordBatch): - pa_table = pa.Table.from_batches([pa_table]) - return pa_table - warnings.warn("Using CPU via PyArrow to read ORC dataset.") if len(filepath_or_buffer) > 1: raise NotImplementedError( @@ -364,11 +345,18 @@ def read_orc_stripe(orc_file, stripe, columns): orc_file = orc.ORCFile(filepath_or_buffer[0]) if stripes is not None and len(stripes) > 0: for stripe_source_file in stripes: - pa_tables = [ - read_orc_stripe(orc_file, i, columns) + pa_tables = ( + orc_file.read_stripe(i, columns) for i in stripe_source_file - ] - pa_table = pa.concat_tables(pa_tables) + ) + pa_table = pa.concat_tables( + [ + pa.Table.from_batches([table]) + if isinstance(table, pa.RecordBatch) + else table + for table in pa_tables + ] + ) else: pa_table = orc_file.read(columns=columns) df = cudf.DataFrame.from_arrow(pa_table) @@ -378,16 +366,18 @@ def read_orc_stripe(orc_file, stripe, columns): @ioutils.doc_to_orc() def to_orc( - df, + df: cudf.DataFrame, fname, - compression="snappy", - statistics="ROWGROUP", - stripe_size_bytes=None, - stripe_size_rows=None, - row_index_stride=None, + compression: Literal[ + False, None, "SNAPPY", "ZLIB", "ZSTD", "LZ4" + ] = "SNAPPY", + statistics: Literal["NONE", "STRIPE", "ROWGROUP"] = "ROWGROUP", + stripe_size_bytes: int | None = None, + stripe_size_rows: int | None = None, + row_index_stride: int | None = None, cols_as_map_type=None, storage_options=None, - index=None, + index: bool | None = None, ): """{docstring}""" @@ -413,7 +403,7 @@ def to_orc( if ioutils.is_fsspec_open_file(path_or_buf): with path_or_buf as file_obj: file_obj = ioutils.get_IOBase_writer(file_obj) - liborc.write_orc( + _plc_write_orc( df, file_obj, compression, @@ -425,7 +415,7 @@ def to_orc( index, ) else: - liborc.write_orc( + _plc_write_orc( df, path_or_buf, compression, @@ -438,4 +428,279 @@ def to_orc( ) -ORCWriter = liborc.ORCWriter +@acquire_spill_lock() +def _plc_write_orc( + table: cudf.DataFrame, + path_or_buf, + compression: Literal[ + False, None, "SNAPPY", "ZLIB", "ZSTD", "LZ4" + ] = "SNAPPY", + statistics: Literal["NONE", "STRIPE", "ROWGROUP"] = "ROWGROUP", + stripe_size_bytes: int | None = None, + stripe_size_rows: int | None = None, + row_index_stride: int | None = None, + cols_as_map_type=None, + index: bool | None = None, +) -> None: + """ + See `cudf::io::write_orc`. + + See Also + -------- + cudf.read_orc + """ + user_data = {"pandas": ioutils.generate_pandas_metadata(table, index)} + if index is True or ( + index is None and not isinstance(table.index, cudf.RangeIndex) + ): + columns = ( + table._columns + if table.index is None + else itertools.chain(table.index._columns, table._columns) + ) + plc_table = plc.Table( + [col.to_pylibcudf(mode="read") for col in columns] + ) + tbl_meta = plc.io.types.TableInputMetadata(plc_table) + for level, idx_name in enumerate(table._index.names): + tbl_meta.column_metadata[level].set_name( + ioutils._index_level_name(idx_name, level, table._column_names) # type: ignore[arg-type] + ) + num_index_cols_meta = len(table.index.names) + else: + plc_table = plc.Table( + [col.to_pylibcudf(mode="read") for col in table._columns] + ) + tbl_meta = plc.io.types.TableInputMetadata(plc_table) + num_index_cols_meta = 0 + + has_map_type = False + if cols_as_map_type is not None: + cols_as_map_type = set(cols_as_map_type) + has_map_type = True + + for i, (name, col) in enumerate( + table._column_labels_and_values, start=num_index_cols_meta + ): + tbl_meta.column_metadata[i].set_name(name) + _set_col_children_metadata( + col, + tbl_meta.column_metadata[i], + has_map_type and name in cols_as_map_type, + ) + + options = ( + plc.io.orc.OrcWriterOptions.builder( + plc.io.SinkInfo([path_or_buf]), plc_table + ) + .metadata(tbl_meta) + .key_value_metadata(user_data) + .compression(_get_comp_type(compression)) + .enable_statistics(_get_orc_stat_freq(statistics)) + .build() + ) + if stripe_size_bytes is not None: + options.set_stripe_size_bytes(stripe_size_bytes) + if stripe_size_rows is not None: + options.set_stripe_size_rows(stripe_size_rows) + if row_index_stride is not None: + options.set_row_index_stride(row_index_stride) + + plc.io.orc.write_orc(options) + + +class ORCWriter: + """ + ORCWriter lets you you incrementally write out a ORC file from a series + of cudf tables + + See Also + -------- + cudf.io.orc.to_orc + """ + + def __init__( + self, + path, + index: bool | None = None, + compression: Literal[ + False, None, "SNAPPY", "ZLIB", "ZSTD", "LZ4" + ] = "SNAPPY", + statistics: Literal["NONE", "STRIPE", "ROWGROUP"] = "ROWGROUP", + cols_as_map_type=None, + stripe_size_bytes: int | None = None, + stripe_size_rows: int | None = None, + row_index_stride: int | None = None, + ): + self.sink = plc.io.SinkInfo([path]) + self.statistics = statistics + self.compression = compression + self.index = index + self.cols_as_map_type = ( + cols_as_map_type + if cols_as_map_type is None + else set(cols_as_map_type) + ) + self.stripe_size_bytes = stripe_size_bytes + self.stripe_size_rows = stripe_size_rows + self.row_index_stride = row_index_stride + self.initialized = False + + def write_table(self, table): + """Writes a single table to the file""" + if not self.initialized: + self._initialize_chunked_state(table) + + keep_index = self.index is not False and ( + table.index.name is not None + or isinstance(table.index, cudf.MultiIndex) + ) + if keep_index: + cols_to_write = itertools.chain( + table.index._columns, table._columns + ) + else: + cols_to_write = table._columns + + self.writer.write( + plc.Table([col.to_pylibcudf(mode="read") for col in cols_to_write]) + ) + + def close(self): + if not self.initialized: + return + self.writer.close() + + def _initialize_chunked_state(self, table): + """ + Prepare all the values required to build the + chunked_orc_writer_options anb creates a writer + """ + + num_index_cols_meta = 0 + plc_table = plc.Table( + [col.to_pylibcudf(mode="read") for col in table._columns] + ) + self.tbl_meta = plc.io.types.TableInputMetadata(plc_table) + if self.index is not False: + if isinstance(table.index, cudf.MultiIndex): + plc_table = plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in itertools.chain( + table.index._columns, table._columns + ) + ] + ) + self.tbl_meta = plc.io.types.TableInputMetadata(plc_table) + for level, idx_name in enumerate(table.index.names): + self.tbl_meta.column_metadata[level].set_name(idx_name) + num_index_cols_meta = len(table.index.names) + else: + if table.index.name is not None: + plc_table = plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in itertools.chain( + table.index._columns, table._columns + ) + ] + ) + self.tbl_meta = plc.io.types.TableInputMetadata(plc_table) + self.tbl_meta.column_metadata[0].set_name(table.index.name) + num_index_cols_meta = 1 + + has_map_type = self.cols_as_map_type is not None + for i, (name, col) in enumerate( + table._column_labels_and_values, start=num_index_cols_meta + ): + self.tbl_meta.column_metadata[i].set_name(name) + _set_col_children_metadata( + col, + self.tbl_meta.column_metadata[i], + has_map_type and name in self.cols_as_map_type, + ) + + user_data = { + "pandas": ioutils.generate_pandas_metadata(table, self.index) + } + + options = ( + plc.io.orc.ChunkedOrcWriterOptions.builder(self.sink) + .metadata(self.tbl_meta) + .key_value_metadata(user_data) + .compression(_get_comp_type(self.compression)) + .enable_statistics(_get_orc_stat_freq(self.statistics)) + .build() + ) + if self.stripe_size_bytes is not None: + options.set_stripe_size_bytes(self.stripe_size_bytes) + if self.stripe_size_rows is not None: + options.set_stripe_size_rows(self.stripe_size_rows) + if self.row_index_stride is not None: + options.set_row_index_stride(self.row_index_stride) + + self.writer = plc.io.orc.OrcChunkedWriter.from_options(options) + + self.initialized = True + + +def _get_comp_type( + compression: Literal[False, None, "SNAPPY", "ZLIB", "ZSTD", "LZ4"], +) -> plc.io.types.CompressionType: + if compression is None or compression is False: + return plc.io.types.CompressionType.NONE + + normed_compression = compression.upper() + if normed_compression == "SNAPPY": + return plc.io.types.CompressionType.SNAPPY + elif normed_compression == "ZLIB": + return plc.io.types.CompressionType.ZLIB + elif normed_compression == "ZSTD": + return plc.io.types.CompressionType.ZSTD + elif normed_compression == "LZ4": + return plc.io.types.CompressionType.LZ4 + else: + raise ValueError(f"Unsupported `compression` type {compression}") + + +def _get_orc_stat_freq( + statistics: Literal["NONE", "STRIPE", "ROWGROUP"], +) -> plc.io.types.StatisticsFreq: + """ + Convert ORC statistics terms to CUDF convention: + - ORC "STRIPE" == CUDF "ROWGROUP" + - ORC "ROWGROUP" == CUDF "PAGE" + """ + normed_statistics = statistics.upper() + if normed_statistics == "NONE": + return plc.io.types.StatisticsFreq.STATISTICS_NONE + elif normed_statistics == "STRIPE": + return plc.io.types.StatisticsFreq.STATISTICS_ROWGROUP + elif normed_statistics == "ROWGROUP": + return plc.io.types.StatisticsFreq.STATISTICS_PAGE + else: + raise ValueError(f"Unsupported `statistics_freq` type {statistics}") + + +def _set_col_children_metadata( + col: ColumnBase, + col_meta: plc.io.types.ColumnInMetadata, + list_column_as_map: bool = False, +) -> None: + if isinstance(col.dtype, cudf.StructDtype): + for i, (child_col, name) in enumerate( + zip(col.children, list(col.dtype.fields)) + ): + col_meta.child(i).set_name(name) + _set_col_children_metadata( + child_col, col_meta.child(i), list_column_as_map + ) + elif isinstance(col.dtype, cudf.ListDtype): + if list_column_as_map: + col_meta.set_list_column_as_map() + _set_col_children_metadata( + col.children[1], col_meta.child(1), list_column_as_map + ) + else: + return diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 5681601d2be..d9a3da6666d 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -3,37 +3,45 @@ import datetime import functools +import json import operator import os import urllib import warnings from io import BufferedWriter, BytesIO, IOBase, TextIOWrapper from threading import Thread -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any import fsspec import fsspec.implementations.local import numpy as np import pandas as pd +import pyarrow as pa from fsspec.core import expand_paths_if_needed, get_fs_token_paths import cudf from cudf.api.types import is_list_like from cudf.core._compat import PANDAS_LT_300 from cudf.utils.docutils import docfmt_partial +from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes, np_to_pa_dtype try: import fsspec.parquet as fsspec_parquet - except ImportError: fsspec_parquet = None + if TYPE_CHECKING: - from collections.abc import Callable + from collections.abc import Callable, Hashable from cudf.core.column import ColumnBase +PARQUET_META_TYPE_MAP = { + str(cudf_dtype): str(pandas_dtype) + for cudf_dtype, pandas_dtype in np_dtypes_to_pandas_dtypes.items() +} + _BYTES_PER_THREAD_DEFAULT = 256 * 1024 * 1024 _ROW_GROUP_SIZE_BYTES_DEFAULT = np.iinfo(np.uint64).max @@ -1487,6 +1495,153 @@ ) +def _index_level_name( + index_name: Hashable, level: int, column_names: list[Hashable] +) -> Hashable: + """ + Return the name of an index level or a default name + if `index_name` is None or is already a column name. + + Parameters + ---------- + index_name : name of an Index object + level : level of the Index object + + Returns + ------- + name : str + """ + if index_name is not None and index_name not in column_names: + return index_name + else: + return f"__index_level_{level}__" + + +def generate_pandas_metadata(table: cudf.DataFrame, index: bool | None) -> str: + col_names: list[Hashable] = [] + types = [] + index_levels = [] + index_descriptors = [] + columns_to_convert = list(table._columns) + # Columns + for name, col in table._column_labels_and_values: + if cudf.get_option("mode.pandas_compatible"): + # in pandas-compat mode, non-string column names are stringified. + col_names.append(str(name)) + else: + col_names.append(name) + + if isinstance(col.dtype, cudf.CategoricalDtype): + raise ValueError( + "'category' column dtypes are currently not " + + "supported by the gpu accelerated parquet writer" + ) + elif isinstance( + col.dtype, + (cudf.ListDtype, cudf.StructDtype, cudf.core.dtypes.DecimalDtype), + ): + types.append(col.dtype.to_arrow()) + else: + # A boolean element takes 8 bits in cudf and 1 bit in + # pyarrow. To make sure the cudf format is interoperable + # with arrow, we use `int8` type when converting from a + # cudf boolean array. + if col.dtype.type == np.bool_: + types.append(pa.int8()) + else: + types.append(np_to_pa_dtype(col.dtype)) + + # Indexes + materialize_index = False + if index is not False: + for level, name in enumerate(table.index.names): + if isinstance(table.index, cudf.MultiIndex): + idx = table.index.get_level_values(level) + else: + idx = table.index + + if isinstance(idx, cudf.RangeIndex): + if index is None: + descr: dict[str, Any] | Hashable = { + "kind": "range", + "name": table.index.name, + "start": table.index.start, + "stop": table.index.stop, + "step": table.index.step, + } + else: + materialize_index = True + # When `index=True`, RangeIndex needs to be materialized. + materialized_idx = idx._as_int_index() + descr = _index_level_name( + index_name=materialized_idx.name, + level=level, + column_names=col_names, + ) + index_levels.append(materialized_idx) + columns_to_convert.append(materialized_idx._values) + col_names.append(descr) + types.append(np_to_pa_dtype(materialized_idx.dtype)) + else: + descr = _index_level_name( + index_name=idx.name, level=level, column_names=col_names + ) + columns_to_convert.append(idx._values) + col_names.append(descr) + if isinstance(idx.dtype, cudf.CategoricalDtype): + raise ValueError( + "'category' column dtypes are currently not " + + "supported by the gpu accelerated parquet writer" + ) + elif isinstance(idx.dtype, cudf.ListDtype): + types.append(col.dtype.to_arrow()) + else: + # A boolean element takes 8 bits in cudf and 1 bit in + # pyarrow. To make sure the cudf format is interperable + # in arrow, we use `int8` type when converting from a + # cudf boolean array. + if idx.dtype.type == np.bool_: + types.append(pa.int8()) + else: + types.append(np_to_pa_dtype(idx.dtype)) + + index_levels.append(idx) + index_descriptors.append(descr) + + df_meta = table.head(0) + if materialize_index: + df_meta.index = df_meta.index._as_int_index() + metadata = pa.pandas_compat.construct_metadata( + columns_to_convert=columns_to_convert, + # It is OKAY to do `.head(0).to_pandas()` because + # this method will extract `.columns` metadata only + df=df_meta.to_pandas(), + column_names=col_names, + index_levels=index_levels, + index_descriptors=index_descriptors, + preserve_index=index, + types=types, + ) + + md_dict = json.loads(metadata[b"pandas"]) + + # correct metadata for list and struct and nullable numeric types + for col_meta in md_dict["columns"]: + if ( + col_meta["name"] in table._column_names + and table._data[col_meta["name"]].nullable + and col_meta["numpy_type"] in PARQUET_META_TYPE_MAP + and col_meta["pandas_type"] != "decimal" + ): + col_meta["numpy_type"] = PARQUET_META_TYPE_MAP[ + col_meta["numpy_type"] + ] + if col_meta["numpy_type"] in ("list", "struct"): + col_meta["numpy_type"] = "object" + + return json.dumps(md_dict) + + def is_url(url): """Check if a string is a valid URL to a network location. diff --git a/python/pylibcudf/pylibcudf/io/types.pxd b/python/pylibcudf/pylibcudf/io/types.pxd index a1f3b17936c..61fe33d6805 100644 --- a/python/pylibcudf/pylibcudf/io/types.pxd +++ b/python/pylibcudf/pylibcudf/io/types.pxd @@ -65,7 +65,6 @@ cdef class ColumnInMetadata: cdef class TableInputMetadata: cdef table_input_metadata c_obj - cdef list column_metadata cdef class TableWithMetadata: cdef public Table tbl diff --git a/python/pylibcudf/pylibcudf/io/types.pyi b/python/pylibcudf/pylibcudf/io/types.pyi index a3a559219ff..63fa9d1ff79 100644 --- a/python/pylibcudf/pylibcudf/io/types.pyi +++ b/python/pylibcudf/pylibcudf/io/types.pyi @@ -64,6 +64,8 @@ class PartitionInfo: class TableInputMetadata: def __init__(self, table: Table): ... + @property + def column_metadata(self) -> list[ColumnInMetadata]: ... class ColumnInMetadata: def set_name(self, name: str) -> Self: ... diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx index a2155829f2c..458595ca0e0 100644 --- a/python/pylibcudf/pylibcudf/io/types.pyx +++ b/python/pylibcudf/pylibcudf/io/types.pyx @@ -288,12 +288,14 @@ cdef class TableInputMetadata: """ def __init__(self, Table table): self.c_obj = table_input_metadata(table.view()) - self.column_metadata = [ + + @property + def column_metadata(self): + return [ ColumnInMetadata.from_libcudf(&self.c_obj.column_metadata[i], self) for i in range(self.c_obj.column_metadata.size()) ] - cdef class TableWithMetadata: """A container holding a table and its associated metadata (e.g. column names) From cbeefd8f4e4e67f52331131039533ef1f0ea0a65 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Fri, 6 Dec 2024 16:54:54 -0500 Subject: [PATCH 36/78] Add Parquet Reader options classes to pylibcudf (#17464) Follow up of #17263, this PR adds the parquet reader options classes to pylibcudf and plumbs the changes through cudf python. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Yunsong Wang (https://github.com/PointKernel) - Nghia Truong (https://github.com/ttnghia) - MithunR (https://github.com/mythrocks) URL: https://github.com/rapidsai/cudf/pull/17464 --- cpp/include/cudf/io/parquet.hpp | 1 + python/cudf/cudf/_lib/parquet.pyx | 58 +-- python/cudf_polars/cudf_polars/dsl/ir.py | 44 ++- python/pylibcudf/pylibcudf/io/parquet.pxd | 36 +- python/pylibcudf/pylibcudf/io/parquet.pyi | 21 +- python/pylibcudf/pylibcudf/io/parquet.pyx | 339 +++++++++++------- .../pylibcudf/tests/io/test_parquet.py | 28 +- 7 files changed, 333 insertions(+), 194 deletions(-) diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index bfe76d5690c..b561d0989e9 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -410,6 +410,7 @@ class parquet_reader_options_builder { * * @param val Boolean value whether to read matching projected and filter columns from mismatched * Parquet sources. + * * @return this for chaining. */ parquet_reader_options_builder& allow_mismatched_pq_schemas(bool val) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index c77c9875342..1b4c18d13a7 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -205,7 +205,7 @@ cdef object _process_metadata(object df, else: start = range_index_meta["start"] + skip_rows stop = range_index_meta["stop"] - if nrows != -1: + if nrows > -1: stop = start + nrows idx = cudf.RangeIndex( start=start, @@ -256,16 +256,27 @@ def read_parquet_chunked( # (see read_parquet) allow_range_index = columns is not None and len(columns) != 0 + options = ( + plc.io.parquet.ParquetReaderOptions.builder( + plc.io.SourceInfo(filepaths_or_buffers) + ) + .use_pandas_metadata(use_pandas_metadata) + .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas) + .build() + ) + if row_groups is not None: + options.set_row_groups(row_groups) + if nrows > -1: + options.set_num_rows(nrows) + if skip_rows != 0: + options.set_skip_rows(skip_rows) + if columns is not None: + options.set_columns(columns) + reader = ChunkedParquetReader( - plc.io.SourceInfo(filepaths_or_buffers), - columns, - row_groups, - use_pandas_metadata=use_pandas_metadata, + options, chunk_read_limit=chunk_read_limit, pass_read_limit=pass_read_limit, - skip_rows=skip_rows, - nrows=nrows, - allow_mismatched_pq_schemas=allow_mismatched_pq_schemas, ) tbl_w_meta = reader.read_chunk() @@ -325,19 +336,26 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, if columns is not None and len(columns) == 0 or filters: allow_range_index = False - # Read Parquet - - tbl_w_meta = plc.io.parquet.read_parquet( - plc.io.SourceInfo(filepaths_or_buffers), - columns, - row_groups, - filters, - convert_strings_to_categories = False, - use_pandas_metadata = use_pandas_metadata, - skip_rows = skip_rows, - nrows = nrows, - allow_mismatched_pq_schemas=allow_mismatched_pq_schemas, + options = ( + plc.io.parquet.ParquetReaderOptions.builder( + plc.io.SourceInfo(filepaths_or_buffers) + ) + .use_pandas_metadata(use_pandas_metadata) + .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas) + .build() ) + if row_groups is not None: + options.set_row_groups(row_groups) + if nrows > -1: + options.set_num_rows(nrows) + if skip_rows != 0: + options.set_skip_rows(skip_rows) + if columns is not None: + options.set_columns(columns) + if filters is not None: + options.set_filter(filters) + + tbl_w_meta = plc.io.parquet.read_parquet(options) df = cudf.DataFrame._from_data( *data_from_pylibcudf_io(tbl_w_meta) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 1faa778ccf6..b5af3bb80bf 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -517,17 +517,22 @@ def do_evaluate( elif typ == "parquet": parquet_options = config_options.get("parquet_options", {}) if parquet_options.get("chunked", True): + options = plc.io.parquet.ParquetReaderOptions.builder( + plc.io.SourceInfo(paths) + ).build() + # We handle skip_rows != 0 by reading from the + # up to n_rows + skip_rows and slicing off the + # first skip_rows entries. + # TODO: Remove this workaround once + # https://github.com/rapidsai/cudf/issues/16186 + # is fixed + nrows = n_rows + skip_rows + if nrows > -1: + options.set_num_rows(nrows) + if with_columns is not None: + options.set_columns(with_columns) reader = plc.io.parquet.ChunkedParquetReader( - plc.io.SourceInfo(paths), - columns=with_columns, - # We handle skip_rows != 0 by reading from the - # up to n_rows + skip_rows and slicing off the - # first skip_rows entries. - # TODO: Remove this workaround once - # https://github.com/rapidsai/cudf/issues/16186 - # is fixed - nrows=n_rows + skip_rows, - skip_rows=0, + options, chunk_read_limit=parquet_options.get( "chunk_read_limit", cls.PARQUET_DEFAULT_CHUNK_SIZE ), @@ -573,13 +578,18 @@ def slice_skip(tbl: plc.Table): if predicate is not None and row_index is None: # Can't apply filters during read if we have a row index. filters = to_parquet_filter(predicate.value) - tbl_w_meta = plc.io.parquet.read_parquet( - plc.io.SourceInfo(paths), - columns=with_columns, - filters=filters, - nrows=n_rows, - skip_rows=skip_rows, - ) + options = plc.io.parquet.ParquetReaderOptions.builder( + plc.io.SourceInfo(paths) + ).build() + if n_rows != -1: + options.set_num_rows(n_rows) + if skip_rows != 0: + options.set_skip_rows(skip_rows) + if with_columns is not None: + options.set_columns(with_columns) + if filters is not None: + options.set_filter(filters) + tbl_w_meta = plc.io.parquet.read_parquet(options) df = DataFrame.from_table( tbl_w_meta.tbl, # TODO: consider nested column names? diff --git a/python/pylibcudf/pylibcudf/io/parquet.pxd b/python/pylibcudf/pylibcudf/io/parquet.pxd index 7bd6ba91ca9..84f47cf5305 100644 --- a/python/pylibcudf/pylibcudf/io/parquet.pxd +++ b/python/pylibcudf/pylibcudf/io/parquet.pxd @@ -19,6 +19,8 @@ from pylibcudf.libcudf.io.parquet cimport ( chunked_parquet_reader as cpp_chunked_parquet_reader, parquet_writer_options, parquet_writer_options_builder, + parquet_reader_options, + parquet_reader_options_builder, chunked_parquet_writer_options, chunked_parquet_writer_options_builder, ) @@ -27,6 +29,25 @@ from pylibcudf.table cimport Table from pylibcudf.types cimport DataType +cdef class ParquetReaderOptions: + cdef parquet_reader_options c_obj + cdef SourceInfo source + cpdef void set_row_groups(self, list row_groups) + cpdef void set_num_rows(self, size_type nrows) + cpdef void set_skip_rows(self, int64_t skip_rows) + cpdef void set_columns(self, list col_names) + cpdef void set_filter(self, Expression filter) + +cdef class ParquetReaderOptionsBuilder: + cdef parquet_reader_options_builder c_obj + cdef SourceInfo source + cpdef ParquetReaderOptionsBuilder convert_strings_to_categories(self, bool val) + cpdef ParquetReaderOptionsBuilder use_pandas_metadata(self, bool val) + cpdef ParquetReaderOptionsBuilder allow_mismatched_pq_schemas(self, bool val) + cpdef ParquetReaderOptionsBuilder use_arrow_schema(self, bool val) + cpdef build(self) + + cdef class ChunkedParquetReader: cdef unique_ptr[cpp_chunked_parquet_reader] reader @@ -34,20 +55,7 @@ cdef class ChunkedParquetReader: cpdef TableWithMetadata read_chunk(self) -cpdef read_parquet( - SourceInfo source_info, - list columns = *, - list row_groups = *, - Expression filters = *, - bool convert_strings_to_categories = *, - bool use_pandas_metadata = *, - int64_t skip_rows = *, - size_type nrows = *, - bool allow_mismatched_pq_schemas = *, - # disabled see comment in parquet.pyx for more - # ReaderColumnSchema reader_column_schema = *, - # DataType timestamp_type = * -) +cpdef read_parquet(ParquetReaderOptions options) cdef class ParquetChunkedWriter: diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyi b/python/pylibcudf/pylibcudf/io/parquet.pyi index 22bea1abd8e..2d8d12c1a45 100644 --- a/python/pylibcudf/pylibcudf/io/parquet.pyi +++ b/python/pylibcudf/pylibcudf/io/parquet.pyi @@ -1,7 +1,8 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from collections.abc import Mapping -from typing import Self + +from typing_extensions import Self from pylibcudf.expressions import Expression from pylibcudf.io.types import ( @@ -16,6 +17,24 @@ from pylibcudf.io.types import ( ) from pylibcudf.table import Table +class ParquetReaderOptions: + def __init__(self): ... + def set_row_groups(self, row_groups: list[list[int]]): ... + def set_num_rows(self, nrows: int): ... + def set_skip_rows(self, skip_rows: int): ... + def set_columns(self, col_names: list[str]): ... + def set_filter(self, filter: Expression): ... + @staticmethod + def builder(source: SourceInfo) -> ParquetReaderOptionsBuilder: ... + +class ParquetReaderOptionsBuilder: + def __init__(self): ... + def convert_strings_to_categories(self, val: bool) -> Self: ... + def use_pandas_metadata(self, val: bool) -> Self: ... + def allow_mismatched_pq_schemas(self, val: bool) -> Self: ... + def use_arrow_schema(self, val: bool) -> Self: ... + def build(self) -> ParquetReaderOptions: ... + class ChunkedParquetReader: def __init__( self, diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyx b/python/pylibcudf/pylibcudf/io/parquet.pyx index 9bdf849a30c..672fe2be847 100644 --- a/python/pylibcudf/pylibcudf/io/parquet.pyx +++ b/python/pylibcudf/pylibcudf/io/parquet.pyx @@ -42,47 +42,204 @@ __all__ = [ "ParquetWriterOptionsBuilder", "read_parquet", "write_parquet", + "ParquetReaderOptions", + "ParquetReaderOptionsBuilder", "ChunkedParquetWriterOptions", "ChunkedParquetWriterOptionsBuilder" "merge_row_group_metadata", ] -cdef parquet_reader_options _setup_parquet_reader_options( - SourceInfo source_info, - list columns = None, - list row_groups = None, - Expression filters = None, - bool convert_strings_to_categories = False, - bool use_pandas_metadata = True, - int64_t skip_rows = 0, - size_type nrows = -1, - bool allow_mismatched_pq_schemas=False, - # ReaderColumnSchema reader_column_schema = None, - # DataType timestamp_type = DataType(type_id.EMPTY) -): - cdef vector[string] col_vec - cdef parquet_reader_options opts = ( - parquet_reader_options.builder(source_info.c_obj) - .convert_strings_to_categories(convert_strings_to_categories) - .use_pandas_metadata(use_pandas_metadata) - .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas) - .use_arrow_schema(True) - .build() - ) - if row_groups is not None: - opts.set_row_groups(row_groups) - if nrows != -1: - opts.set_num_rows(nrows) - if skip_rows != 0: - opts.set_skip_rows(skip_rows) - if columns is not None: - col_vec.reserve(len(columns)) - for col in columns: - col_vec.push_back(str(col).encode()) - opts.set_columns(col_vec) - if filters is not None: - opts.set_filter(dereference(filters.c_obj.get())) - return opts + +cdef class ParquetReaderOptions: + """The settings to use for ``read_parquet`` + For details, see :cpp:class:`cudf::io::parquet_reader_options` + """ + @staticmethod + def builder(SourceInfo source): + """ + Create a ParquetReaderOptionsBuilder object + + For details, see :cpp:func:`cudf::io::parquet_reader_options::builder` + + Parameters + ---------- + sink : SourceInfo + The source to read the Parquet file from. + + Returns + ------- + ParquetReaderOptionsBuilder + Builder to build ParquetReaderOptions + """ + cdef ParquetReaderOptionsBuilder parquet_builder = ( + ParquetReaderOptionsBuilder.__new__(ParquetReaderOptionsBuilder) + ) + parquet_builder.c_obj = parquet_reader_options.builder(source.c_obj) + parquet_builder.source = source + return parquet_builder + + cpdef void set_row_groups(self, list row_groups): + """ + Sets list of individual row groups to read. + + Parameters + ---------- + row_groups : list + List of row groups to read + + Returns + ------- + None + """ + cdef vector[vector[size_type]] outer + cdef vector[size_type] inner + for row_group in row_groups: + for x in row_group: + inner.push_back(x) + outer.push_back(inner) + inner.clear() + + self.c_obj.set_row_groups(outer) + + cpdef void set_num_rows(self, size_type nrows): + """ + Sets number of rows to read. + + Parameters + ---------- + nrows : size_type + Number of rows to read after skip + + Returns + ------- + None + """ + self.c_obj.set_num_rows(nrows) + + cpdef void set_skip_rows(self, int64_t skip_rows): + """ + Sets number of rows to skip. + + Parameters + ---------- + skip_rows : int64_t + Number of rows to skip from start + + Returns + ------- + None + """ + self.c_obj.set_skip_rows(skip_rows) + + cpdef void set_columns(self, list col_names): + """ + Sets names of the columns to be read. + + Parameters + ---------- + col_names : list + List of column names + + Returns + ------- + None + """ + cdef vector[string] vec + for name in col_names: + vec.push_back(str(name).encode()) + self.c_obj.set_columns(vec) + + cpdef void set_filter(self, Expression filter): + """ + Sets AST based filter for predicate pushdown. + + Parameters + ---------- + filter : Expression + AST expression to use as filter + + Returns + ------- + None + """ + self.c_obj.set_filter(dereference(filter.c_obj.get())) + + +cdef class ParquetReaderOptionsBuilder: + cpdef ParquetReaderOptionsBuilder convert_strings_to_categories(self, bool val): + """ + Sets enable/disable conversion of strings to categories. + + Parameters + ---------- + val : bool + Boolean value to enable/disable conversion of string columns to categories + + Returns + ------- + ParquetReaderOptionsBuilder + """ + self.c_obj.convert_strings_to_categories(val) + return self + + cpdef ParquetReaderOptionsBuilder use_pandas_metadata(self, bool val): + """ + Sets to enable/disable use of pandas metadata to read. + + Parameters + ---------- + val : bool + Boolean value whether to use pandas metadata + + Returns + ------- + ParquetReaderOptionsBuilder + """ + self.c_obj.use_pandas_metadata(val) + return self + + cpdef ParquetReaderOptionsBuilder allow_mismatched_pq_schemas(self, bool val): + """ + Sets to enable/disable reading of matching projected and filter + columns from mismatched Parquet sources. + + Parameters + ---------- + val : bool + Boolean value whether to read matching projected and filter + columns from mismatched Parquet sources. + + Returns + ------- + ParquetReaderOptionsBuilder + """ + self.c_obj.allow_mismatched_pq_schemas(val) + return self + + cpdef ParquetReaderOptionsBuilder use_arrow_schema(self, bool val): + """ + Sets to enable/disable use of arrow schema to read. + + Parameters + ---------- + val : bool + Boolean value whether to use arrow schema + + Returns + ------- + ParquetReaderOptionsBuilder + """ + self.c_obj.use_arrow_schema(val) + return self + + cpdef build(self): + """Create a ParquetReaderOptions object""" + cdef ParquetReaderOptions parquet_options = ParquetReaderOptions.__new__( + ParquetReaderOptions + ) + parquet_options.c_obj = move(self.c_obj.build()) + parquet_options.source = self.source + return parquet_options cdef class ChunkedParquetReader: @@ -93,63 +250,27 @@ cdef class ChunkedParquetReader: Parameters ---------- - source_info : SourceInfo - The SourceInfo object to read the Parquet file from. - columns : list, default None - The names of the columns to be read - row_groups : list[list[size_type]], default None - List of row groups to be read. - use_pandas_metadata : bool, default True - If True, return metadata about the index column in - the per-file user metadata of the ``TableWithMetadata`` - convert_strings_to_categories : bool, default False - Whether to convert string columns to the category type - skip_rows : int64_t, default 0 - The number of rows to skip from the start of the file. - nrows : size_type, default -1 - The number of rows to read. By default, read the entire file. + options : ParquetReaderOptions + Settings for controlling reading behavior chunk_read_limit : size_t, default 0 Limit on total number of bytes to be returned per read, or 0 if there is no limit. pass_read_limit : size_t, default 1024000000 Limit on the amount of memory used for reading and decompressing data or 0 if there is no limit. - allow_mismatched_pq_schemas : bool, default False - Whether to read (matching) columns specified in `columns` from - the input files with otherwise mismatched schemas. """ def __init__( self, - SourceInfo source_info, - list columns=None, - list row_groups=None, - bool use_pandas_metadata=True, - bool convert_strings_to_categories=False, - int64_t skip_rows = 0, - size_type nrows = -1, + ParquetReaderOptions options, size_t chunk_read_limit=0, size_t pass_read_limit=1024000000, - bool allow_mismatched_pq_schemas=False ): - - cdef parquet_reader_options opts = _setup_parquet_reader_options( - source_info, - columns, - row_groups, - filters=None, - convert_strings_to_categories=convert_strings_to_categories, - use_pandas_metadata=use_pandas_metadata, - skip_rows=skip_rows, - nrows=nrows, - allow_mismatched_pq_schemas=allow_mismatched_pq_schemas, - ) - with nogil: self.reader.reset( new cpp_chunked_parquet_reader( chunk_read_limit, pass_read_limit, - opts + options.c_obj, ) ) @@ -184,69 +305,23 @@ cdef class ChunkedParquetReader: return TableWithMetadata.from_libcudf(c_result) -cpdef read_parquet( - SourceInfo source_info, - list columns = None, - list row_groups = None, - Expression filters = None, - bool convert_strings_to_categories = False, - bool use_pandas_metadata = True, - int64_t skip_rows = 0, - size_type nrows = -1, - bool allow_mismatched_pq_schemas = False, - # Disabled, these aren't used by cudf-python - # we should only add them back in if there's user demand - # ReaderColumnSchema reader_column_schema = None, - # DataType timestamp_type = DataType(type_id.EMPTY) -): - """Reads an Parquet file into a :py:class:`~.types.TableWithMetadata`. + +cpdef read_parquet(ParquetReaderOptions options): + """ + Read from Parquet format. + + The source to read from and options are encapsulated + by the `options` object. For details, see :cpp:func:`read_parquet`. Parameters ---------- - source_info : SourceInfo - The SourceInfo object to read the Parquet file from. - columns : list, default None - The string names of the columns to be read. - row_groups : list[list[size_type]], default None - List of row groups to be read. - filters : Expression, default None - An AST :py:class:`pylibcudf.expressions.Expression` - to use for predicate pushdown. - convert_strings_to_categories : bool, default False - Whether to convert string columns to the category type - use_pandas_metadata : bool, default True - If True, return metadata about the index column in - the per-file user metadata of the ``TableWithMetadata`` - skip_rows : int64_t, default 0 - The number of rows to skip from the start of the file. - nrows : size_type, default -1 - The number of rows to read. By default, read the entire file. - allow_mismatched_pq_schemas : bool, default False - If True, enable reading (matching) columns specified in `columns` - from the input files with otherwise mismatched schemas. - - Returns - ------- - TableWithMetadata - The Table and its corresponding metadata (column names) that were read in. + options: ParquetReaderOptions + Settings for controlling reading behavior """ - cdef table_with_metadata c_result - cdef parquet_reader_options opts = _setup_parquet_reader_options( - source_info, - columns, - row_groups, - filters, - convert_strings_to_categories, - use_pandas_metadata, - skip_rows, - nrows, - allow_mismatched_pq_schemas, - ) - with nogil: - c_result = move(cpp_read_parquet(opts)) + c_result = move(cpp_read_parquet(options.c_obj)) return TableWithMetadata.from_libcudf(c_result) diff --git a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py index 94524acbcc8..da535809745 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py @@ -31,19 +31,24 @@ def test_read_parquet_basic( binary_source_or_sink, pa_table, **_COMMON_PARQUET_SOURCE_KWARGS ) - res = plc.io.parquet.read_parquet( - plc.io.SourceInfo([source]), - nrows=nrows, - skip_rows=skiprows, - columns=columns, - ) + options = plc.io.parquet.ParquetReaderOptions.builder( + plc.io.SourceInfo([source]) + ).build() + if nrows > -1: + options.set_num_rows(nrows) + if skiprows != 0: + options.set_skip_rows(skiprows) + if columns is not None: + options.set_columns(columns) + + res = plc.io.parquet.read_parquet(options) if columns is not None: pa_table = pa_table.select(columns) # Adapt to nrows/skiprows pa_table = pa_table.slice( - offset=skiprows, length=nrows if nrows != -1 else None + offset=skiprows, length=nrows if nrows > -1 else None ) assert_table_and_meta_eq(pa_table, res, check_field_nullability=False) @@ -95,9 +100,12 @@ def test_read_parquet_filters( binary_source_or_sink, pa_table, **_COMMON_PARQUET_SOURCE_KWARGS ) - plc_table_w_meta = plc.io.parquet.read_parquet( - plc.io.SourceInfo([source]), filters=plc_filters - ) + options = plc.io.parquet.ParquetReaderOptions.builder( + plc.io.SourceInfo([source]) + ).build() + options.set_filter(plc_filters) + + plc_table_w_meta = plc.io.parquet.read_parquet(options) exp = read_table(source, filters=pa_filters) assert_table_and_meta_eq( exp, plc_table_w_meta, check_field_nullability=False From 14b4321b5172104c5d9801e196e607e3bb0c4c39 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Fri, 6 Dec 2024 16:27:03 -0600 Subject: [PATCH 37/78] Fix all null list column with missing child column in JSON reader (#17348) Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - Nghia Truong (https://github.com/ttnghia) - Basit Ayantunde (https://github.com/lamarrr) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/17348 --- cpp/src/io/json/host_tree_algorithms.cu | 126 ++++++++++++++------- cpp/src/io/json/json_column.cu | 67 ++++++------ cpp/src/io/json/nested_json.hpp | 12 ++ cpp/src/io/json/parser_features.cpp | 58 +++++++--- cpp/tests/io/json/json_test.cpp | 140 ++++++++++++++++++++++++ 5 files changed, 317 insertions(+), 86 deletions(-) diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu index 7fafa885c66..7b9fc25d1cc 100644 --- a/cpp/src/io/json/host_tree_algorithms.cu +++ b/cpp/src/io/json/host_tree_algorithms.cu @@ -222,18 +222,19 @@ struct json_column_data { using hashmap_of_device_columns = std::unordered_map>; -std::pair, hashmap_of_device_columns> build_tree( - device_json_column& root, - host_span is_str_column_all_nulls, - tree_meta_t& d_column_tree, - device_span d_unique_col_ids, - device_span d_max_row_offsets, - std::vector const& column_names, - NodeIndexT row_array_parent_col_id, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); +std:: + tuple, cudf::detail::host_vector, hashmap_of_device_columns> + build_tree(device_json_column& root, + host_span is_str_column_all_nulls, + tree_meta_t& d_column_tree, + device_span d_unique_col_ids, + device_span d_max_row_offsets, + std::vector const& column_names, + NodeIndexT row_array_parent_col_id, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); void scatter_offsets(tree_meta_t const& tree, device_span col_ids, @@ -242,6 +243,7 @@ void scatter_offsets(tree_meta_t const& tree, device_span sorted_col_ids, // Reuse this for parent_col_ids tree_meta_t const& d_column_tree, host_span ignore_vals, + host_span is_mixed, hashmap_of_device_columns const& columns, rmm::cuda_stream_view stream); @@ -363,17 +365,17 @@ void make_device_json_column(device_span input, } return std::vector(); }(); - auto const [ignore_vals, columns] = build_tree(root, - is_str_column_all_nulls, - d_column_tree, - d_unique_col_ids, - d_max_row_offsets, - column_names, - row_array_parent_col_id, - is_array_of_arrays, - options, - stream, - mr); + auto const [ignore_vals, is_mixed_pruned, columns] = build_tree(root, + is_str_column_all_nulls, + d_column_tree, + d_unique_col_ids, + d_max_row_offsets, + column_names, + row_array_parent_col_id, + is_array_of_arrays, + options, + stream, + mr); if (ignore_vals.empty()) return; scatter_offsets(tree, col_ids, @@ -382,22 +384,24 @@ void make_device_json_column(device_span input, sorted_col_ids, d_column_tree, ignore_vals, + is_mixed_pruned, columns, stream); } -std::pair, hashmap_of_device_columns> build_tree( - device_json_column& root, - host_span is_str_column_all_nulls, - tree_meta_t& d_column_tree, - device_span d_unique_col_ids, - device_span d_max_row_offsets, - std::vector const& column_names, - NodeIndexT row_array_parent_col_id, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std:: + tuple, cudf::detail::host_vector, hashmap_of_device_columns> + build_tree(device_json_column& root, + host_span is_str_column_all_nulls, + tree_meta_t& d_column_tree, + device_span d_unique_col_ids, + device_span d_max_row_offsets, + std::vector const& column_names, + NodeIndexT row_array_parent_col_id, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { bool const is_enabled_lines = options.is_enabled_lines(); bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); @@ -488,7 +492,9 @@ std::pair, hashmap_of_device_columns> build_tree // NoPruning: iterate through schema and enforce type. if (adj[parent_node_sentinel].empty()) - return {cudf::detail::make_host_vector(0, stream), {}}; // for empty file + return {cudf::detail::make_host_vector(0, stream), + cudf::detail::make_host_vector(0, stream), + {}}; // for empty file CUDF_EXPECTS(adj[parent_node_sentinel].size() == 1, "Should be 1"); auto expected_types = cudf::detail::make_host_vector(num_columns, stream); std::fill_n(expected_types.begin(), num_columns, NUM_NODE_CLASSES); @@ -551,11 +557,14 @@ std::pair, hashmap_of_device_columns> build_tree auto list_child = schema.child_types.at(this_list_child_name); for (auto const& child_id : child_ids) mark_is_pruned(child_id, list_child); + // TODO: Store null map of non-target types for list children to mark list entry as null. } }; if (is_array_of_arrays) { if (adj[adj[parent_node_sentinel][0]].empty()) - return {cudf::detail::make_host_vector(0, stream), {}}; + return {cudf::detail::make_host_vector(0, stream), + cudf::detail::make_host_vector(0, stream), + {}}; auto root_list_col_id = is_enabled_lines ? adj[parent_node_sentinel][0] : adj[adj[parent_node_sentinel][0]][0]; // mark root and row array col_id as not pruned. @@ -647,8 +656,12 @@ std::pair, hashmap_of_device_columns> build_tree ? adj[parent_node_sentinel][0] : (adj[adj[parent_node_sentinel][0]].empty() ? -1 : adj[adj[parent_node_sentinel][0]][0]); + // List children which are pruned mixed types, nullify parent list row. + auto is_mixed_pruned = cudf::detail::make_host_vector(num_columns, stream); + std::fill_n(is_mixed_pruned.begin(), num_columns, false); auto handle_mixed_types = [&column_categories, &is_str_column_all_nulls, + &is_mixed_pruned, &is_pruned, &expected_types, &is_enabled_mixed_types_as_string, @@ -794,6 +807,14 @@ std::pair, hashmap_of_device_columns> build_tree "list child column insertion failed, duplicate column name in the parent"); ref.get().column_order.emplace_back(list_child_name); auto this_ref = std::ref(ref.get().child_columns.at(list_child_name)); + if (options.is_enabled_experimental()) { + for (auto const& child_id : child_ids) { + if (is_pruned[child_id]) { + // store this child_id for mixed_type nullify parent list_id. + is_mixed_pruned[child_id] = is_pruned[child_id]; + } + } + } // Mixed type handling handle_mixed_types(child_ids); if (child_ids.empty()) { @@ -829,7 +850,7 @@ std::pair, hashmap_of_device_columns> build_tree [](auto exp, auto cat) { return exp == NUM_NODE_CLASSES ? cat : exp; }); cudf::detail::cuda_memcpy_async(d_column_tree.node_categories, expected_types, stream); - return {is_pruned, columns}; + return {is_pruned, is_mixed_pruned, columns}; } void scatter_offsets(tree_meta_t const& tree, @@ -839,6 +860,7 @@ void scatter_offsets(tree_meta_t const& tree, device_span sorted_col_ids, // Reuse this for parent_col_ids tree_meta_t const& d_column_tree, host_span ignore_vals, + host_span is_mixed_pruned, hashmap_of_device_columns const& columns, rmm::cuda_stream_view stream) { @@ -857,6 +879,8 @@ void scatter_offsets(tree_meta_t const& tree, auto d_ignore_vals = cudf::detail::make_device_uvector_async( ignore_vals, stream, cudf::get_current_device_resource_ref()); + auto d_is_mixed_pruned = cudf::detail::make_device_uvector_async( + is_mixed_pruned, stream, cudf::get_current_device_resource_ref()); auto d_columns_data = cudf::detail::make_device_uvector_async( columns_data, stream, cudf::get_current_device_resource_ref()); @@ -921,9 +945,31 @@ void scatter_offsets(tree_meta_t const& tree, column_categories[col_ids[parent_node_id]] == NC_LIST and (!d_ignore_vals[col_ids[parent_node_id]]); }); + // For children of list and in ignore_vals, find it's parent node id, and set corresponding + // parent's null mask to null. Setting mixed type list rows to null. + auto const num_list_children = thrust::distance( + thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()), list_children_end); + thrust::for_each_n( + rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + num_list_children, + [node_ids = node_ids.begin(), + parent_node_ids = tree.parent_node_ids.begin(), + column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin(), + row_offsets = row_offsets.begin(), + d_is_mixed_pruned = d_is_mixed_pruned.begin(), + d_ignore_vals = d_ignore_vals.begin(), + d_columns_data = d_columns_data.begin()] __device__(size_type i) { + auto const node_id = node_ids[i]; + auto const parent_node_id = parent_node_ids[node_id]; + if (parent_node_id == parent_node_sentinel or d_ignore_vals[col_ids[parent_node_id]]) return; + if (column_categories[col_ids[parent_node_id]] == NC_LIST and + d_is_mixed_pruned[col_ids[node_id]]) { + clear_bit(d_columns_data[col_ids[parent_node_id]].validity, row_offsets[parent_node_id]); + } + }); - auto const num_list_children = - list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()); thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream), parent_col_ids.begin(), parent_col_ids.begin() + num_list_children, diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 30a154fdda2..1fe58a0449f 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -464,46 +464,49 @@ std::pair, std::vector> device_json_co column_names.emplace_back( json_col.child_columns.empty() ? list_child_name : json_col.child_columns.begin()->first); - // Note: json_col modified here, reuse the memory + // If child is not present, set the null mask correctly, but offsets are zero, and children + // are empty. Note: json_col modified here, reuse the memory auto offsets_column = std::make_unique(data_type{type_id::INT32}, num_rows + 1, json_col.child_offsets.release(), rmm::device_buffer{}, 0); // Create children column - auto child_schema_element = - json_col.child_columns.empty() ? std::optional{} : get_list_child_schema(); - auto [child_column, names] = - json_col.child_columns.empty() or (prune_columns and !child_schema_element.has_value()) - ? std::pair, - // EMPTY type could not used because gather throws exception on EMPTY type. - std::vector>{std::make_unique( - data_type{type_id::INT8}, - 0, - rmm::device_buffer{}, - rmm::device_buffer{}, - 0), - std::vector{}} - : device_json_column_to_cudf_column(json_col.child_columns.begin()->second, - d_input, - options, - prune_columns, - child_schema_element, - stream, - mr); + auto child_schema_element = get_list_child_schema(); + auto [child_column, names] = [&]() { + if (json_col.child_columns.empty()) { + // EMPTY type could not used because gather throws exception on EMPTY type. + auto empty_col = make_empty_column( + child_schema_element.value_or(schema_element{data_type{type_id::INT8}}), stream, mr); + auto children_metadata = std::vector{ + make_column_name_info( + child_schema_element.value_or(schema_element{data_type{type_id::INT8}}), + list_child_name) + .children}; + + return std::pair, std::vector>{ + std::move(empty_col), children_metadata}; + } + return device_json_column_to_cudf_column(json_col.child_columns.begin()->second, + d_input, + options, + prune_columns, + child_schema_element, + stream, + mr); + }(); column_names.back().children = names; auto [result_bitmask, null_count] = make_validity(json_col); - auto ret_col = make_lists_column(num_rows, - std::move(offsets_column), - std::move(child_column), - 0, - rmm::device_buffer{0, stream, mr}, - stream, - mr); - // The null_mask is set after creation of list column is to skip the purge_nonempty_nulls and - // null validation applied in make_lists_column factory, which is not needed for json - // parent column cannot be null when its children is non-empty in JSON - if (null_count != 0) { ret_col->set_null_mask(std::move(result_bitmask), null_count); } + auto ret_col = make_lists_column( + num_rows, + std::move(offsets_column), + std::move(child_column), + null_count, + null_count == 0 ? rmm::device_buffer{0, stream, mr} : std::move(result_bitmask), + stream, + mr); + // Since some rows in child column may need to be nullified due to mixed types, we can not + // skip the purge_nonempty_nulls call in make_lists_column factory return {std::move(ret_col), std::move(column_names)}; } default: CUDF_FAIL("Unsupported column type"); break; diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 4989fff4b30..2f6942fe139 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -429,6 +429,18 @@ table_with_metadata device_parse_nested_json(device_span input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); +/** + * @brief Create empty column of a given nested schema + * + * @param schema The schema of the column to create + * @param stream The CUDA stream to which kernels are dispatched + * @param mr resource with which to allocate + * @return The empty column + */ +std::unique_ptr make_empty_column(schema_element const& schema, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + /** * @brief Create all null column of a given nested schema * diff --git a/cpp/src/io/json/parser_features.cpp b/cpp/src/io/json/parser_features.cpp index ced7acb9cde..2da320b2af3 100644 --- a/cpp/src/io/json/parser_features.cpp +++ b/cpp/src/io/json/parser_features.cpp @@ -159,7 +159,17 @@ struct empty_column_functor { std::unique_ptr child = cudf::type_dispatcher( schema.child_types.at(child_name).type, *this, schema.child_types.at(child_name)); auto offsets = make_empty_column(data_type(type_to_id())); - return make_lists_column(0, std::move(offsets), std::move(child), 0, {}, stream, mr); + std::vector> child_columns; + child_columns.push_back(std::move(offsets)); + child_columns.push_back(std::move(child)); + // Do not use `cudf::make_lists_column` since we do not need to call `purge_nonempty_nulls` on + // the child column as it does not have non-empty nulls. Look issue #17356 + return std::make_unique(cudf::data_type{type_id::LIST}, + 0, + rmm::device_buffer{}, + rmm::device_buffer{}, + 0, + std::move(child_columns)); } template )> @@ -174,6 +184,13 @@ struct empty_column_functor { } }; +std::unique_ptr make_empty_column(schema_element const& schema, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + return cudf::type_dispatcher(schema.type, empty_column_functor{stream, mr}, schema); +} + /// Created all null column of the specified schema struct allnull_column_functor { rmm::cuda_stream_view stream; @@ -198,10 +215,9 @@ struct allnull_column_functor { std::unique_ptr operator()(schema_element const& schema, size_type size) const { CUDF_EXPECTS(schema.child_types.size() == 1, "Dictionary column should have only one child"); - auto const& child_name = schema.child_types.begin()->first; - std::unique_ptr child = cudf::type_dispatcher(schema.child_types.at(child_name).type, - empty_column_functor{stream, mr}, - schema.child_types.at(child_name)); + auto const& child_name = schema.child_types.begin()->first; + std::unique_ptr child = + make_empty_column(schema.child_types.at(child_name), stream, mr); return make_fixed_width_column(schema.type, size, mask_state::ALL_NULL, stream, mr); auto indices = make_zeroed_offsets(size - 1); auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr); @@ -221,14 +237,22 @@ struct allnull_column_functor { std::unique_ptr operator()(schema_element const& schema, size_type size) const { CUDF_EXPECTS(schema.child_types.size() == 1, "List column should have only one child"); - auto const& child_name = schema.child_types.begin()->first; - std::unique_ptr child = cudf::type_dispatcher(schema.child_types.at(child_name).type, - empty_column_functor{stream, mr}, - schema.child_types.at(child_name)); - auto offsets = make_zeroed_offsets(size); + auto const& child_name = schema.child_types.begin()->first; + std::unique_ptr child = + make_empty_column(schema.child_types.at(child_name), stream, mr); + auto offsets = make_zeroed_offsets(size); auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr); - return make_lists_column( - size, std::move(offsets), std::move(child), size, std::move(null_mask), stream, mr); + std::vector> child_columns; + child_columns.push_back(std::move(offsets)); + child_columns.push_back(std::move(child)); + // Do not use `cudf::make_lists_column` since we do not need to call `purge_nonempty_nulls` on + // the child column as it does not have non-empty nulls. Look issue #17356 + return std::make_unique(cudf::data_type{type_id::LIST}, + size, + rmm::device_buffer{}, + std::move(null_mask), + size, + std::move(child_columns)); } template )> @@ -240,8 +264,14 @@ struct allnull_column_functor { schema.child_types.at(child_name).type, *this, schema.child_types.at(child_name), size)); } auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr); - return make_structs_column( - size, std::move(child_columns), size, std::move(null_mask), stream, mr); + // Do not use `cudf::make_structs_column` since we do not need to call `superimpose_nulls` on + // the children columns. Look issue #17356 + return std::make_unique(cudf::data_type{type_id::STRUCT}, + size, + rmm::device_buffer{}, + std::move(null_mask), + size, + std::move(child_columns)); } }; diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index 3c8db99c3c7..37a750330fa 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -56,6 +56,8 @@ using int16_wrapper = wrapper; using int64_wrapper = wrapper; using timestamp_ms_wrapper = wrapper; using bool_wrapper = wrapper; +using size_type_wrapper = wrapper; +using strings_wrapper = cudf::test::strings_column_wrapper; using cudf::data_type; using cudf::type_id; @@ -3253,6 +3255,144 @@ TEST_F(JsonReaderTest, JsonNestedDtypeFilterWithOrder) CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), *wrapped); } } + + // test list (all-null) of struct (empty) of string (empty) + { + std::string json_stringl = R"( + {"a" : [1], "c2": [1, 2]} + {} + )"; + auto lines = true; + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_stringl.data(), json_stringl.size()}) + .prune_columns(true) + .experimental(true) + .lines(lines); + + cudf::io::schema_element dtype_schema{ + data_type{cudf::type_id::STRUCT}, + { + {"a", {data_type{cudf::type_id::LIST}, {{"element", {dtype()}}}}}, + {"c2", + {data_type{cudf::type_id::LIST}, + {{"element", + {data_type{cudf::type_id::STRUCT}, + { + {"d", {data_type{cudf::type_id::STRING}}}, + }, + {{"d"}}}}}}}, + }, + {{"a", "c2"}}}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have column "a":[int64_t] + ASSERT_EQ(result.tbl->num_columns(), 2); + ASSERT_EQ(result.metadata.schema_info.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + ASSERT_EQ(result.metadata.schema_info[0].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "offsets"); + EXPECT_EQ(result.metadata.schema_info[0].children[1].name, "element"); + // Make sure we have all null list "c2": [{"d": ""}] + EXPECT_EQ(result.metadata.schema_info[1].name, "c2"); + ASSERT_EQ(result.metadata.schema_info[1].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[1].children[0].name, "offsets"); + EXPECT_EQ(result.metadata.schema_info[1].children[1].name, "element"); + ASSERT_EQ(result.metadata.schema_info[1].children[1].children.size(), 1); + EXPECT_EQ(result.metadata.schema_info[1].children[1].children[0].name, "d"); + + auto const expected0 = [&] { + auto const valids = std::vector{1, 0}; + auto [null_mask, null_count] = + cudf::test::detail::make_null_mask(valids.begin(), valids.end()); + return cudf::make_lists_column(2, + size_type_wrapper{0, 1, 1}.release(), + int64_wrapper{1}.release(), + null_count, + std::move(null_mask)); + }(); + + auto const expected1 = [&] { + auto const get_structs = [] { + auto child = cudf::test::strings_column_wrapper{}; + return cudf::test::structs_column_wrapper{{child}}; + }; + auto const valids = std::vector{0, 0}; + auto [null_mask, null_count] = + cudf::test::detail::make_null_mask(valids.begin(), valids.end()); + return cudf::make_lists_column(2, + size_type_wrapper{0, 0, 0}.release(), + get_structs().release(), + null_count, + std::move(null_mask)); + }(); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected0, result.tbl->get_column(0).view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected1, result.tbl->get_column(1).view()); + } +} + +TEST_F(JsonReaderTest, NullifyMixedList) +{ + using namespace cudf::test::iterators; + // test list + std::string json_stringl = R"( + {"c2": []} + {"c2": [{}]} + {"c2": [[]]} + {"c2": [{}, [], {}]} + {"c2": [[123], {"b": "1"}]} + {"c2": [{"x": "y"}, {"b": "1"}]} + {} + )"; + // [], [{null, null}], null, null, null, [{null, null}, {1, null}], null + // valid 1 1 0 0 0 1 0 + // ofset 0, 0, 1, 1, 1, 1, 3, 3 + // child {null, null}, {null, null}, {1, null} + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_stringl.data(), json_stringl.size()}) + .prune_columns(true) + .experimental(true) + .lines(true); + + // struct>> eg. {"c2": [{"b": "1", "c": "2"}]} + cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT}, + { + {"c2", + {data_type{cudf::type_id::LIST}, + {{"element", + {data_type{cudf::type_id::STRUCT}, + { + {"b", {data_type{cudf::type_id::STRING}}}, + {"c", {data_type{cudf::type_id::STRING}}}, + }, + {{"b", "c"}}}}}}}, + }, + {{"c2"}}}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + ASSERT_EQ(result.tbl->num_columns(), 1); + ASSERT_EQ(result.metadata.schema_info.size(), 1); + + // Expected: A list of struct of 2-string columns + // [], [{null, null}], null, null, null, [{null, null}, {1, null}], null + auto get_structs = [] { + strings_wrapper child0{{"", "", "1"}, nulls_at({0, 0, 1})}; + strings_wrapper child1{{"", "", ""}, all_nulls()}; + // purge non-empty nulls in list seems to retain nullmask in struct child column + return cudf::test::structs_column_wrapper{{child0, child1}, no_nulls()}.release(); + }; + std::vector const list_nulls{1, 1, 0, 0, 0, 1, 0}; + auto [null_mask, null_count] = + cudf::test::detail::make_null_mask(list_nulls.cbegin(), list_nulls.cend()); + auto const expected = cudf::make_lists_column( + 7, + cudf::test::fixed_width_column_wrapper{0, 0, 1, 1, 1, 1, 3, 3}.release(), + get_structs(), + null_count, + std::move(null_mask)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, result.tbl->get_column(0).view()); } struct JsonCompressedIOTest : public cudf::test::BaseFixture, From 80fc629aab1cc459b9ff8f0e9fee379a82219815 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Sat, 7 Dec 2024 01:41:33 -0600 Subject: [PATCH 38/78] Update cuda-python lower bounds to 12.6.2 / 11.8.5 (#17547) We require a newer cuda-python lower bound for new features and to use the new layout. This will fix a number of errors observed when the runtime version of cuda-python is older than the version used to build packages using Cython features from cuda-python. See https://github.com/rapidsai/build-planning/issues/117#issuecomment-2524250915 for details. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - James Lamb (https://github.com/jameslamb) URL: https://github.com/rapidsai/cudf/pull/17547 --- conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +- conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +- conda/recipes/cudf/meta.yaml | 4 ++-- conda/recipes/pylibcudf/meta.yaml | 4 ++-- dependencies.yaml | 8 ++++---- python/cudf/pyproject.toml | 2 +- python/pylibcudf/pyproject.toml | 2 +- 7 files changed, 12 insertions(+), 12 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 87c40421be0..bad508154aa 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -19,7 +19,7 @@ dependencies: - cramjam - cubinlinker - cuda-nvtx=11.8 -- cuda-python>=11.7.1,<12.0a0 +- cuda-python>=11.8.5,<12.0a0 - cuda-sanitizer-api=11.8.86 - cuda-version=11.8 - cudatoolkit diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 0935de96d19..969124a29ad 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -21,7 +21,7 @@ dependencies: - cuda-nvcc - cuda-nvrtc-dev - cuda-nvtx-dev -- cuda-python>=12.0,<13.0a0 +- cuda-python>=12.6.2,<13.0a0 - cuda-sanitizer-api - cuda-version=12.5 - cupy>=12.0.0 diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index e52b8c5f2a0..2c16deeed82 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -91,7 +91,7 @@ requirements: - cudatoolkit - ptxcompiler >=0.7.0 - cubinlinker # CUDA enhanced compatibility. - - cuda-python >=11.7.1,<12.0a0 + - cuda-python >=11.8.5,<12.0a0 {% else %} - cuda-cudart - libcufile # [linux64] @@ -100,7 +100,7 @@ requirements: # TODO: Add nvjitlink here # xref: https://github.com/rapidsai/cudf/issues/12822 - cuda-nvrtc - - cuda-python >=12.0,<13.0a0 + - cuda-python >=12.6.2,<13.0a0 - pynvjitlink {% endif %} - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml index 3d965f30986..08eab363af0 100644 --- a/conda/recipes/pylibcudf/meta.yaml +++ b/conda/recipes/pylibcudf/meta.yaml @@ -83,9 +83,9 @@ requirements: - {{ pin_compatible('rmm', max_pin='x.x') }} - fsspec >=0.6.0 {% if cuda_major == "11" %} - - cuda-python >=11.7.1,<12.0a0 + - cuda-python >=11.8.5,<12.0a0 {% else %} - - cuda-python >=12.0,<13.0a0 + - cuda-python >=12.6.2,<13.0a0 {% endif %} - nvtx >=0.2.1 - packaging diff --git a/dependencies.yaml b/dependencies.yaml index 044c7d187b3..3c55ce2c614 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -679,10 +679,10 @@ dependencies: matrices: - matrix: {cuda: "12.*"} packages: - - cuda-python>=12.0,<13.0a0 + - cuda-python>=12.6.2,<13.0a0 - matrix: {cuda: "11.*"} packages: &run_pylibcudf_packages_all_cu11 - - cuda-python>=11.7.1,<12.0a0 + - cuda-python>=11.8.5,<12.0a0 - {matrix: null, packages: *run_pylibcudf_packages_all_cu11} run_cudf: common: @@ -705,10 +705,10 @@ dependencies: matrices: - matrix: {cuda: "12.*"} packages: - - cuda-python>=12.0,<13.0a0 + - cuda-python>=12.6.2,<13.0a0 - matrix: {cuda: "11.*"} packages: &run_cudf_packages_all_cu11 - - cuda-python>=11.7.1,<12.0a0 + - cuda-python>=11.8.5,<12.0a0 - {matrix: null, packages: *run_cudf_packages_all_cu11} - output_types: conda matrices: diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 80de9056a0a..21c18ef0174 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -20,7 +20,7 @@ requires-python = ">=3.10" dependencies = [ "cachetools", "cubinlinker", - "cuda-python>=11.7.1,<12.0a0", + "cuda-python>=11.8.5,<12.0a0", "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", "libcudf==25.2.*,>=0.0.0a0", diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index a5e5704b8ed..53ee3e2b56e 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -18,7 +18,7 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ - "cuda-python>=11.7.1,<12.0a0", + "cuda-python>=11.8.5,<12.0a0", "libcudf==25.2.*,>=0.0.0a0", "nvtx>=0.2.1", "packaging", From a0fc6a89a596ebae7df436be25aed70ec908f83e Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 9 Dec 2024 09:33:08 -0500 Subject: [PATCH 39/78] Use cooperative-groups instead of cub warp-reduce for strings contains (#17540) Replaces the `cub::WarpReduce` usage in `cudf::strings::contains` with cooperative-groups `any()`. The change is only for the `contains_warp_parallel` kernel which is used for wider strings. Using cooperative-groups generates more efficient code for the same results and gives an additional 11-14% performance improvement. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Yunsong Wang (https://github.com/PointKernel) - Nghia Truong (https://github.com/ttnghia) - Shruti Shivakumar (https://github.com/shrshi) URL: https://github.com/rapidsai/cudf/pull/17540 --- cpp/src/strings/search/find.cu | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu index 0f33fcb6fe1..94bc81ec933 100644 --- a/cpp/src/strings/search/find.cu +++ b/cpp/src/strings/search/find.cu @@ -32,6 +32,7 @@ #include #include +#include #include #include #include @@ -347,13 +348,15 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings, string_view const d_target, bool* d_results) { - auto const idx = cudf::detail::grid_1d::global_thread_id(); - using warp_reduce = cub::WarpReduce; - __shared__ typename warp_reduce::TempStorage temp_storage; + auto const idx = cudf::detail::grid_1d::global_thread_id(); auto const str_idx = idx / cudf::detail::warp_size; if (str_idx >= d_strings.size()) { return; } - auto const lane_idx = idx % cudf::detail::warp_size; + + namespace cg = cooperative_groups; + auto const warp = cg::tiled_partition(cg::this_thread_block()); + auto const lane_idx = warp.thread_rank(); + if (d_strings.is_null(str_idx)) { return; } // get the string for this warp auto const d_str = d_strings.element(str_idx); @@ -373,7 +376,7 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings, } } - auto const result = warp_reduce(temp_storage).Reduce(found, cub::Max()); + auto const result = warp.any(found); if (lane_idx == 0) { d_results[str_idx] = result; } } From 0f5d4b9514b92f69465f4d76b1f9db1c5a37f33a Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Mon, 9 Dec 2024 10:41:26 -0500 Subject: [PATCH 40/78] Remove unused IO utilities from cudf python (#17374) Removes unused IO utilities from cuDF Python. Depends on #17163 #16042 #17252 #17263 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17374 --- python/cudf/cudf/_lib/io/utils.pxd | 6 +-- python/cudf/cudf/_lib/io/utils.pyx | 87 ++---------------------------- 2 files changed, 5 insertions(+), 88 deletions(-) diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd index 96504ebdd66..9b8bab012e2 100644 --- a/python/cudf/cudf/_lib/io/utils.pxd +++ b/python/cudf/cudf/_lib/io/utils.pxd @@ -13,9 +13,6 @@ from pylibcudf.libcudf.io.types cimport ( from cudf._lib.column cimport Column -cdef sink_info make_sinks_info( - list src, vector[unique_ptr[data_sink]] & data) except* -cdef sink_info make_sink_info(src, unique_ptr[data_sink] & data) except* cdef add_df_col_struct_names( df, child_names_dict @@ -26,7 +23,8 @@ cdef update_col_struct_field_names( ) cdef update_struct_field_names( table, - vector[column_name_info]& schema_info) + vector[column_name_info]& schema_info +) cdef Column update_column_struct_field_names( Column col, column_name_info& info diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx index f23980b387a..df4675be599 100644 --- a/python/cudf/cudf/_lib/io/utils.pyx +++ b/python/cudf/cudf/_lib/io/utils.pyx @@ -1,97 +1,16 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cpython.buffer cimport PyBUF_READ -from cpython.memoryview cimport PyMemoryView_FromMemory -from libcpp.memory cimport unique_ptr + from libcpp.string cimport string -from libcpp.utility cimport move + from libcpp.vector cimport vector -from pylibcudf.libcudf.io.data_sink cimport data_sink -from pylibcudf.libcudf.io.types cimport ( - column_name_info, - sink_info, -) +from pylibcudf.libcudf.io.types cimport column_name_info from cudf._lib.column cimport Column -import codecs -import io -import os - from cudf.core.dtypes import StructDtype -# Converts the Python sink input to libcudf IO sink_info. -cdef sink_info make_sinks_info( - list src, vector[unique_ptr[data_sink]] & sink -) except*: - cdef vector[data_sink *] data_sinks - cdef vector[string] paths - if isinstance(src[0], io.StringIO): - data_sinks.reserve(len(src)) - for s in src: - sink.push_back(unique_ptr[data_sink](new iobase_data_sink(s))) - data_sinks.push_back(sink.back().get()) - return sink_info(data_sinks) - elif isinstance(src[0], io.TextIOBase): - data_sinks.reserve(len(src)) - for s in src: - # Files opened in text mode expect writes to be str rather than - # bytes, which requires conversion from utf-8. If the underlying - # buffer is utf-8, we can bypass this conversion by writing - # directly to it. - if codecs.lookup(s.encoding).name not in {"utf-8", "ascii"}: - raise NotImplementedError(f"Unsupported encoding {s.encoding}") - sink.push_back( - unique_ptr[data_sink](new iobase_data_sink(s.buffer)) - ) - data_sinks.push_back(sink.back().get()) - return sink_info(data_sinks) - elif isinstance(src[0], io.IOBase): - data_sinks.reserve(len(src)) - for s in src: - sink.push_back(unique_ptr[data_sink](new iobase_data_sink(s))) - data_sinks.push_back(sink.back().get()) - return sink_info(data_sinks) - elif isinstance(src[0], (basestring, os.PathLike)): - paths.reserve(len(src)) - for s in src: - paths.push_back( os.path.expanduser(s).encode()) - return sink_info(move(paths)) - else: - raise TypeError("Unrecognized input type: {}".format(type(src))) - - -cdef sink_info make_sink_info(src, unique_ptr[data_sink] & sink) except*: - cdef vector[unique_ptr[data_sink]] datasinks - cdef sink_info info = make_sinks_info([src], datasinks) - if not datasinks.empty(): - sink.swap(datasinks[0]) - return info - - -# Adapts a python io.IOBase object as a libcudf IO data_sink. This lets you -# write from cudf to any python file-like object (File/BytesIO/SocketIO etc) -cdef cppclass iobase_data_sink(data_sink): - object buf - - iobase_data_sink(object buf_): - this.buf = buf_ - - void host_write(const void * data, size_t size) with gil: - if isinstance(buf, io.StringIO): - buf.write(PyMemoryView_FromMemory(data, size, PyBUF_READ) - .tobytes().decode()) - else: - buf.write(PyMemoryView_FromMemory(data, size, PyBUF_READ)) - - void flush() with gil: - buf.flush() - - size_t bytes_written() with gil: - return buf.tell() - - cdef add_df_col_struct_names(df, child_names_dict): for name, child_names in child_names_dict.items(): col = df._data[name] From ba3ed5773171a545d43d9e0f598c6c2eb37ec122 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 9 Dec 2024 10:08:13 -0800 Subject: [PATCH 41/78] Fix nvcc-imposed UB in `constexpr` functions (#17534) nvcc does not support `constexpr` functions that are not well-defined to call from the device. This is UB even when the function is not called from the device. Throwing an exception is one such operation. This PR cleans up error handling for functions that are called from device, and removes `constexpr` from the ones that are not actually used from the device, or in the constexpr context. Authors: - Vukasin Milovanovic (https://github.com/vuule) - MithunR (https://github.com/mythrocks) Approvers: - Karthikeyan (https://github.com/karthikeyann) - MithunR (https://github.com/mythrocks) - David Wendt (https://github.com/davidwendt) - Vyas Ramasubramani (https://github.com/vyasr) - Bradley Dice (https://github.com/bdice) - Mike Wilson (https://github.com/hyperbolic2346) - Yunsong Wang (https://github.com/PointKernel) - Muhammad Haseeb (https://github.com/mhaseeb123) URL: https://github.com/rapidsai/cudf/pull/17534 --- .../cudf/detail/utilities/device_operators.cuh | 18 +++++++++++++++++- cpp/include/cudf/utilities/span.hpp | 2 ++ cpp/src/io/orc/writer_impl.cu | 2 +- cpp/src/io/utilities/time_utils.cuh | 6 +++--- 4 files changed, 23 insertions(+), 5 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh index 46f424e051b..d16be5e22dd 100644 --- a/cpp/include/cudf/detail/utilities/device_operators.cuh +++ b/cpp/include/cudf/detail/utilities/device_operators.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -83,7 +83,11 @@ struct DeviceSum { template ()>* = nullptr> static constexpr T identity() { +#ifndef __CUDA_ARCH__ CUDF_FAIL("fixed_point does not yet support device operator identity"); +#else + CUDF_UNREACHABLE("fixed_point does not yet support device operator identity"); +#endif return T{}; } }; @@ -141,7 +145,11 @@ struct DeviceMin { template ()>* = nullptr> static constexpr T identity() { +#ifndef __CUDA_ARCH__ CUDF_FAIL("fixed_point does not yet support DeviceMin identity"); +#else + CUDF_UNREACHABLE("fixed_point does not yet support DeviceMin identity"); +#endif return cuda::std::numeric_limits::max(); } @@ -189,7 +197,11 @@ struct DeviceMax { template ()>* = nullptr> static constexpr T identity() { +#ifndef __CUDA_ARCH__ CUDF_FAIL("fixed_point does not yet support DeviceMax identity"); +#else + CUDF_UNREACHABLE("fixed_point does not yet support DeviceMax identity"); +#endif return cuda::std::numeric_limits::lowest(); } @@ -225,7 +237,11 @@ struct DeviceProduct { template ()>* = nullptr> static constexpr T identity() { +#ifndef __CUDA_ARCH__ CUDF_FAIL("fixed_point does not yet support DeviceProduct identity"); +#else + CUDF_UNREACHABLE("fixed_point does not yet support DeviceProduct identity"); +#endif return T{1, numeric::scale_type{0}}; } }; diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp index 21ee4fa9e9b..2273a89892b 100644 --- a/cpp/include/cudf/utilities/span.hpp +++ b/cpp/include/cudf/utilities/span.hpp @@ -417,7 +417,9 @@ class base_2dspan { constexpr base_2dspan(RowType flat_view, size_t columns) : _flat{flat_view}, _size{columns == 0 ? 0 : flat_view.size() / columns, columns} { +#ifndef __CUDA_ARCH__ CUDF_EXPECTS(_size.first * _size.second == flat_view.size(), "Invalid 2D span size"); +#endif } /** diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index d432deb8e79..76e5369ffd0 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -506,7 +506,7 @@ size_t max_varint_size() return cudf::util::div_rounding_up_unsafe(sizeof(T) * 8, 7); } -constexpr size_t RLE_stream_size(TypeKind kind, size_t count) +size_t RLE_stream_size(TypeKind kind, size_t count) { using cudf::util::div_rounding_up_unsafe; constexpr auto byte_rle_max_len = 128; diff --git a/cpp/src/io/utilities/time_utils.cuh b/cpp/src/io/utilities/time_utils.cuh index 687766c1bcc..ff1b9f58e6c 100644 --- a/cpp/src/io/utilities/time_utils.cuh +++ b/cpp/src/io/utilities/time_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,7 +32,7 @@ static const __device__ __constant__ int32_t powers_of_ten[10] = { struct get_period { template - constexpr int32_t operator()() + int32_t operator()() { if constexpr (is_chrono()) { return T::period::den; } CUDF_FAIL("Invalid, non chrono type"); @@ -42,7 +42,7 @@ struct get_period { /** * @brief Function that translates cuDF time unit to clock frequency */ -constexpr int32_t to_clockrate(type_id timestamp_type_id) +inline int32_t to_clockrate(type_id timestamp_type_id) { return timestamp_type_id == type_id::EMPTY ? 0 From ed2892c8a4f00ad376e7b020d09371902fbf6b68 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 9 Dec 2024 13:47:22 -0500 Subject: [PATCH 42/78] Document undefined behavior in div_rounding_up_safe (#17542) Adds more description to the `div_rounding_up_safe` utility identifying undefined behavior. Closes #17539 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Paul Mattione (https://github.com/pmattione-nvidia) - Lawrence Mitchell (https://github.com/wence-) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/17542 --- .../cudf/detail/utilities/integer_utils.hpp | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp index 8b709f2a8f8..957b6b70fe2 100644 --- a/cpp/include/cudf/detail/utilities/integer_utils.hpp +++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp @@ -1,7 +1,7 @@ /* * Copyright 2019 BlazingDB, Inc. * Copyright 2019 Eyal Rozenberg - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -134,16 +134,20 @@ constexpr I div_rounding_up_safe(std::integral_constant, I dividend, } // namespace detail /** - * Divides the left-hand-side by the right-hand-side, rounding up + * @brief Divides the left-hand-side by the right-hand-side, rounding up * to an integral multiple of the right-hand-side, e.g. (9,5) -> 2 , (10,5) -> 2, (11,5) -> 3. * - * @param dividend the number to divide - * @param divisor the number of by which to divide - * @return The least integer multiple of {@link divisor} which is greater than or equal to - * the non-integral division dividend/divisor. + * The result is undefined if `divisor == 0` or + * if `divisor == -1` and `dividend == min()`. + * + * Will not overflow, and may _or may not_ be slower than the intuitive + * approach of using `(dividend + divisor - 1) / divisor`. * - * @note will not overflow, and may _or may not_ be slower than the intuitive - * approach of using (dividend + divisor - 1) / divisor + * @tparam I Integer type for `dividend`, `divisor`, and the return type + * @param dividend The number to divide + * @param divisor The number by which to divide + * @return The least integer multiple of `divisor` which is greater than or equal to + * the non-integral division `dividend/divisor` */ template constexpr I div_rounding_up_safe(I dividend, I divisor) noexcept From a79077cf67ff2154c2e0cd8b40891a8ec6d1712c Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Mon, 9 Dec 2024 13:06:53 -0600 Subject: [PATCH 43/78] [JNI] Enables fabric handles for CUDA async memory pools (#17526) This PR adds a `CUDA_ASYNC_FABRIC` allocation mode in `RmmAllocationMode` and pipes in the options to RMM's `cuda_async_memory_resource` of a `fabric` for the handle type, and `read_write` as the memory protection mode (as that's the only mode supported by the pools, and is required for IPC). If `CUDA_ASYNC` is used, fabric handles are not requested, and the memory protection is `none`. Authors: - Alessandro Bellina (https://github.com/abellina) Approvers: - Nghia Truong (https://github.com/ttnghia) - Jason Lowe (https://github.com/jlowe) URL: https://github.com/rapidsai/cudf/pull/17526 --- java/src/main/java/ai/rapids/cudf/Rmm.java | 11 ++++++---- .../ai/rapids/cudf/RmmAllocationMode.java | 7 ++++++- .../cudf/RmmCudaAsyncMemoryResource.java | 15 ++++++++++++-- java/src/main/native/src/RmmJni.cpp | 20 ++++++++++++++----- 4 files changed, 41 insertions(+), 12 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java index ed029c918e4..d1cc0cc96fe 100755 --- a/java/src/main/java/ai/rapids/cudf/Rmm.java +++ b/java/src/main/java/ai/rapids/cudf/Rmm.java @@ -206,7 +206,8 @@ private static void setGlobalValsFromResource(RmmDeviceMemoryResource resource) * {@link RmmAllocationMode#CUDA_DEFAULT}, * {@link RmmAllocationMode#POOL}, * {@link RmmAllocationMode#ARENA}, - * {@link RmmAllocationMode#CUDA_ASYNC} and + * {@link RmmAllocationMode#CUDA_ASYNC}, + * {@link RmmAllocationMode#CUDA_ASYNC_FABRIC} and * {@link RmmAllocationMode#CUDA_MANAGED_MEMORY} * @param logConf How to do logging or null if you don't want to * @param poolSize The initial pool size in bytes @@ -221,6 +222,7 @@ public static synchronized void initialize(int allocationMode, LogConf logConf, boolean isPool = (allocationMode & RmmAllocationMode.POOL) != 0; boolean isArena = (allocationMode & RmmAllocationMode.ARENA) != 0; boolean isAsync = (allocationMode & RmmAllocationMode.CUDA_ASYNC) != 0; + boolean isAsyncFabric = (allocationMode & RmmAllocationMode.CUDA_ASYNC_FABRIC) != 0; boolean isManaged = (allocationMode & RmmAllocationMode.CUDA_MANAGED_MEMORY) != 0; if (isAsync && isManaged) { @@ -246,6 +248,9 @@ public static synchronized void initialize(int allocationMode, LogConf logConf, } else if (isAsync) { resource = new RmmLimitingResourceAdaptor<>( new RmmCudaAsyncMemoryResource(poolSize, poolSize), poolSize, 512); + } else if (isAsyncFabric) { + resource = new RmmLimitingResourceAdaptor<>( + new RmmCudaAsyncMemoryResource(poolSize, poolSize, true), poolSize, 512); } else if (isManaged) { resource = new RmmManagedMemoryResource(); } else { @@ -521,7 +526,6 @@ public static DeviceMemoryBuffer alloc(long size, Cuda.Stream stream) { private static native long allocInternal(long size, long stream) throws RmmException; - static native void free(long ptr, long length, long stream) throws RmmException; /** @@ -562,7 +566,7 @@ static native long newArenaMemoryResource(long childHandle, static native void releaseArenaMemoryResource(long handle); - static native long newCudaAsyncMemoryResource(long size, long release) throws RmmException; + static native long newCudaAsyncMemoryResource(long size, long release, boolean fabric) throws RmmException; static native void releaseCudaAsyncMemoryResource(long handle); @@ -575,7 +579,6 @@ static native long newLoggingResourceAdaptor(long handle, int type, String path, static native void releaseLoggingResourceAdaptor(long handle); - static native long newTrackingResourceAdaptor(long handle, long alignment) throws RmmException; static native void releaseTrackingResourceAdaptor(long handle); diff --git a/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java b/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java index 966c21bee22..3f7bc1fae76 100644 --- a/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java +++ b/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,4 +36,9 @@ public class RmmAllocationMode { * Use CUDA async suballocation strategy */ public static final int CUDA_ASYNC = 0x00000008; + /** + * Use CUDA async suballocation strategy with fabric handles that are + * peer accessible with read-write access + */ + public static final int CUDA_ASYNC_FABRIC = 0x00000010; } diff --git a/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java b/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java index fa1f13cb7ed..cf4936e2e24 100644 --- a/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java +++ b/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,9 +30,20 @@ public class RmmCudaAsyncMemoryResource implements RmmDeviceMemoryResource { * @param releaseThreshold size in bytes for when memory is released back to cuda */ public RmmCudaAsyncMemoryResource(long size, long releaseThreshold) { + this(size, releaseThreshold, false); + } + + /** + * Create a new async memory resource + * @param size the initial size of the pool + * @param releaseThreshold size in bytes for when memory is released back to cuda + * @param fabric if true request peer read+write accessible fabric handles when + * creating the pool + */ + public RmmCudaAsyncMemoryResource(long size, long releaseThreshold, boolean fabric) { this.size = size; this.releaseThreshold = releaseThreshold; - handle = Rmm.newCudaAsyncMemoryResource(size, releaseThreshold); + handle = Rmm.newCudaAsyncMemoryResource(size, releaseThreshold, fabric); } @Override diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index 23c7b7fb243..0f424761bfe 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -772,14 +772,24 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseArenaMemoryResource(JNIEnv CATCH_STD(env, ) } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource(JNIEnv* env, - jclass clazz, - jlong init, - jlong release) +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource( + JNIEnv* env, jclass clazz, jlong init, jlong release, jboolean fabric) { try { cudf::jni::auto_set_device(env); - auto ret = new rmm::mr::cuda_async_memory_resource(init, release); + + // When we are using fabric, we need to set the memory access to be + // read_write, in order for peer GPUs to have access to this memory. + // Otherwise, choose default parameters (optional set to nullopt). + auto [handle_type, prot_flag] = + fabric + ? std::pair{std::optional{ + rmm::mr::cuda_async_memory_resource::allocation_handle_type::fabric}, + std::optional{rmm::mr::cuda_async_memory_resource::access_flags::read_write}} + : std::pair{std::nullopt, std::nullopt}; + + auto ret = new rmm::mr::cuda_async_memory_resource(init, release, handle_type, prot_flag); + return reinterpret_cast(ret); } CATCH_STD(env, 0) From f5955929b06e2a4609b9fca0e3f949afb9b1dadd Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 9 Dec 2024 14:22:04 -0800 Subject: [PATCH 44/78] Remove cudf._lib.string.convert/split in favor of inlining pylibcudf (#17496) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17496 --- python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/strings/CMakeLists.txt | 15 -- python/cudf/cudf/_lib/strings/__init__.py | 15 -- .../cudf/_lib/strings/convert/CMakeLists.txt | 24 -- .../cudf/_lib/strings/convert/__init__.pxd | 0 .../cudf/_lib/strings/convert/__init__.py | 0 .../strings/convert/convert_fixed_point.pyx | 76 ------ .../_lib/strings/convert/convert_floats.pyx | 19 -- .../_lib/strings/convert/convert_integers.pyx | 20 -- .../_lib/strings/convert/convert_lists.pyx | 32 --- .../_lib/strings/convert/convert_urls.pyx | 48 ---- .../cudf/_lib/strings/split/CMakeLists.txt | 22 -- .../cudf/cudf/_lib/strings/split/__init__.pxd | 0 .../cudf/cudf/_lib/strings/split/__init__.py | 0 .../cudf/_lib/strings/split/partition.pyx | 35 --- python/cudf/cudf/_lib/strings/split/split.pyx | 155 ----------- python/cudf/cudf/core/column/decimal.py | 15 +- python/cudf/cudf/core/column/lists.py | 10 +- python/cudf/cudf/core/column/string.py | 246 +++++++++++++++--- python/cudf/cudf/core/tools/datetimes.py | 5 +- python/cudf/cudf/core/tools/numeric.py | 66 ++--- 22 files changed, 262 insertions(+), 543 deletions(-) delete mode 100644 python/cudf/cudf/_lib/strings/CMakeLists.txt delete mode 100644 python/cudf/cudf/_lib/strings/convert/CMakeLists.txt delete mode 100644 python/cudf/cudf/_lib/strings/convert/__init__.pxd delete mode 100644 python/cudf/cudf/_lib/strings/convert/__init__.py delete mode 100644 python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx delete mode 100644 python/cudf/cudf/_lib/strings/convert/convert_floats.pyx delete mode 100644 python/cudf/cudf/_lib/strings/convert/convert_integers.pyx delete mode 100644 python/cudf/cudf/_lib/strings/convert/convert_lists.pyx delete mode 100644 python/cudf/cudf/_lib/strings/convert/convert_urls.pyx delete mode 100644 python/cudf/cudf/_lib/strings/split/CMakeLists.txt delete mode 100644 python/cudf/cudf/_lib/strings/split/__init__.pxd delete mode 100644 python/cudf/cudf/_lib/strings/split/__init__.py delete mode 100644 python/cudf/cudf/_lib/strings/split/partition.pyx delete mode 100644 python/cudf/cudf/_lib/strings/split/split.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index e98cf283bbb..f9ac3a16940 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -46,4 +46,3 @@ target_link_libraries(interop PUBLIC nanoarrow) add_subdirectory(io) add_subdirectory(nvtext) -add_subdirectory(strings) diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 4758a933898..52e9b89da7b 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -12,7 +12,6 @@ sort, stream_compaction, string_casting, - strings, strings_udf, ) diff --git a/python/cudf/cudf/_lib/strings/CMakeLists.txt b/python/cudf/cudf/_lib/strings/CMakeLists.txt deleted file mode 100644 index dca9c4cc3fc..00000000000 --- a/python/cudf/cudf/_lib/strings/CMakeLists.txt +++ /dev/null @@ -1,15 +0,0 @@ -# ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= -add_subdirectory(convert) -add_subdirectory(split) diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index b795c54c112..341ba6d11c3 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -32,18 +32,3 @@ detokenize, tokenize_with_vocabulary, ) -from cudf._lib.strings.convert.convert_fixed_point import to_decimal -from cudf._lib.strings.convert.convert_floats import is_float -from cudf._lib.strings.convert.convert_integers import is_integer -from cudf._lib.strings.convert.convert_urls import url_decode, url_encode -from cudf._lib.strings.split.partition import partition, rpartition -from cudf._lib.strings.split.split import ( - rsplit, - rsplit_re, - rsplit_record, - rsplit_record_re, - split, - split_re, - split_record, - split_record_re, -) diff --git a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt b/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt deleted file mode 100644 index e8a76b476a8..00000000000 --- a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt +++ /dev/null @@ -1,24 +0,0 @@ -# ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= - -set(cython_sources convert_fixed_point.pyx convert_floats.pyx convert_integers.pyx - convert_lists.pyx convert_urls.pyx -) - -set(linked_libraries cudf::cudf) -rapids_cython_create_modules( - CXX - SOURCE_FILES "${cython_sources}" - LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf -) diff --git a/python/cudf/cudf/_lib/strings/convert/__init__.pxd b/python/cudf/cudf/_lib/strings/convert/__init__.pxd deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/strings/convert/__init__.py b/python/cudf/cudf/_lib/strings/convert/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx deleted file mode 100644 index 96dcd021c3b..00000000000 --- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx +++ /dev/null @@ -1,76 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column -from cudf._lib.types cimport dtype_to_pylibcudf_type - -import pylibcudf as plc - - -@acquire_spill_lock() -def from_decimal(Column input_col): - """ - Converts a `Decimal64Column` to a `StringColumn`. - - Parameters - ---------- - input_col : input column of type decimal - - Returns - ------- - A column of strings representing the input decimal values. - """ - plc_column = plc.strings.convert.convert_fixed_point.from_fixed_point( - input_col.to_pylibcudf(mode="read"), - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def to_decimal(Column input_col, object out_type): - """ - Returns a `Decimal64Column` from the provided `StringColumn` - using the scale in the `out_type`. - - Parameters - ---------- - input_col : input column of type string - out_type : The type and scale of the decimal column expected - - Returns - ------- - A column of decimals parsed from the string values. - """ - plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point( - input_col.to_pylibcudf(mode="read"), - dtype_to_pylibcudf_type(out_type), - ) - result = Column.from_pylibcudf(plc_column) - result.dtype.precision = out_type.precision - return result - - -@acquire_spill_lock() -def is_fixed_point(Column input_col, object dtype): - """ - Returns a Column of boolean values with True for `input_col` - that have fixed-point characters. The output row also has a - False value if the corresponding string would cause an integer - overflow. The scale of the `dtype` is used to determine overflow - in the output row. - - Parameters - ---------- - input_col : input column of type string - dtype : The type and scale of a decimal column - - Returns - ------- - A Column of booleans indicating valid decimal conversion. - """ - plc_column = plc.strings.convert.convert_fixed_point.is_fixed_point( - input_col.to_pylibcudf(mode="read"), - dtype_to_pylibcudf_type(dtype), - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx deleted file mode 100644 index 5da6e3f10cc..00000000000 --- a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def is_float(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that have floats. - """ - plc_column = plc.strings.convert.convert_floats.is_float( - source_strings.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx deleted file mode 100644 index 50113347ccb..00000000000 --- a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -import pylibcudf as plc - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def is_integer(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that have integers. - """ - return Column.from_pylibcudf( - plc.strings.convert.convert_integers.is_integer( - source_strings.to_pylibcudf(mode="read") - ) - ) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx deleted file mode 100644 index 3a2cb4bd5c7..00000000000 --- a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -import pylibcudf as plc - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -from cudf._lib.scalar import as_device_scalar - - -@acquire_spill_lock() -def format_list_column(Column source_list, Column separators): - """ - Format a list column of strings into a strings column. - - Parameters - ---------- - input_col : input column of type list with strings child. - - separators: strings used for formatting (', ', '[', ']') - - Returns - ------- - Formatted strings column - """ - plc_column = plc.strings.convert.convert_lists.format_list_column( - source_list.to_pylibcudf(mode="read"), - as_device_scalar("None").c_value, - separators.to_pylibcudf(mode="read"), - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx b/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx deleted file mode 100644 index d5c2f771970..00000000000 --- a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import pylibcudf as plc - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def url_decode(Column source_strings): - """ - Decode each string in column. No format checking is performed. - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - URL decoded string column - """ - plc_column = plc.strings.convert.convert_urls.url_decode( - source_strings.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def url_encode(Column source_strings): - """ - Encode each string in column. No format checking is performed. - All characters are encoded except for ASCII letters, digits, - and these characters: '.','_','-','~'. Encoding converts to - hex using UTF-8 encoded bytes. - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - URL encoded string column - """ - plc_column = plc.strings.convert.convert_urls.url_encode( - source_strings.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt b/python/cudf/cudf/_lib/strings/split/CMakeLists.txt deleted file mode 100644 index 4ede0a2fac5..00000000000 --- a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt +++ /dev/null @@ -1,22 +0,0 @@ -# ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= - -set(cython_sources partition.pyx split.pyx) - -set(linked_libraries cudf::cudf) -rapids_cython_create_modules( - CXX - SOURCE_FILES "${cython_sources}" - LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf -) diff --git a/python/cudf/cudf/_lib/strings/split/__init__.pxd b/python/cudf/cudf/_lib/strings/split/__init__.pxd deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/strings/split/__init__.py b/python/cudf/cudf/_lib/strings/split/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/strings/split/partition.pyx b/python/cudf/cudf/_lib/strings/split/partition.pyx deleted file mode 100644 index 5319addc41c..00000000000 --- a/python/cudf/cudf/_lib/strings/split/partition.pyx +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def partition(Column source_strings, - object py_delimiter): - """ - Returns data by splitting the `source_strings` - column at the first occurrence of the specified `py_delimiter`. - """ - plc_table = plc.strings.split.partition.partition( - source_strings.to_pylibcudf(mode="read"), - py_delimiter.device_value.c_value - ) - return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) - - -@acquire_spill_lock() -def rpartition(Column source_strings, - object py_delimiter): - """ - Returns a Column by splitting the `source_strings` - column at the last occurrence of the specified `py_delimiter`. - """ - plc_table = plc.strings.split.partition.rpartition( - source_strings.to_pylibcudf(mode="read"), - py_delimiter.device_value.c_value - ) - return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) diff --git a/python/cudf/cudf/_lib/strings/split/split.pyx b/python/cudf/cudf/_lib/strings/split/split.pyx deleted file mode 100644 index 4ec6c7073d8..00000000000 --- a/python/cudf/cudf/_lib/strings/split/split.pyx +++ /dev/null @@ -1,155 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def split(Column source_strings, - object py_delimiter, - size_type maxsplit): - """ - Returns data by splitting the `source_strings` - column around the specified `py_delimiter`. - The split happens from beginning. - """ - plc_table = plc.strings.split.split.split( - source_strings.to_pylibcudf(mode="read"), - py_delimiter.device_value.c_value, - maxsplit, - ) - return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) - - -@acquire_spill_lock() -def split_record(Column source_strings, - object py_delimiter, - size_type maxsplit): - """ - Returns a Column by splitting the `source_strings` - column around the specified `py_delimiter`. - The split happens from beginning. - """ - plc_column = plc.strings.split.split.split_record( - source_strings.to_pylibcudf(mode="read"), - py_delimiter.device_value.c_value, - maxsplit, - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def rsplit(Column source_strings, - object py_delimiter, - size_type maxsplit): - """ - Returns data by splitting the `source_strings` - column around the specified `py_delimiter`. - The split happens from the end. - """ - plc_table = plc.strings.split.split.rsplit( - source_strings.to_pylibcudf(mode="read"), - py_delimiter.device_value.c_value, - maxsplit, - ) - return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) - - -@acquire_spill_lock() -def rsplit_record(Column source_strings, - object py_delimiter, - size_type maxsplit): - """ - Returns a Column by splitting the `source_strings` - column around the specified `py_delimiter`. - The split happens from the end. - """ - plc_column = plc.strings.split.split.rsplit_record( - source_strings.to_pylibcudf(mode="read"), - py_delimiter.device_value.c_value, - maxsplit, - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def split_re(Column source_strings, - object pattern, - size_type maxsplit): - """ - Returns data by splitting the `source_strings` - column around the delimiters identified by `pattern`. - """ - plc_table = plc.strings.split.split.split_re( - source_strings.to_pylibcudf(mode="read"), - plc.strings.regex_program.RegexProgram.create( - str(pattern), - plc.strings.regex_flags.RegexFlags.DEFAULT, - ), - maxsplit, - ) - return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) - - -@acquire_spill_lock() -def rsplit_re(Column source_strings, - object pattern, - size_type maxsplit): - """ - Returns data by splitting the `source_strings` - column around the delimiters identified by `pattern`. - The delimiters are searched starting from the end of each string. - """ - plc_table = plc.strings.split.split.rsplit_re( - source_strings.to_pylibcudf(mode="read"), - plc.strings.regex_program.RegexProgram.create( - str(pattern), - plc.strings.regex_flags.RegexFlags.DEFAULT, - ), - maxsplit, - ) - return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) - - -@acquire_spill_lock() -def split_record_re(Column source_strings, - object pattern, - size_type maxsplit): - """ - Returns a Column by splitting the `source_strings` - column around the delimiters identified by `pattern`. - """ - plc_column = plc.strings.split.split.split_record_re( - source_strings.to_pylibcudf(mode="read"), - plc.strings.regex_program.RegexProgram.create( - str(pattern), - plc.strings.regex_flags.RegexFlags.DEFAULT, - ), - maxsplit, - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def rsplit_record_re(Column source_strings, - object pattern, - size_type maxsplit): - """ - Returns a Column by splitting the `source_strings` - column around the delimiters identified by `pattern`. - The delimiters are searched starting from the end of each string. - """ - plc_column = plc.strings.split.split.rsplit_record_re( - source_strings.to_pylibcudf(mode="read"), - plc.strings.regex_program.RegexProgram.create( - str(pattern), - plc.strings.regex_flags.RegexFlags.DEFAULT, - ), - maxsplit, - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 2c22724d3d7..9e6a73f1a9c 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -10,13 +10,12 @@ import numpy as np import pyarrow as pa +import pylibcudf as plc + import cudf -from cudf._lib.strings.convert.convert_fixed_point import ( - from_decimal as cpp_from_decimal, -) from cudf.api.types import is_scalar from cudf.core._internals import binaryop, unary -from cudf.core.buffer import as_buffer +from cudf.core.buffer import acquire_spill_lock, as_buffer from cudf.core.column.column import ColumnBase from cudf.core.column.numerical_base import NumericalBaseColumn from cudf.core.dtypes import ( @@ -89,7 +88,13 @@ def as_decimal_column( def as_string_column(self) -> cudf.core.column.StringColumn: if len(self) > 0: - return cpp_from_decimal(self) + with acquire_spill_lock(): + plc_column = ( + plc.strings.convert.convert_fixed_point.from_fixed_point( + self.to_pylibcudf(mode="read"), + ) + ) + return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] else: return cast( cudf.core.column.StringColumn, diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index ea384888388..b95fb0a0d39 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -14,7 +14,6 @@ import cudf import cudf.core.column.column as column -from cudf._lib.strings.convert.convert_lists import format_list_column from cudf._lib.types import size_type_dtype from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar from cudf.core.buffer import acquire_spill_lock @@ -272,8 +271,13 @@ def as_string_column(self) -> cudf.core.column.StringColumn: # Separator strings to match the Python format separators = as_column([", ", "[", "]"]) - # Call libcudf to format the list column - return format_list_column(lc, separators) + with acquire_spill_lock(): + plc_column = plc.strings.convert.convert_lists.format_list_column( + lc.to_pylibcudf(mode="read"), + cudf.Scalar("None").device_value.c_value, + separators.to_pylibcudf(mode="read"), + ) + return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] def _transform_leaves(self, func, *args, **kwargs) -> Self: # return a new list column with the same nested structure diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 6b45828568c..4a2483a80e3 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -19,6 +19,7 @@ import cudf.api.types import cudf.core.column.column as column import cudf.core.column.datetime as datetime +from cudf import _lib as libcudf from cudf._lib import string_casting as str_cast, strings as libstrings from cudf._lib.column import Column from cudf._lib.types import size_type_dtype @@ -44,6 +45,7 @@ SeriesOrIndex, ) from cudf.core.buffer import Buffer + from cudf.core.column.numerical import NumericalColumn def str_to_boolean(column: StringColumn): @@ -1336,7 +1338,7 @@ def isinteger(self) -> SeriesOrIndex: 2 False dtype: bool """ - return self._return_or_inplace(libstrings.is_integer(self._column)) + return self._return_or_inplace(self._column.is_integer()) def ishex(self) -> SeriesOrIndex: """ @@ -1468,7 +1470,7 @@ def isfloat(self) -> SeriesOrIndex: 3 False dtype: bool """ - return self._return_or_inplace(libstrings.is_float(self._column)) + return self._return_or_inplace(self._column.is_float()) def isdecimal(self) -> SeriesOrIndex: """ @@ -2710,26 +2712,25 @@ def split( if len(str(pat)) <= 1: regex = False + result_table: StringColumn | dict[int, StringColumn] if expand: if self._column.null_count == len(self._column): result_table = {0: self._column.copy()} else: if regex is True: - data = libstrings.split_re(self._column, pat, n) + data = self._column.split_re(pat, n) else: - data = libstrings.split( - self._column, cudf.Scalar(pat, "str"), n - ) + data = self._column.split(cudf.Scalar(pat, "str"), n) if len(data) == 1 and data[0].null_count == len(self._column): result_table = {} else: result_table = data else: if regex is True: - result_table = libstrings.split_record_re(self._column, pat, n) + result_table = self._column.split_record_re(pat, n) else: - result_table = libstrings.split_record( - self._column, cudf.Scalar(pat, "str"), n + result_table = self._column.split_record( + cudf.Scalar(pat, "str"), n ) return self._return_or_inplace(result_table, expand=expand) @@ -2883,28 +2884,25 @@ def rsplit( if regex and isinstance(pat, re.Pattern): pat = pat.pattern + result_table: StringColumn | dict[int, StringColumn] if expand: if self._column.null_count == len(self._column): result_table = {0: self._column.copy()} else: if regex is True: - data = libstrings.rsplit_re(self._column, pat, n) + data = self._column.rsplit_re(pat, n) else: - data = libstrings.rsplit( - self._column, cudf.Scalar(pat, "str"), n - ) + data = self._column.rsplit(cudf.Scalar(pat, "str"), n) if len(data) == 1 and data[0].null_count == len(self._column): result_table = {} else: result_table = data else: if regex is True: - result_table = libstrings.rsplit_record_re( - self._column, pat, n - ) + result_table = self._column.rsplit_record_re(pat, n) else: - result_table = libstrings.rsplit_record( - self._column, cudf.Scalar(pat, "str"), n + result_table = self._column.rsplit_record( + cudf.Scalar(pat, "str"), n ) return self._return_or_inplace(result_table, expand=expand) @@ -2989,7 +2987,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: sep = " " return self._return_or_inplace( - libstrings.partition(self._column, cudf.Scalar(sep, "str")), + self._column.partition(cudf.Scalar(sep, "str")), expand=expand, ) @@ -3054,7 +3052,7 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: sep = " " return self._return_or_inplace( - libstrings.rpartition(self._column, cudf.Scalar(sep, "str")), + self._column.rpartition(cudf.Scalar(sep, "str")), expand=expand, ) @@ -4499,8 +4497,7 @@ def url_decode(self) -> SeriesOrIndex: 1 https://medium.com/rapids-ai dtype: object """ - - return self._return_or_inplace(libstrings.url_decode(self._column)) + return self._return_or_inplace(self._column.url_decode()) def url_encode(self) -> SeriesOrIndex: """ @@ -4531,7 +4528,7 @@ def url_encode(self) -> SeriesOrIndex: 1 https%3A%2F%2Fmedium.com%2Frapids-ai dtype: object """ - return self._return_or_inplace(libstrings.url_encode(self._column)) + return self._return_or_inplace(self._column.url_encode()) def code_points(self) -> SeriesOrIndex: """ @@ -6015,13 +6012,13 @@ def as_numerical_column( out_dtype = cudf.api.types.dtype(dtype) string_col = self if out_dtype.kind in {"i", "u"}: - if not libstrings.is_integer(string_col).all(): + if not string_col.is_integer().all(): raise ValueError( "Could not convert strings to integer " "type due to presence of non-integer values." ) elif out_dtype.kind == "f": - if not libstrings.is_float(string_col).all(): + if not string_col.is_float().all(): raise ValueError( "Could not convert strings to float " "type due to presence of non-floating values." @@ -6099,10 +6096,17 @@ def as_timedelta_column( ) -> cudf.core.column.TimeDeltaColumn: return self.strptime(dtype, "%D days %H:%M:%S") # type: ignore[return-value] + @acquire_spill_lock() def as_decimal_column( self, dtype: Dtype - ) -> "cudf.core.column.DecimalBaseColumn": - return libstrings.to_decimal(self, dtype) + ) -> cudf.core.column.DecimalBaseColumn: + plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point( + self.to_pylibcudf(mode="read"), + libcudf.types.dtype_to_pylibcudf_type(dtype), + ) + result = Column.from_pylibcudf(plc_column) + result.dtype.precision = dtype.precision # type: ignore[union-attr] + return result # type: ignore[return-value] def as_string_column(self) -> StringColumn: return self @@ -6138,12 +6142,9 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool: if self.dtype == to_dtype: return True - elif ( - to_dtype.kind in {"i", "u"} - and not libstrings.is_integer(self).all() - ): + elif to_dtype.kind in {"i", "u"} and not self.is_integer().all(): return False - elif to_dtype.kind == "f" and not libstrings.is_float(self).all(): + elif to_dtype.kind == "f" and not self.is_float().all(): return False else: return True @@ -6333,11 +6334,180 @@ def title(self) -> Self: def is_title(self) -> Self: return self._modify_characters(plc.strings.capitalize.is_title) + @acquire_spill_lock() def replace_multiple(self, pattern: Self, replacements: Self) -> Self: - with acquire_spill_lock(): - plc_result = plc.strings.replace.replace_multiple( - self.to_pylibcudf(mode="read"), - pattern.to_pylibcudf(mode="read"), - replacements.to_pylibcudf(mode="read"), + plc_result = plc.strings.replace.replace_multiple( + self.to_pylibcudf(mode="read"), + pattern.to_pylibcudf(mode="read"), + replacements.to_pylibcudf(mode="read"), + ) + return cast(Self, Column.from_pylibcudf(plc_result)) + + @acquire_spill_lock() + def _split_record_re( + self, + pattern: str, + maxsplit: int, + method: Callable[ + [plc.Column, plc.strings.regex_program.RegexProgram, int], + plc.Column, + ], + ) -> Self: + plc_column = method( + self.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + pattern, + plc.strings.regex_flags.RegexFlags.DEFAULT, + ), + maxsplit, + ) + return cast(Self, Column.from_pylibcudf(plc_column)) + + def split_record_re(self, pattern: str, maxsplit: int) -> Self: + return self._split_record_re( + pattern, maxsplit, plc.strings.split.split.split_record_re + ) + + def rsplit_record_re(self, pattern: str, maxsplit: int) -> Self: + return self._split_record_re( + pattern, maxsplit, plc.strings.split.split.rsplit_record_re + ) + + @acquire_spill_lock() + def _split_re( + self, + pattern: str, + maxsplit: int, + method: Callable[ + [plc.Column, plc.strings.regex_program.RegexProgram, int], + plc.Table, + ], + ) -> dict[int, Self]: + plc_table = method( + self.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + pattern, + plc.strings.regex_flags.RegexFlags.DEFAULT, + ), + maxsplit, + ) + return dict( + enumerate( + Column.from_pylibcudf(col) # type: ignore[misc] + for col in plc_table.columns() ) - return cast(Self, Column.from_pylibcudf(plc_result)) + ) + + def split_re(self, pattern: str, maxsplit: int) -> dict[int, Self]: + return self._split_re( + pattern, maxsplit, plc.strings.split.split.split_re + ) + + def rsplit_re(self, pattern: str, maxsplit: int) -> dict[int, Self]: + return self._split_re( + pattern, maxsplit, plc.strings.split.split.rsplit_re + ) + + @acquire_spill_lock() + def _split_record( + self, + delimiter: cudf.Scalar, + maxsplit: int, + method: Callable[[plc.Column, plc.Scalar, int], plc.Column], + ) -> Self: + plc_column = method( + self.to_pylibcudf(mode="read"), + delimiter.device_value.c_value, + maxsplit, + ) + return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] + + def split_record(self, delimiter: cudf.Scalar, maxsplit: int) -> Self: + return self._split_record( + delimiter, maxsplit, plc.strings.split.split.split_record + ) + + def rsplit_record(self, delimiter: cudf.Scalar, maxsplit: int) -> Self: + return self._split_record( + delimiter, maxsplit, plc.strings.split.split.rsplit_record + ) + + @acquire_spill_lock() + def _split( + self, + delimiter: cudf.Scalar, + maxsplit: int, + method: Callable[[plc.Column, plc.Scalar, int], plc.Column], + ) -> dict[int, Self]: + plc_table = method( + self.to_pylibcudf(mode="read"), + delimiter.device_value.c_value, + maxsplit, + ) + return dict( + enumerate( + Column.from_pylibcudf(col) # type: ignore[misc] + for col in plc_table.columns() + ) + ) + + def split(self, delimiter: cudf.Scalar, maxsplit: int) -> dict[int, Self]: + return self._split(delimiter, maxsplit, plc.strings.split.split.split) + + def rsplit(self, delimiter: cudf.Scalar, maxsplit: int) -> dict[int, Self]: + return self._split(delimiter, maxsplit, plc.strings.split.split.rsplit) + + @acquire_spill_lock() + def _partition( + self, + delimiter: cudf.Scalar, + method: Callable[[plc.Column, plc.Scalar], plc.Column], + ) -> dict[int, Self]: + plc_table = method( + self.to_pylibcudf(mode="read"), + delimiter.device_value.c_value, + ) + return dict( + enumerate( + Column.from_pylibcudf(col) # type: ignore[misc] + for col in plc_table.columns() + ) + ) + + def partition(self, delimiter: cudf.Scalar) -> dict[int, Self]: + return self._partition( + delimiter, plc.strings.split.partition.partition + ) + + def rpartition(self, delimiter: cudf.Scalar) -> dict[int, Self]: + return self._partition( + delimiter, plc.strings.split.partition.rpartition + ) + + @acquire_spill_lock() + def url_decode(self) -> Self: + plc_column = plc.strings.convert.convert_urls.url_decode( + self.to_pylibcudf(mode="read") + ) + return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] + + @acquire_spill_lock() + def url_encode(self) -> Self: + plc_column = plc.strings.convert.convert_urls.url_encode( + self.to_pylibcudf(mode="read") + ) + return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] + + @acquire_spill_lock() + def is_integer(self) -> NumericalColumn: + plc_column = plc.strings.convert.convert_integers.is_integer( + self.to_pylibcudf(mode="read") + ) + return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] + + @acquire_spill_lock() + def is_float(self) -> NumericalColumn: + plc_column = plc.strings.convert.convert_floats.is_float( + self.to_pylibcudf(mode="read") + ) + return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 80ee078917a..8be336021b1 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -15,9 +15,6 @@ import cudf from cudf import _lib as libcudf -from cudf._lib.strings.convert.convert_integers import ( - is_integer as cpp_is_integer, -) from cudf.api.types import is_integer, is_scalar from cudf.core import column from cudf.core.buffer import acquire_spill_lock @@ -232,7 +229,7 @@ def to_datetime( ) break elif arg_col.dtype.kind == "O": - if not cpp_is_integer(arg_col).all(): + if not arg_col.is_integer().all(): col = new_series._column.strptime( cudf.dtype("datetime64[ns]"), format=format ) diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 91f23490031..40348461f8c 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -2,14 +2,13 @@ from __future__ import annotations import warnings -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal import numpy as np import pandas as pd import cudf from cudf import _lib as libcudf -from cudf._lib import strings as libstrings from cudf.api.types import _is_non_decimal_numeric_dtype, is_string_dtype from cudf.core._internals import unary from cudf.core.column import as_column @@ -18,10 +17,16 @@ from cudf.utils.dtypes import can_convert_to_column if TYPE_CHECKING: - from cudf.core.column import ColumnBase + from cudf.core.column.numerical import NumericalColumn + from cudf.core.column.string import StringColumn -def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None): +def to_numeric( + arg, + errors: Literal["raise", "coerce", "ignore"] = "raise", + downcast: Literal["integer", "signed", "unsigned", "float", None] = None, + dtype_backend=None, +): """ Convert argument into numerical types. @@ -130,7 +135,9 @@ def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None): else: try: col = _convert_str_col( - col._get_decategorized_column(), errors, downcast + col._get_decategorized_column(), # type: ignore[attr-defined] + errors, + downcast, ) except ValueError as e: if errors == "ignore": @@ -139,7 +146,7 @@ def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None): raise e elif is_string_dtype(dtype): try: - col = _convert_str_col(col, errors, downcast) + col = _convert_str_col(col, errors, downcast) # type: ignore[arg-type] except ValueError as e: if errors == "ignore": return arg @@ -186,7 +193,11 @@ def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None): return col.values -def _convert_str_col(col, errors, _downcast=None): +def _convert_str_col( + col: StringColumn, + errors: Literal["raise", "coerce", "ignore"], + _downcast: Literal["integer", "signed", "unsigned", "float", None] = None, +) -> NumericalColumn: """ Converts a string column to numeric column @@ -212,13 +223,21 @@ def _convert_str_col(col, errors, _downcast=None): if not is_string_dtype(col): raise TypeError("col must be string dtype.") - is_integer = libstrings.is_integer(col) - if is_integer.all(): - return col.astype(dtype=cudf.dtype("i8")) + if col.is_integer().all(): + return col.astype(dtype=cudf.dtype("i8")) # type: ignore[return-value] - col = _proc_inf_empty_strings(col) + # TODO: This can be handled by libcudf in + # future see StringColumn.as_numerical_column + converted_col = ( + col.to_lower() + .find_and_replace(as_column([""]), as_column(["NaN"])) + .replace_multiple( + as_column(["+", "inf", "inity"]), # type: ignore[arg-type] + as_column(["", "Inf", ""]), # type: ignore[arg-type] + ) + ) - is_float = libstrings.is_float(col) + is_float = converted_col.is_float() if is_float.all(): if _downcast in {"unsigned", "signed", "integer"}: warnings.warn( @@ -227,27 +246,14 @@ def _convert_str_col(col, errors, _downcast=None): "limited by float32 precision." ) ) - return col.astype(dtype=cudf.dtype("float32")) + return converted_col.astype(dtype=cudf.dtype("float32")) # type: ignore[return-value] else: - return col.astype(dtype=cudf.dtype("float64")) + return converted_col.astype(dtype=cudf.dtype("float64")) # type: ignore[return-value] else: if errors == "coerce": - col = libcudf.string_casting.stod(col) + converted_col = libcudf.string_casting.stod(converted_col) non_numerics = is_float.unary_operator("not") - col[non_numerics] = None - return col + converted_col[non_numerics] = None + return converted_col # type: ignore[return-value] else: raise ValueError("Unable to convert some strings to numerics.") - - -def _proc_inf_empty_strings(col: ColumnBase) -> ColumnBase: - """Handles empty and infinity strings""" - col = col.to_lower() # type: ignore[attr-defined] - col = col.find_and_replace(as_column([""]), as_column(["NaN"])) - # TODO: This can be handled by libcudf in - # future see StringColumn.as_numerical_column - col = col.replace_multiple( # type: ignore[attr-defined] - as_column(["+", "inf", "inity"]), - as_column(["", "Inf", ""]), - ) - return col From 5b412dc14d047959d1a2b70bf27ffea139769f7a Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Mon, 9 Dec 2024 16:54:59 -0600 Subject: [PATCH 45/78] [JNI] remove rmm argument to set rw access for fabric handles (#17553) This is a follow up from https://github.com/rapidsai/cudf/pull/17526, where fabric handles can be enabled from RMM. That PR also sets the memory access protection flag (`cudaMemPoolSetAccess`), but I have learned that this second flag is not needed from the owner device. In fact, it causes confusion because the owning device fails to call this function with some of the flags (access none). `cudaMemPoolSetAccess` is meant to only be called from peer processes that have imported the pool's handle. In our case, UCX handles this from the peer's side and it does not need to be anywhere in RMM or cuDF. Sorry for the noise. I'd like to get this fix in, and then I am going to fix RMM by removing that API. Authors: - Alessandro Bellina (https://github.com/abellina) Approvers: - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) - Jason Lowe (https://github.com/jlowe) URL: https://github.com/rapidsai/cudf/pull/17553 --- java/src/main/native/src/RmmJni.cpp | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index 0f424761bfe..8c733018fa7 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -778,17 +778,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource( try { cudf::jni::auto_set_device(env); - // When we are using fabric, we need to set the memory access to be - // read_write, in order for peer GPUs to have access to this memory. - // Otherwise, choose default parameters (optional set to nullopt). - auto [handle_type, prot_flag] = - fabric - ? std::pair{std::optional{ - rmm::mr::cuda_async_memory_resource::allocation_handle_type::fabric}, - std::optional{rmm::mr::cuda_async_memory_resource::access_flags::read_write}} - : std::pair{std::nullopt, std::nullopt}; - - auto ret = new rmm::mr::cuda_async_memory_resource(init, release, handle_type, prot_flag); + auto handle_type = + fabric ? std::optional{rmm::mr::cuda_async_memory_resource::allocation_handle_type::fabric} + : std::nullopt; + + auto ret = new rmm::mr::cuda_async_memory_resource(init, release, handle_type); return reinterpret_cast(ret); } From 9df95d1c5fd41b1b87976fd3680a1d06f2d26310 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 9 Dec 2024 14:55:16 -0800 Subject: [PATCH 46/78] Remove cudf._lib.transform in favor of inlining pylibcudf (#17505) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17505 --- python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/transform.pyx | 113 -------------------- python/cudf/cudf/core/column/categorical.py | 4 +- python/cudf/cudf/core/column/column.py | 34 ++++-- python/cudf/cudf/core/column/lists.py | 2 +- python/cudf/cudf/core/column/numerical.py | 30 +++++- python/cudf/cudf/core/dataframe.py | 30 +++--- python/cudf/cudf/core/df_protocol.py | 3 +- python/cudf/cudf/core/frame.py | 9 +- python/cudf/cudf/core/indexed_frame.py | 2 +- python/cudf/cudf/core/reshape.py | 7 +- python/cudf/cudf/core/series.py | 3 +- python/cudf/cudf/datasets.py | 3 +- python/cudf/cudf/tests/test_column.py | 9 +- 14 files changed, 85 insertions(+), 165 deletions(-) delete mode 100644 python/cudf/cudf/_lib/transform.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index f9ac3a16940..084fc19a61e 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -25,7 +25,6 @@ set(cython_sources stream_compaction.pyx string_casting.pyx strings_udf.pyx - transform.pyx types.pyx utils.pyx ) diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx deleted file mode 100644 index a163bb07888..00000000000 --- a/python/cudf/cudf/_lib/transform.pyx +++ /dev/null @@ -1,113 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from numba.np import numpy_support - -import cudf -from cudf.core.buffer import acquire_spill_lock, as_buffer -from cudf.utils import cudautils - -from pylibcudf cimport transform as plc_transform -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def bools_to_mask(Column col): - """ - Given an int8 (boolean) column, compress the data from booleans to bits and - return a Buffer - """ - mask, _ = plc_transform.bools_to_mask(col.to_pylibcudf(mode="read")) - return as_buffer(mask) - - -@acquire_spill_lock() -def mask_to_bools(object mask_buffer, size_type begin_bit, size_type end_bit): - """ - Given a mask buffer, returns a boolean column representng bit 0 -> False - and 1 -> True within range of [begin_bit, end_bit), - """ - if not isinstance(mask_buffer, cudf.core.buffer.Buffer): - raise TypeError("mask_buffer is not an instance of " - "cudf.core.buffer.Buffer") - plc_column = plc_transform.mask_to_bools( - mask_buffer.get_ptr(mode="read"), begin_bit, end_bit - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def nans_to_nulls(Column input): - mask, _ = plc_transform.nans_to_nulls( - input.to_pylibcudf(mode="read") - ) - return as_buffer(mask) - - -@acquire_spill_lock() -def transform(Column input, op): - nb_type = numpy_support.from_dtype(input.dtype) - nb_signature = (nb_type,) - compiled_op = cudautils.compile_udf(op, nb_signature) - np_dtype = cudf.dtype(compiled_op[1]) - - plc_column = plc_transform.transform( - input.to_pylibcudf(mode="read"), - compiled_op[0], - plc.column._datatype_from_dtype_desc(np_dtype.str[1:]), - True - ) - return Column.from_pylibcudf(plc_column) - - -def table_encode(list source_columns): - plc_table, plc_column = plc_transform.encode( - plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]) - ) - - return ( - [Column.from_pylibcudf(col) for col in plc_table.columns()], - Column.from_pylibcudf(plc_column) - ) - - -def one_hot_encode(Column input_column, Column categories): - plc_table = plc_transform.one_hot_encode( - input_column.to_pylibcudf(mode="read"), - categories.to_pylibcudf(mode="read"), - ) - result_columns = [ - Column.from_pylibcudf(col, data_ptr_exposed=True) - for col in plc_table.columns() - ] - result_labels = [ - x if x is not None else '' - for x in categories.to_arrow().to_pylist() - ] - return dict(zip(result_labels, result_columns)) - - -@acquire_spill_lock() -def compute_column(list columns, tuple column_names, str expr): - """Compute a new column by evaluating an expression on a set of columns. - - Parameters - ---------- - columns : list - The set of columns forming the table to evaluate the expression on. - column_names : tuple[str] - The names associated with each column. These names are necessary to map - column names in the expression to indices in the provided list of - columns, which are what will be used by libcudf to evaluate the - expression on the table. - expr : str - The expression to evaluate. - """ - result = plc_transform.compute_column( - plc.Table([col.to_pylibcudf(mode="read") for col in columns]), - plc.expressions.to_expression(expr, column_names), - ) - return Column.from_pylibcudf(result) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index c849a9d3d2b..71ec11e75af 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -13,7 +13,6 @@ import cudf from cudf import _lib as libcudf -from cudf._lib.transform import bools_to_mask from cudf.core._internals import unary from cudf.core.column import column from cudf.core.column.methods import ColumnMethods @@ -775,12 +774,11 @@ def to_pandas( raise NotImplementedError(f"{arrow_type=} is not implemented.") if self.categories.dtype.kind == "f": - new_mask = bools_to_mask(self.notnull()) col = type(self)( data=self.data, # type: ignore[arg-type] size=self.size, dtype=self.dtype, - mask=new_mask, + mask=self.notnull().fillna(False).as_mask(), children=self.children, ) else: diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 1ddc79e8970..b317858077f 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -32,7 +32,6 @@ drop_duplicates, drop_nulls, ) -from cudf._lib.transform import bools_to_mask from cudf._lib.types import size_type_dtype from cudf.api.types import ( _is_non_decimal_numeric_dtype, @@ -373,10 +372,14 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: return result._with_type_metadata(cudf_dtype_from_pa_type(array.type)) + @acquire_spill_lock() def _get_mask_as_column(self) -> ColumnBase: - return libcudf.transform.mask_to_bools( - self.base_mask, self.offset, self.offset + len(self) + plc_column = plc.transform.mask_to_bools( + self.base_mask.get_ptr(mode="read"), # type: ignore[union-attr] + self.offset, + self.offset + len(self), ) + return type(self).from_pylibcudf(plc_column) @cached_property def memory_usage(self) -> int: @@ -981,11 +984,14 @@ def as_mask(self) -> Buffer: ------- Buffer """ - if self.has_nulls(): raise ValueError("Column must have no nulls.") - return bools_to_mask(self) + with acquire_spill_lock(): + mask, _ = plc.transform.bools_to_mask( + self.to_pylibcudf(mode="read") + ) + return as_buffer(mask) @property def is_unique(self) -> bool: @@ -1514,6 +1520,18 @@ def _return_sentinel_column(): ) return codes.fillna(na_sentinel.value) + def one_hot_encode( + self, categories: ColumnBase + ) -> abc.Generator[ColumnBase]: + plc_table = plc.transform.one_hot_encode( + self.to_pylibcudf(mode="read"), + categories.to_pylibcudf(mode="read"), + ) + return ( + type(self).from_pylibcudf(col, data_ptr_exposed=True) + for col in plc_table.columns() + ) + def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool: """Check if an object dtype Series or array contains NaN.""" @@ -2093,8 +2111,7 @@ def as_column( ) # Consider NaT as NA in the mask # but maintain NaT as a value - bool_mask = as_column(~is_nat) - mask = as_buffer(bools_to_mask(bool_mask)) + mask = as_column(~is_nat).as_mask() buffer = as_buffer(arbitrary.view("|u1")) col = build_column(data=buffer, mask=mask, dtype=arbitrary.dtype) if dtype: @@ -2264,8 +2281,7 @@ def _mask_from_cuda_array_interface_desc(obj, cai_mask) -> Buffer: ) return as_buffer(data=desc["data"][0], size=mask_size, owner=obj) elif typecode == "b": - col = as_column(cai_mask) - return bools_to_mask(col) + return as_column(cai_mask).as_mask() else: raise NotImplementedError(f"Cannot infer mask from typestr {typestr}") diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index b95fb0a0d39..ba98e28f6a2 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -255,7 +255,7 @@ def from_sequences( data=None, size=len(arbitrary), dtype=cudf.ListDtype(data_col.dtype), - mask=cudf._lib.transform.bools_to_mask(as_column(mask_col)), + mask=as_column(mask_col).as_mask(), offset=0, null_count=0, children=(offset_col, data_col), diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 9514aaeab50..790cd6ea9bb 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -7,9 +7,10 @@ import numpy as np import pandas as pd +from numba.np import numpy_support from typing_extensions import Self -import pylibcudf +import pylibcudf as plc import cudf import cudf.core.column.column as column @@ -17,11 +18,13 @@ from cudf import _lib as libcudf from cudf.api.types import is_integer, is_scalar from cudf.core._internals import binaryop, unary +from cudf.core.buffer import acquire_spill_lock, as_buffer from cudf.core.column.column import ColumnBase, as_column from cudf.core.column.numerical_base import NumericalBaseColumn from cudf.core.dtypes import CategoricalDtype from cudf.core.mixins import BinaryOperand from cudf.errors import MixedTypeError +from cudf.utils import cudautils from cudf.utils.dtypes import ( find_common_type, min_column_type, @@ -179,13 +182,27 @@ def __setitem__(self, key: Any, value: Any): if out: self._mimic_inplace(out, inplace=True) + @acquire_spill_lock() + def transform(self, compiled_op, np_dtype: np.dtype) -> ColumnBase: + plc_column = plc.transform.transform( + self.to_pylibcudf(mode="read"), + compiled_op[0], + plc.column._datatype_from_dtype_desc(np_dtype.str[1:]), + True, + ) + return type(self).from_pylibcudf(plc_column) + def unary_operator(self, unaryop: str | Callable) -> ColumnBase: if callable(unaryop): - return libcudf.transform.transform(self, unaryop) + nb_type = numpy_support.from_dtype(self.dtype) + nb_signature = (nb_type,) + compiled_op = cudautils.compile_udf(unaryop, nb_signature) + np_dtype = np.dtype(compiled_op[1]) + return self.transform(compiled_op, np_dtype) unaryop = unaryop.upper() unaryop = _unaryop_map.get(unaryop, unaryop) - unaryop = pylibcudf.unary.UnaryOperator[unaryop] + unaryop = plc.unary.UnaryOperator[unaryop] return unary.unary_operation(self, unaryop) def __invert__(self): @@ -298,8 +315,11 @@ def nans_to_nulls(self: Self) -> Self: # Only floats can contain nan. if self.dtype.kind != "f" or self.nan_count == 0: return self - newmask = libcudf.transform.nans_to_nulls(self) - return self.set_mask(newmask) + with acquire_spill_lock(): + mask, _ = plc.transform.nans_to_nulls( + self.to_pylibcudf(mode="read") + ) + return self.set_mask(as_buffer(mask)) def normalize_binop_value(self, other: ScalarLike) -> Self | cudf.Scalar: if isinstance(other, ColumnBase): diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 656274bca38..325601e5311 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6772,9 +6772,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): ) result = column.as_column(result, dtype=result_dtype) if mask is not None: - result = result.set_mask( - cudf._lib.transform.bools_to_mask(mask._column) - ) + result = result.set_mask(mask._column.as_mask()) return Series._from_column(result, index=self.index) else: result_df = DataFrame(result, index=self.index) @@ -7883,6 +7881,16 @@ def interleave_columns(self): ) return self._constructor_sliced._from_column(result_col) + @acquire_spill_lock() + def _compute_columns(self, expr: str) -> ColumnBase: + plc_column = plc.transform.compute_column( + plc.Table( + [col.to_pylibcudf(mode="read") for col in self._columns] + ), + plc.expressions.to_expression(expr, self._column_names), + ) + return libcudf.column.Column.from_pylibcudf(plc_column) + @_performance_tracking def eval(self, expr: str, inplace: bool = False, **kwargs): """Evaluate a string describing operations on DataFrame columns. @@ -8010,11 +8018,7 @@ def eval(self, expr: str, inplace: bool = False, **kwargs): raise ValueError( "Cannot operate inplace if there is no assignment" ) - return Series._from_column( - libcudf.transform.compute_column( - [*self._columns], self._column_names, statements[0] - ) - ) + return Series._from_column(self._compute_columns(statements[0])) targets = [] exprs = [] @@ -8030,15 +8034,9 @@ def eval(self, expr: str, inplace: bool = False, **kwargs): targets.append(t.strip()) exprs.append(e.strip()) - cols = ( - libcudf.transform.compute_column( - [*self._columns], self._column_names, e - ) - for e in exprs - ) ret = self if inplace else self.copy(deep=False) - for name, col in zip(targets, cols): - ret._data[name] = col + for name, expr in zip(targets, exprs): + ret._data[name] = self._compute_columns(expr) if not inplace: return ret diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index aa601a2b322..a798041699e 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -799,8 +799,7 @@ def _set_missing_values( valid_mask = _ensure_gpu_buffer( valid_mask[0], valid_mask[1], allow_copy ) - boolmask = as_column(valid_mask._buf, dtype="bool") - bitmask = cudf._lib.transform.bools_to_mask(boolmask) + bitmask = as_column(valid_mask._buf, dtype="bool").as_mask() return cudf_col.set_mask(bitmask) elif null == _MaskKind.BITMASK: valid_mask = _ensure_gpu_buffer( diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 0a7e6fefe6e..84a3caf905f 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1457,7 +1457,14 @@ def _split(self, splits): @_performance_tracking def _encode(self): - columns, indices = libcudf.transform.table_encode(list(self._columns)) + plc_table, plc_column = plc.transform.encode( + plc.Table([col.to_pylibcudf(mode="read") for col in self._columns]) + ) + columns = [ + libcudf.column.Column.from_pylibcudf(col) + for col in plc_table.columns() + ] + indices = libcudf.column.Column.from_pylibcudf(plc_column) keys = self._from_columns_like_self(columns) return keys, indices diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 21ac009e7ff..95f3d4d01d5 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3507,7 +3507,7 @@ def _apply(self, func, kernel_getter, *args, **kwargs): col = _post_process_output_col(ans_col, retty) - col.set_base_mask(libcudf.transform.bools_to_mask(ans_mask)) + col.set_base_mask(ans_mask.as_mask()) result = cudf.Series._from_column(col, index=self.index) return result diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 84c653c5b3f..59a3e9dbf3b 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -12,7 +12,6 @@ import cudf from cudf._lib.column import Column -from cudf._lib.transform import one_hot_encode from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import is_scalar @@ -1338,7 +1337,11 @@ def _one_hot_encode_column( f"np.iinfo({size_type_dtype}).max. Consider reducing " "size of category" ) - data = one_hot_encode(column, categories) + result_labels = ( + x if x is not None else "" + for x in categories.to_arrow().to_pylist() + ) + data = dict(zip(result_labels, column.one_hot_encode(categories))) if drop_first and len(data): data.pop(next(iter(data))) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 58cefc6554e..be74b0f867a 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -17,7 +17,6 @@ from typing_extensions import Self, assert_never import cudf -from cudf import _lib as libcudf from cudf.api.extensions import no_default from cudf.api.types import ( _is_non_decimal_numeric_dtype, @@ -526,7 +525,7 @@ def from_categorical(cls, categorical, codes=None): mask = None if not valid_codes.all(): - mask = libcudf.transform.bools_to_mask(valid_codes) + mask = valid_codes.as_mask() col = CategoricalColumn( data=col.data, size=codes.size, diff --git a/python/cudf/cudf/datasets.py b/python/cudf/cudf/datasets.py index e8d634598f4..a91a4951306 100644 --- a/python/cudf/cudf/datasets.py +++ b/python/cudf/cudf/datasets.py @@ -4,7 +4,6 @@ import pandas as pd import cudf -from cudf._lib.transform import bools_to_mask __all__ = ["randomdata", "timeseries"] @@ -70,7 +69,7 @@ def timeseries( size=len(index), p=[1 - nulls_frequency, nulls_frequency], ) - mask_buf = bools_to_mask(cudf.core.column.as_column(mask)) + mask_buf = cudf.core.column.as_column(mask).as_mask() masked_col = gdf[col]._column.set_mask(mask_buf) gdf[col] = cudf.Series._from_column(masked_col, index=gdf.index) diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 65947efc2df..c3c9a1c5338 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -7,7 +7,6 @@ import pytest import cudf -from cudf._lib.transform import mask_to_bools from cudf.core.column.column import as_column from cudf.testing import assert_eq from cudf.testing._utils import assert_exceptions_equal @@ -489,9 +488,7 @@ def test_build_df_from_nullable_pandas_dtype(pd_dtype, expect_dtype): # check mask expect_mask = [x is not pd.NA for x in pd_data["a"]] - got_mask = mask_to_bools( - gd_data["a"]._column.base_mask, 0, len(gd_data) - ).values_host + got_mask = gd_data["a"]._column._get_mask_as_column().values_host np.testing.assert_array_equal(expect_mask, got_mask) @@ -527,9 +524,7 @@ def test_build_series_from_nullable_pandas_dtype(pd_dtype, expect_dtype): # check mask expect_mask = [x is not pd.NA for x in pd_data] - got_mask = mask_to_bools( - gd_data._column.base_mask, 0, len(gd_data) - ).values_host + got_mask = gd_data._column._get_mask_as_column().values_host np.testing.assert_array_equal(expect_mask, got_mask) From ebad043967e8bb6a2a56ecfcb0b0612ea2894fa2 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Mon, 9 Dec 2024 18:37:41 -0500 Subject: [PATCH 47/78] Remove unused `BufferArrayFromVector` (#17549) Follow up to #17506. This PR removes an unused buffer class. Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/17549 --- python/cudf/cudf/_lib/parquet.pyx | 46 +------------------------------ 1 file changed, 1 insertion(+), 45 deletions(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 1b4c18d13a7..00c434ae374 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -20,11 +20,8 @@ from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io from cudf._lib.utils import _index_level_name, generate_pandas_metadata -from libc.stdint cimport int64_t, uint8_t +from libc.stdint cimport int64_t from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move -from libcpp.vector cimport vector from pylibcudf.expressions cimport Expression from pylibcudf.io.parquet cimport ChunkedParquetReader @@ -47,47 +44,6 @@ from pylibcudf cimport Table from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT from pylibcudf.io.types cimport TableInputMetadata, SinkInfo, ColumnInMetadata from pylibcudf.io.parquet cimport ParquetChunkedWriter -from cython.operator cimport dereference - - -cdef class BufferArrayFromVector: - cdef Py_ssize_t length - cdef unique_ptr[vector[uint8_t]] in_vec - - # these two things declare part of the buffer interface - cdef Py_ssize_t shape[1] - cdef Py_ssize_t strides[1] - - @staticmethod - cdef BufferArrayFromVector from_unique_ptr( - unique_ptr[vector[uint8_t]] in_vec - ): - cdef BufferArrayFromVector buf = BufferArrayFromVector() - buf.in_vec = move(in_vec) - buf.length = dereference(buf.in_vec).size() - return buf - - def __getbuffer__(self, Py_buffer *buffer, int flags): - cdef Py_ssize_t itemsize = sizeof(uint8_t) - - self.shape[0] = self.length - self.strides[0] = 1 - - buffer.buf = dereference(self.in_vec).data() - - buffer.format = NULL # byte - buffer.internal = NULL - buffer.itemsize = itemsize - buffer.len = self.length * itemsize # product(shape) * itemsize - buffer.ndim = 1 - buffer.obj = self - buffer.readonly = 0 - buffer.shape = self.shape - buffer.strides = self.strides - buffer.suboffsets = NULL - - def __releasebuffer__(self, Py_buffer *buffer): - pass def _parse_metadata(meta): From 47643959aaa7331523d79178bf37ea5106a01c05 Mon Sep 17 00:00:00 2001 From: Hirota Akio <33370421+a-hirota@users.noreply.github.com> Date: Tue, 10 Dec 2024 11:13:02 +0900 Subject: [PATCH 48/78] Enable rounding for Decimal32 and Decimal64 in cuDF (#17332) Authors: - Hirota Akio (https://github.com/a-hirota) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/17332 --- python/cudf/cudf/core/indexed_frame.py | 8 +++- python/cudf/cudf/tests/test_series.py | 63 ++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 95f3d4d01d5..0e6a5e03ea6 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3970,7 +3970,13 @@ def round(self, decimals=0, how="half_even"): cols = ( col.round(decimals[name], how=how) - if name in decimals and col.dtype.kind in "fiu" + if name in decimals + and ( + col.dtype.kind in "fiu" + or isinstance( + col.dtype, (cudf.Decimal32Dtype, cudf.Decimal64Dtype) + ) + ) else col.copy(deep=True) for name, col in self._column_labels_and_values ) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 99bd9adb034..f8697c5c6b8 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -772,6 +772,69 @@ def test_round_nan_as_null_false(series, decimal): assert_eq(result, expected, atol=1e-10) +@pytest.mark.parametrize( + "data, dtype, decimals, expected_half_up, expected_half_even", + [ + ( + [1.234, 2.345, 3.456], + cudf.Decimal32Dtype(precision=5, scale=3), + 2, + [1.23, 2.35, 3.46], + [1.23, 2.34, 3.46], + ), + ( + [1.234, 2.345, 3.456], + cudf.Decimal32Dtype(precision=5, scale=3), + 0, + [1.0, 2.0, 3.0], + [1.0, 2.0, 3.0], + ), + ( + [1.234, 2.345, 3.456], + cudf.Decimal32Dtype(precision=5, scale=3), + 3, + [1.234, 2.345, 3.456], + [1.234, 2.345, 3.456], + ), + ( + [1.234567, 2.345678, 3.456789], + cudf.Decimal64Dtype(precision=10, scale=6), + 4, + [1.2346, 2.3457, 3.4568], + [1.2346, 2.3457, 3.4568], + ), + ( + [1.234567, 2.345678, 3.456789], + cudf.Decimal64Dtype(precision=10, scale=6), + 2, + [1.23, 2.35, 3.46], + [1.23, 2.35, 3.46], + ), + ( + [1.234567, 2.345678, 3.456789], + cudf.Decimal64Dtype(precision=10, scale=6), + 6, + [1.234567, 2.345678, 3.456789], + [1.234567, 2.345678, 3.456789], + ), + ], +) +def test_series_round_decimal( + data, dtype, decimals, expected_half_up, expected_half_even +): + ser = cudf.Series(data).astype(dtype) + + result_half_up = ser.round(decimals=decimals, how="half_up").astype(dtype) + expected_ser_half_up = cudf.Series(expected_half_up).astype(dtype) + assert_eq(result_half_up, expected_ser_half_up) + + result_half_even = ser.round(decimals=decimals, how="half_even").astype( + dtype + ) + expected_ser_half_even = cudf.Series(expected_half_even).astype(dtype) + assert_eq(result_half_even, expected_ser_half_even) + + @pytest.mark.parametrize("ps", _series_na_data()) @pytest.mark.parametrize("nan_as_null", [True, False, None]) def test_series_isnull_isna(ps, nan_as_null): From c53ace8f381af7c9e9dce161dcc756d07f8f147c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 9 Dec 2024 18:35:11 -0800 Subject: [PATCH 49/78] Fix CMake format in cudf/_lib/CMakeLists.txt (#17559) Due to 2 of my cudf._lib refactoring PRs going in which then impacted formatting of `cudf/_lib/CMakeLists.txt` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17559 --- python/cudf/cudf/_lib/CMakeLists.txt | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 084fc19a61e..efe96ff6c3e 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -13,20 +13,8 @@ # ============================================================================= set(cython_sources - column.pyx - copying.pyx - csv.pyx - groupby.pyx - interop.pyx - parquet.pyx - reduce.pyx - scalar.pyx - sort.pyx - stream_compaction.pyx - string_casting.pyx - strings_udf.pyx - types.pyx - utils.pyx + column.pyx copying.pyx csv.pyx groupby.pyx interop.pyx parquet.pyx reduce.pyx scalar.pyx + sort.pyx stream_compaction.pyx string_casting.pyx strings_udf.pyx types.pyx utils.pyx ) set(linked_libraries cudf::cudf) From e16b3a3c499bda40082c1990f94ef0aa3bb23b35 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 10 Dec 2024 01:16:54 -0600 Subject: [PATCH 50/78] Remove Thrust patch in favor of CMake definition for Thrust 32-bit offset types. (#17527) Follow-up for #17523 to use `target_compile_definitions` and drop the Thrust patch. Authors: - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17527 --- cpp/CMakeLists.txt | 4 ++++ .../thirdparty/patches/cccl_override.json | 5 ----- .../thrust_disable_64bit_dispatching.diff | 22 ------------------- 3 files changed, 4 insertions(+), 27 deletions(-) delete mode 100644 cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 12e6826f301..e54c71de4fa 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -928,6 +928,10 @@ if(TARGET CUDA::cuFile${_cufile_suffix}) target_compile_definitions(cudf PRIVATE CUDF_CUFILE_FOUND) endif() +# Remove this after upgrading to a CCCL that has a proper CMake option. See +# https://github.com/NVIDIA/cccl/pull/2844 +target_compile_definitions(cudf PRIVATE THRUST_FORCE_32_BIT_OFFSET_TYPE=1) + # Compile stringified JIT sources first add_dependencies(cudf jitify_preprocess_run) diff --git a/cpp/cmake/thirdparty/patches/cccl_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json index 2f29578f7ae..d5cadce40c2 100644 --- a/cpp/cmake/thirdparty/patches/cccl_override.json +++ b/cpp/cmake/thirdparty/patches/cccl_override.json @@ -3,11 +3,6 @@ "packages" : { "CCCL" : { "patches" : [ - { - "file" : "${current_json_dir}/thrust_disable_64bit_dispatching.diff", - "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]", - "fixed_in" : "" - }, { "file" : "${current_json_dir}/thrust_faster_sort_compile_times.diff", "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]", diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff deleted file mode 100644 index 9f68d85e7db..00000000000 --- a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff +++ /dev/null @@ -1,22 +0,0 @@ -diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h -index 3d004aa55..71ce86bea 100644 ---- a/thrust/thrust/system/cuda/detail/dispatch.h -+++ b/thrust/thrust/system/cuda/detail/dispatch.h -@@ -63,7 +63,7 @@ - _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count1) \ - _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count2) - --#if defined(THRUST_FORCE_64_BIT_OFFSET_TYPE) -+#if 0 - //! @brief Always dispatches to 64 bit offset version of an algorithm - # define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments) \ - _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count) \ -@@ -89,7 +89,7 @@ - _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count) \ - _THRUST_INDEX_TYPE_DISPATCH(std::uint64_t, status, call_64, count, arguments) - --#elif defined(THRUST_FORCE_32_BIT_OFFSET_TYPE) -+#elif 1 - - //! @brief Ensures that the size of the input does not overflow the offset type - # define _THRUST_INDEX_TYPE_DISPATCH_GUARD_OVERFLOW(index_type, count) \ From 13e983eafecad5a3d4053157febd714e40a410c3 Mon Sep 17 00:00:00 2001 From: Mike Sarahan Date: Tue, 10 Dec 2024 09:15:37 -0600 Subject: [PATCH 51/78] gate telemetry dispatch calls on TELEMETRY_ENABLED env var (#17551) Because of the switch away from certificates/mTLS, we are having to rework a few things. In the meantime, telemetry jobs are failing. This PR adds a switch to turn all of the telemetry stuff off - to skip it instead. It is meant to be controlled by an org-wide environment variable, which can be applied to individual repos by ops. At the time of submitting this PR, the environment variable is 'false' and no telemetry is being reported. Authors: - Mike Sarahan (https://github.com/msarahan) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17551 --- .github/workflows/pr.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 7c0bd6d52e2..49ca5ca0fb9 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -52,6 +52,7 @@ jobs: OTEL_SERVICE_NAME: 'pr-cudf' steps: - name: Telemetry setup + if: ${{ vars.TELEMETRY_ENABLED == 'true' }} uses: rapidsai/shared-actions/telemetry-dispatch-stash-base-env-vars@main changed-files: secrets: inherit @@ -329,7 +330,7 @@ jobs: telemetry-summarize: runs-on: ubuntu-latest needs: pr-builder - if: always() + if: ${{ vars.TELEMETRY_ENABLED == 'true' && !cancelled() }} continue-on-error: true steps: - name: Load stashed telemetry env vars From 3468e9259960b4f16cd849e8497be4f5bee0839b Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 10 Dec 2024 12:32:10 -0500 Subject: [PATCH 52/78] Replace cudf::detail::copy_if logic with thrust::copy_if and gather (#17520) Replaces the custom kernels for `cudf::detail::copy_if` with a call to `thrust::copy_if` to build indices to call `cudf::detail::gather`. This is easier to maintain and faster for some cases but slower in others. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Yunsong Wang (https://github.com/PointKernel) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/17520 --- .../stream_compaction/apply_boolean_mask.cpp | 8 +- cpp/include/cudf/detail/copy_if.cuh | 363 +----------------- cpp/src/dictionary/remove_keys.cu | 1 + 3 files changed, 23 insertions(+), 349 deletions(-) diff --git a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp index fa017ca9e29..267aa3a93f3 100644 --- a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp +++ b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp @@ -63,8 +63,8 @@ void apply_boolean_mask_benchmark(nvbench::state& state, nvbench::type_list -#include #include -#include #include #include -#include -#include -#include -#include #include #include #include #include -#include #include -#include -#include #include -#include #include #include -#include -#include #include #include -#include - namespace cudf { namespace detail { -// Compute the count of elements that pass the mask within each block -template -CUDF_KERNEL void compute_block_counts(cudf::size_type* __restrict__ block_counts, - cudf::size_type size, - cudf::size_type per_thread, - Filter filter) -{ - int tid = threadIdx.x + per_thread * block_size * blockIdx.x; - int count = 0; - - for (int i = 0; i < per_thread; i++) { - bool mask_true = (tid < size) && filter(tid); - count += __syncthreads_count(mask_true); - tid += block_size; - } - - if (threadIdx.x == 0) block_counts[blockIdx.x] = count; -} - -// Compute the exclusive prefix sum of each thread's mask value within each block -template -__device__ cudf::size_type block_scan_mask(bool mask_true, cudf::size_type& block_sum) -{ - int offset = 0; - - using BlockScan = cub::BlockScan; - __shared__ typename BlockScan::TempStorage temp_storage; - BlockScan(temp_storage).ExclusiveSum(mask_true, offset, block_sum); - - return offset; -} - -// This kernel scatters data and validity mask of a column based on the -// scan of the boolean mask. The block offsets for the scan are already computed. -// Just compute the scan of the mask in each block and add it to the block's -// output offset. This is the output index of each element. Scattering -// the valid mask is not as easy, because each thread is only responsible for -// one bit. Warp-level processing (ballot) makes this simpler. -// To make scattering efficient, we "coalesce" the block's scattered data and -// valids in shared memory, and then write from shared memory to global memory -// in a contiguous manner. -// The has_validity template parameter specializes this kernel for the -// non-nullable case for performance without writing another kernel. -// -// Note: `filter` is not run on indices larger than the input column size -template -__launch_bounds__(block_size) CUDF_KERNEL - void scatter_kernel(cudf::mutable_column_device_view output_view, - cudf::size_type* output_null_count, - cudf::column_device_view input_view, - cudf::size_type const* __restrict__ block_offsets, - cudf::size_type size, - cudf::size_type per_thread, - Filter filter) -{ - T* __restrict__ output_data = output_view.data(); - cudf::bitmask_type* __restrict__ output_valid = output_view.null_mask(); - static_assert(block_size <= 1024, "Maximum thread block size exceeded"); - - int tid = threadIdx.x + per_thread * block_size * blockIdx.x; - cudf::size_type block_offset = block_offsets[blockIdx.x]; - - // one extra warp worth in case the block is not aligned - __shared__ bool temp_valids[has_validity ? block_size + cudf::detail::warp_size : 1]; - __shared__ T temp_data[block_size]; - - cudf::size_type warp_valid_counts{0}; // total valid sum over the `per_thread` loop below - cudf::size_type block_sum = 0; // count passing filter over the `per_thread` loop below - - // Note that since the maximum gridDim.x on all supported GPUs is as big as - // cudf::size_type, this loop is sufficient to cover our maximum column size - // regardless of the value of block_size and per_thread. - for (int i = 0; i < per_thread; i++) { - bool mask_true = (tid < size) && filter(tid); - - cudf::size_type tmp_block_sum = 0; - // get output location using a scan of the mask result - cudf::size_type const local_index = block_scan_mask(mask_true, tmp_block_sum); - block_sum += tmp_block_sum; - - if (has_validity) { - temp_valids[threadIdx.x] = false; // init shared memory - if (threadIdx.x < cudf::detail::warp_size) temp_valids[block_size + threadIdx.x] = false; - __syncthreads(); // wait for init - } - - if (mask_true) { - temp_data[local_index] = input_view.data()[tid]; // scatter data to shared - - // scatter validity mask to shared memory - if (has_validity and input_view.is_valid(tid)) { - // determine aligned offset for this warp's output - cudf::size_type const aligned_offset = block_offset % cudf::detail::warp_size; - temp_valids[local_index + aligned_offset] = true; - } - } - - __syncthreads(); // wait for shared data and validity mask to be complete - - // Copy output data coalesced from shared to global - if (threadIdx.x < tmp_block_sum) - output_data[block_offset + threadIdx.x] = temp_data[threadIdx.x]; - - if (has_validity) { - // Since the valid bools are contiguous in shared memory now, we can use - // __popc to combine them into a single mask element. - // Then, most mask elements can be directly copied from shared to global - // memory. Only the first and last 32-bit mask elements of each block must - // use an atomicOr, because these are where other blocks may overlap. - - constexpr int num_warps = block_size / cudf::detail::warp_size; - // account for partial blocks with non-warp-aligned offsets - int const last_index = tmp_block_sum + (block_offset % cudf::detail::warp_size) - 1; - int const last_warp = min(num_warps, last_index / cudf::detail::warp_size); - int const wid = threadIdx.x / cudf::detail::warp_size; - int const lane = threadIdx.x % cudf::detail::warp_size; - - cudf::size_type tmp_warp_valid_counts{0}; - - if (tmp_block_sum > 0 && wid <= last_warp) { - int valid_index = (block_offset / cudf::detail::warp_size) + wid; - - // compute the valid mask for this warp - uint32_t valid_warp = __ballot_sync(0xffff'ffffu, temp_valids[threadIdx.x]); - - // Note the atomicOr's below assume that output_valid has been set to - // all zero before the kernel - if (lane == 0 && valid_warp != 0) { - tmp_warp_valid_counts = __popc(valid_warp); - if (wid > 0 && wid < last_warp) - output_valid[valid_index] = valid_warp; - else { - cuda::atomic_ref ref{ - output_valid[valid_index]}; - ref.fetch_or(valid_warp, cuda::std::memory_order_relaxed); - } - } - - // if the block is full and not aligned then we have one more warp to cover - if ((wid == 0) && (last_warp == num_warps)) { - uint32_t valid_warp = __ballot_sync(0xffff'ffffu, temp_valids[block_size + threadIdx.x]); - if (lane == 0 && valid_warp != 0) { - tmp_warp_valid_counts += __popc(valid_warp); - cuda::atomic_ref ref{ - output_valid[valid_index + num_warps]}; - ref.fetch_or(valid_warp, cuda::std::memory_order_relaxed); - } - } - } - warp_valid_counts += tmp_warp_valid_counts; - } - - block_offset += tmp_block_sum; - tid += block_size; - } - // Compute total null_count for this block and add it to global count - constexpr cudf::size_type leader_lane{0}; - cudf::size_type block_valid_count = - cudf::detail::single_lane_block_sum_reduce(warp_valid_counts); - - if (threadIdx.x == 0) { // one thread computes and adds to null count - cuda::atomic_ref ref{*output_null_count}; - ref.fetch_add(block_sum - block_valid_count, cuda::std::memory_order_relaxed); - } -} - -template -struct DeviceType { - using type = T; -}; - -template -struct DeviceType()>> { - using type = typename T::rep; -}; - -template -struct DeviceType()>> { - using type = typename cudf::device_storage_type_t; -}; - -// Dispatch functor which performs the scatter for fixed column types and gather for other -template -struct scatter_gather_functor { - template ()>* = nullptr> - std::unique_ptr operator()(cudf::column_view const& input, - cudf::size_type const& output_size, - cudf::size_type const* block_offsets, - Filter filter, - cudf::size_type per_thread, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) - { - auto output_column = - cudf::allocate_like(input, output_size, cudf::mask_allocation_policy::RETAIN, stream, mr); - auto output = output_column->mutable_view(); - - bool has_valid = input.nullable(); - - using Type = typename DeviceType::type; - - auto scatter = (has_valid) ? scatter_kernel - : scatter_kernel; - - cudf::detail::grid_1d grid{input.size(), block_size, per_thread}; - - cudf::detail::device_scalar null_count{0, stream}; - if (output.nullable()) { - // Have to initialize the output mask to all zeros because we may update - // it with atomicOr(). - CUDF_CUDA_TRY(cudaMemsetAsync(static_cast(output.null_mask()), - 0, - cudf::bitmask_allocation_size_bytes(output.size()), - stream.value())); - } - - auto output_device_view = cudf::mutable_column_device_view::create(output, stream); - auto input_device_view = cudf::column_device_view::create(input, stream); - scatter<<>>(*output_device_view, - null_count.data(), - *input_device_view, - block_offsets, - input.size(), - per_thread, - filter); - - if (has_valid) { output_column->set_null_count(null_count.value(stream)); } - return output_column; - } - - template () and !cudf::is_fixed_point()>* = nullptr> - std::unique_ptr operator()(cudf::column_view const& input, - cudf::size_type const& output_size, - cudf::size_type const*, - Filter filter, - cudf::size_type, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) - { - rmm::device_uvector indices(output_size, stream); - - thrust::copy_if(rmm::exec_policy(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(input.size()), - indices.begin(), - filter); - - auto output_table = cudf::detail::gather(cudf::table_view{{input}}, - indices, - cudf::out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, - stream, - mr); - - // There will be only one column - return std::make_unique(std::move(output_table->get_column(0))); - } -}; - /** * @brief Filters `input` using a Filter function object * @@ -319,9 +44,11 @@ struct scatter_gather_functor { * false otherwise. * * @tparam Filter the filter functor type - * @param[in] input The table_view to filter - * @param[in] filter A function object that takes an index and returns a bool - * @return unique_ptr The table generated from filtered `input`. + * @param input The table_view to filter + * @param filter A function object that takes an index and returns a bool + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used for allocating the returned memory + * @return The table generated from filtered `input` */ template std::unique_ptr
copy_if(table_view const& input, @@ -333,76 +60,22 @@ std::unique_ptr
copy_if(table_view const& input, if (0 == input.num_rows() || 0 == input.num_columns()) { return empty_like(input); } - constexpr int block_size = 256; - cudf::size_type per_thread = - elements_per_thread(compute_block_counts, input.num_rows(), block_size); - cudf::detail::grid_1d grid{input.num_rows(), block_size, per_thread}; - - // temp storage for block counts and offsets - rmm::device_uvector block_counts(grid.num_blocks, stream); - rmm::device_uvector block_offsets(grid.num_blocks + 1, stream); - - // 1. Find the count of elements in each block that "pass" the mask - compute_block_counts<<>>( - block_counts.begin(), input.num_rows(), per_thread, filter); - - // initialize just the first element of block_offsets to 0 since the InclusiveSum below - // starts at the second element. - CUDF_CUDA_TRY(cudaMemsetAsync(block_offsets.begin(), 0, sizeof(cudf::size_type), stream.value())); - - // 2. Find the offset for each block's output using a scan of block counts - if (grid.num_blocks > 1) { - // Determine and allocate temporary device storage - size_t temp_storage_bytes = 0; - cub::DeviceScan::InclusiveSum(nullptr, - temp_storage_bytes, - block_counts.begin(), - block_offsets.begin() + 1, - grid.num_blocks, - stream.value()); - rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); - - // Run exclusive prefix sum - cub::DeviceScan::InclusiveSum(d_temp_storage.data(), - temp_storage_bytes, - block_counts.begin(), - block_offsets.begin() + 1, - grid.num_blocks, - stream.value()); - } - - // As it is InclusiveSum, last value in block_offsets will be output_size - // unless num_blocks == 1, in which case output_size is just block_counts[0] - cudf::size_type output_size{0}; - CUDF_CUDA_TRY(cudaMemcpyAsync( - &output_size, - grid.num_blocks > 1 ? block_offsets.begin() + grid.num_blocks : block_counts.begin(), - sizeof(cudf::size_type), - cudaMemcpyDefault, - stream.value())); + auto indices = rmm::device_uvector(input.num_rows(), stream); + auto const begin = thrust::counting_iterator(0); + auto const end = begin + input.num_rows(); + auto const indices_end = + thrust::copy_if(rmm::exec_policy(stream), begin, end, indices.begin(), filter); - stream.synchronize(); + auto const output_size = static_cast(thrust::distance(indices.begin(), indices_end)); - if (output_size == input.num_rows()) { - return std::make_unique
(input, stream, mr); - } else if (output_size > 0) { - std::vector> out_columns(input.num_columns()); - std::transform(input.begin(), input.end(), out_columns.begin(), [&](auto col_view) { - return cudf::type_dispatcher(col_view.type(), - scatter_gather_functor{}, - col_view, - output_size, - block_offsets.begin(), - filter, - per_thread, - stream, - mr); - }); + // nothing selected + if (output_size == 0) { return empty_like(input); } + // everything selected + if (output_size == input.num_rows()) { return std::make_unique
(input, stream, mr); } - return std::make_unique
(std::move(out_columns)); - } else { - return empty_like(input); - } + auto const map = device_span(indices.data(), output_size); + return cudf::detail::gather( + input, map, out_of_bounds_policy::DONT_CHECK, negative_index_policy::NOT_ALLOWED, stream, mr); } } // namespace detail diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu index 59c8453cf33..4715931a7a9 100644 --- a/cpp/src/dictionary/remove_keys.cu +++ b/cpp/src/dictionary/remove_keys.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include #include From 2f5bf7659e40cd27bb35f10785e233aad5481bbd Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 10 Dec 2024 09:37:46 -0800 Subject: [PATCH 53/78] Simplify serialization protocols (#17552) This rewrites all serialization protocols in cudf to remove the need for pickling intermediates. --- python/cudf/cudf/_lib/copying.pyx | 11 +-- python/cudf/cudf/core/_base_index.py | 8 -- python/cudf/cudf/core/abc.py | 16 ++-- python/cudf/cudf/core/buffer/buffer.py | 8 +- .../cudf/cudf/core/buffer/spillable_buffer.py | 4 +- python/cudf/cudf/core/column/column.py | 23 +++--- python/cudf/cudf/core/dataframe.py | 9 +- python/cudf/cudf/core/dtypes.py | 77 +++++++----------- python/cudf/cudf/core/frame.py | 73 +++++++++++++---- python/cudf/cudf/core/groupby/groupby.py | 13 ++- python/cudf/cudf/core/index.py | 13 +-- python/cudf/cudf/core/multiindex.py | 7 +- python/cudf/cudf/core/resample.py | 12 +-- python/cudf/cudf/core/series.py | 9 +- .../stringColumnWithRangeIndex_cudf_23.12.pkl | Bin 1394 -> 1108 bytes python/cudf/cudf/tests/test_serialize.py | 19 ++++- python/cudf/cudf/tests/test_struct.py | 2 +- .../dask_cudf/tests/test_distributed.py | 16 +++- 18 files changed, 179 insertions(+), 141 deletions(-) diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index 4dfb12d8ab3..c478cd1a990 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -1,7 +1,5 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -import pickle - from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -367,14 +365,13 @@ class PackedColumns(Serializable): header["index-names"] = self.index_names header["metadata"] = self._metadata.tobytes() for name, dtype in self.column_dtypes.items(): - dtype_header, dtype_frames = dtype.serialize() + dtype_header, dtype_frames = dtype.device_serialize() self.column_dtypes[name] = ( dtype_header, (len(frames), len(frames) + len(dtype_frames)), ) frames.extend(dtype_frames) header["column-dtypes"] = self.column_dtypes - header["type-serialized"] = pickle.dumps(type(self)) return header, frames @classmethod @@ -382,9 +379,9 @@ class PackedColumns(Serializable): column_dtypes = {} for name, dtype in header["column-dtypes"].items(): dtype_header, (start, stop) = dtype - column_dtypes[name] = pickle.loads( - dtype_header["type-serialized"] - ).deserialize(dtype_header, frames[start:stop]) + column_dtypes[name] = Serializable.device_deserialize( + dtype_header, frames[start:stop] + ) return cls( plc.contiguous_split.pack( plc.contiguous_split.unpack_from_memoryviews( diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index a6abd63d042..950ce5f1236 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -2,7 +2,6 @@ from __future__ import annotations -import pickle import warnings from functools import cached_property from typing import TYPE_CHECKING, Any, Literal @@ -330,13 +329,6 @@ def get_level_values(self, level): else: raise KeyError(f"Requested level with name {level} " "not found") - @classmethod - def deserialize(cls, header, frames): - # Dispatch deserialization to the appropriate index type in case - # deserialization is ever attempted with the base class directly. - idx_type = pickle.loads(header["type-serialized"]) - return idx_type.deserialize(header, frames) - @property def names(self): """ diff --git a/python/cudf/cudf/core/abc.py b/python/cudf/cudf/core/abc.py index ce6bb83bc77..c8ea03b04fe 100644 --- a/python/cudf/cudf/core/abc.py +++ b/python/cudf/cudf/core/abc.py @@ -1,8 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. """Common abstract base classes for cudf.""" -import pickle - import numpy import cudf @@ -22,6 +20,14 @@ class Serializable: latter converts back from that representation into an equivalent object. """ + # A mapping from class names to the classes themselves. This is used to + # reconstruct the correct class when deserializing an object. + _name_type_map: dict = {} + + def __init_subclass__(cls, /, **kwargs): + super().__init_subclass__(**kwargs) + cls._name_type_map[cls.__name__] = cls + def serialize(self): """Generate an equivalent serializable representation of an object. @@ -98,7 +104,7 @@ def device_serialize(self): ) for f in frames ) - header["type-serialized"] = pickle.dumps(type(self)) + header["type-serialized-name"] = type(self).__name__ header["is-cuda"] = [ hasattr(f, "__cuda_array_interface__") for f in frames ] @@ -128,10 +134,10 @@ def device_deserialize(cls, header, frames): :meta private: """ - typ = pickle.loads(header["type-serialized"]) + typ = cls._name_type_map[header["type-serialized-name"]] frames = [ cudf.core.buffer.as_buffer(f) if c else memoryview(f) - for c, f in zip(header["is-cuda"], frames) + for c, f in zip(header["is-cuda"], frames, strict=True) ] return typ.deserialize(header, frames) diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index ffa306bf93f..625938ca168 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -3,7 +3,6 @@ from __future__ import annotations import math -import pickle import weakref from types import SimpleNamespace from typing import TYPE_CHECKING, Any, Literal @@ -432,8 +431,7 @@ def serialize(self) -> tuple[dict, list]: second element is a list containing single frame. """ header: dict[str, Any] = {} - header["type-serialized"] = pickle.dumps(type(self)) - header["owner-type-serialized"] = pickle.dumps(type(self._owner)) + header["owner-type-serialized-name"] = type(self._owner).__name__ header["frame_count"] = 1 frames = [self] return header, frames @@ -460,7 +458,9 @@ def deserialize(cls, header: dict, frames: list) -> Self: if isinstance(frame, cls): return frame # The frame is already deserialized - owner_type: BufferOwner = pickle.loads(header["owner-type-serialized"]) + owner_type: BufferOwner = Serializable._name_type_map[ + header["owner-type-serialized-name"] + ] if hasattr(frame, "__cuda_array_interface__"): owner = owner_type.from_device_memory(frame, exposed=False) else: diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py index b40c56c9a6b..66f8be4ddc5 100644 --- a/python/cudf/cudf/core/buffer/spillable_buffer.py +++ b/python/cudf/cudf/core/buffer/spillable_buffer.py @@ -3,7 +3,6 @@ from __future__ import annotations import collections.abc -import pickle import time import weakref from threading import RLock @@ -415,8 +414,7 @@ def serialize(self) -> tuple[dict, list]: header: dict[str, Any] = {} frames: list[Buffer | memoryview] with self._owner.lock: - header["type-serialized"] = pickle.dumps(self.__class__) - header["owner-type-serialized"] = pickle.dumps(type(self._owner)) + header["owner-type-serialized-name"] = type(self._owner).__name__ header["frame_count"] = 1 if self.is_spilled: frames = [self.memoryview()] diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index f6eaea4b783..4b1e9c1129e 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2,7 +2,6 @@ from __future__ import annotations -import pickle from collections import abc from collections.abc import MutableSequence, Sequence from functools import cached_property @@ -1224,28 +1223,27 @@ def serialize(self) -> tuple[dict, list]: header: dict[Any, Any] = {} frames = [] - header["type-serialized"] = pickle.dumps(type(self)) try: - dtype, dtype_frames = self.dtype.serialize() + dtype, dtype_frames = self.dtype.device_serialize() header["dtype"] = dtype frames.extend(dtype_frames) header["dtype-is-cudf-serialized"] = True except AttributeError: - header["dtype"] = pickle.dumps(self.dtype) + header["dtype"] = self.dtype.str header["dtype-is-cudf-serialized"] = False if self.data is not None: - data_header, data_frames = self.data.serialize() + data_header, data_frames = self.data.device_serialize() header["data"] = data_header frames.extend(data_frames) if self.mask is not None: - mask_header, mask_frames = self.mask.serialize() + mask_header, mask_frames = self.mask.device_serialize() header["mask"] = mask_header frames.extend(mask_frames) if self.children: child_headers, child_frames = zip( - *(c.serialize() for c in self.children) + *(c.device_serialize() for c in self.children) ) header["subheaders"] = list(child_headers) frames.extend(chain(*child_frames)) @@ -1257,8 +1255,7 @@ def serialize(self) -> tuple[dict, list]: def deserialize(cls, header: dict, frames: list) -> ColumnBase: def unpack(header, frames) -> tuple[Any, list]: count = header["frame_count"] - klass = pickle.loads(header["type-serialized"]) - obj = klass.deserialize(header, frames[:count]) + obj = cls.device_deserialize(header, frames[:count]) return obj, frames[count:] assert header["frame_count"] == len(frames), ( @@ -1268,7 +1265,7 @@ def unpack(header, frames) -> tuple[Any, list]: if header["dtype-is-cudf-serialized"]: dtype, frames = unpack(header["dtype"], frames) else: - dtype = pickle.loads(header["dtype"]) + dtype = np.dtype(header["dtype"]) if "data" in header: data, frames = unpack(header["data"], frames) else: @@ -2219,7 +2216,9 @@ def serialize_columns(columns: list[ColumnBase]) -> tuple[list[dict], list]: frames = [] if len(columns) > 0: - header_columns = [c.serialize() for c in columns] + header_columns: list[tuple[dict, list]] = [ + c.device_serialize() for c in columns + ] headers, column_frames = zip(*header_columns) for f in column_frames: frames.extend(f) @@ -2236,7 +2235,7 @@ def deserialize_columns(headers: list[dict], frames: list) -> list[ColumnBase]: for meta in headers: col_frame_count = meta["frame_count"] - col_typ = pickle.loads(meta["type-serialized"]) + col_typ = Serializable._name_type_map[meta["type-serialized-name"]] colobj = col_typ.deserialize(meta, frames[:col_frame_count]) columns.append(colobj) # Advance frames diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index bd78d5dd9f1..fd68a40324e 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7,7 +7,6 @@ import itertools import numbers import os -import pickle import re import sys import textwrap @@ -44,7 +43,6 @@ ) from cudf.core import column, df_protocol, indexing_utils, reshape from cudf.core._compat import PANDAS_LT_300 -from cudf.core.abc import Serializable from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ( CategoricalColumn, @@ -582,7 +580,7 @@ class _DataFrameiAtIndexer(_DataFrameIlocIndexer): pass -class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin): +class DataFrame(IndexedFrame, GetAttrGetItemMixin): """ A GPU Dataframe object. @@ -1184,7 +1182,7 @@ def _constructor_expanddim(self): def serialize(self): header, frames = super().serialize() - header["index"], index_frames = self.index.serialize() + header["index"], index_frames = self.index.device_serialize() header["index_frame_count"] = len(index_frames) # For backwards compatibility with older versions of cuDF, index # columns are placed before data columns. @@ -1199,8 +1197,7 @@ def deserialize(cls, header, frames): header, frames[header["index_frame_count"] :] ) - idx_typ = pickle.loads(header["index"]["type-serialized"]) - index = idx_typ.deserialize(header["index"], frames[:index_nframes]) + index = cls.device_deserialize(header["index"], frames[:index_nframes]) obj.index = index return obj diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 2110e610c37..8765a27a165 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -3,7 +3,6 @@ import decimal import operator -import pickle import textwrap import warnings from functools import cached_property @@ -91,13 +90,13 @@ def dtype(arbitrary): raise TypeError(f"Cannot interpret {arbitrary} as a valid cuDF dtype") -def _decode_type( +def _check_type( cls: type, header: dict, frames: list, is_valid_class: Callable[[type, type], bool] = operator.is_, -) -> tuple[dict, list, type]: - """Decode metadata-encoded type and check validity +) -> None: + """Perform metadata-encoded type and check validity Parameters ---------- @@ -112,12 +111,6 @@ class performing deserialization serialization by `cls` (default is to check type equality), called as `is_valid_class(decoded_class, cls)`. - Returns - ------- - tuple - Tuple of validated headers, frames, and the decoded class - constructor. - Raises ------ AssertionError @@ -128,11 +121,10 @@ class performing deserialization f"Deserialization expected {header['frame_count']} frames, " f"but received {len(frames)}." ) - klass = pickle.loads(header["type-serialized"]) assert is_valid_class( - klass, cls + klass := Serializable._name_type_map[header["type-serialized-name"]], + cls, ), f"Header-encoded {klass=} does not match decoding {cls=}." - return header, frames, klass class _BaseDtype(ExtensionDtype, Serializable): @@ -305,13 +297,14 @@ def construct_from_string(self): def serialize(self): header = {} - header["type-serialized"] = pickle.dumps(type(self)) header["ordered"] = self.ordered frames = [] if self.categories is not None: - categories_header, categories_frames = self.categories.serialize() + categories_header, categories_frames = ( + self.categories.device_serialize() + ) header["categories"] = categories_header frames.extend(categories_frames) header["frame_count"] = len(frames) @@ -319,15 +312,14 @@ def serialize(self): @classmethod def deserialize(cls, header, frames): - header, frames, klass = _decode_type(cls, header, frames) + _check_type(cls, header, frames) ordered = header["ordered"] categories_header = header["categories"] categories_frames = frames - categories_type = pickle.loads(categories_header["type-serialized"]) - categories = categories_type.deserialize( + categories = Serializable.device_deserialize( categories_header, categories_frames ) - return klass(categories=categories, ordered=ordered) + return cls(categories=categories, ordered=ordered) def __repr__(self): return self.to_pandas().__repr__() @@ -495,12 +487,13 @@ def __hash__(self): def serialize(self) -> tuple[dict, list]: header: dict[str, Dtype] = {} - header["type-serialized"] = pickle.dumps(type(self)) frames = [] if isinstance(self.element_type, _BaseDtype): - header["element-type"], frames = self.element_type.serialize() + header["element-type"], frames = ( + self.element_type.device_serialize() + ) else: header["element-type"] = getattr( self.element_type, "name", self.element_type @@ -510,14 +503,14 @@ def serialize(self) -> tuple[dict, list]: @classmethod def deserialize(cls, header: dict, frames: list): - header, frames, klass = _decode_type(cls, header, frames) + _check_type(cls, header, frames) if isinstance(header["element-type"], dict): - element_type = pickle.loads( - header["element-type"]["type-serialized"] - ).deserialize(header["element-type"], frames) + element_type = Serializable.device_deserialize( + header["element-type"], frames + ) else: element_type = header["element-type"] - return klass(element_type=element_type) + return cls(element_type=element_type) @cached_property def itemsize(self): @@ -641,7 +634,6 @@ def __hash__(self): def serialize(self) -> tuple[dict, list]: header: dict[str, Any] = {} - header["type-serialized"] = pickle.dumps(type(self)) frames: list[Buffer] = [] @@ -649,33 +641,31 @@ def serialize(self) -> tuple[dict, list]: for k, dtype in self.fields.items(): if isinstance(dtype, _BaseDtype): - dtype_header, dtype_frames = dtype.serialize() + dtype_header, dtype_frames = dtype.device_serialize() fields[k] = ( dtype_header, (len(frames), len(frames) + len(dtype_frames)), ) frames.extend(dtype_frames) else: - fields[k] = pickle.dumps(dtype) + fields[k] = dtype.str header["fields"] = fields header["frame_count"] = len(frames) return header, frames @classmethod def deserialize(cls, header: dict, frames: list): - header, frames, klass = _decode_type(cls, header, frames) + _check_type(cls, header, frames) fields = {} for k, dtype in header["fields"].items(): if isinstance(dtype, tuple): dtype_header, (start, stop) = dtype - fields[k] = pickle.loads( - dtype_header["type-serialized"] - ).deserialize( + fields[k] = Serializable.device_deserialize( dtype_header, frames[start:stop], ) else: - fields[k] = pickle.loads(dtype) + fields[k] = np.dtype(dtype) return cls(fields) @cached_property @@ -838,7 +828,6 @@ def _from_decimal(cls, decimal): def serialize(self) -> tuple[dict, list]: return ( { - "type-serialized": pickle.dumps(type(self)), "precision": self.precision, "scale": self.scale, "frame_count": 0, @@ -848,11 +837,8 @@ def serialize(self) -> tuple[dict, list]: @classmethod def deserialize(cls, header: dict, frames: list): - header, frames, klass = _decode_type( - cls, header, frames, is_valid_class=issubclass - ) - klass = pickle.loads(header["type-serialized"]) - return klass(header["precision"], header["scale"]) + _check_type(cls, header, frames, is_valid_class=issubclass) + return cls(header["precision"], header["scale"]) def __eq__(self, other: Dtype) -> bool: if other is self: @@ -960,18 +946,17 @@ def __hash__(self): def serialize(self) -> tuple[dict, list]: header = { - "type-serialized": pickle.dumps(type(self)), - "fields": pickle.dumps((self.subtype, self.closed)), + "fields": (self.subtype.str, self.closed), "frame_count": 0, } return header, [] @classmethod def deserialize(cls, header: dict, frames: list): - header, frames, klass = _decode_type(cls, header, frames) - klass = pickle.loads(header["type-serialized"]) - subtype, closed = pickle.loads(header["fields"]) - return klass(subtype, closed=closed) + _check_type(cls, header, frames) + subtype, closed = header["fields"] + subtype = np.dtype(subtype) + return cls(subtype, closed=closed) def _is_categorical_dtype(obj): diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 30868924bcd..f7af374ca8d 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3,7 +3,6 @@ from __future__ import annotations import operator -import pickle import warnings from collections import abc from typing import TYPE_CHECKING, Any, Literal @@ -22,6 +21,7 @@ from cudf import _lib as libcudf from cudf.api.types import is_dtype_equal, is_scalar from cudf.core._compat import PANDAS_LT_300 +from cudf.core.abc import Serializable from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ( ColumnBase, @@ -45,7 +45,7 @@ # TODO: It looks like Frame is missing a declaration of `copy`, need to add -class Frame(BinaryOperand, Scannable): +class Frame(BinaryOperand, Scannable, Serializable): """A collection of Column objects with an optional index. Parameters @@ -95,37 +95,80 @@ def ndim(self) -> int: @_performance_tracking def serialize(self): # TODO: See if self._data can be serialized outright + frames = [] header = { - "type-serialized": pickle.dumps(type(self)), - "column_names": pickle.dumps(self._column_names), - "column_rangeindex": pickle.dumps(self._data.rangeindex), - "column_multiindex": pickle.dumps(self._data.multiindex), - "column_label_dtype": pickle.dumps(self._data.label_dtype), - "column_level_names": pickle.dumps(self._data._level_names), + "column_label_dtype": None, + "dtype-is-cudf-serialized": False, } - header["columns"], frames = serialize_columns(self._columns) + if (label_dtype := self._data.label_dtype) is not None: + try: + header["column_label_dtype"], frames = ( + label_dtype.device_serialize() + ) + header["dtype-is-cudf-serialized"] = True + except AttributeError: + header["column_label_dtype"] = label_dtype.str + + header["columns"], column_frames = serialize_columns(self._columns) + column_names, column_names_numpy_type = ( + zip( + *[ + (cname.item(), type(cname).__name__) + if isinstance(cname, np.generic) + else (cname, "") + for cname in self._column_names + ] + ) + if self._column_names + else ((), ()) + ) + header |= { + "column_names": column_names, + "column_names_numpy_type": column_names_numpy_type, + "column_rangeindex": self._data.rangeindex, + "column_multiindex": self._data.multiindex, + "column_level_names": self._data._level_names, + } + frames.extend(column_frames) + return header, frames @classmethod @_performance_tracking def deserialize(cls, header, frames): - cls_deserialize = pickle.loads(header["type-serialized"]) - column_names = pickle.loads(header["column_names"]) - columns = deserialize_columns(header["columns"], frames) kwargs = {} + dtype_header = header["column_label_dtype"] + if header["dtype-is-cudf-serialized"]: + count = dtype_header["frame_count"] + kwargs["label_dtype"] = cls.device_deserialize( + header, frames[:count] + ) + frames = frames[count:] + else: + kwargs["label_dtype"] = ( + np.dtype(dtype_header) if dtype_header is not None else None + ) + + columns = deserialize_columns(header["columns"], frames) for metadata in [ "rangeindex", "multiindex", - "label_dtype", "level_names", ]: key = f"column_{metadata}" if key in header: - kwargs[metadata] = pickle.loads(header[key]) + kwargs[metadata] = header[key] + + column_names = [ + getattr(np, cntype)(cname) if cntype != "" else cname + for cname, cntype in zip( + header["column_names"], header["column_names_numpy_type"] + ) + ] col_accessor = ColumnAccessor( data=dict(zip(column_names, columns)), **kwargs ) - return cls_deserialize._from_data(col_accessor) + return cls._from_data(col_accessor) @classmethod @_performance_tracking diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index e59b948aba9..a7ced1b833a 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -3,7 +3,6 @@ import copy import itertools -import pickle import textwrap import warnings from collections import abc @@ -1265,7 +1264,7 @@ def serialize(self): obj_header, obj_frames = self.obj.serialize() header["obj"] = obj_header - header["obj_type"] = pickle.dumps(type(self.obj)) + header["obj_type_name"] = type(self.obj).__name__ header["num_obj_frames"] = len(obj_frames) frames.extend(obj_frames) @@ -1280,7 +1279,7 @@ def serialize(self): def deserialize(cls, header, frames): kwargs = header["kwargs"] - obj_type = pickle.loads(header["obj_type"]) + obj_type = Serializable._name_type_map[header["obj_type_name"]] obj = obj_type.deserialize( header["obj"], frames[: header["num_obj_frames"]] ) @@ -3304,8 +3303,8 @@ def _handle_misc(self, by): def serialize(self): header = {} frames = [] - header["names"] = pickle.dumps(self.names) - header["_named_columns"] = pickle.dumps(self._named_columns) + header["names"] = self.names + header["_named_columns"] = self._named_columns column_header, column_frames = cudf.core.column.serialize_columns( self._key_columns ) @@ -3315,8 +3314,8 @@ def serialize(self): @classmethod def deserialize(cls, header, frames): - names = pickle.loads(header["names"]) - _named_columns = pickle.loads(header["_named_columns"]) + names = header["names"] + _named_columns = header["_named_columns"] key_columns = cudf.core.column.deserialize_columns( header["columns"], frames ) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 1b90e9f9df0..244bd877c1a 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -3,7 +3,6 @@ from __future__ import annotations import operator -import pickle import warnings from collections.abc import Hashable, MutableMapping from functools import cache, cached_property @@ -495,9 +494,8 @@ def serialize(self): header["index_column"]["step"] = self.step frames = [] - header["name"] = pickle.dumps(self.name) - header["dtype"] = pickle.dumps(self.dtype) - header["type-serialized"] = pickle.dumps(type(self)) + header["name"] = self.name + header["dtype"] = self.dtype.str header["frame_count"] = 0 return header, frames @@ -505,11 +503,14 @@ def serialize(self): @_performance_tracking def deserialize(cls, header, frames): h = header["index_column"] - name = pickle.loads(header["name"]) + name = header["name"] start = h["start"] stop = h["stop"] step = h.get("step", 1) - return RangeIndex(start=start, stop=stop, step=step, name=name) + dtype = np.dtype(header["dtype"]) + return RangeIndex( + start=start, stop=stop, step=step, dtype=dtype, name=name + ) @property # type: ignore @_performance_tracking diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index bfff62f0a89..a878b072860 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -5,7 +5,6 @@ import itertools import numbers import operator -import pickle import warnings from functools import cached_property from typing import TYPE_CHECKING, Any @@ -918,15 +917,15 @@ def take(self, indices) -> Self: def serialize(self): header, frames = super().serialize() # Overwrite the names in _data with the true names. - header["column_names"] = pickle.dumps(self.names) + header["column_names"] = self.names return header, frames @classmethod @_performance_tracking def deserialize(cls, header, frames): # Spoof the column names to construct the frame, then set manually. - column_names = pickle.loads(header["column_names"]) - header["column_names"] = pickle.dumps(range(0, len(column_names))) + column_names = header["column_names"] + header["column_names"] = range(0, len(column_names)) obj = super().deserialize(header, frames) return obj._set_names(column_names) diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py index d95d252559f..391ee31f125 100644 --- a/python/cudf/cudf/core/resample.py +++ b/python/cudf/cudf/core/resample.py @@ -15,7 +15,6 @@ # limitations under the License. from __future__ import annotations -import pickle import warnings from typing import TYPE_CHECKING @@ -26,6 +25,7 @@ import cudf from cudf._lib.column import Column +from cudf.core.abc import Serializable from cudf.core.buffer import acquire_spill_lock from cudf.core.groupby.groupby import ( DataFrameGroupBy, @@ -97,21 +97,21 @@ def serialize(self): header, frames = super().serialize() grouping_head, grouping_frames = self.grouping.serialize() header["grouping"] = grouping_head - header["resampler_type"] = pickle.dumps(type(self)) + header["resampler_type"] = type(self).__name__ header["grouping_frames_count"] = len(grouping_frames) frames.extend(grouping_frames) return header, frames @classmethod def deserialize(cls, header, frames): - obj_type = pickle.loads(header["obj_type"]) + obj_type = Serializable._name_type_map[header["obj_type_name"]] obj = obj_type.deserialize( header["obj"], frames[: header["num_obj_frames"]] ) grouping = _ResampleGrouping.deserialize( header["grouping"], frames[header["num_obj_frames"] :] ) - resampler_cls = pickle.loads(header["resampler_type"]) + resampler_cls = Serializable._name_type_map[header["resampler_type"]] out = resampler_cls.__new__(resampler_cls) out.grouping = grouping super().__init__(out, obj, by=grouping) @@ -163,8 +163,8 @@ def serialize(self): @classmethod def deserialize(cls, header, frames): - names = pickle.loads(header["names"]) - _named_columns = pickle.loads(header["_named_columns"]) + names = header["names"] + _named_columns = header["_named_columns"] key_columns = cudf.core.column.deserialize_columns( header["columns"], frames[: -header["__bin_labels_count"]] ) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 9b60424c924..778db5973bf 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4,7 +4,6 @@ import functools import inspect -import pickle import textwrap import warnings from collections import abc @@ -28,7 +27,6 @@ ) from cudf.core import indexing_utils from cudf.core._compat import PANDAS_LT_300 -from cudf.core.abc import Serializable from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ( ColumnBase, @@ -415,7 +413,7 @@ def _loc_to_iloc(self, arg): return indices -class Series(SingleColumnFrame, IndexedFrame, Serializable): +class Series(SingleColumnFrame, IndexedFrame): """ One-dimensional GPU array (including time series). @@ -900,7 +898,7 @@ def hasnans(self): def serialize(self): header, frames = super().serialize() - header["index"], index_frames = self.index.serialize() + header["index"], index_frames = self.index.device_serialize() header["index_frame_count"] = len(index_frames) # For backwards compatibility with older versions of cuDF, index # columns are placed before data columns. @@ -916,8 +914,7 @@ def deserialize(cls, header, frames): header, frames[header["index_frame_count"] :] ) - idx_typ = pickle.loads(header["index"]["type-serialized"]) - index = idx_typ.deserialize(header["index"], frames[:index_nframes]) + index = cls.device_deserialize(header["index"], frames[:index_nframes]) obj.index = index return obj diff --git a/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl b/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl index 1ec077d10f77f4b3b3a8cc1bbfee559707b60dfc..64e06f0631d1475e9b6ab62434227253ad95a28b 100644 GIT binary patch literal 1108 zcmbVLO>fgc5RDs$(AI*6hP3J*AeTf!;=qAJL=aLjB%+)+AZuf9vKEdVt#=7QMdA{g zNPFw>dtqibse&jRuw-kz``(-HZ~l*SzhWPAccjxyrFjXaKH-WfCE*&(ajcVZH!dXa zCQPxhWK#}i{{`AFt&Nx?QIsl5c*$kTvh)jw?{EQMp=}<-MW&~Dl(7-dqC_ob90ump z8lAN4ka*{YmcZK79iz1Lnq!!~%OU)e@tDgYLBJf^ zWTLpxxq{F$&D%+L90|+f0%q_5R?O5ho==o0@h=RRHvW{AA1MTJESlnB=!up%5vPO| zXNc(`=AhIg!CAs3(Fl9bRG+0!Kpd?_6cX4u;!%A{ehlhxnq{~ZHZyW6~dW# z6#59Qik1o9DVgSz9b9|0T5*c19R^`9Y;wH>6Kr}#rNSZb7~tW_?qQVc>1+|%E}9Bm zH#XOjHZvwATVUBD$>Sm~mDI7Kjj35Aj!T|6$Tg<0guXX|D_o0q=!L-& z#2d4jZlvt#$FN?x+p6&{$?vP5_}EWaQ7~Hf1Ca`zWyQRZSps+@UW*|qi?>_d9{#*v e_j`4>)BpXwUA^hlnLQ{BD$Hjox5;g9SelgkgE>*?OI-{1 zZD)V4o_n)w-Ry8`Aj!$iz27RrL8m8S8T?_E*h)^vM$^<5Wd#d>!agEn4JoBtLvvENwKjjI@<8*+YK3T)J z{*XNaS0Y0J{5?e2DiA8E8jg<6f1(6SA78<2dW^K2!LlxYgHEUO;U`}95$sCnA=rHj z)`~^TgfOqmOgl#2Yr@||=T8ggbK&y3FumRxbeu-?i%pMfO%&D?rCq5ANjQpJ(vFbX z2m;3#6b&Q0DO>QJvD5M3CR{HS(qgU)`l^LVcvK{zc9DcUZoRrs(gA&MNtn7uUL~dP z=2R|KN=aAq`Xrd(=5#uxx|+~*AeqT{GjM};(4czdkjIjeUP4VajzQr+y>9y=pB6)f z(}ZwNuut4Br(u?2o2gKmsZumhHI4EuC#c>8{BjTS9x4a!1lSI%o8D5J^Xb3ZTPFQ8 z-(@kQNs=9AJc$68*f!fWnCx|d*v5}{GrwI7k2AIY`n4Fnk)t;Z+!Ef#i+gsP6V%K^ F?-%(Z>0$r? diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py index 68f2aaf9cab..b50ed04427f 100644 --- a/python/cudf/cudf/tests/test_serialize.py +++ b/python/cudf/cudf/tests/test_serialize.py @@ -7,6 +7,7 @@ import numpy as np import pandas as pd import pytest +from packaging import version import cudf from cudf.testing import _utils as utils, assert_eq @@ -149,13 +150,19 @@ def test_serialize(df, to_host): def test_serialize_dtype_error_checking(): dtype = cudf.IntervalDtype("float", "right") - header, frames = dtype.serialize() - with pytest.raises(AssertionError): - # Invalid number of frames - type(dtype).deserialize(header, [None] * (header["frame_count"] + 1)) + # Must call device_serialize (not serialize) to ensure that the type metadata is + # encoded in the header. + header, frames = dtype.device_serialize() with pytest.raises(AssertionError): # mismatching class cudf.StructDtype.deserialize(header, frames) + # The is-cuda flag list length must match the number of frames + header["is-cuda"] = [False] + with pytest.raises(AssertionError): + # Invalid number of frames + type(dtype).deserialize( + header, [np.zeros(1)] * (header["frame_count"] + 1) + ) def test_serialize_dataframe(): @@ -382,6 +389,10 @@ def test_serialize_string_check_buffer_sizes(): assert expect == got +@pytest.mark.skipif( + version.parse(np.__version__) < version.parse("2.0.0"), + reason="The serialization of numpy 2.0 types is incompatible with numpy 1.x", +) def test_deserialize_cudf_23_12(datadir): fname = datadir / "pkl" / "stringColumnWithRangeIndex_cudf_23.12.pkl" diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py index 899d78c999b..b85943626a6 100644 --- a/python/cudf/cudf/tests/test_struct.py +++ b/python/cudf/cudf/tests/test_struct.py @@ -79,7 +79,7 @@ def test_series_construction_with_nulls(): ) def test_serialize_struct_dtype(fields): dtype = cudf.StructDtype(fields) - recreated = dtype.__class__.deserialize(*dtype.serialize()) + recreated = dtype.__class__.device_deserialize(*dtype.device_serialize()) assert recreated == dtype diff --git a/python/dask_cudf/dask_cudf/tests/test_distributed.py b/python/dask_cudf/dask_cudf/tests/test_distributed.py index d03180852eb..c28b7e49207 100644 --- a/python/dask_cudf/dask_cudf/tests/test_distributed.py +++ b/python/dask_cudf/dask_cudf/tests/test_distributed.py @@ -4,7 +4,7 @@ import pytest import dask -from dask import dataframe as dd +from dask import array as da, dataframe as dd from dask.distributed import Client from distributed.utils_test import cleanup, loop, loop_in_thread # noqa: F401 @@ -121,3 +121,17 @@ def test_unique(): ddf.x.unique().compute(), check_index=False, ) + + +def test_serialization_of_numpy_types(): + # Dask uses numpy integers as column names, which can break cudf serialization + with dask_cuda.LocalCUDACluster(n_workers=1) as cluster: + with Client(cluster): + with dask.config.set( + {"dataframe.backend": "cudf", "array.backend": "cupy"} + ): + rng = da.random.default_rng() + X_arr = rng.random((100, 10), chunks=(50, 10)) + X = dd.from_dask_array(X_arr) + X = X[X.columns[0]] + X.compute() From 5306eca611c7926fa59c581351c3cf7f0abf464d Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 10 Dec 2024 09:50:48 -0800 Subject: [PATCH 54/78] Use rapids-logger to generate the cudf logger (#17307) This PR replaces cudf's logger implementation with one generated using https://github.com/rapidsai/rapids-logger. This approach allows us to centralize the logger definition across different RAPIDS projects while allowing each project to vendor its own copy with a suitable set of macros and default logger objects. The common logger also takes care of handling the more complex packaging problems around ensuring that we fully isolate our spdlog dependency and do not leak any of its symbols, allowing our libraries to be safely installed in a much broader set of environments. Contributes to https://github.com/rapidsai/build-planning/issues/104. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Nghia Truong (https://github.com/ttnghia) - James Lamb (https://github.com/jameslamb) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17307 --- .../all_cuda-118_arch-x86_64.yaml | 1 - .../all_cuda-125_arch-x86_64.yaml | 1 - conda/recipes/libcudf/conda_build_config.yaml | 3 - conda/recipes/libcudf/meta.yaml | 1 - cpp/CMakeLists.txt | 22 ++--- cpp/benchmarks/io/cuio_common.cpp | 2 +- cpp/cmake/thirdparty/get_spdlog.cmake | 27 ------ .../developer_guide/DEVELOPER_GUIDE.md | 6 +- cpp/include/cudf/detail/utilities/logger.hpp | 27 ------ cpp/include/cudf/utilities/logger.hpp | 54 ------------ cpp/src/io/comp/nvcomp_adapter.cpp | 2 +- cpp/src/io/csv/reader_impl.cu | 2 +- cpp/src/io/orc/reader_impl_chunking.cu | 2 +- cpp/src/io/orc/stripe_enc.cu | 2 +- cpp/src/io/orc/writer_impl.cu | 2 +- cpp/src/io/parquet/reader_impl_helpers.cpp | 2 +- cpp/src/io/parquet/writer_impl.cu | 2 +- cpp/src/io/utilities/base64_utilities.cpp | 2 +- cpp/src/io/utilities/data_sink.cpp | 2 +- cpp/src/io/utilities/datasource.cpp | 2 +- cpp/src/io/utilities/file_io_utilities.cpp | 3 +- cpp/src/io/utilities/getenv_or.hpp | 2 +- cpp/src/utilities/host_memory.cpp | 2 +- cpp/src/utilities/logger.cpp | 83 ------------------- cpp/src/utilities/stream_pool.cpp | 2 +- cpp/tests/utilities_tests/logger_tests.cpp | 48 +++++------ dependencies.yaml | 1 - 27 files changed, 53 insertions(+), 252 deletions(-) delete mode 100644 cpp/cmake/thirdparty/get_spdlog.cmake delete mode 100644 cpp/include/cudf/detail/utilities/logger.hpp delete mode 100644 cpp/include/cudf/utilities/logger.hpp delete mode 100644 cpp/src/utilities/logger.cpp diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index bad508154aa..33fc2f651c6 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -87,7 +87,6 @@ dependencies: - s3fs>=2022.3.0 - scikit-build-core>=0.10.0 - scipy -- spdlog>=1.14.1,<1.15 - sphinx - sphinx-autobuild - sphinx-copybutton diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 969124a29ad..c290a83a37f 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -86,7 +86,6 @@ dependencies: - s3fs>=2022.3.0 - scikit-build-core>=0.10.0 - scipy -- spdlog>=1.14.1,<1.15 - sphinx - sphinx-autobuild - sphinx-copybutton diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml index c78ca326005..00020fdf6b8 100644 --- a/conda/recipes/libcudf/conda_build_config.yaml +++ b/conda/recipes/libcudf/conda_build_config.yaml @@ -31,9 +31,6 @@ fmt_version: flatbuffers_version: - "=24.3.25" -spdlog_version: - - ">=1.14.1,<1.15" - nvcomp_version: - "=4.1.0.6" diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 1c2e9e8dd98..b585aafc397 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -68,7 +68,6 @@ requirements: - librdkafka {{ librdkafka_version }} - fmt {{ fmt_version }} - flatbuffers {{ flatbuffers_version }} - - spdlog {{ spdlog_version }} - zlib {{ zlib_version }} outputs: diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index e54c71de4fa..3d77307ccde 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -273,6 +273,14 @@ endif() # add third party dependencies using CPM rapids_cpm_init() + +# Not using rapids-cmake since we never want to find, always download. +CPMAddPackage( + NAME rapids_logger GITHUB_REPOSITORY rapidsai/rapids-logger GIT_SHALLOW TRUE GIT_TAG + 14bb233d2420f7187a690f0bb528ec0420c70d48 +) +rapids_make_logger(cudf EXPORT_SET cudf-exports) + # find jitify include(cmake/thirdparty/get_jitify.cmake) # find NVTX @@ -299,8 +307,6 @@ include(cmake/Modules/JitifyPreprocessKernels.cmake) include(cmake/thirdparty/get_kvikio.cmake) # find fmt include(cmake/thirdparty/get_fmt.cmake) -# find spdlog -include(cmake/thirdparty/get_spdlog.cmake) # find nanoarrow include(cmake/thirdparty/get_nanoarrow.cmake) # find thread_pool @@ -772,7 +778,6 @@ add_library( src/utilities/default_stream.cpp src/utilities/host_memory.cpp src/utilities/linked_column.cpp - src/utilities/logger.cpp src/utilities/prefetch.cpp src/utilities/stacktrace.cpp src/utilities/stream_pool.cpp @@ -910,11 +915,8 @@ if(CUDF_LARGE_STRINGS_DISABLED) target_compile_definitions(cudf PRIVATE CUDF_LARGE_STRINGS_DISABLED) endif() -# Define RMM logging level -target_compile_definitions(cudf PRIVATE "RMM_LOGGING_LEVEL=LIBCUDF_LOGGING_LEVEL") - -# Define spdlog level -target_compile_definitions(cudf PUBLIC "SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${LIBCUDF_LOGGING_LEVEL}") +# Define logging level +target_compile_definitions(cudf PRIVATE "CUDF_LOG_ACTIVE_LEVEL=${LIBCUDF_LOGGING_LEVEL}") # Enable remote IO through KvikIO target_compile_definitions(cudf PRIVATE $<$:CUDF_KVIKIO_REMOTE_IO>) @@ -938,8 +940,7 @@ add_dependencies(cudf jitify_preprocess_run) # Specify the target module library dependencies target_link_libraries( cudf - PUBLIC CCCL::CCCL rmm::rmm rmm::rmm_logger $ - spdlog::spdlog_header_only + PUBLIC CCCL::CCCL rmm::rmm rmm::rmm_logger $ cudf_logger PRIVATE $ cuco::cuco ZLIB::ZLIB @@ -948,6 +949,7 @@ target_link_libraries( $ nanoarrow rmm::rmm_logger_impl + cudf_logger_impl ) # Add Conda library, and include paths if specified diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp index 45b46005c47..38a21961735 100644 --- a/cpp/benchmarks/io/cuio_common.cpp +++ b/cpp/benchmarks/io/cuio_common.cpp @@ -17,7 +17,7 @@ #include #include -#include +#include #include #include diff --git a/cpp/cmake/thirdparty/get_spdlog.cmake b/cpp/cmake/thirdparty/get_spdlog.cmake deleted file mode 100644 index 90b0f4d8a8e..00000000000 --- a/cpp/cmake/thirdparty/get_spdlog.cmake +++ /dev/null @@ -1,27 +0,0 @@ -# ============================================================================= -# Copyright (c) 2023-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= - -# Use CPM to find or clone speedlog -function(find_and_configure_spdlog) - - include(${rapids-cmake-dir}/cpm/spdlog.cmake) - rapids_cpm_spdlog( - FMT_OPTION "EXTERNAL_FMT_HO" - INSTALL_EXPORT_SET cudf-exports - BUILD_EXPORT_SET cudf-exports - ) - -endfunction() - -find_and_configure_spdlog() diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md index 1c1052487f2..5032a073b58 100644 --- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md +++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md @@ -1082,15 +1082,15 @@ initialization. If this setting is higher than the compile-time CMake variable, in between the two settings will be excluded from the written log. The available levels are the same as for the CMake variable. * Global logger object exposed via `cudf::logger()` - sets the minimum logging level at runtime. -For example, calling `cudf::logger().set_level(spdlog::level::err)`, will exclude any messages that +For example, calling `cudf::default_logger().set_level(level_enum::err)`, will exclude any messages that are not errors or critical errors. This API should not be used within libcudf to manipulate logging, its purpose is to allow upstream users to configure libcudf logging to fit their application. By default, logging messages are output to stderr. Setting the environment variable `LIBCUDF_DEBUG_LOG_FILE` redirects the log to a file with the specified path (can be relative to the current directory). -Upstream users can also manipulate `cudf::logger().sinks()` to add sinks or divert the log to -standard output or even a custom spdlog sink. +Upstream users can also manipulate `cudf::default_logger().sinks()` to add sinks or divert the log to +standard output. # Data Types diff --git a/cpp/include/cudf/detail/utilities/logger.hpp b/cpp/include/cudf/detail/utilities/logger.hpp deleted file mode 100644 index e7643eb44bd..00000000000 --- a/cpp/include/cudf/detail/utilities/logger.hpp +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -// Log messages that require computation should only be used at level TRACE and DEBUG -#define CUDF_LOG_TRACE(...) SPDLOG_LOGGER_TRACE(&cudf::detail::logger(), __VA_ARGS__) -#define CUDF_LOG_DEBUG(...) SPDLOG_LOGGER_DEBUG(&cudf::detail::logger(), __VA_ARGS__) -#define CUDF_LOG_INFO(...) SPDLOG_LOGGER_INFO(&cudf::detail::logger(), __VA_ARGS__) -#define CUDF_LOG_WARN(...) SPDLOG_LOGGER_WARN(&cudf::detail::logger(), __VA_ARGS__) -#define CUDF_LOG_ERROR(...) SPDLOG_LOGGER_ERROR(&cudf::detail::logger(), __VA_ARGS__) -#define CUDF_LOG_CRITICAL(...) SPDLOG_LOGGER_CRITICAL(&cudf::detail::logger(), __VA_ARGS__) diff --git a/cpp/include/cudf/utilities/logger.hpp b/cpp/include/cudf/utilities/logger.hpp deleted file mode 100644 index 982554a23f5..00000000000 --- a/cpp/include/cudf/utilities/logger.hpp +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include - -namespace CUDF_EXPORT cudf { - -namespace detail { -spdlog::logger& logger(); -} - -/** - * @brief Returns the global logger. - * - * This is a global instance of a spdlog logger. It can be used to configure logging behavior in - * libcudf. - * - * Examples: - * @code{.cpp} - * // Turn off logging at runtime - * cudf::logger().set_level(spdlog::level::off); - * // Add a stdout sink to the logger - * cudf::logger().sinks().push_back(std::make_shared()); - * // Replace the default sink - * cudf::logger().sinks() ={std::make_shared()}; - * @endcode - * - * Note: Changes to the sinks are not thread safe and should only be done during global - * initialization. - * - * @return spdlog::logger& The logger. - */ -[[deprecated( - "Support for direct access to spdlog loggers in cudf is planned for removal")]] spdlog::logger& -logger(); - -} // namespace CUDF_EXPORT cudf diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index 9d3cf75a13f..d45c02f374f 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -18,8 +18,8 @@ #include "nvcomp_adapter.cuh" -#include #include +#include #include #include diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 6c84b53db46..7f0b5e07b09 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -28,13 +28,13 @@ #include "io/utilities/parsing_utils.cuh" #include -#include #include #include #include #include #include #include +#include #include #include #include diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu index fcaee9c548e..726c79bd004 100644 --- a/cpp/src/io/orc/reader_impl_chunking.cu +++ b/cpp/src/io/orc/reader_impl_chunking.cu @@ -22,7 +22,7 @@ #include #include -#include +#include #include #include diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index ed0b6969154..07172b6b7f7 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -23,10 +23,10 @@ #include #include #include -#include #include #include #include +#include #include #include diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 76e5369ffd0..0906017ee61 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -29,9 +29,9 @@ #include #include #include -#include #include #include +#include #include #include #include diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index bfd0cc992cf..0dd1aff41e9 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -23,7 +23,7 @@ #include "ipc/Message_generated.h" #include "ipc/Schema_generated.h" -#include +#include #include #include diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index f865c9a7643..188e6a8c0d8 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -38,10 +38,10 @@ #include #include #include -#include #include #include #include +#include #include #include diff --git a/cpp/src/io/utilities/base64_utilities.cpp b/cpp/src/io/utilities/base64_utilities.cpp index 2a2a07afc8d..00fc54f9883 100644 --- a/cpp/src/io/utilities/base64_utilities.cpp +++ b/cpp/src/io/utilities/base64_utilities.cpp @@ -60,7 +60,7 @@ #include "base64_utilities.hpp" -#include +#include #include diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index bed03869b34..dfa5d46cf48 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -16,9 +16,9 @@ #include "file_io_utilities.hpp" -#include #include #include +#include #include #include diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 62ef7c7a794..38dedcc2627 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -17,11 +17,11 @@ #include "file_io_utilities.hpp" #include "getenv_or.hpp" -#include #include #include #include #include +#include #include #include diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp index 9b17e7f6d55..28367c95430 100644 --- a/cpp/src/io/utilities/file_io_utilities.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -19,10 +19,11 @@ #include "getenv_or.hpp" #include -#include #include +#include #include +#include #include #include diff --git a/cpp/src/io/utilities/getenv_or.hpp b/cpp/src/io/utilities/getenv_or.hpp index 3fd97a00b61..b9613428418 100644 --- a/cpp/src/io/utilities/getenv_or.hpp +++ b/cpp/src/io/utilities/getenv_or.hpp @@ -16,7 +16,7 @@ #pragma once -#include +#include #include #include diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp index e30806a5011..4196523d211 100644 --- a/cpp/src/utilities/host_memory.cpp +++ b/cpp/src/utilities/host_memory.cpp @@ -14,8 +14,8 @@ * limitations under the License. */ -#include #include +#include #include #include #include diff --git a/cpp/src/utilities/logger.cpp b/cpp/src/utilities/logger.cpp deleted file mode 100644 index e52fffbd8c6..00000000000 --- a/cpp/src/utilities/logger.cpp +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include - -#include -#include - -#include - -namespace { - -/** - * @brief Creates a sink for libcudf logging. - * - * Returns a file sink if the file name has been specified, otherwise returns a stderr sink. - */ -[[nodiscard]] spdlog::sink_ptr make_libcudf_sink() -{ - if (auto filename = std::getenv("LIBCUDF_DEBUG_LOG_FILE"); filename != nullptr) { - return std::make_shared(filename, true); - } else { - return std::make_shared(); - } -} - -/** - * @brief Converts the level name into the `spdlog` level enum. - */ -[[nodiscard]] spdlog::level::level_enum libcudf_log_level() -{ - auto const env_level = std::getenv("LIBCUDF_LOGGING_LEVEL"); - if (env_level == nullptr) { return spdlog::level::warn; } - - auto const env_lvl_str = std::string(env_level); - if (env_lvl_str == "TRACE") return spdlog::level::trace; - if (env_lvl_str == "DEBUG") return spdlog::level::debug; - if (env_lvl_str == "INFO") return spdlog::level::info; - if (env_lvl_str == "WARN") return spdlog::level::warn; - if (env_lvl_str == "ERROR") return spdlog::level::err; - if (env_lvl_str == "CRITICAL") return spdlog::level::critical; - if (env_lvl_str == "OFF") return spdlog::level::off; - - CUDF_FAIL("Invalid value for LIBCUDF_LOGGING_LEVEL environment variable"); -} - -/** - * @brief Simple wrapper around a spdlog::logger that performs cuDF-specific initialization. - */ -struct logger_wrapper { - spdlog::logger logger_; - - logger_wrapper() : logger_{"CUDF", make_libcudf_sink()} - { - logger_.set_pattern("[%6t][%H:%M:%S:%f][%-6l] %v"); - logger_.set_level(libcudf_log_level()); - logger_.flush_on(spdlog::level::warn); - } -}; - -} // namespace - -spdlog::logger& cudf::detail::logger() -{ - static logger_wrapper wrapped{}; - return wrapped.logger_; -} - -spdlog::logger& cudf::logger() { return cudf::detail::logger(); } diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp index 9d1bebd1937..b0f2d8c0637 100644 --- a/cpp/src/utilities/stream_pool.cpp +++ b/cpp/src/utilities/stream_pool.cpp @@ -14,8 +14,8 @@ * limitations under the License. */ -#include #include +#include #include #include diff --git a/cpp/tests/utilities_tests/logger_tests.cpp b/cpp/tests/utilities_tests/logger_tests.cpp index cfab570833b..58396115a54 100644 --- a/cpp/tests/utilities_tests/logger_tests.cpp +++ b/cpp/tests/utilities_tests/logger_tests.cpp @@ -16,29 +16,25 @@ #include -#include - -#include +#include #include class LoggerTest : public cudf::test::BaseFixture { std::ostringstream oss; - spdlog::level::level_enum prev_level; - std::vector prev_sinks; + cudf::level_enum prev_level; public: - LoggerTest() - : prev_level{cudf::detail::logger().level()}, prev_sinks{cudf::detail::logger().sinks()} + LoggerTest() : prev_level{cudf::default_logger().level()} { - cudf::detail::logger().sinks() = {std::make_shared(oss)}; - cudf::detail::logger().set_formatter( - std::unique_ptr(new spdlog::pattern_formatter("%v"))); + cudf::default_logger().sinks().push_back(std::make_shared(oss)); + cudf::default_logger().set_pattern("%v"); } ~LoggerTest() override { - cudf::detail::logger().set_level(prev_level); - cudf::detail::logger().sinks() = prev_sinks; + cudf::default_logger().set_pattern("[%6t][%H:%M:%S:%f][%-6l] %v"); + cudf::default_logger().set_level(prev_level); + cudf::default_logger().sinks().pop_back(); } void clear_sink() { oss.str(""); } @@ -47,32 +43,32 @@ class LoggerTest : public cudf::test::BaseFixture { TEST_F(LoggerTest, Basic) { - cudf::detail::logger().critical("crit msg"); + cudf::default_logger().critical("crit msg"); ASSERT_EQ(this->sink_content(), "crit msg\n"); } TEST_F(LoggerTest, DefaultLevel) { - cudf::detail::logger().trace("trace"); - cudf::detail::logger().debug("debug"); - cudf::detail::logger().info("info"); - cudf::detail::logger().warn("warn"); - cudf::detail::logger().error("error"); - cudf::detail::logger().critical("critical"); - ASSERT_EQ(this->sink_content(), "warn\nerror\ncritical\n"); + cudf::default_logger().trace("trace"); + cudf::default_logger().debug("debug"); + cudf::default_logger().info("info"); + cudf::default_logger().warn("warn"); + cudf::default_logger().error("error"); + cudf::default_logger().critical("critical"); + ASSERT_EQ(this->sink_content(), "info\nwarn\nerror\ncritical\n"); } TEST_F(LoggerTest, CustomLevel) { - cudf::detail::logger().set_level(spdlog::level::warn); - cudf::detail::logger().info("info"); - cudf::detail::logger().warn("warn"); + cudf::default_logger().set_level(cudf::level_enum::warn); + cudf::default_logger().info("info"); + cudf::default_logger().warn("warn"); ASSERT_EQ(this->sink_content(), "warn\n"); this->clear_sink(); - cudf::detail::logger().set_level(spdlog::level::debug); - cudf::detail::logger().trace("trace"); - cudf::detail::logger().debug("debug"); + cudf::default_logger().set_level(cudf::level_enum::debug); + cudf::default_logger().trace("trace"); + cudf::default_logger().debug("debug"); ASSERT_EQ(this->sink_content(), "debug\n"); } diff --git a/dependencies.yaml b/dependencies.yaml index 3c55ce2c614..44767f1e9d3 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -413,7 +413,6 @@ dependencies: - fmt>=11.0.2,<12 - flatbuffers==24.3.25 - librdkafka>=2.5.0,<2.6.0a0 - - spdlog>=1.14.1,<1.15 depends_on_nvcomp: common: - output_types: conda From 657f50bae866d97a231d565f34a1941efd49c721 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 10 Dec 2024 10:16:11 -0800 Subject: [PATCH 55/78] Fix typos, rename types, and add null_probability benchmark axis for distinct (#17546) This PR addresses several minor issues discovered while working on #17467: - Corrected a typo where `RowHasher` should have been `RowEqual` - Renamed `hash_set_type` to `distinct_set_t` - Added a `null_probability` benchmark axis for the distinct benchmark, similar to other stream compaction benchmarks Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Muhammad Haseeb (https://github.com/mhaseeb123) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17546 --- cpp/benchmarks/stream_compaction/distinct.cpp | 4 +++- cpp/src/stream_compaction/distinct.cu | 4 ++-- cpp/src/stream_compaction/distinct_helpers.cu | 12 ++++++------ cpp/src/stream_compaction/distinct_helpers.hpp | 12 +++++++----- 4 files changed, 18 insertions(+), 14 deletions(-) diff --git a/cpp/benchmarks/stream_compaction/distinct.cpp b/cpp/benchmarks/stream_compaction/distinct.cpp index d7deebca89a..75d04bb4e8e 100644 --- a/cpp/benchmarks/stream_compaction/distinct.cpp +++ b/cpp/benchmarks/stream_compaction/distinct.cpp @@ -34,6 +34,7 @@ void nvbench_distinct(nvbench::state& state, nvbench::type_list) cudf::size_type const num_rows = state.get_int64("NumRows"); auto const keep = get_keep(state.get_string("keep")); cudf::size_type const cardinality = state.get_int64("cardinality"); + auto const null_probability = state.get_float64("null_probability"); if (cardinality > num_rows) { state.skip("cardinality > num_rows"); @@ -42,7 +43,7 @@ void nvbench_distinct(nvbench::state& state, nvbench::type_list) data_profile profile = data_profile_builder() .cardinality(cardinality) - .null_probability(0.01) + .null_probability(null_probability) .distribution(cudf::type_to_id(), distribution_id::UNIFORM, static_cast(0), @@ -65,6 +66,7 @@ using data_type = nvbench::type_list; NVBENCH_BENCH_TYPES(nvbench_distinct, NVBENCH_TYPE_AXES(data_type)) .set_name("distinct") .set_type_axes_names({"Type"}) + .add_float64_axis("null_probability", {0.01}) .add_string_axis("keep", {"any", "first", "last", "none"}) .add_int64_axis("cardinality", {100, 100'000, 10'000'000, 1'000'000'000}) .add_int64_axis("NumRows", {100, 100'000, 10'000'000, 1'000'000'000}); diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu index 7d11b02d3e1..9ab8ed5938a 100644 --- a/cpp/src/stream_compaction/distinct.cu +++ b/cpp/src/stream_compaction/distinct.cu @@ -95,8 +95,8 @@ rmm::device_uvector distinct_indices(table_view const& input, auto const row_equal = cudf::experimental::row::equality::self_comparator(preprocessed_input); auto const helper_func = [&](auto const& d_equal) { - using RowHasher = std::decay_t; - auto set = hash_set_type{ + using RowEqual = std::decay_t; + auto set = distinct_set_t{ num_rows, 0.5, // desired load factor cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, diff --git a/cpp/src/stream_compaction/distinct_helpers.cu b/cpp/src/stream_compaction/distinct_helpers.cu index c3a004b7f28..aadb438b019 100644 --- a/cpp/src/stream_compaction/distinct_helpers.cu +++ b/cpp/src/stream_compaction/distinct_helpers.cu @@ -21,8 +21,8 @@ namespace cudf::detail { -template -rmm::device_uvector reduce_by_row(hash_set_type& set, +template +rmm::device_uvector reduce_by_row(distinct_set_t& set, size_type num_rows, duplicate_keep_option keep, rmm::cuda_stream_view stream, @@ -100,7 +100,7 @@ rmm::device_uvector reduce_by_row(hash_set_type& set, } template rmm::device_uvector reduce_by_row( - hash_set_type>& set, @@ -110,7 +110,7 @@ template rmm::device_uvector reduce_by_row( rmm::device_async_resource_ref mr); template rmm::device_uvector reduce_by_row( - hash_set_type>& set, @@ -120,7 +120,7 @@ template rmm::device_uvector reduce_by_row( rmm::device_async_resource_ref mr); template rmm::device_uvector reduce_by_row( - hash_set_type>& set, @@ -130,7 +130,7 @@ template rmm::device_uvector reduce_by_row( rmm::device_async_resource_ref mr); template rmm::device_uvector reduce_by_row( - hash_set_type>& set, diff --git a/cpp/src/stream_compaction/distinct_helpers.hpp b/cpp/src/stream_compaction/distinct_helpers.hpp index f15807c2434..4ca1cab937a 100644 --- a/cpp/src/stream_compaction/distinct_helpers.hpp +++ b/cpp/src/stream_compaction/distinct_helpers.hpp @@ -47,12 +47,12 @@ auto constexpr reduction_init_value(duplicate_keep_option keep) } } -template -using hash_set_type = +template +using distinct_set_t = cuco::static_set, cuda::thread_scope_device, - RowHasher, + RowEqual, cuco::linear_probing<1, cudf::experimental::row::hash::device_row_hasher< cudf::hashing::detail::default_hash, @@ -79,6 +79,8 @@ using hash_set_type = * the `reduction_init_value()` function. Then, the reduction result for each row group is written * into the output array at the index of an unspecified row in the group. * + * @tparam RowEqual The type of row equality comparator + * * @param set The auxiliary set to perform reduction * @param set_size The number of elements in set * @param num_rows The number of all input rows @@ -87,8 +89,8 @@ using hash_set_type = * @param mr Device memory resource used to allocate the returned vector * @return A device_uvector containing the output indices */ -template -rmm::device_uvector reduce_by_row(hash_set_type& set, +template +rmm::device_uvector reduce_by_row(distinct_set_t& set, size_type num_rows, duplicate_keep_option keep, rmm::cuda_stream_view stream, From be62ea60440a8357702eb292e19e69dd6be001e0 Mon Sep 17 00:00:00 2001 From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com> Date: Tue, 10 Dec 2024 13:21:39 -0600 Subject: [PATCH 56/78] Update version references in workflow (#17568) Update version references in breaking-change trigger workflow --- .github/workflows/trigger-breaking-change-alert.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml index 3b972f31ca4..01dd2436beb 100644 --- a/.github/workflows/trigger-breaking-change-alert.yaml +++ b/.github/workflows/trigger-breaking-change-alert.yaml @@ -12,7 +12,7 @@ jobs: trigger-notifier: if: contains(github.event.pull_request.labels.*.name, 'breaking') secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-24.12 + uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.02 with: sender_login: ${{ github.event.sender.login }} sender_avatar: ${{ github.event.sender.avatar_url }} From 1e95864f6631a1dc90d78fc9418281c256fa9f59 Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Tue, 10 Dec 2024 13:47:42 -0600 Subject: [PATCH 57/78] Fix Dask-cuDF `clip` APIs (#17509) Closes https://github.com/rapidsai/cudf/issues/17502 **Background Info**: The cudf and pandas `axis` defaults are different, and the upstream dask-expr `clip` APIs are consistent with the behavior of Pandas (not cudf). Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) - Matthew Murray (https://github.com/Matt711) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/17509 --- .../dask_cudf/dask_cudf/_expr/collection.py | 10 +++++++ python/dask_cudf/dask_cudf/tests/test_core.py | 26 +++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/python/dask_cudf/dask_cudf/_expr/collection.py b/python/dask_cudf/dask_cudf/_expr/collection.py index 2dc4031b876..5192e6b8171 100644 --- a/python/dask_cudf/dask_cudf/_expr/collection.py +++ b/python/dask_cudf/dask_cudf/_expr/collection.py @@ -163,6 +163,11 @@ def read_text(*args, **kwargs): return legacy_read_text(*args, **kwargs) + def clip(self, lower=None, upper=None, axis=1): + if axis not in (None, 1): + raise NotImplementedError("axis not yet supported in clip.") + return new_collection(self.expr.clip(lower, upper, 1)) + class Series(DXSeries, CudfFrameBase): def groupby(self, by, **kwargs): @@ -182,6 +187,11 @@ def struct(self): return StructMethods(self) + def clip(self, lower=None, upper=None, axis=1): + if axis not in (None, 1): + raise NotImplementedError("axis not yet supported in clip.") + return new_collection(self.expr.clip(lower, upper, 1)) + class Index(DXIndex, CudfFrameBase): pass # Same as pandas (for now) diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index cda7e2d134d..7101fb7e00a 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -1019,3 +1019,29 @@ def test_rename_axis_after_join(): result = ddf1.join(ddf2, how="outer") expected = df1.join(df2, how="outer") dd.assert_eq(result, expected, check_index=False) + + +def test_clip_dataframe(): + df = cudf.DataFrame( + { + "id": ["a", "b", "c", "d"], + "score": [-1, 1, 4, 6], + } + ) + expect = df.clip(lower=["b", 1], upper=["d", 5], axis=1) + got = dd.from_pandas(df, npartitions=2).clip( + lower=["b", 1], upper=["d", 5], axis=1 + ) + dd.assert_eq(expect, got) + + +def test_clip_series(): + ser = cudf.Series([-0.5, 0.5, 4.5, 5.5]) + expect = ser.clip(lower=0, upper=5).round().astype(int) + got = ( + dd.from_pandas(ser, npartitions=2) + .clip(lower=0, upper=5) + .round() + .astype(int) + ) + dd.assert_eq(expect, got) From 0c5bd6627159fe44a49e56020f0c0842696bc397 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 10 Dec 2024 17:10:45 -0500 Subject: [PATCH 58/78] Rework minhash APIs for deprecation cycle (#17421) Renames `minhash_permuted()` to `minhash()` and deprecates `minhash_permuted` Also removes the `word_minhash` APIs deprecated in 24.12. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Matthew Murray (https://github.com/Matt711) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17421 --- cpp/benchmarks/text/minhash.cpp | 5 +- cpp/include/nvtext/minhash.hpp | 194 +--------- cpp/src/text/minhash.cu | 341 +----------------- cpp/tests/text/minhash_tests.cpp | 79 ++-- docs/cudf/source/conf.py | 2 + python/cudf/cudf/_lib/nvtext/minhash.pyx | 46 +-- python/cudf/cudf/_lib/strings/__init__.py | 4 - python/cudf/cudf/core/column/string.py | 166 +-------- .../cudf/cudf/tests/text/test_text_methods.py | 72 +--- .../pylibcudf/libcudf/nvtext/minhash.pxd | 34 -- python/pylibcudf/pylibcudf/nvtext/minhash.pxd | 12 +- python/pylibcudf/pylibcudf/nvtext/minhash.pyi | 7 +- python/pylibcudf/pylibcudf/nvtext/minhash.pyx | 168 +-------- .../pylibcudf/tests/test_nvtext_minhash.py | 30 +- 14 files changed, 100 insertions(+), 1060 deletions(-) diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp index a80d0dcbdb8..8c86e8d4366 100644 --- a/cpp/benchmarks/text/minhash.cpp +++ b/cpp/benchmarks/text/minhash.cpp @@ -54,9 +54,8 @@ static void bench_minhash(nvbench::state& state) state.add_global_memory_writes(num_rows); // output are hashes state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = base64 - ? nvtext::minhash64_permuted(input, 0, parameters_a, parameters_b, hash_width) - : nvtext::minhash_permuted(input, 0, parameters_a, parameters_b, hash_width); + auto result = base64 ? nvtext::minhash64(input, 0, parameters_a, parameters_b, hash_width) + : nvtext::minhash(input, 0, parameters_a, parameters_b, hash_width); }); } diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index b2c1a23f57e..f0d5d9ecb5d 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -31,69 +31,6 @@ namespace CUDF_EXPORT nvtext { * @file */ -/** - * @brief Returns the minhash value for each string - * - * Hash values are computed from substrings of each string and the - * minimum hash value is returned for each string. - * - * Any null row entries result in corresponding null output rows. - * - * This function uses MurmurHash3_x86_32 for the hash algorithm. - * - * @deprecated Deprecated in 24.12 - * - * @throw std::invalid_argument if the width < 2 - * - * @param input Strings column to compute minhash - * @param seed Seed value used for the hash algorithm - * @param width The character width used for apply substrings; - * Default is 4 characters. - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned column's device memory - * @return Minhash values for each string in input - */ -[[deprecated]] std::unique_ptr minhash( - cudf::strings_column_view const& input, - cudf::numeric_scalar seed = 0, - cudf::size_type width = 4, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); - -/** - * @brief Returns the minhash values for each string per seed - * - * Hash values are computed from substrings of each string and the - * minimum hash value is returned for each string for each seed. - * Each row of the list column are seed results for the corresponding - * string. The order of the elements in each row match the order of - * the seeds provided in the `seeds` parameter. - * - * This function uses MurmurHash3_x86_32 for the hash algorithm. - * - * Any null row entries result in corresponding null output rows. - * - * @deprecated Deprecated in 24.12 - to be replaced in a future release - * - * @throw std::invalid_argument if the width < 2 - * @throw std::invalid_argument if seeds is empty - * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit - * - * @param input Strings column to compute minhash - * @param seeds Seed values used for the hash algorithm - * @param width The character width used for apply substrings; - * Default is 4 characters. - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned column's device memory - * @return List column of minhash values for each string per seed - */ -[[deprecated]] std::unique_ptr minhash( - cudf::strings_column_view const& input, - cudf::device_span seeds, - cudf::size_type width = 4, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); - /** * @brief Returns the minhash values for each string * @@ -132,7 +69,7 @@ namespace CUDF_EXPORT nvtext { * @param mr Device memory resource used to allocate the returned column's device memory * @return List column of minhash values for each string per seed */ -std::unique_ptr minhash_permuted( +std::unique_ptr minhash( cudf::strings_column_view const& input, uint32_t seed, cudf::device_span parameter_a, @@ -142,67 +79,16 @@ std::unique_ptr minhash_permuted( rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** - * @brief Returns the minhash value for each string - * - * Hash values are computed from substrings of each string and the - * minimum hash value is returned for each string. - * - * Any null row entries result in corresponding null output rows. - * - * This function uses MurmurHash3_x64_128 for the hash algorithm. - * The hash function returns 2 uint64 values but only the first value - * is used with the minhash calculation. - * - * @deprecated Deprecated in 24.12 - * - * @throw std::invalid_argument if the width < 2 - * - * @param input Strings column to compute minhash - * @param seed Seed value used for the hash algorithm - * @param width The character width used for apply substrings; - * Default is 4 characters. - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned column's device memory - * @return Minhash values as UINT64 for each string in input - */ -[[deprecated]] std::unique_ptr minhash64( - cudf::strings_column_view const& input, - cudf::numeric_scalar seed = 0, - cudf::size_type width = 4, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); - -/** - * @brief Returns the minhash values for each string per seed - * - * Hash values are computed from substrings of each string and the - * minimum hash value is returned for each string for each seed. - * Each row of the list column are seed results for the corresponding - * string. The order of the elements in each row match the order of - * the seeds provided in the `seeds` parameter. - * - * This function uses MurmurHash3_x64_128 for the hash algorithm. + * @copydoc nvtext::minhash * - * Any null row entries result in corresponding null output rows. - * - * @deprecated Deprecated in 24.12 - to be replaced in a future release - * - * @throw std::invalid_argument if the width < 2 - * @throw std::invalid_argument if seeds is empty - * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit - * - * @param input Strings column to compute minhash - * @param seeds Seed values used for the hash algorithm - * @param width The character width used for apply substrings; - * Default is 4 characters. - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned column's device memory - * @return List column of minhash values for each string per seed + * @deprecated Use nvtext::minhash() */ -[[deprecated]] std::unique_ptr minhash64( +[[deprecated]] std::unique_ptr minhash_permuted( cudf::strings_column_view const& input, - cudf::device_span seeds, - cudf::size_type width = 4, + uint32_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); @@ -244,7 +130,7 @@ std::unique_ptr minhash_permuted( * @param mr Device memory resource used to allocate the returned column's device memory * @return List column of minhash values for each string per seed */ -std::unique_ptr minhash64_permuted( +std::unique_ptr minhash64( cudf::strings_column_view const& input, uint64_t seed, cudf::device_span parameter_a, @@ -254,64 +140,18 @@ std::unique_ptr minhash64_permuted( rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** - * @brief Returns the minhash values for each row of strings per seed - * - * Hash values are computed from each string in each row and the - * minimum hash value is returned for each row for each seed. - * Each row of the output list column are seed results for the corresponding - * input row. The order of the elements in each row match the order of - * the seeds provided in the `seeds` parameter. - * - * This function uses MurmurHash3_x86_32 for the hash algorithm. - * - * Any null row entries result in corresponding null output rows. + * @copydoc nvtext::minhash64 * - * @deprecated Deprecated in 24.12 - * - * @throw std::invalid_argument if seeds is empty - * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit - * - * @param input Lists column of strings to compute minhash - * @param seeds Seed values used for the hash algorithm - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned column's device memory - * @return List column of minhash values for each string per seed + * @deprecated Use nvtext::minhash64() */ -[[deprecated]] std::unique_ptr word_minhash( - cudf::lists_column_view const& input, - cudf::device_span seeds, +[[deprecated]] std::unique_ptr minhash64_permuted( + cudf::strings_column_view const& input, + uint64_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); -/** - * @brief Returns the minhash values for each row of strings per seed - * - * Hash values are computed from each string in each row and the - * minimum hash value is returned for each row for each seed. - * Each row of the output list column are seed results for the corresponding - * input row. The order of the elements in each row match the order of - * the seeds provided in the `seeds` parameter. - * - * This function uses MurmurHash3_x64_128 for the hash algorithm though - * only the first 64-bits of the hash are used in computing the output. - * - * Any null row entries result in corresponding null output rows. - * - * @deprecated Deprecated in 24.12 - * - * @throw std::invalid_argument if seeds is empty - * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit - * - * @param input Lists column of strings to compute minhash - * @param seeds Seed values used for the hash algorithm - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned column's device memory - * @return List column of minhash values for each string per seed - */ -[[deprecated]] std::unique_ptr word_minhash64( - cudf::lists_column_view const& input, - cudf::device_span seeds, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group } // namespace CUDF_EXPORT nvtext diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index b7a719a2041..9a44d9477ab 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -52,118 +52,6 @@ namespace nvtext { namespace detail { namespace { -/** - * @brief Compute the minhash of each string for each seed - * - * This is a warp-per-string algorithm where parallel threads within a warp - * work on substrings of a single string row. - * - * @tparam HashFunction hash function to use on each substring - * - * @param d_strings Strings column to process - * @param seeds Seeds for hashing each string - * @param width Substring window size in characters - * @param d_hashes Minhash output values for each string - */ -template < - typename HashFunction, - typename hash_value_type = std:: - conditional_t, uint32_t, uint64_t>> -CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, - cudf::device_span seeds, - cudf::size_type width, - hash_value_type* d_hashes) -{ - auto const idx = cudf::detail::grid_1d::global_thread_id(); - - auto const str_idx = static_cast(idx / cudf::detail::warp_size); - if (str_idx >= d_strings.size()) { return; } - auto const lane_idx = static_cast(idx % cudf::detail::warp_size); - - if (d_strings.is_null(str_idx)) { return; } - - auto const d_str = d_strings.element(str_idx); - auto const d_output = d_hashes + (str_idx * seeds.size()); - - // initialize hashes output for this string - if (lane_idx == 0) { - auto const init = d_str.empty() ? 0 : std::numeric_limits::max(); - thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init); - } - __syncwarp(); - - auto const begin = d_str.data() + lane_idx; - auto const end = d_str.data() + d_str.size_bytes(); - - // each lane hashes 'width' substrings of d_str - for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) { - if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { continue; } - auto const check_str = // used for counting 'width' characters - cudf::string_view(itr, static_cast(thrust::distance(itr, end))); - auto const [bytes, left] = cudf::strings::detail::bytes_to_character_position(check_str, width); - if ((itr != d_str.data()) && (left > 0)) { continue; } // true if past the end of the string - - auto const hash_str = cudf::string_view(itr, bytes); - // hashing with each seed on the same section of the string is 10x faster than - // computing the substrings for each seed - for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) { - auto const hasher = HashFunction(seeds[seed_idx]); - // hash substring and store the min value - if constexpr (std::is_same_v) { - auto const hvalue = hasher(hash_str); - cuda::atomic_ref ref{*(d_output + seed_idx)}; - ref.fetch_min(hvalue, cuda::std::memory_order_relaxed); - } else { - // This code path assumes the use of MurmurHash3_x64_128 which produces 2 uint64 values - // but only uses the first uint64 value as requested by the LLM team. - auto const hvalue = thrust::get<0>(hasher(hash_str)); - cuda::atomic_ref ref{*(d_output + seed_idx)}; - ref.fetch_min(hvalue, cuda::std::memory_order_relaxed); - } - } - } -} - -template < - typename HashFunction, - typename hash_value_type = std:: - conditional_t, uint32_t, uint64_t>> -std::unique_ptr minhash_fn(cudf::strings_column_view const& input, - cudf::device_span seeds, - cudf::size_type width, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument); - CUDF_EXPECTS(width >= 2, - "Parameter width should be an integer value of 2 or greater", - std::invalid_argument); - CUDF_EXPECTS((static_cast(input.size()) * seeds.size()) < - static_cast(std::numeric_limits::max()), - "The number of seeds times the number of input rows exceeds the column size limit", - std::overflow_error); - - auto const output_type = cudf::data_type{cudf::type_to_id()}; - if (input.is_empty()) { return cudf::make_empty_column(output_type); } - - auto const d_strings = cudf::column_device_view::create(input.parent(), stream); - - auto hashes = cudf::make_numeric_column(output_type, - input.size() * static_cast(seeds.size()), - cudf::mask_state::UNALLOCATED, - stream, - mr); - auto d_hashes = hashes->mutable_view().data(); - - constexpr cudf::thread_index_type block_size = 256; - cudf::detail::grid_1d grid{ - static_cast(input.size()) * cudf::detail::warp_size, block_size}; - minhash_kernel<<>>( - *d_strings, seeds, width, d_hashes); - - return hashes; -} - constexpr cudf::thread_index_type block_size = 256; // for potentially tuning minhash_seed_kernel independently from block_size constexpr cudf::thread_index_type tile_size = block_size; @@ -297,13 +185,13 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings, * @param d_results Final results vector of calculate values */ template -CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings, - cudf::device_span indices, - cudf::device_span parameter_a, - cudf::device_span parameter_b, - cudf::size_type width, - hash_value_type const* d_hashes, - hash_value_type* d_results) +CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, + cudf::device_span indices, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + hash_value_type const* d_hashes, + hash_value_type* d_results) { auto const tid = cudf::detail::grid_1d::global_thread_id(); auto const idx = (tid / blocks_per_string) / block_size; @@ -478,7 +366,7 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, auto d_indices = cudf::device_span(indices.data(), threshold_index); cudf::detail::grid_1d grid{static_cast(d_indices.size()) * block_size, block_size}; - minhash_permuted_kernel + minhash_kernel <<>>( *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results); } @@ -489,7 +377,7 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, auto d_indices = cudf::device_span(indices.data() + threshold_index, count); cudf::detail::grid_1d grid{count * block_size * blocks_per_string, block_size}; - minhash_permuted_kernel + minhash_kernel <<>>( *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results); } @@ -497,101 +385,6 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, return results; } -/** - * @brief Compute the minhash of each list row of strings for each seed - * - * This is a warp-per-row algorithm where parallel threads within a warp - * work on strings in a single list row. - * - * @tparam HashFunction hash function to use on each string - * - * @param d_input List of strings to process - * @param seeds Seeds for hashing each string - * @param d_hashes Minhash output values (one per row) - */ -template < - typename HashFunction, - typename hash_value_type = std:: - conditional_t, uint32_t, uint64_t>> -CUDF_KERNEL void minhash_word_kernel(cudf::detail::lists_column_device_view const d_input, - cudf::device_span seeds, - hash_value_type* d_hashes) -{ - auto const idx = cudf::detail::grid_1d::global_thread_id(); - auto const row_idx = idx / cudf::detail::warp_size; - - if (row_idx >= d_input.size()) { return; } - if (d_input.is_null(row_idx)) { return; } - - auto const d_row = cudf::list_device_view(d_input, row_idx); - auto const d_output = d_hashes + (row_idx * seeds.size()); - - // initialize hashes output for this row - auto const lane_idx = static_cast(idx % cudf::detail::warp_size); - if (lane_idx == 0) { - auto const init = d_row.size() == 0 ? 0 : std::numeric_limits::max(); - thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init); - } - __syncwarp(); - - // each lane hashes a string from the input row - for (auto str_idx = lane_idx; str_idx < d_row.size(); str_idx += cudf::detail::warp_size) { - auto const hash_str = - d_row.is_null(str_idx) ? cudf::string_view{} : d_row.element(str_idx); - for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) { - auto const hasher = HashFunction(seeds[seed_idx]); - // hash string and store the min value - hash_value_type hv; - if constexpr (std::is_same_v) { - hv = hasher(hash_str); - } else { - // This code path assumes the use of MurmurHash3_x64_128 which produces 2 uint64 values - // but only uses the first uint64 value as requested by the LLM team. - hv = thrust::get<0>(hasher(hash_str)); - } - cuda::atomic_ref ref{*(d_output + seed_idx)}; - ref.fetch_min(hv, cuda::std::memory_order_relaxed); - } - } -} - -template < - typename HashFunction, - typename hash_value_type = std:: - conditional_t, uint32_t, uint64_t>> -std::unique_ptr word_minhash_fn(cudf::lists_column_view const& input, - cudf::device_span seeds, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument); - CUDF_EXPECTS((static_cast(input.size()) * seeds.size()) < - static_cast(std::numeric_limits::max()), - "The number of seeds times the number of input rows exceeds the column size limit", - std::overflow_error); - - auto const output_type = cudf::data_type{cudf::type_to_id()}; - if (input.is_empty()) { return cudf::make_empty_column(output_type); } - - auto const d_input = cudf::column_device_view::create(input.parent(), stream); - - auto hashes = cudf::make_numeric_column(output_type, - input.size() * static_cast(seeds.size()), - cudf::mask_state::UNALLOCATED, - stream, - mr); - auto d_hashes = hashes->mutable_view().data(); - auto lcdv = cudf::detail::lists_column_device_view(*d_input); - - constexpr cudf::thread_index_type block_size = 256; - cudf::detail::grid_1d grid{ - static_cast(input.size()) * cudf::detail::warp_size, block_size}; - minhash_word_kernel - <<>>(lcdv, seeds, d_hashes); - - return hashes; -} - std::unique_ptr build_list_result(cudf::column_view const& input, std::unique_ptr&& hashes, cudf::size_type seeds_size, @@ -620,30 +413,6 @@ std::unique_ptr build_list_result(cudf::column_view const& input, } } // namespace -std::unique_ptr minhash(cudf::strings_column_view const& input, - cudf::numeric_scalar const& seed, - cudf::size_type width, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; - auto const seeds = cudf::device_span{seed.data(), 1}; - auto hashes = detail::minhash_fn(input, seeds, width, stream, mr); - hashes->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count()); - return hashes; -} - -std::unique_ptr minhash(cudf::strings_column_view const& input, - cudf::device_span seeds, - cudf::size_type width, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; - auto hashes = detail::minhash_fn(input, seeds, width, stream, mr); - return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); -} - std::unique_ptr minhash(cudf::strings_column_view const& input, uint32_t seed, cudf::device_span parameter_a, @@ -658,30 +427,6 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr); } -std::unique_ptr minhash64(cudf::strings_column_view const& input, - cudf::numeric_scalar const& seed, - cudf::size_type width, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; - auto const seeds = cudf::device_span{seed.data(), 1}; - auto hashes = detail::minhash_fn(input, seeds, width, stream, mr); - hashes->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count()); - return hashes; -} - -std::unique_ptr minhash64(cudf::strings_column_view const& input, - cudf::device_span seeds, - cudf::size_type width, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; - auto hashes = detail::minhash_fn(input, seeds, width, stream, mr); - return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); -} - std::unique_ptr minhash64(cudf::strings_column_view const& input, uint64_t seed, cudf::device_span parameter_a, @@ -696,45 +441,18 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr); } -std::unique_ptr word_minhash(cudf::lists_column_view const& input, - cudf::device_span seeds, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; - auto hashes = detail::word_minhash_fn(input, seeds, stream, mr); - return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); -} - -std::unique_ptr word_minhash64(cudf::lists_column_view const& input, - cudf::device_span seeds, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; - auto hashes = detail::word_minhash_fn(input, seeds, stream, mr); - return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); -} } // namespace detail std::unique_ptr minhash(cudf::strings_column_view const& input, - cudf::numeric_scalar seed, - cudf::size_type width, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - return detail::minhash(input, seed, width, stream, mr); -} - -std::unique_ptr minhash(cudf::strings_column_view const& input, - cudf::device_span seeds, + uint32_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, cudf::size_type width, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::minhash(input, seeds, width, stream, mr); + return detail::minhash(input, seed, parameter_a, parameter_b, width, stream, mr); } std::unique_ptr minhash_permuted(cudf::strings_column_view const& input, @@ -750,23 +468,15 @@ std::unique_ptr minhash_permuted(cudf::strings_column_view const& } std::unique_ptr minhash64(cudf::strings_column_view const& input, - cudf::numeric_scalar seed, - cudf::size_type width, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - return detail::minhash64(input, seed, width, stream, mr); -} - -std::unique_ptr minhash64(cudf::strings_column_view const& input, - cudf::device_span seeds, + uint64_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, cudf::size_type width, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::minhash64(input, seeds, width, stream, mr); + return detail::minhash64(input, seed, parameter_a, parameter_b, width, stream, mr); } std::unique_ptr minhash64_permuted(cudf::strings_column_view const& input, @@ -781,21 +491,4 @@ std::unique_ptr minhash64_permuted(cudf::strings_column_view const return detail::minhash64(input, seed, parameter_a, parameter_b, width, stream, mr); } -std::unique_ptr word_minhash(cudf::lists_column_view const& input, - cudf::device_span seeds, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - return detail::word_minhash(input, seeds, stream, mr); -} - -std::unique_ptr word_minhash64(cudf::lists_column_view const& input, - cudf::device_span seeds, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - return detail::word_minhash64(input, seeds, stream, mr); -} } // namespace nvtext diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index 042ac44621e..8bfb17e0efd 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -44,10 +44,9 @@ TEST_F(MinHashTest, Permuted) auto view = cudf::strings_column_view(input); - auto first = thrust::counting_iterator(10); - auto params = cudf::test::fixed_width_column_wrapper(first, first + 3); - auto results = - nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); + auto first = thrust::counting_iterator(10); + auto params = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results = nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4); using LCW32 = cudf::test::lists_column_wrapper; // clang-format off @@ -66,9 +65,9 @@ TEST_F(MinHashTest, Permuted) // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 3); - auto results64 = nvtext::minhash64_permuted( - view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results64 = + nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); using LCW64 = cudf::test::lists_column_wrapper; // clang-format off @@ -95,10 +94,9 @@ TEST_F(MinHashTest, PermutedWide) auto input = cudf::test::strings_column_wrapper({small, wide}); auto view = cudf::strings_column_view(input); - auto first = thrust::counting_iterator(20); - auto params = cudf::test::fixed_width_column_wrapper(first, first + 3); - auto results = - nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); + auto first = thrust::counting_iterator(20); + auto params = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results = nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4); using LCW32 = cudf::test::lists_column_wrapper; // clang-format off @@ -109,9 +107,9 @@ TEST_F(MinHashTest, PermutedWide) // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 3); - auto results64 = nvtext::minhash64_permuted( - view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results64 = + nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); using LCW64 = cudf::test::lists_column_wrapper; // clang-format off @@ -132,9 +130,8 @@ TEST_F(MinHashTest, PermutedManyParameters) auto first = thrust::counting_iterator(20); // more than params_per_thread - auto params = cudf::test::fixed_width_column_wrapper(first, first + 31); - auto results = - nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); + auto params = cudf::test::fixed_width_column_wrapper(first, first + 31); + auto results = nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4); using LCW32 = cudf::test::lists_column_wrapper; // clang-format off @@ -152,9 +149,9 @@ TEST_F(MinHashTest, PermutedManyParameters) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); // more than params_per_thread - auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 31); - auto results64 = nvtext::minhash64_permuted( - view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 31); + auto results64 = + nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); using LCW64 = cudf::test::lists_column_wrapper; // clang-format off @@ -182,15 +179,13 @@ TEST_F(MinHashTest, PermutedManyParameters) TEST_F(MinHashTest, EmptyTest) { - auto input = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); - auto view = cudf::strings_column_view(input->view()); - auto params = cudf::test::fixed_width_column_wrapper({1, 2, 3}); - auto results = - nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); + auto input = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); + auto view = cudf::strings_column_view(input->view()); + auto params = cudf::test::fixed_width_column_wrapper({1, 2, 3}); + auto results = nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4); EXPECT_EQ(results->size(), 0); auto params64 = cudf::test::fixed_width_column_wrapper({1, 2, 3}); - results = nvtext::minhash64_permuted( - view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); + results = nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); EXPECT_EQ(results->size(), 0); } @@ -199,18 +194,16 @@ TEST_F(MinHashTest, ErrorsTest) auto input = cudf::test::strings_column_wrapper({"this string intentionally left blank"}); auto view = cudf::strings_column_view(input); auto empty = cudf::test::fixed_width_column_wrapper(); - EXPECT_THROW( - nvtext::minhash_permuted(view, 0, cudf::column_view(empty), cudf::column_view(empty), 0), - std::invalid_argument); + EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(empty), cudf::column_view(empty), 0), + std::invalid_argument); auto empty64 = cudf::test::fixed_width_column_wrapper(); EXPECT_THROW( - nvtext::minhash64_permuted(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 0), + nvtext::minhash64(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 0), std::invalid_argument); + EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(empty), cudf::column_view(empty), 4), + std::invalid_argument); EXPECT_THROW( - nvtext::minhash_permuted(view, 0, cudf::column_view(empty), cudf::column_view(empty), 4), - std::invalid_argument); - EXPECT_THROW( - nvtext::minhash64_permuted(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 4), + nvtext::minhash64(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 4), std::invalid_argument); std::vector h_input(50000, ""); @@ -219,18 +212,16 @@ TEST_F(MinHashTest, ErrorsTest) auto const zeroes = thrust::constant_iterator(0); auto params = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); + EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4), + std::overflow_error); + auto params64 = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); EXPECT_THROW( - nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4), + nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4), std::overflow_error); - auto params64 = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); - EXPECT_THROW(nvtext::minhash64_permuted( - view, 0, cudf::column_view(params64), cudf::column_view(params64), 4), - std::overflow_error); + EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(empty), 4), + std::invalid_argument); EXPECT_THROW( - nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(empty), 4), - std::invalid_argument); - EXPECT_THROW( - nvtext::minhash64_permuted(view, 0, cudf::column_view(params64), cudf::column_view(empty64), 4), + nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(empty64), 4), std::invalid_argument); } diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index fbb9ca4b128..7aa8f9f4a1c 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -594,6 +594,8 @@ def on_missing_reference(app, env, node, contnode): # TODO: Remove this when we figure out why typing_extensions doesn't seem # to map types correctly for intersphinx ("py:class", "typing_extensions.Self"), + ("py:class", "np.uint32"), + ("py:class", "np.uint64"), ] diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx index 25cfcf99ca6..9f2b3f92502 100644 --- a/python/cudf/cudf/_lib/nvtext/minhash.pyx +++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx @@ -10,19 +10,9 @@ from pylibcudf import nvtext @acquire_spill_lock() -def minhash(Column input, Column seeds, int width=4): - result = nvtext.minhash.minhash( - input.to_pylibcudf(mode="read"), - seeds.to_pylibcudf(mode="read"), - width, - ) - return Column.from_pylibcudf(result) - - -@acquire_spill_lock() -def minhash_permuted(Column input, uint32_t seed, Column a, Column b, int width): +def minhash(Column input, uint32_t seed, Column a, Column b, int width): return Column.from_pylibcudf( - nvtext.minhash.minhash_permuted( + nvtext.minhash.minhash( input.to_pylibcudf(mode="read"), seed, a.to_pylibcudf(mode="read"), @@ -33,19 +23,9 @@ def minhash_permuted(Column input, uint32_t seed, Column a, Column b, int width) @acquire_spill_lock() -def minhash64(Column input, Column seeds, int width=4): - result = nvtext.minhash.minhash64( - input.to_pylibcudf(mode="read"), - seeds.to_pylibcudf(mode="read"), - width, - ) - return Column.from_pylibcudf(result) - - -@acquire_spill_lock() -def minhash64_permuted(Column input, uint64_t seed, Column a, Column b, int width): +def minhash64(Column input, uint64_t seed, Column a, Column b, int width): return Column.from_pylibcudf( - nvtext.minhash.minhash64_permuted( + nvtext.minhash.minhash64( input.to_pylibcudf(mode="read"), seed, a.to_pylibcudf(mode="read"), @@ -53,21 +33,3 @@ def minhash64_permuted(Column input, uint64_t seed, Column a, Column b, int widt width, ) ) - - -@acquire_spill_lock() -def word_minhash(Column input, Column seeds): - result = nvtext.minhash.word_minhash( - input.to_pylibcudf(mode="read"), - seeds.to_pylibcudf(mode="read"), - ) - return Column.from_pylibcudf(result) - - -@acquire_spill_lock() -def word_minhash64(Column input, Column seeds): - result = nvtext.minhash.word_minhash64( - input.to_pylibcudf(mode="read"), - seeds.to_pylibcudf(mode="read"), - ) - return Column.from_pylibcudf(result) diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index 341ba6d11c3..b9095a22a42 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -9,10 +9,6 @@ from cudf._lib.nvtext.minhash import ( minhash, minhash64, - minhash64_permuted, - minhash_permuted, - word_minhash, - word_minhash64, ) from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 4a2483a80e3..06196717ce3 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5461,49 +5461,6 @@ def edit_distance_matrix(self) -> SeriesOrIndex: ) def minhash( - self, seeds: ColumnLike | None = None, width: int = 4 - ) -> SeriesOrIndex: - """ - Compute the minhash of a strings column. - This uses the MurmurHash3_x86_32 algorithm for the hash function. - - Parameters - ---------- - seeds : ColumnLike - The seeds used for the hash algorithm. - Must be of type uint32. - width : int - The width of the substring to hash. - Default is 4 characters. - - Examples - -------- - >>> import cudf - >>> str_series = cudf.Series(['this is my', 'favorite book']) - >>> seeds = cudf.Series([0], dtype=np.uint32) - >>> str_series.str.minhash(seeds) - 0 [21141582] - 1 [962346254] - dtype: list - >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - >>> str_series.str.minhash(seeds) - 0 [21141582, 403093213, 1258052021] - 1 [962346254, 677440381, 122618762] - dtype: list - """ - if seeds is None: - seeds_column = column.as_column(0, dtype=np.uint32, length=1) - else: - seeds_column = column.as_column(seeds) - if seeds_column.dtype != np.uint32: - raise ValueError( - f"Expecting a Series with dtype uint32, got {type(seeds)}" - ) - return self._return_or_inplace( - libstrings.minhash(self._column, seeds_column, width) - ) - - def minhash_permuted( self, seed: np.uint32, a: ColumnLike, b: ColumnLike, width: int ) -> SeriesOrIndex: """ @@ -5535,7 +5492,7 @@ def minhash_permuted( >>> s = cudf.Series(['this is my', 'favorite book']) >>> a = cudf.Series([1, 2, 3], dtype=np.uint32) >>> b = cudf.Series([4, 5, 6], dtype=np.uint32) - >>> s.str.minhash_permuted(0, a=a, b=b, width=5) + >>> s.str.minhash(0, a=a, b=b, width=5) 0 [1305480171, 462824409, 74608232] 1 [32665388, 65330773, 97996158] dtype: list @@ -5551,53 +5508,10 @@ def minhash_permuted( f"Expecting a Series with dtype uint32, got {type(b)}" ) return self._return_or_inplace( - libstrings.minhash_permuted( - self._column, seed, a_column, b_column, width - ) + libstrings.minhash(self._column, seed, a_column, b_column, width) ) def minhash64( - self, seeds: ColumnLike | None = None, width: int = 4 - ) -> SeriesOrIndex: - """ - Compute the minhash of a strings column. - - This uses the MurmurHash3_x64_128 algorithm for the hash function. - This function generates 2 uint64 values but only the first - uint64 value is used. - - Parameters - ---------- - seeds : ColumnLike - The seeds used for the hash algorithm. - Must be of type uint64. - width : int - The width of the substring to hash. - Default is 4 characters. - - Examples - -------- - >>> import cudf - >>> str_series = cudf.Series(['this is my', 'favorite book']) - >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint64) - >>> str_series.str.minhash64(seeds) - 0 [3232308021562742685, 4445611509348165860, 586435843695903598] - 1 [23008204270530356, 1281229757012344693, 153762819128779913] - dtype: list - """ - if seeds is None: - seeds_column = column.as_column(0, dtype=np.uint64, length=1) - else: - seeds_column = column.as_column(seeds) - if seeds_column.dtype != np.uint64: - raise ValueError( - f"Expecting a Series with dtype uint64, got {type(seeds)}" - ) - return self._return_or_inplace( - libstrings.minhash64(self._column, seeds_column, width) - ) - - def minhash64_permuted( self, seed: np.uint64, a: ColumnLike, b: ColumnLike, width: int ) -> SeriesOrIndex: """ @@ -5628,7 +5542,7 @@ def minhash64_permuted( >>> s = cudf.Series(['this is my', 'favorite book', 'to read']) >>> a = cudf.Series([2, 3], dtype=np.uint64) >>> b = cudf.Series([5, 6], dtype=np.uint64) - >>> s.str.minhash64_permuted(0, a=a, b=b, width=5) + >>> s.str.minhash64(0, a=a, b=b, width=5) 0 [172452388517576012, 316595762085180527] 1 [71427536958126239, 58787297728258215] 2 [423885828176437114, 1140588505926961370] @@ -5645,79 +5559,7 @@ def minhash64_permuted( f"Expecting a Series with dtype uint64, got {type(b)}" ) return self._return_or_inplace( - libstrings.minhash64_permuted( - self._column, seed, a_column, b_column, width - ) - ) - - def word_minhash(self, seeds: ColumnLike | None = None) -> SeriesOrIndex: - """ - Compute the minhash of a list column of strings. - This uses the MurmurHash3_x86_32 algorithm for the hash function. - - Parameters - ---------- - seeds : ColumnLike - The seeds used for the hash algorithm. - Must be of type uint32. - - Examples - -------- - >>> import cudf - >>> import numpy as np - >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]]) - >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - >>> ls.str.word_minhash(seeds=seeds) - 0 [21141582, 1232889953, 1268336794] - 1 [962346254, 2321233602, 1354839212] - dtype: list - """ - if seeds is None: - seeds_column = column.as_column(0, dtype=np.uint32, length=1) - else: - seeds_column = column.as_column(seeds) - if seeds_column.dtype != np.uint32: - raise ValueError( - f"Expecting a Series with dtype uint32, got {type(seeds)}" - ) - return self._return_or_inplace( - libstrings.word_minhash(self._column, seeds_column) - ) - - def word_minhash64(self, seeds: ColumnLike | None = None) -> SeriesOrIndex: - """ - Compute the minhash of a list column of strings. - This uses the MurmurHash3_x64_128 algorithm for the hash function. - This function generates 2 uint64 values but only the first - uint64 value is used. - - Parameters - ---------- - seeds : ColumnLike - The seeds used for the hash algorithm. - Must be of type uint64. - - Examples - -------- - >>> import cudf - >>> import numpy as np - >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]]) - >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint64) - >>> ls.str.word_minhash64(seeds) - 0 [2603139454418834912, 8644371945174847701, 5541030711534384340] - 1 [5240044617220523711, 5847101123925041457, 153762819128779913] - dtype: list - """ - if seeds is None: - seeds_column = column.as_column(0, dtype=np.uint64, length=1) - else: - seeds_column = column.as_column(seeds) - if seeds_column.dtype != np.uint64: - raise ValueError( - f"Expecting a Series with dtype uint64, got {type(seeds)}" - ) - return self._return_or_inplace( - libstrings.word_minhash64(self._column, seeds_column) + libstrings.minhash64(self._column, seed, a_column, b_column, width) ) def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex: diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 3637ef075f2..9a62285403f 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -882,7 +882,7 @@ def test_is_vowel_consonant(): assert_eq(expected, actual) -def test_minhash_permuted(): +def test_minhash(): strings = cudf.Series(["this is my", "favorite book", None, ""]) params = cudf.Series([1, 2, 3], dtype=np.uint32) @@ -894,7 +894,7 @@ def test_minhash_permuted(): cudf.Series([0, 0, 0], dtype=np.uint32), ] ) - actual = strings.str.minhash_permuted(0, a=params, b=params, width=5) + actual = strings.str.minhash(0, a=params, b=params, width=5) assert_eq(expected, actual) params = cudf.Series([1, 2, 3], dtype=np.uint64) @@ -912,78 +912,18 @@ def test_minhash_permuted(): cudf.Series([0, 0, 0], dtype=np.uint64), ] ) - actual = strings.str.minhash64_permuted(0, a=params, b=params, width=5) + actual = strings.str.minhash64(0, a=params, b=params, width=5) assert_eq(expected, actual) # test wrong seed types with pytest.raises(ValueError): - strings.str.minhash_permuted(1, a="a", b="b", width=7) + strings.str.minhash(1, a="a", b="b", width=7) with pytest.raises(ValueError): params = cudf.Series([0, 1, 2], dtype=np.int32) - strings.str.minhash_permuted(1, a=params, b=params, width=6) + strings.str.minhash(1, a=params, b=params, width=6) with pytest.raises(ValueError): params = cudf.Series([0, 1, 2], dtype=np.uint32) - strings.str.minhash64_permuted(1, a=params, b=params, width=8) - - -def test_word_minhash(): - ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]]) - - expected = cudf.Series( - [ - cudf.Series([21141582], dtype=np.uint32), - cudf.Series([962346254], dtype=np.uint32), - ] - ) - actual = ls.str.word_minhash() - assert_eq(expected, actual) - seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - expected = cudf.Series( - [ - cudf.Series([21141582, 1232889953, 1268336794], dtype=np.uint32), - cudf.Series([962346254, 2321233602, 1354839212], dtype=np.uint32), - ] - ) - actual = ls.str.word_minhash(seeds=seeds) - assert_eq(expected, actual) - - expected = cudf.Series( - [ - cudf.Series([2603139454418834912], dtype=np.uint64), - cudf.Series([5240044617220523711], dtype=np.uint64), - ] - ) - actual = ls.str.word_minhash64() - assert_eq(expected, actual) - seeds = cudf.Series([0, 1, 2], dtype=np.uint64) - expected = cudf.Series( - [ - cudf.Series( - [ - 2603139454418834912, - 8644371945174847701, - 5541030711534384340, - ], - dtype=np.uint64, - ), - cudf.Series( - [5240044617220523711, 5847101123925041457, 153762819128779913], - dtype=np.uint64, - ), - ] - ) - actual = ls.str.word_minhash64(seeds=seeds) - assert_eq(expected, actual) - - # test wrong seed types - with pytest.raises(ValueError): - ls.str.word_minhash(seeds="a") - with pytest.raises(ValueError): - seeds = cudf.Series([0, 1, 2], dtype=np.int32) - ls.str.word_minhash(seeds=seeds) - with pytest.raises(ValueError): - seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - ls.str.word_minhash64(seeds=seeds) + strings.str.minhash64(1, a=params, b=params, width=8) def test_jaccard_index(): diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd index 8570531dfde..9d1e8cba425 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd @@ -11,18 +11,6 @@ from pylibcudf.libcudf.types cimport size_type cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: cdef unique_ptr[column] minhash( - const column_view &strings, - const numeric_scalar[uint32_t] seed, - const size_type width, - ) except +libcudf_exception_handler - - cdef unique_ptr[column] minhash( - const column_view &strings, - const column_view &seeds, - const size_type width, - ) except +libcudf_exception_handler - - cdef unique_ptr[column] minhash_permuted( const column_view &strings, const uint32_t seed, const column_view &a, @@ -31,31 +19,9 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: ) except + cdef unique_ptr[column] minhash64( - const column_view &strings, - const column_view &seeds, - const size_type width, - ) except +libcudf_exception_handler - - cdef unique_ptr[column] minhash64( - const column_view &strings, - const numeric_scalar[uint64_t] seed, - const size_type width, - ) except +libcudf_exception_handler - - cdef unique_ptr[column] minhash64_permuted( const column_view &strings, const uint64_t seed, const column_view &a, const column_view &b, const size_type width, ) except + - - cdef unique_ptr[column] word_minhash( - const column_view &input, - const column_view &seeds - ) except +libcudf_exception_handler - - cdef unique_ptr[column] word_minhash64( - const column_view &input, - const column_view &seeds - ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd index 6b544282f44..0af53748cdc 100644 --- a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd @@ -9,9 +9,7 @@ ctypedef fused ColumnOrScalar: Column Scalar -cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=*) - -cpdef Column minhash_permuted( +cpdef Column minhash( Column input, uint32_t seed, Column a, @@ -19,16 +17,10 @@ cpdef Column minhash_permuted( size_type width ) -cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=*) - -cpdef Column minhash64_permuted( +cpdef Column minhash64( Column input, uint64_t seed, Column a, Column b, size_type width ) - -cpdef Column word_minhash(Column input, Column seeds) - -cpdef Column word_minhash64(Column input, Column seeds) diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyi b/python/pylibcudf/pylibcudf/nvtext/minhash.pyi index a2d9b6364f7..5d88cfbbea0 100644 --- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyi +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyi @@ -1,13 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from pylibcudf.column import Column -from pylibcudf.scalar import Scalar def minhash( - input: Column, seeds: Column | Scalar, width: int = 4 + input: Column, seed: int, a: Column, b: Column, width: int ) -> Column: ... def minhash64( - input: Column, seeds: Column | Scalar, width: int = 4 + input: Column, seed: int, a: Column, b: Column, width: int ) -> Column: ... -def word_minhash(input: Column, seeds: Column) -> Column: ... -def word_minhash64(input: Column, seeds: Column) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx index 5448cc6de9b..84811cda867 100644 --- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx @@ -8,69 +8,15 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.nvtext.minhash cimport ( minhash as cpp_minhash, minhash64 as cpp_minhash64, - minhash64_permuted as cpp_minhash64_permuted, - minhash_permuted as cpp_minhash_permuted, - word_minhash as cpp_word_minhash, - word_minhash64 as cpp_word_minhash64, ) -from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar from pylibcudf.libcudf.types cimport size_type -from pylibcudf.scalar cimport Scalar - -from cython.operator import dereference -import warnings __all__ = [ "minhash", "minhash64", - "word_minhash", - "word_minhash64", ] -cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4): - """ - Returns the minhash values for each string per seed. - This function uses MurmurHash3_x86_32 for the hash algorithm. - - For details, see :cpp:func:`minhash`. - - Parameters - ---------- - input : Column - Strings column to compute minhash - seeds : Column or Scalar - Seed value(s) used for the hash algorithm. - width : size_type - Character width used for apply substrings; - Default is 4 characters. - - Returns - ------- - Column - List column of minhash values for each string per seed - """ - warnings.warn( - "Starting in version 25.02, the signature of this function will " - "be changed to match pylibcudf.nvtext.minhash_permuted.", - FutureWarning - ) - - cdef unique_ptr[column] c_result - - if not isinstance(seeds, (Column, Scalar)): - raise TypeError("Must pass a Column or Scalar") - - with nogil: - c_result = cpp_minhash( - input.view(), - seeds.view() if ColumnOrScalar is Column else - dereference(seeds.c_obj.get()), - width - ) - - return Column.from_libcudf(move(c_result)) - -cpdef Column minhash_permuted( +cpdef Column minhash( Column input, uint32_t seed, Column a, @@ -81,7 +27,7 @@ cpdef Column minhash_permuted( Returns the minhash values for each string. This function uses MurmurHash3_x86_32 for the hash algorithm. - For details, see :cpp:func:`minhash_permuted`. + For details, see :cpp:func:`minhash`. Parameters ---------- @@ -104,7 +50,7 @@ cpdef Column minhash_permuted( cdef unique_ptr[column] c_result with nogil: - c_result = cpp_minhash_permuted( + c_result = cpp_minhash( input.view(), seed, a.view(), @@ -114,50 +60,7 @@ cpdef Column minhash_permuted( return Column.from_libcudf(move(c_result)) -cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4): - """ - Returns the minhash values for each string per seed. - This function uses MurmurHash3_x64_128 for the hash algorithm. - - For details, see :cpp:func:`minhash64`. - - Parameters - ---------- - input : Column - Strings column to compute minhash - seeds : Column or Scalar - Seed value(s) used for the hash algorithm. - width : size_type - Character width used for apply substrings; - Default is 4 characters. - - Returns - ------- - Column - List column of minhash values for each string per seed - """ - warnings.warn( - "Starting in version 25.02, the signature of this function will " - "be changed to match pylibcudf.nvtext.minhash64_permuted.", - FutureWarning - ) - - cdef unique_ptr[column] c_result - - if not isinstance(seeds, (Column, Scalar)): - raise TypeError("Must pass a Column or Scalar") - - with nogil: - c_result = cpp_minhash64( - input.view(), - seeds.view() if ColumnOrScalar is Column else - dereference(seeds.c_obj.get()), - width - ) - - return Column.from_libcudf(move(c_result)) - -cpdef Column minhash64_permuted( +cpdef Column minhash64( Column input, uint64_t seed, Column a, @@ -168,7 +71,7 @@ cpdef Column minhash64_permuted( Returns the minhash values for each string. This function uses MurmurHash3_x64_128 for the hash algorithm. - For details, see :cpp:func:`minhash64_permuted`. + For details, see :cpp:func:`minhash64`. Parameters ---------- @@ -191,7 +94,7 @@ cpdef Column minhash64_permuted( cdef unique_ptr[column] c_result with nogil: - c_result = cpp_minhash64_permuted( + c_result = cpp_minhash64( input.view(), seed, a.view(), @@ -200,62 +103,3 @@ cpdef Column minhash64_permuted( ) return Column.from_libcudf(move(c_result)) - -cpdef Column word_minhash(Column input, Column seeds): - """ - Returns the minhash values for each row of strings per seed. - This function uses MurmurHash3_x86_32 for the hash algorithm. - - For details, see :cpp:func:`word_minhash`. - - Parameters - ---------- - input : Column - Lists column of strings to compute minhash - seeds : Column or Scalar - Seed values used for the hash algorithm. - - Returns - ------- - Column - List column of minhash values for each string per seed - """ - cdef unique_ptr[column] c_result - - with nogil: - c_result = cpp_word_minhash( - input.view(), - seeds.view() - ) - - return Column.from_libcudf(move(c_result)) - -cpdef Column word_minhash64(Column input, Column seeds): - """ - Returns the minhash values for each row of strings per seed. - This function uses MurmurHash3_x64_128 for the hash algorithm though - only the first 64-bits of the hash are used in computing the output. - - For details, see :cpp:func:`word_minhash64`. - - Parameters - ---------- - input : Column - Lists column of strings to compute minhash - seeds : Column or Scalar - Seed values used for the hash algorithm. - - Returns - ------- - Column - List column of minhash values for each string per seed - """ - cdef unique_ptr[column] c_result - - with nogil: - c_result = cpp_word_minhash64( - input.view(), - seeds.view() - ) - - return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py index ec533e64307..ad7a6f7a762 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py @@ -13,20 +13,13 @@ def minhash_input_data(request): return input_arr, seeds, request.param -@pytest.fixture(scope="module", params=[pa.uint32(), pa.uint64()]) -def word_minhash_input_data(request): - input_arr = pa.array([["foo", "bar"], ["foo foo", "bar bar"]]) - seeds = pa.array([2, 3, 4, 5], request.param) - return input_arr, seeds, request.param - - @pytest.mark.parametrize("width", [5, 12]) -def test_minhash_permuted(minhash_input_data, width): +def test_minhash(minhash_input_data, width): input_arr, seeds, seed_type = minhash_input_data minhash_func = ( - plc.nvtext.minhash.minhash_permuted + plc.nvtext.minhash.minhash if seed_type == pa.uint32() - else plc.nvtext.minhash.minhash64_permuted + else plc.nvtext.minhash.minhash64 ) result = minhash_func( plc.interop.from_arrow(input_arr), @@ -40,20 +33,3 @@ def test_minhash_permuted(minhash_input_data, width): assert pa_result.type == pa.list_( pa.field("element", seed_type, nullable=False) ) - - -def test_word_minhash(word_minhash_input_data): - input_arr, seeds, seed_type = word_minhash_input_data - word_minhash_func = ( - plc.nvtext.minhash.word_minhash - if seed_type == pa.uint32() - else plc.nvtext.minhash.word_minhash64 - ) - result = word_minhash_func( - plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds) - ) - pa_result = plc.interop.to_arrow(result) - assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr)) - assert pa_result.type == pa.list_( - pa.field("element", seed_type, nullable=False) - ) From cd3a79bfa71be68c8e95ff8dd60a41eb641f8d5a Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Wed, 11 Dec 2024 11:12:32 -0600 Subject: [PATCH 59/78] Specify a version for rapids_logger dependency (#17573) ## Description #17307 broke builds that use the rapids-cmake pinned dependencies feature since no version was specified for the rapids_logger dependency. This adds a version string equal to the git tag so the dependency has a stated version. ## Checklist - [X] I am familiar with the [Contributing Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md). - [ ] New or existing tests cover these changes. - [X] The documentation is up to date with these changes. --------- Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com> Co-authored-by: Vyas Ramasubramani Co-authored-by: Bradley Dice --- cpp/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 3d77307ccde..2f17b57b0a4 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -277,7 +277,7 @@ rapids_cpm_init() # Not using rapids-cmake since we never want to find, always download. CPMAddPackage( NAME rapids_logger GITHUB_REPOSITORY rapidsai/rapids-logger GIT_SHALLOW TRUE GIT_TAG - 14bb233d2420f7187a690f0bb528ec0420c70d48 + c510947ae9d3a67530cfe3e5eaccb5a3b8ea0e55 VERSION c510947ae9d3a67530cfe3e5eaccb5a3b8ea0e55 ) rapids_make_logger(cudf EXPORT_SET cudf-exports) From 3801e7496914dec453f0d3cb49aef7c60ab636aa Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 11 Dec 2024 12:18:49 -0800 Subject: [PATCH 60/78] Replace direct `cudaMemcpyAsync` calls with utility functions (within `/include`) (#17557) Replaced the calls to `cudaMemcpyAsync` with the new `cuda_memcpy`/`cuda_memcpy_async` utility, which optionally avoids using the copy engine. Also took the opportunity to use `cudf::detail::host_vector` and its factories to enable wider pinned memory use. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - David Wendt (https://github.com/davidwendt) - https://github.com/nvdbaranec - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/17557 --- cpp/include/cudf/detail/get_value.cuh | 11 +++++------ cpp/include/cudf/table/table_device_view.cuh | 10 +++++++--- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/cpp/include/cudf/detail/get_value.cuh b/cpp/include/cudf/detail/get_value.cuh index 5ea0d06039f..1bfb40e5916 100644 --- a/cpp/include/cudf/detail/get_value.cuh +++ b/cpp/include/cudf/detail/get_value.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include #include @@ -48,11 +49,9 @@ T get_value(column_view const& col_view, size_type element_index, rmm::cuda_stre CUDF_EXPECTS(data_type(type_to_id()) == col_view.type(), "get_value data type mismatch"); CUDF_EXPECTS(element_index >= 0 && element_index < col_view.size(), "invalid element_index value"); - T result; - CUDF_CUDA_TRY(cudaMemcpyAsync( - &result, col_view.data() + element_index, sizeof(T), cudaMemcpyDefault, stream.value())); - stream.synchronize(); - return result; + return cudf::detail::make_host_vector_sync( + device_span{col_view.data() + element_index, 1}, stream) + .front(); } } // namespace detail diff --git a/cpp/include/cudf/table/table_device_view.cuh b/cpp/include/cudf/table/table_device_view.cuh index 16d532ea2b8..4f6238b5fe7 100644 --- a/cpp/include/cudf/table/table_device_view.cuh +++ b/cpp/include/cudf/table/table_device_view.cuh @@ -16,6 +16,8 @@ #pragma once #include +#include +#include #include #include #include @@ -251,7 +253,7 @@ auto contiguous_copy_column_device_views(HostTableView source_view, rmm::cuda_st // A buffer of CPU memory is allocated to hold the ColumnDeviceView // objects. Once filled, the CPU memory is then copied to device memory // and the pointer is set in the d_columns member. - std::vector h_buffer(padded_views_size_bytes); + auto h_buffer = cudf::detail::make_host_vector(padded_views_size_bytes, stream); // Each ColumnDeviceView instance may have child objects which may // require setting some internal device pointers before being copied // from CPU to device. @@ -266,8 +268,10 @@ auto contiguous_copy_column_device_views(HostTableView source_view, rmm::cuda_st auto d_columns = detail::child_columns_to_device_array( source_view.begin(), source_view.end(), h_ptr, d_ptr); - CUDF_CUDA_TRY(cudaMemcpyAsync(d_ptr, h_ptr, views_size_bytes, cudaMemcpyDefault, stream.value())); - stream.synchronize(); + auto const h_span = host_span{h_buffer}.subspan( + static_cast(h_ptr) - h_buffer.data(), views_size_bytes); + auto const d_span = device_span{static_cast(d_ptr), views_size_bytes}; + cudf::detail::cuda_memcpy(d_span, h_span, stream); return std::make_tuple(std::move(descendant_storage), d_columns); } From 63c5a384f29050ee50e4a2ab0681fceeab5cd3ec Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 11 Dec 2024 17:32:29 -0500 Subject: [PATCH 61/78] Fix some possible thread-id overflow calculations (#17473) Fixes some possible thread-id calculations or usages that may possibly overflow `int32` type or `size_type`. Reference #10368 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Shruti Shivakumar (https://github.com/shrshi) URL: https://github.com/rapidsai/cudf/pull/17473 --- cpp/include/cudf/detail/copy_if_else.cuh | 11 ++++++----- cpp/src/partitioning/partitioning.cu | 7 ++++--- cpp/src/quantiles/tdigest/tdigest_aggregation.cu | 2 +- cpp/src/transform/jit/kernel.cu | 5 +++-- cpp/src/transform/row_bit_count.cu | 2 +- 5 files changed, 15 insertions(+), 12 deletions(-) diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh index 5dc75b1a3fb..a7efb4e6e93 100644 --- a/cpp/include/cudf/detail/copy_if_else.cuh +++ b/cpp/include/cudf/detail/copy_if_else.cuh @@ -44,10 +44,11 @@ __launch_bounds__(block_size) CUDF_KERNEL mutable_column_device_view out, size_type* __restrict__ const valid_count) { - auto tidx = cudf::detail::grid_1d::global_thread_id(); - auto const stride = cudf::detail::grid_1d::grid_stride(); - int const warp_id = tidx / cudf::detail::warp_size; - size_type const warps_per_grid = gridDim.x * block_size / cudf::detail::warp_size; + auto tidx = cudf::detail::grid_1d::global_thread_id(); + + auto const stride = cudf::detail::grid_1d::grid_stride(); + auto const warp_id = tidx / cudf::detail::warp_size; + auto const warps_per_grid = stride / cudf::detail::warp_size; // begin/end indices for the column data size_type const begin = 0; @@ -60,7 +61,7 @@ __launch_bounds__(block_size) CUDF_KERNEL // lane id within the current warp constexpr size_type leader_lane{0}; - int const lane_id = threadIdx.x % cudf::detail::warp_size; + auto const lane_id = threadIdx.x % cudf::detail::warp_size; size_type warp_valid_count{0}; diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu index ebab3beb08f..d6b85db3f0f 100644 --- a/cpp/src/partitioning/partitioning.cu +++ b/cpp/src/partitioning/partitioning.cu @@ -138,7 +138,7 @@ CUDF_KERNEL void compute_row_partition_numbers(row_hasher_t the_hasher, auto const stride = cudf::detail::grid_1d::grid_stride(); // Initialize local histogram - size_type partition_number = threadIdx.x; + thread_index_type partition_number = threadIdx.x; while (partition_number < num_partitions) { shared_partition_sizes[partition_number] = 0; partition_number += blockDim.x; @@ -207,7 +207,7 @@ CUDF_KERNEL void compute_row_output_locations(size_type* __restrict__ row_partit extern __shared__ size_type shared_partition_offsets[]; // Initialize array of this blocks offsets from global array - size_type partition_number = threadIdx.x; + thread_index_type partition_number = threadIdx.x; while (partition_number < num_partitions) { shared_partition_offsets[partition_number] = block_partition_offsets[partition_number * gridDim.x + blockIdx.x]; @@ -303,7 +303,8 @@ CUDF_KERNEL void copy_block_partitions(InputIter input_iter, // Fetch the offset in the output buffer of each partition in this thread // block - for (size_type ipartition = threadIdx.x; ipartition < num_partitions; ipartition += blockDim.x) { + for (thread_index_type ipartition = threadIdx.x; ipartition < num_partitions; + ipartition += blockDim.x) { partition_offset_global[ipartition] = scanned_block_partition_sizes[ipartition * gridDim.x + blockIdx.x]; } diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu index d27420658d6..2128bacff80 100644 --- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu +++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu @@ -385,7 +385,7 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta, size_type const* group_cluster_offsets, bool has_nulls) { - int const tid = threadIdx.x + blockIdx.x * blockDim.x; + auto const tid = cudf::detail::grid_1d::global_thread_id(); auto const group_index = tid; if (group_index >= num_groups) { return; } diff --git a/cpp/src/transform/jit/kernel.cu b/cpp/src/transform/jit/kernel.cu index 4fd0369c26b..9d96c11c3f2 100644 --- a/cpp/src/transform/jit/kernel.cu +++ b/cpp/src/transform/jit/kernel.cu @@ -38,8 +38,9 @@ CUDF_KERNEL void kernel(cudf::size_type size, TypeOut* out_data, TypeIn* in_data { // cannot use global_thread_id utility due to a JIT build issue by including // the `cudf/detail/utilities/cuda.cuh` header - thread_index_type const start = threadIdx.x + blockIdx.x * blockDim.x; - thread_index_type const stride = blockDim.x * gridDim.x; + auto const block_size = static_cast(blockDim.x); + thread_index_type const start = threadIdx.x + blockIdx.x * block_size; + thread_index_type const stride = block_size * gridDim.x; for (auto i = start; i < static_cast(size); i += stride) { GENERIC_UNARY_OP(&out_data[i], in_data[i]); diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu index 66bbe532e46..39c11295fbd 100644 --- a/cpp/src/transform/row_bit_count.cu +++ b/cpp/src/transform/row_bit_count.cu @@ -413,7 +413,7 @@ CUDF_KERNEL void compute_segment_sizes(device_span col size_type max_branch_depth) { extern __shared__ row_span thread_branch_stacks[]; - int const tid = threadIdx.x + blockIdx.x * blockDim.x; + auto const tid = static_cast(cudf::detail::grid_1d::global_thread_id()); auto const num_segments = static_cast(output.size()); if (tid >= num_segments) { return; } From 32548b074bc0350186906c223980acac142ba5a2 Mon Sep 17 00:00:00 2001 From: Nghia Truong <7416935+ttnghia@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:34:28 -0800 Subject: [PATCH 62/78] Expose Scalar's constructor and `Scalar#getScalarHandle()` to public (#17580) This exposes the constructor and `getScalarHandle()` method in `Scalar.java` to the public, allowing them to be called from the outside. Without access to these methods, it was very inconvenient. Workaround has been implemented ([spark-rapids-jni/CudfAccessor.java](https://github.com/NVIDIA/spark-rapids-jni/blob/5231d4d82603d488b95ea259874a26f9f4354005/src/main/java/ai/rapids/cudf/CudfAccessor.java#L21)) to overcome this but it is better to have the issue addressed from the root. Partially contributes to https://github.com/NVIDIA/spark-rapids-jni/issues/1307. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - Jason Lowe (https://github.com/jlowe) URL: https://github.com/rapidsai/cudf/pull/17580 --- java/src/main/java/ai/rapids/cudf/Scalar.java | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Scalar.java b/java/src/main/java/ai/rapids/cudf/Scalar.java index 286b5c208c9..f3155bc5860 100644 --- a/java/src/main/java/ai/rapids/cudf/Scalar.java +++ b/java/src/main/java/ai/rapids/cudf/Scalar.java @@ -521,13 +521,28 @@ private static ColumnVector buildNullColumnVector(HostColumnVector.DataType host private static native long makeStructScalar(long[] viewHandles, boolean isValid); private static native long repeatString(long scalarHandle, int repeatTimes); - Scalar(DType type, long scalarHandle) { + /** + * Constructor to create a scalar from a native handle and a type. + * + * @param type The type of the scalar + * @param scalarHandle The native handle (pointer address) to the scalar data + */ + public Scalar(DType type, long scalarHandle) { this.type = type; this.offHeap = new OffHeapState(scalarHandle); MemoryCleaner.register(this, offHeap); incRefCount(); } + /** + * Get the native handle (native pointer address) for the scalar. + * + * @return The native handle + */ + public long getScalarHandle() { + return offHeap.scalarHandle; + } + /** * Increment the reference count for this scalar. You need to call close on this * to decrement the reference count again. @@ -542,10 +557,6 @@ public synchronized Scalar incRefCount() { return this; } - long getScalarHandle() { - return offHeap.scalarHandle; - } - /** * Free the memory associated with a scalar. */ From 78e5c0d6c5a5c876421d1ab2308b14f8c7ecb9f7 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 11 Dec 2024 17:53:36 -0800 Subject: [PATCH 63/78] Use batched memcpy when writing ORC statistics (#17572) This PR replaces a set of per-column, per-rowgroup D2D memcopies with a single call to the `batched_memcpy_async` utility. Should improve performance when writing wide tables. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Muhammad Haseeb (https://github.com/mhaseeb123) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/17572 --- cpp/src/io/orc/writer_impl.cu | 36 ++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 0906017ee61..8e532b01788 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -28,6 +28,7 @@ #include #include +#include #include #include #include @@ -1386,29 +1387,34 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer, // we know the size of each array. The number of stripes per column in a chunk array can // be calculated by dividing the number of chunks by the number of columns. // That many chunks need to be copied at a time to the proper destination. - size_t num_entries_seen = 0; + size_t num_entries_seen = 0; + auto const num_buffers_to_copy = per_chunk_stats.stripe_stat_chunks.size() * num_columns * 2; + auto h_srcs = cudf::detail::make_empty_host_vector(num_buffers_to_copy, stream); + auto h_dsts = cudf::detail::make_empty_host_vector(num_buffers_to_copy, stream); + auto h_lens = cudf::detail::make_empty_host_vector(num_buffers_to_copy, stream); + for (size_t i = 0; i < per_chunk_stats.stripe_stat_chunks.size(); ++i) { auto const stripes_per_col = per_chunk_stats.stripe_stat_chunks[i].size() / num_columns; - auto const chunk_bytes = stripes_per_col * sizeof(statistics_chunk); - auto const merge_bytes = stripes_per_col * sizeof(statistics_merge_group); for (size_t col = 0; col < num_columns; ++col) { - CUDF_CUDA_TRY( - cudaMemcpyAsync(stat_chunks.data() + (num_stripes * col) + num_entries_seen, - per_chunk_stats.stripe_stat_chunks[i].data() + col * stripes_per_col, - chunk_bytes, - cudaMemcpyDefault, - stream.value())); - CUDF_CUDA_TRY( - cudaMemcpyAsync(stats_merge.device_ptr() + (num_stripes * col) + num_entries_seen, - per_chunk_stats.stripe_stat_merge[i].device_ptr() + col * stripes_per_col, - merge_bytes, - cudaMemcpyDefault, - stream.value())); + h_srcs.push_back(per_chunk_stats.stripe_stat_chunks[i].data() + col * stripes_per_col); + h_dsts.push_back(stat_chunks.data() + (num_stripes * col) + num_entries_seen); + h_lens.push_back(stripes_per_col * sizeof(statistics_chunk)); + + h_srcs.push_back(per_chunk_stats.stripe_stat_merge[i].device_ptr() + col * stripes_per_col); + h_dsts.push_back(stats_merge.device_ptr() + (num_stripes * col) + num_entries_seen); + h_lens.push_back(stripes_per_col * sizeof(statistics_merge_group)); } num_entries_seen += stripes_per_col; } + auto const& mr = cudf::get_current_device_resource_ref(); + auto const d_srcs = cudf::detail::make_device_uvector_async(h_srcs, stream, mr); + auto const d_dsts = cudf::detail::make_device_uvector_async(h_dsts, stream, mr); + auto const d_lens = cudf::detail::make_device_uvector_async(h_lens, stream, mr); + cudf::detail::batched_memcpy_async( + d_srcs.begin(), d_dsts.begin(), d_lens.begin(), d_srcs.size(), stream); + auto file_stats_merge = cudf::detail::make_host_vector(num_file_blobs, stream); for (auto i = 0u; i < num_file_blobs; ++i) { From 00ed1f27df491d11c82d4990c979b0c2783c5881 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Wed, 11 Dec 2024 23:41:11 -0600 Subject: [PATCH 64/78] Remove unused code of json schema in JSON reader (#17581) Remove dead code in json reader Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Muhammad Haseeb (https://github.com/mhaseeb123) URL: https://github.com/rapidsai/cudf/pull/17581 --- cpp/src/io/json/nested_json.hpp | 11 --- cpp/src/io/json/parser_features.cpp | 116 ---------------------------- 2 files changed, 127 deletions(-) diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 2f6942fe139..cc5f256ea80 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -464,17 +464,6 @@ std::unique_ptr make_all_nulls_column(schema_element const& schema, */ column_name_info make_column_name_info(schema_element const& schema, std::string const& col_name); -/** - * @brief Get the path data type of a column by path if present in input schema - * - * @param path path of the column - * @param options json reader options which holds schema - * @return data type of the column if present - */ -std::optional get_path_data_type( - host_span const> path, - cudf::io::json_reader_options const& options); - /** * @brief Helper class to get path of a column by column id from reduced column tree * diff --git a/cpp/src/io/json/parser_features.cpp b/cpp/src/io/json/parser_features.cpp index 2da320b2af3..4b4827ca8d9 100644 --- a/cpp/src/io/json/parser_features.cpp +++ b/cpp/src/io/json/parser_features.cpp @@ -68,78 +68,6 @@ void json_reader_options::set_dtypes(schema_element types) } // namespace cudf::io namespace cudf::io::json::detail { -namespace { - -// example schema and its path. -// "a": int {"a", int} -// "a": [ int ] {"a", list}, {"element", int} -// "a": { "b": int} {"a", struct}, {"b", int} -// "a": [ {"b": int }] {"a", list}, {"element", struct}, {"b", int} -// "a": [ null] {"a", list}, {"element", str} -// back() is root. -// front() is leaf. -/** - * @brief Get the path data type of a column by path if present in input schema - * - * @param path path of the json column - * @param root root of input schema element - * @return data type of the column if present, otherwise std::nullopt - */ -std::optional get_path_data_type( - host_span const> path, schema_element const& root) -{ - if (path.empty() || path.size() == 1) { - return root.type; - } else { - if (path.back().second == NC_STRUCT && root.type.id() == type_id::STRUCT) { - auto const child_name = path.first(path.size() - 1).back().first; - auto const child_schema_it = root.child_types.find(child_name); - return (child_schema_it != std::end(root.child_types)) - ? get_path_data_type(path.first(path.size() - 1), child_schema_it->second) - : std::optional{}; - } else if (path.back().second == NC_LIST && root.type.id() == type_id::LIST) { - auto const child_schema_it = root.child_types.find(list_child_name); - return (child_schema_it != std::end(root.child_types)) - ? get_path_data_type(path.first(path.size() - 1), child_schema_it->second) - : std::optional{}; - } - return std::optional{}; - } -} - -std::optional child_schema_element(std::string const& col_name, - cudf::io::json_reader_options const& options) -{ - return std::visit( - cudf::detail::visitor_overload{ - [col_name](std::vector const& user_dtypes) -> std::optional { - auto column_index = atol(col_name.data()); - return (static_cast(column_index) < user_dtypes.size()) - ? std::optional{{user_dtypes[column_index]}} - : std::optional{}; - }, - [col_name]( - std::map const& user_dtypes) -> std::optional { - return (user_dtypes.find(col_name) != std::end(user_dtypes)) - ? std::optional{{user_dtypes.find(col_name)->second}} - : std::optional{}; - }, - [col_name]( - std::map const& user_dtypes) -> std::optional { - return (user_dtypes.find(col_name) != std::end(user_dtypes)) - ? user_dtypes.find(col_name)->second - : std::optional{}; - }, - [col_name](schema_element const& user_dtypes) -> std::optional { - return (user_dtypes.child_types.find(col_name) != std::end(user_dtypes.child_types)) - ? user_dtypes.child_types.find(col_name)->second - : std::optional{}; - }}, - options.get_dtypes()); -} - -} // namespace - /// Created an empty column of the specified schema struct empty_column_functor { rmm::cuda_stream_view stream; @@ -311,48 +239,4 @@ column_name_info make_column_name_info(schema_element const& schema, std::string } return info; } - -std::optional get_path_data_type( - host_span const> path, - cudf::io::json_reader_options const& options) -{ - if (path.empty()) return {}; - std::optional col_schema = child_schema_element(path.back().first, options); - // check if it has value, then do recursive call and return. - if (col_schema.has_value()) { - return get_path_data_type(path, col_schema.value()); - } else { - return {}; - } -} - -// idea: write a memoizer using template and lambda?, then call recursively. -std::vector path_from_tree::get_path(NodeIndexT this_col_id) -{ - std::vector path; - // stops at root. - while (this_col_id != parent_node_sentinel) { - auto type = column_categories[this_col_id]; - std::string name = ""; - // code same as name_and_parent_index lambda. - auto parent_col_id = column_parent_ids[this_col_id]; - if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) { - if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) { - name = column_names[this_col_id]; - } else { - name = list_child_name; - } - } else if (column_categories[parent_col_id] == NC_FN) { - auto field_name_col_id = parent_col_id; - parent_col_id = column_parent_ids[parent_col_id]; - name = column_names[field_name_col_id]; - } - // "name": type/schema - path.emplace_back(name, type); - this_col_id = parent_col_id; - if (this_col_id == row_array_parent_col_id) return path; - } - return {}; -} - } // namespace cudf::io::json::detail From 98d98560ff3f5cdf3c6e72243d914ef87fcd4753 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 12 Dec 2024 08:34:47 -0500 Subject: [PATCH 65/78] Add anonymous namespace to libcudf test source (#17529) Uses anonymous namespace declaration on internal only functions and structures in the libcudf gtest source. This helps prevent odd nvcc compile errors like the one described in #17432 Closes #17432 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/17529 --- cpp/tests/bitmask/set_nullmask_tests.cu | 9 ++------- cpp/tests/bitmask/valid_if_tests.cu | 2 ++ cpp/tests/column/bit_cast_test.cpp | 4 ++++ cpp/tests/column/compound_test.cu | 2 ++ cpp/tests/device_atomics/device_atomics_test.cu | 2 ++ cpp/tests/fixed_point/fixed_point_tests.cpp | 2 ++ cpp/tests/fixed_point/fixed_point_tests.cu | 2 ++ cpp/tests/groupby/tdigest_tests.cu | 4 ++++ cpp/tests/interop/dlpack_test.cpp | 2 ++ cpp/tests/io/json/json_tree.cpp | 4 ++-- cpp/tests/io/json/json_tree_csr.cu | 3 +++ cpp/tests/io/parquet_chunked_reader_test.cu | 10 ++++++++++ cpp/tests/iterator/optional_iterator_test_numeric.cu | 12 ++---------- cpp/tests/iterator/pair_iterator_test_numeric.cu | 12 ++---------- cpp/tests/quantiles/percentile_approx_test.cpp | 2 ++ cpp/tests/reductions/tdigest_tests.cu | 4 +++- cpp/tests/streams/interop_test.cpp | 2 ++ cpp/tests/transform/row_bit_count_test.cu | 2 ++ cpp/tests/wrappers/timestamps_test.cu | 4 ++++ 19 files changed, 54 insertions(+), 30 deletions(-) diff --git a/cpp/tests/bitmask/set_nullmask_tests.cu b/cpp/tests/bitmask/set_nullmask_tests.cu index e95c9fb41c6..9f8d22ea94d 100644 --- a/cpp/tests/bitmask/set_nullmask_tests.cu +++ b/cpp/tests/bitmask/set_nullmask_tests.cu @@ -31,6 +31,7 @@ #include #include +namespace { struct valid_bit_functor { cudf::bitmask_type const* _null_mask; __device__ bool operator()(cudf::size_type element_index) const noexcept @@ -38,13 +39,7 @@ struct valid_bit_functor { return cudf::bit_is_set(_null_mask, element_index); } }; - -std::ostream& operator<<(std::ostream& stream, thrust::host_vector const& bits) -{ - for (auto _bit : bits) - stream << int(_bit); - return stream; -} +} // namespace struct SetBitmaskTest : public cudf::test::BaseFixture { void expect_bitmask_equal(cudf::bitmask_type const* bitmask, // Device Ptr diff --git a/cpp/tests/bitmask/valid_if_tests.cu b/cpp/tests/bitmask/valid_if_tests.cu index 96f122f21a8..8ffcc552ecb 100644 --- a/cpp/tests/bitmask/valid_if_tests.cu +++ b/cpp/tests/bitmask/valid_if_tests.cu @@ -28,6 +28,7 @@ struct ValidIfTest : public cudf::test::BaseFixture {}; +namespace { struct odds_valid { __host__ __device__ bool operator()(cudf::size_type i) { return i % 2; } }; @@ -37,6 +38,7 @@ struct all_valid { struct all_null { __host__ __device__ bool operator()(cudf::size_type i) { return false; } }; +} // namespace TEST_F(ValidIfTest, EmptyRange) { diff --git a/cpp/tests/column/bit_cast_test.cpp b/cpp/tests/column/bit_cast_test.cpp index 5570a7d498c..1f29ea9e5fc 100644 --- a/cpp/tests/column/bit_cast_test.cpp +++ b/cpp/tests/column/bit_cast_test.cpp @@ -25,6 +25,7 @@ #include +namespace { template struct rep_type_impl { using type = void; @@ -47,12 +48,14 @@ struct rep_type_impl()>> { template using rep_type_t = typename rep_type_impl::type; +} // namespace template struct ColumnViewAllTypesTests : public cudf::test::BaseFixture {}; TYPED_TEST_SUITE(ColumnViewAllTypesTests, cudf::test::FixedWidthTypes); +namespace { template void do_bit_cast(cudf::column_view const& column_view, Iterator begin, Iterator end) { @@ -102,6 +105,7 @@ void do_bit_cast(cudf::column_view const& column_view, Iterator begin, Iterator } } } +} // namespace TYPED_TEST(ColumnViewAllTypesTests, BitCast) { diff --git a/cpp/tests/column/compound_test.cu b/cpp/tests/column/compound_test.cu index d7e93fb22a3..fff3282fdd5 100644 --- a/cpp/tests/column/compound_test.cu +++ b/cpp/tests/column/compound_test.cu @@ -34,6 +34,7 @@ struct CompoundColumnTest : public cudf::test::BaseFixture {}; +namespace { template struct checker_for_level1 { ColumnDeviceView d_column; @@ -62,6 +63,7 @@ struct checker_for_level2 { return bcheck; } }; +} // namespace TEST_F(CompoundColumnTest, ChildrenLevel1) { diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu index b81f8196d89..2fb24f6b31e 100644 --- a/cpp/tests/device_atomics/device_atomics_test.cu +++ b/cpp/tests/device_atomics/device_atomics_test.cu @@ -31,6 +31,7 @@ #include +namespace { template CUDF_KERNEL void gpu_atomic_test(T* result, T* data, size_t size) { @@ -109,6 +110,7 @@ std::enable_if_t(), T> accumulate(cudf::host_span xs.begin(), xs.end(), ys.begin(), [](T const& ts) { return ts.time_since_epoch().count(); }); return T{typename T::duration{std::accumulate(ys.begin(), ys.end(), 0)}}; } +} // namespace template struct AtomicsTest : public cudf::test::BaseFixture { diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp index b96c6909e55..f8f8d525043 100644 --- a/cpp/tests/fixed_point/fixed_point_tests.cpp +++ b/cpp/tests/fixed_point/fixed_point_tests.cpp @@ -577,10 +577,12 @@ TEST_F(FixedPointTest, Decimal32FloatVector) float_vector_test(0.15, 20, -2, std::multiplies<>()); } +namespace { struct cast_to_int32_fn { using decimal32 = fixed_point; int32_t __host__ __device__ operator()(decimal32 fp) { return static_cast(fp); } }; +} // namespace TYPED_TEST(FixedPointTestAllReps, FixedPointColumnWrapper) { diff --git a/cpp/tests/fixed_point/fixed_point_tests.cu b/cpp/tests/fixed_point/fixed_point_tests.cu index f34760341d8..ddc48c97012 100644 --- a/cpp/tests/fixed_point/fixed_point_tests.cu +++ b/cpp/tests/fixed_point/fixed_point_tests.cu @@ -72,10 +72,12 @@ TYPED_TEST(FixedPointTestAllReps, DecimalXXThrust) EXPECT_EQ(vec2, vec3); } +namespace { struct cast_to_int32_fn { using decimal32 = fixed_point; int32_t __host__ __device__ operator()(decimal32 fp) { return static_cast(fp); } }; +} // namespace TEST_F(FixedPointTest, DecimalXXThrustOnDevice) { diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu index 4ae5d06b214..883a5093bd1 100644 --- a/cpp/tests/groupby/tdigest_tests.cu +++ b/cpp/tests/groupby/tdigest_tests.cu @@ -30,6 +30,7 @@ #include #include +namespace { /** * @brief Functor to generate a tdigest by key. * @@ -116,6 +117,7 @@ struct tdigest_groupby_simple_merge_op { return std::move(result.second[0].results[0]); } }; +} // namespace template struct TDigestAllTypes : public cudf::test::BaseFixture {}; @@ -508,6 +510,7 @@ TEST_F(TDigestMergeTest, EmptyGroups) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result.second[0].results[0]); } +namespace { std::unique_ptr do_agg( cudf::column_view key, cudf::column_view val, @@ -537,6 +540,7 @@ std::unique_ptr do_agg( return std::make_unique(std::move(result_columns)); } +} // namespace TEST_F(TDigestMergeTest, AllValuesAreNull) { diff --git a/cpp/tests/interop/dlpack_test.cpp b/cpp/tests/interop/dlpack_test.cpp index ef4b9dd9b8a..b7106e823dd 100644 --- a/cpp/tests/interop/dlpack_test.cpp +++ b/cpp/tests/interop/dlpack_test.cpp @@ -26,6 +26,7 @@ #include +namespace { struct dlpack_deleter { void operator()(DLManagedTensor* tensor) { tensor->deleter(tensor); } }; @@ -60,6 +61,7 @@ void validate_dtype(DLDataType const& dtype) EXPECT_EQ(1, dtype.lanes); EXPECT_EQ(sizeof(T) * 8, dtype.bits); } +} // namespace class DLPackUntypedTests : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/io/json/json_tree.cpp b/cpp/tests/io/json/json_tree.cpp index 887d4fa783f..5201a46ba7d 100644 --- a/cpp/tests/io/json/json_tree.cpp +++ b/cpp/tests/io/json/json_tree.cpp @@ -34,6 +34,8 @@ namespace cuio_json = cudf::io::json; +namespace { + // Host copy of tree_meta_t struct tree_meta_t2 { std::vector node_categories; @@ -43,8 +45,6 @@ struct tree_meta_t2 { std::vector node_range_end; }; -namespace { - tree_meta_t2 to_cpu_tree(cuio_json::tree_meta_t const& d_value, rmm::cuda_stream_view stream) { return {cudf::detail::make_std_vector_async(d_value.node_categories, stream), diff --git a/cpp/tests/io/json/json_tree_csr.cu b/cpp/tests/io/json/json_tree_csr.cu index f988ae24b38..a67830a7864 100644 --- a/cpp/tests/io/json/json_tree_csr.cu +++ b/cpp/tests/io/json/json_tree_csr.cu @@ -36,6 +36,8 @@ namespace cuio_json = cudf::io::json; +namespace { + struct h_tree_meta_t { std::vector node_categories; std::vector parent_node_ids; @@ -222,6 +224,7 @@ void run_test(std::string const& input, bool enable_lines = true) // assert equality between csr and meta formats ASSERT_TRUE(iseq); } +} // namespace struct JsonColumnTreeTests : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/io/parquet_chunked_reader_test.cu b/cpp/tests/io/parquet_chunked_reader_test.cu index 153a8a0c5aa..369376b6c95 100644 --- a/cpp/tests/io/parquet_chunked_reader_test.cu +++ b/cpp/tests/io/parquet_chunked_reader_test.cu @@ -1074,6 +1074,7 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadNullCount) } while (reader.has_next()); } +namespace { constexpr size_t input_limit_expected_file_count = 4; std::vector input_limit_get_test_names(std::string const& base_filename) @@ -1133,6 +1134,7 @@ void input_limit_test_read(std::vector const& test_filenames, CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.first, t); } } +} // namespace struct ParquetChunkedReaderInputLimitConstrainedTest : public cudf::test::BaseFixture {}; @@ -1189,6 +1191,7 @@ TEST_F(ParquetChunkedReaderInputLimitConstrainedTest, MixedColumns) struct ParquetChunkedReaderInputLimitTest : public cudf::test::BaseFixture {}; +namespace { struct offset_gen { int const group_size; __device__ int operator()(int i) { return i * group_size; } @@ -1198,6 +1201,8 @@ template struct value_gen { __device__ T operator()(int i) { return i % 1024; } }; +} // namespace + TEST_F(ParquetChunkedReaderInputLimitTest, List) { auto base_path = temp_env->get_temp_filepath("list"); @@ -1263,6 +1268,7 @@ TEST_F(ParquetChunkedReaderInputLimitTest, List) input_limit_test_read(test_filenames, tbl, 32 * 1024 * 1024, 64 * 1024 * 1024, expected_c); } +namespace { void tiny_list_rowgroup_test(bool just_list_col) { auto iter = thrust::make_counting_iterator(0); @@ -1320,6 +1326,7 @@ void tiny_list_rowgroup_test(bool just_list_col) CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *(result.first)); } +} // namespace TEST_F(ParquetChunkedReaderInputLimitTest, TinyListRowGroupsSingle) { @@ -1333,6 +1340,7 @@ TEST_F(ParquetChunkedReaderInputLimitTest, TinyListRowGroupsMixed) tiny_list_rowgroup_test(false); } +namespace { struct char_values { __device__ int8_t operator()(int i) { @@ -1341,6 +1349,8 @@ struct char_values { return index == 0 ? 'a' : (index == 1 ? 'b' : 'c'); } }; +} // namespace + TEST_F(ParquetChunkedReaderInputLimitTest, Mixed) { auto base_path = temp_env->get_temp_filepath("mixed_types"); diff --git a/cpp/tests/iterator/optional_iterator_test_numeric.cu b/cpp/tests/iterator/optional_iterator_test_numeric.cu index 257c0979017..8377060b6ec 100644 --- a/cpp/tests/iterator/optional_iterator_test_numeric.cu +++ b/cpp/tests/iterator/optional_iterator_test_numeric.cu @@ -26,16 +26,6 @@ using TestingTypes = cudf::test::NumericTypes; -namespace cudf { -// To print meanvar for debug. -// Needs to be in the cudf namespace for ADL -template -std::ostream& operator<<(std::ostream& os, cudf::meanvar const& rhs) -{ - return os << "[" << rhs.value << ", " << rhs.value_squared << ", " << rhs.count << "] "; -}; -} // namespace cudf - template struct NumericOptionalIteratorTest : public IteratorTest {}; @@ -46,6 +36,7 @@ TYPED_TEST(NumericOptionalIteratorTest, nonull_optional_iterator) } TYPED_TEST(NumericOptionalIteratorTest, null_optional_iterator) { null_optional_iterator(*this); } +namespace { // Transformers and Operators for optional_iterator test template struct transformer_optional_meanvar { @@ -65,6 +56,7 @@ template struct optional_to_meanvar { CUDF_HOST_DEVICE inline T operator()(cuda::std::optional const& v) { return v.value_or(T{0}); } }; +} // namespace // TODO: enable this test also at __CUDACC_DEBUG__ // This test causes fatal compilation error only at device debug mode. diff --git a/cpp/tests/iterator/pair_iterator_test_numeric.cu b/cpp/tests/iterator/pair_iterator_test_numeric.cu index 3447aa0dde6..5f707232953 100644 --- a/cpp/tests/iterator/pair_iterator_test_numeric.cu +++ b/cpp/tests/iterator/pair_iterator_test_numeric.cu @@ -24,16 +24,6 @@ using TestingTypes = cudf::test::NumericTypes; -namespace cudf { -// To print meanvar for debug. -// Needs to be in the cudf namespace for ADL -template -std::ostream& operator<<(std::ostream& os, cudf::meanvar const& rhs) -{ - return os << "[" << rhs.value << ", " << rhs.value_squared << ", " << rhs.count << "] "; -}; -} // namespace cudf - template struct NumericPairIteratorTest : public IteratorTest {}; @@ -53,6 +43,7 @@ struct transformer_pair_meanvar { }; }; +namespace { struct sum_if_not_null { template CUDF_HOST_DEVICE inline thrust::pair operator()(thrust::pair const& lhs, @@ -66,6 +57,7 @@ struct sum_if_not_null { return {rhs}; } }; +} // namespace // TODO: enable this test also at __CUDACC_DEBUG__ // This test causes fatal compilation error only at device debug mode. diff --git a/cpp/tests/quantiles/percentile_approx_test.cpp b/cpp/tests/quantiles/percentile_approx_test.cpp index 37414eb3fba..c146fd2ea4e 100644 --- a/cpp/tests/quantiles/percentile_approx_test.cpp +++ b/cpp/tests/quantiles/percentile_approx_test.cpp @@ -33,6 +33,7 @@ #include +namespace { std::unique_ptr arrow_percentile_approx(cudf::column_view const& _values, int delta, std::vector const& percentages) @@ -315,6 +316,7 @@ cudf::data_type get_appropriate_type() if constexpr (cudf::is_fixed_point()) { return cudf::data_type{cudf::type_to_id(), -7}; } return cudf::data_type{cudf::type_to_id()}; } +} // namespace using PercentileApproxTypes = cudf::test::Concat; diff --git a/cpp/tests/reductions/tdigest_tests.cu b/cpp/tests/reductions/tdigest_tests.cu index c8fec51e1c9..184725e17e0 100644 --- a/cpp/tests/reductions/tdigest_tests.cu +++ b/cpp/tests/reductions/tdigest_tests.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,6 +25,7 @@ template struct ReductionTDigestAllTypes : public cudf::test::BaseFixture {}; TYPED_TEST_SUITE(ReductionTDigestAllTypes, cudf::test::NumericTypes); +namespace { struct reduce_op { std::unique_ptr operator()(cudf::column_view const& values, int delta) const { @@ -60,6 +61,7 @@ struct reduce_merge_op { return cudf::make_structs_column(tbl.num_rows(), std::move(cols), 0, rmm::device_buffer()); } }; +} // namespace TYPED_TEST(ReductionTDigestAllTypes, Simple) { diff --git a/cpp/tests/streams/interop_test.cpp b/cpp/tests/streams/interop_test.cpp index 7133baf6df1..79ea6b7d6d4 100644 --- a/cpp/tests/streams/interop_test.cpp +++ b/cpp/tests/streams/interop_test.cpp @@ -23,9 +23,11 @@ #include +namespace { struct dlpack_deleter { void operator()(DLManagedTensor* tensor) { tensor->deleter(tensor); } }; +} // namespace struct DLPackTest : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu index 01a042130d6..7e203086fca 100644 --- a/cpp/tests/transform/row_bit_count_test.cu +++ b/cpp/tests/transform/row_bit_count_test.cu @@ -590,6 +590,7 @@ TEST_F(RowBitCount, EmptyChildColumnInListOfLists) cudf::test::fixed_width_column_wrapper{32, 32, 32, 32}); } +namespace { struct sum_functor { cudf::size_type const* s0; cudf::size_type const* s1; @@ -597,6 +598,7 @@ struct sum_functor { cudf::size_type operator() __device__(int i) { return s0[i] + s1[i] + s2[i]; } }; +} // namespace TEST_F(RowBitCount, Table) { diff --git a/cpp/tests/wrappers/timestamps_test.cu b/cpp/tests/wrappers/timestamps_test.cu index 4086c5a91bb..8e5129dfbd2 100644 --- a/cpp/tests/wrappers/timestamps_test.cu +++ b/cpp/tests/wrappers/timestamps_test.cu @@ -37,6 +37,7 @@ #include #include +namespace { template struct ChronoColumnTest : public cudf::test::BaseFixture { cudf::size_type size() { return cudf::size_type(100); } @@ -72,6 +73,7 @@ struct compare_chrono_elements_to_primitive_representation { return primitive == dur.count(); } }; +} // namespace TYPED_TEST_SUITE(ChronoColumnTest, cudf::test::ChronoTypes); @@ -103,6 +105,7 @@ TYPED_TEST(ChronoColumnTest, ChronoDurationsMatchPrimitiveRepresentation) *cudf::column_device_view::create(chrono_col)})); } +namespace { template struct compare_chrono_elements { cudf::binary_operator comp; @@ -129,6 +132,7 @@ struct compare_chrono_elements { } } }; +} // namespace TYPED_TEST(ChronoColumnTest, ChronosCanBeComparedInDeviceCode) { From 92652be87839e4a4e49216c49bd36860674bff6a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 12 Dec 2024 13:17:28 -0800 Subject: [PATCH 66/78] Remove cudf._lib.parquet in favor of inlining pylibcudf (#17562) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17562 --- python/cudf/cudf/_lib/CMakeLists.txt | 5 +- python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/io/CMakeLists.txt | 21 - python/cudf/cudf/_lib/io/__init__.pxd | 0 python/cudf/cudf/_lib/io/__init__.py | 0 python/cudf/cudf/_lib/io/utils.pxd | 31 - python/cudf/cudf/_lib/io/utils.pyx | 74 -- python/cudf/cudf/_lib/parquet.pyx | 817 ------------------- python/cudf/cudf/io/parquet.py | 992 +++++++++++++++++++++--- python/cudf/cudf/tests/test_parquet.py | 72 +- python/cudf/cudf/utils/ioutils.py | 1 - 11 files changed, 941 insertions(+), 1073 deletions(-) delete mode 100644 python/cudf/cudf/_lib/io/CMakeLists.txt delete mode 100644 python/cudf/cudf/_lib/io/__init__.pxd delete mode 100644 python/cudf/cudf/_lib/io/__init__.py delete mode 100644 python/cudf/cudf/_lib/io/utils.pxd delete mode 100644 python/cudf/cudf/_lib/io/utils.pyx delete mode 100644 python/cudf/cudf/_lib/parquet.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index efe96ff6c3e..f422635d22a 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -13,8 +13,8 @@ # ============================================================================= set(cython_sources - column.pyx copying.pyx csv.pyx groupby.pyx interop.pyx parquet.pyx reduce.pyx scalar.pyx - sort.pyx stream_compaction.pyx string_casting.pyx strings_udf.pyx types.pyx utils.pyx + column.pyx copying.pyx csv.pyx groupby.pyx interop.pyx reduce.pyx scalar.pyx sort.pyx + stream_compaction.pyx string_casting.pyx strings_udf.pyx types.pyx utils.pyx ) set(linked_libraries cudf::cudf) @@ -31,5 +31,4 @@ include(${rapids-cmake-dir}/export/find_package_root.cmake) include(../../../../cpp/cmake/thirdparty/get_nanoarrow.cmake) target_link_libraries(interop PUBLIC nanoarrow) -add_subdirectory(io) add_subdirectory(nvtext) diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 52e9b89da7b..cfdcec4cd3b 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -7,7 +7,6 @@ groupby, interop, nvtext, - parquet, reduce, sort, stream_compaction, diff --git a/python/cudf/cudf/_lib/io/CMakeLists.txt b/python/cudf/cudf/_lib/io/CMakeLists.txt deleted file mode 100644 index e7408cf2852..00000000000 --- a/python/cudf/cudf/_lib/io/CMakeLists.txt +++ /dev/null @@ -1,21 +0,0 @@ -# ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= - -set(cython_sources utils.pyx) -set(linked_libraries cudf::cudf) -rapids_cython_create_modules( - CXX - SOURCE_FILES "${cython_sources}" - LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX io_ ASSOCIATED_TARGETS cudf -) diff --git a/python/cudf/cudf/_lib/io/__init__.pxd b/python/cudf/cudf/_lib/io/__init__.pxd deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/io/__init__.py b/python/cudf/cudf/_lib/io/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd deleted file mode 100644 index 9b8bab012e2..00000000000 --- a/python/cudf/cudf/_lib/io/utils.pxd +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libcpp.memory cimport unique_ptr -from libcpp.vector cimport vector - -from pylibcudf.libcudf.io.data_sink cimport data_sink -from pylibcudf.libcudf.io.types cimport ( - column_name_info, - sink_info, - source_info, -) - -from cudf._lib.column cimport Column - - -cdef add_df_col_struct_names( - df, - child_names_dict -) -cdef update_col_struct_field_names( - Column col, - child_names -) -cdef update_struct_field_names( - table, - vector[column_name_info]& schema_info -) -cdef Column update_column_struct_field_names( - Column col, - column_name_info& info -) diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx deleted file mode 100644 index df4675be599..00000000000 --- a/python/cudf/cudf/_lib/io/utils.pyx +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - - -from libcpp.string cimport string - -from libcpp.vector cimport vector - -from pylibcudf.libcudf.io.types cimport column_name_info - -from cudf._lib.column cimport Column - -from cudf.core.dtypes import StructDtype - -cdef add_df_col_struct_names(df, child_names_dict): - for name, child_names in child_names_dict.items(): - col = df._data[name] - - df._data[name] = update_col_struct_field_names(col, child_names) - - -cdef update_col_struct_field_names(Column col, child_names): - if col.children: - children = list(col.children) - for i, (child, names) in enumerate(zip(children, child_names.values())): - children[i] = update_col_struct_field_names( - child, - names - ) - col.set_base_children(tuple(children)) - - if isinstance(col.dtype, StructDtype): - col = col._rename_fields( - child_names.keys() - ) - - return col - - -cdef update_struct_field_names( - table, - vector[column_name_info]& schema_info -): - # Deprecated, remove in favor of add_col_struct_names - # when a reader is ported to pylibcudf - for i, (name, col) in enumerate(table._column_labels_and_values): - table._data[name] = update_column_struct_field_names( - col, schema_info[i] - ) - - -cdef Column update_column_struct_field_names( - Column col, - column_name_info& info -): - cdef vector[string] field_names - - if col.children: - children = list(col.children) - for i, child in enumerate(children): - children[i] = update_column_struct_field_names( - child, - info.children[i] - ) - col.set_base_children(tuple(children)) - - if isinstance(col.dtype, StructDtype): - field_names.reserve(len(col.base_children)) - for i in range(info.children.size()): - field_names.push_back(info.children[i].name) - col = col._rename_fields( - field_names - ) - - return col diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx deleted file mode 100644 index 00c434ae374..00000000000 --- a/python/cudf/cudf/_lib/parquet.pyx +++ /dev/null @@ -1,817 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -import io - -import pyarrow as pa -import itertools -import cudf -from cudf.core.buffer import acquire_spill_lock - -try: - import ujson as json -except ImportError: - import json - -import numpy as np - -from cudf.api.types import is_list_like - -from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io - -from cudf._lib.utils import _index_level_name, generate_pandas_metadata - -from libc.stdint cimport int64_t -from libcpp cimport bool - -from pylibcudf.expressions cimport Expression -from pylibcudf.io.parquet cimport ChunkedParquetReader -from pylibcudf.libcudf.io.types cimport ( - statistics_freq, - compression_type, - dictionary_policy, -) -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column -from cudf._lib.io.utils cimport ( - add_df_col_struct_names, -) - -import pylibcudf as plc - -from pylibcudf cimport Table - -from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT -from pylibcudf.io.types cimport TableInputMetadata, SinkInfo, ColumnInMetadata -from pylibcudf.io.parquet cimport ParquetChunkedWriter - - -def _parse_metadata(meta): - file_is_range_index = False - file_index_cols = None - file_column_dtype = None - - if 'index_columns' in meta and len(meta['index_columns']) > 0: - file_index_cols = meta['index_columns'] - - if isinstance(file_index_cols[0], dict) and \ - file_index_cols[0]['kind'] == 'range': - file_is_range_index = True - if 'column_indexes' in meta and len(meta['column_indexes']) == 1: - file_column_dtype = meta['column_indexes'][0]["numpy_type"] - return file_is_range_index, file_index_cols, file_column_dtype - - -cdef object _process_metadata(object df, - list names, - dict child_names, - list per_file_user_data, - object row_groups, - object filepaths_or_buffers, - bool allow_range_index, - bool use_pandas_metadata, - size_type nrows=-1, - int64_t skip_rows=0, - ): - - add_df_col_struct_names(df, child_names) - index_col = None - is_range_index = True - column_index_type = None - index_col_names = None - meta = None - for single_file in per_file_user_data: - if b'pandas' not in single_file: - continue - json_str = single_file[b'pandas'].decode('utf-8') - meta = json.loads(json_str) - file_is_range_index, index_col, column_index_type = _parse_metadata(meta) - is_range_index &= file_is_range_index - - if not file_is_range_index and index_col is not None \ - and index_col_names is None: - index_col_names = {} - for idx_col in index_col: - for c in meta['columns']: - if c['field_name'] == idx_col: - index_col_names[idx_col] = c['name'] - - if meta is not None: - # Book keep each column metadata as the order - # of `meta["columns"]` and `column_names` are not - # guaranteed to be deterministic and same always. - meta_data_per_column = { - col_meta['name']: col_meta for col_meta in meta["columns"] - } - - # update the decimal precision of each column - for col in names: - if isinstance(df._data[col].dtype, cudf.core.dtypes.DecimalDtype): - df._data[col].dtype.precision = ( - meta_data_per_column[col]["metadata"]["precision"] - ) - - # Set the index column - if index_col is not None and len(index_col) > 0: - if is_range_index: - if not allow_range_index: - return df - - if len(per_file_user_data) > 1: - range_index_meta = { - "kind": "range", - "name": None, - "start": 0, - "stop": len(df), - "step": 1 - } - else: - range_index_meta = index_col[0] - - if row_groups is not None: - per_file_metadata = [ - pa.parquet.read_metadata( - # Pyarrow cannot read directly from bytes - io.BytesIO(s) if isinstance(s, bytes) else s - ) for s in filepaths_or_buffers - ] - - filtered_idx = [] - for i, file_meta in enumerate(per_file_metadata): - row_groups_i = [] - start = 0 - for row_group in range(file_meta.num_row_groups): - stop = start + file_meta.row_group(row_group).num_rows - row_groups_i.append((start, stop)) - start = stop - - for rg in row_groups[i]: - filtered_idx.append( - cudf.RangeIndex( - start=row_groups_i[rg][0], - stop=row_groups_i[rg][1], - step=range_index_meta['step'] - ) - ) - - if len(filtered_idx) > 0: - idx = cudf.concat(filtered_idx) - else: - idx = cudf.Index._from_column(cudf.core.column.column_empty(0)) - else: - start = range_index_meta["start"] + skip_rows - stop = range_index_meta["stop"] - if nrows > -1: - stop = start + nrows - idx = cudf.RangeIndex( - start=start, - stop=stop, - step=range_index_meta['step'], - name=range_index_meta['name'] - ) - - df._index = idx - elif set(index_col).issubset(names): - index_data = df[index_col] - actual_index_names = iter(index_col_names.values()) - if index_data._num_columns == 1: - idx = cudf.Index._from_column( - index_data._columns[0], - name=next(actual_index_names) - ) - else: - idx = cudf.MultiIndex.from_frame( - index_data, - names=list(actual_index_names) - ) - df.drop(columns=index_col, inplace=True) - df._index = idx - else: - if use_pandas_metadata: - df.index.names = index_col - - if df._num_columns == 0 and column_index_type is not None: - df._data.label_dtype = cudf.dtype(column_index_type) - - return df - - -def read_parquet_chunked( - filepaths_or_buffers, - columns=None, - row_groups=None, - use_pandas_metadata=True, - size_t chunk_read_limit=0, - size_t pass_read_limit=1024000000, - size_type nrows=-1, - int64_t skip_rows=0, - allow_mismatched_pq_schemas=False -): - # Note: If this function ever takes accepts filters - # allow_range_index needs to be False when a filter is passed - # (see read_parquet) - allow_range_index = columns is not None and len(columns) != 0 - - options = ( - plc.io.parquet.ParquetReaderOptions.builder( - plc.io.SourceInfo(filepaths_or_buffers) - ) - .use_pandas_metadata(use_pandas_metadata) - .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas) - .build() - ) - if row_groups is not None: - options.set_row_groups(row_groups) - if nrows > -1: - options.set_num_rows(nrows) - if skip_rows != 0: - options.set_skip_rows(skip_rows) - if columns is not None: - options.set_columns(columns) - - reader = ChunkedParquetReader( - options, - chunk_read_limit=chunk_read_limit, - pass_read_limit=pass_read_limit, - ) - - tbl_w_meta = reader.read_chunk() - column_names = tbl_w_meta.column_names(include_children=False) - child_names = tbl_w_meta.child_names - per_file_user_data = tbl_w_meta.per_file_user_data - concatenated_columns = tbl_w_meta.tbl.columns() - - # save memory - del tbl_w_meta - - cdef Table tbl - while reader.has_next(): - tbl = reader.read_chunk().tbl - - for i in range(tbl.num_columns()): - concatenated_columns[i] = plc.concatenate.concatenate( - [concatenated_columns[i], tbl._columns[i]] - ) - # Drop residual columns to save memory - tbl._columns[i] = None - - df = cudf.DataFrame._from_data( - *_data_from_columns( - columns=[Column.from_pylibcudf(plc) for plc in concatenated_columns], - column_names=column_names, - index_names=None - ) - ) - df = _process_metadata(df, column_names, child_names, - per_file_user_data, row_groups, - filepaths_or_buffers, - allow_range_index, use_pandas_metadata, - nrows=nrows, skip_rows=skip_rows) - return df - - -cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, - use_pandas_metadata=True, - Expression filters=None, - size_type nrows=-1, - int64_t skip_rows=0, - allow_mismatched_pq_schemas=False): - """ - Cython function to call into libcudf API, see `read_parquet`. - - filters, if not None, should be an Expression that evaluates to a - boolean predicate as a function of columns being read. - - See Also - -------- - cudf.io.parquet.read_parquet - cudf.io.parquet.to_parquet - """ - - allow_range_index = True - if columns is not None and len(columns) == 0 or filters: - allow_range_index = False - - options = ( - plc.io.parquet.ParquetReaderOptions.builder( - plc.io.SourceInfo(filepaths_or_buffers) - ) - .use_pandas_metadata(use_pandas_metadata) - .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas) - .build() - ) - if row_groups is not None: - options.set_row_groups(row_groups) - if nrows > -1: - options.set_num_rows(nrows) - if skip_rows != 0: - options.set_skip_rows(skip_rows) - if columns is not None: - options.set_columns(columns) - if filters is not None: - options.set_filter(filters) - - tbl_w_meta = plc.io.parquet.read_parquet(options) - - df = cudf.DataFrame._from_data( - *data_from_pylibcudf_io(tbl_w_meta) - ) - - df = _process_metadata(df, tbl_w_meta.column_names(include_children=False), - tbl_w_meta.child_names, tbl_w_meta.per_file_user_data, - row_groups, filepaths_or_buffers, - allow_range_index, use_pandas_metadata, - nrows=nrows, skip_rows=skip_rows) - return df - -cpdef read_parquet_metadata(list filepaths_or_buffers): - """ - Cython function to call into libcudf API, see `read_parquet_metadata`. - - See Also - -------- - cudf.io.parquet.read_parquet - cudf.io.parquet.to_parquet - """ - parquet_metadata = plc.io.parquet_metadata.read_parquet_metadata( - plc.io.SourceInfo(filepaths_or_buffers) - ) - - # read all column names including index column, if any - col_names = [info.name() for info in parquet_metadata.schema().root().children()] - - index_col_names = set() - json_str = parquet_metadata.metadata()['pandas'] - if json_str != "": - meta = json.loads(json_str) - file_is_range_index, index_col, _ = _parse_metadata(meta) - if ( - not file_is_range_index - and index_col is not None - ): - columns = meta['columns'] - for idx_col in index_col: - for c in columns: - if c['field_name'] == idx_col: - index_col_names.add(idx_col) - - # remove the index column from the list of column names - # only if index_col_names is not None - if len(index_col_names) >= 0: - col_names = [name for name in col_names if name not in index_col_names] - - return ( - parquet_metadata.num_rows(), - parquet_metadata.num_rowgroups(), - col_names, - len(col_names), - parquet_metadata.rowgroup_metadata() - ) - - -@acquire_spill_lock() -def write_parquet( - table, - object filepaths_or_buffers, - object index=None, - object compression="snappy", - object statistics="ROWGROUP", - object metadata_file_path=None, - object int96_timestamps=False, - object row_group_size_bytes=None, - object row_group_size_rows=None, - object max_page_size_bytes=None, - object max_page_size_rows=None, - object max_dictionary_size=None, - object partitions_info=None, - object force_nullable_schema=False, - header_version="1.0", - use_dictionary=True, - object skip_compression=None, - object column_encoding=None, - object column_type_length=None, - object output_as_binary=None, - write_arrow_schema=False, -): - """ - Cython function to call into libcudf API, see `write_parquet`. - - See Also - -------- - cudf.io.parquet.write_parquet - """ - if index is True or ( - index is None and not isinstance(table._index, cudf.RangeIndex) - ): - columns = [*table.index._columns, *table._columns] - plc_table = plc.Table([col.to_pylibcudf(mode="read") for col in columns]) - tbl_meta = TableInputMetadata(plc_table) - for level, idx_name in enumerate(table._index.names): - tbl_meta.column_metadata[level].set_name( - _index_level_name(idx_name, level, table._column_names) - ) - num_index_cols_meta = len(table._index.names) - else: - plc_table = plc.Table( - [col.to_pylibcudf(mode="read") for col in table._columns] - ) - tbl_meta = TableInputMetadata(plc_table) - num_index_cols_meta = 0 - - for i, name in enumerate(table._column_names, num_index_cols_meta): - if not isinstance(name, str): - if cudf.get_option("mode.pandas_compatible"): - tbl_meta.column_metadata[i].set_name(str(name)) - else: - raise ValueError( - "Writing a Parquet file requires string column names" - ) - else: - tbl_meta.column_metadata[i].set_name(name) - - _set_col_metadata( - table[name]._column, - tbl_meta.column_metadata[i], - force_nullable_schema, - None, - skip_compression, - column_encoding, - column_type_length, - output_as_binary - ) - if partitions_info is not None: - user_data = [ - {"pandas": generate_pandas_metadata( - table.iloc[start_row:start_row + num_row].copy(deep=False), - index - )} - for start_row, num_row in partitions_info - ] - else: - user_data = [{"pandas": generate_pandas_metadata(table, index)}] - - if header_version not in ("1.0", "2.0"): - raise ValueError( - f"Invalid parquet header version: {header_version}. " - "Valid values are '1.0' and '2.0'" - ) - - dict_policy = ( - plc.io.types.DictionaryPolicy.ADAPTIVE - if use_dictionary - else plc.io.types.DictionaryPolicy.NEVER - ) - - comp_type = _get_comp_type(compression) - stat_freq = _get_stat_freq(statistics) - options = ( - plc.io.parquet.ParquetWriterOptions.builder( - plc.io.SinkInfo(filepaths_or_buffers), plc_table - ) - .metadata(tbl_meta) - .key_value_metadata(user_data) - .compression(comp_type) - .stats_level(stat_freq) - .int96_timestamps(int96_timestamps) - .write_v2_headers(header_version == "2.0") - .dictionary_policy(dict_policy) - .utc_timestamps(False) - .write_arrow_schema(write_arrow_schema) - .build() - ) - if partitions_info is not None: - options.set_partitions( - [plc.io.types.PartitionInfo(part[0], part[1]) for part in partitions_info] - ) - if metadata_file_path is not None: - if is_list_like(metadata_file_path): - options.set_column_chunks_file_paths(metadata_file_path) - else: - options.set_column_chunks_file_paths([metadata_file_path]) - if row_group_size_bytes is not None: - options.set_row_group_size_bytes(row_group_size_bytes) - if row_group_size_rows is not None: - options.set_row_group_size_rows(row_group_size_rows) - if max_page_size_bytes is not None: - options.set_max_page_size_bytes(max_page_size_bytes) - if max_page_size_rows is not None: - options.set_max_page_size_rows(max_page_size_rows) - if max_dictionary_size is not None: - options.set_max_dictionary_size(max_dictionary_size) - blob = plc.io.parquet.write_parquet(options) - if metadata_file_path is not None: - return np.asarray(blob.obj) - else: - return None - - -cdef class ParquetWriter: - """ - ParquetWriter lets you incrementally write out a Parquet file from a series - of cudf tables - - Parameters - ---------- - filepath_or_buffer : str, io.IOBase, os.PathLike, or list - File path or buffer to write to. The argument may also correspond - to a list of file paths or buffers. - index : bool or None, default None - If ``True``, include a dataframe's index(es) in the file output. - If ``False``, they will not be written to the file. If ``None``, - index(es) other than RangeIndex will be saved as columns. - compression : {'snappy', None}, default 'snappy' - Name of the compression to use. Use ``None`` for no compression. - statistics : {'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}, default 'ROWGROUP' - Level at which column statistics should be included in file. - row_group_size_bytes: int, default ``uint64 max`` - Maximum size of each stripe of the output. - By default, a virtually infinite size equal to ``uint64 max`` will be used. - row_group_size_rows: int, default 1000000 - Maximum number of rows of each stripe of the output. - By default, 1000000 (10^6 rows) will be used. - max_page_size_bytes: int, default 524288 - Maximum uncompressed size of each page of the output. - By default, 524288 (512KB) will be used. - max_page_size_rows: int, default 20000 - Maximum number of rows of each page of the output. - By default, 20000 will be used. - max_dictionary_size: int, default 1048576 - Maximum size of the dictionary page for each output column chunk. Dictionary - encoding for column chunks that exceeds this limit will be disabled. - By default, 1048576 (1MB) will be used. - use_dictionary : bool, default True - If ``True``, enable dictionary encoding for Parquet page data - subject to ``max_dictionary_size`` constraints. - If ``False``, disable dictionary encoding for Parquet page data. - store_schema : bool, default False - If ``True``, enable computing and writing arrow schema to Parquet - file footer's key-value metadata section for faithful round-tripping. - See Also - -------- - cudf.io.parquet.write_parquet - """ - cdef bool initialized - cdef ParquetChunkedWriter writer - cdef SinkInfo sink - cdef TableInputMetadata tbl_meta - cdef str statistics - cdef object compression - cdef object index - cdef size_t row_group_size_bytes - cdef size_type row_group_size_rows - cdef size_t max_page_size_bytes - cdef size_type max_page_size_rows - cdef size_t max_dictionary_size - cdef bool use_dictionary - cdef bool write_arrow_schema - - def __cinit__(self, object filepath_or_buffer, object index=None, - object compression="snappy", str statistics="ROWGROUP", - size_t row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT, - size_type row_group_size_rows=1000000, - size_t max_page_size_bytes=524288, - size_type max_page_size_rows=20000, - size_t max_dictionary_size=1048576, - bool use_dictionary=True, - bool store_schema=False): - filepaths_or_buffers = ( - list(filepath_or_buffer) - if is_list_like(filepath_or_buffer) - else [filepath_or_buffer] - ) - self.sink = plc.io.SinkInfo(filepaths_or_buffers) - self.statistics = statistics - self.compression = compression - self.index = index - self.initialized = False - self.row_group_size_bytes = row_group_size_bytes - self.row_group_size_rows = row_group_size_rows - self.max_page_size_bytes = max_page_size_bytes - self.max_page_size_rows = max_page_size_rows - self.max_dictionary_size = max_dictionary_size - self.use_dictionary = use_dictionary - self.write_arrow_schema = store_schema - - def write_table(self, table, object partitions_info=None): - """ Writes a single table to the file """ - if not self.initialized: - self._initialize_chunked_state( - table, - num_partitions=len(partitions_info) if partitions_info else 1 - ) - if self.index is not False and ( - table._index.name is not None or - isinstance(table._index, cudf.core.multiindex.MultiIndex)): - columns = [*table.index._columns, *table._columns] - plc_table = plc.Table([col.to_pylibcudf(mode="read") for col in columns]) - else: - plc_table = plc.Table( - [col.to_pylibcudf(mode="read") for col in table._columns] - ) - self.writer.write(plc_table, partitions_info) - - def close(self, object metadata_file_path=None): - if not self.initialized: - return None - column_chunks_file_paths=[] - if metadata_file_path is not None: - if is_list_like(metadata_file_path): - column_chunks_file_paths = list(metadata_file_path) - else: - column_chunks_file_paths = [metadata_file_path] - blob = self.writer.close(column_chunks_file_paths) - if metadata_file_path is not None: - return np.asarray(blob.obj) - return None - - def __enter__(self): - return self - - def __exit__(self, *args): - self.close() - - def _initialize_chunked_state(self, table, num_partitions=1): - """ Prepares all the values required to build the - chunked_parquet_writer_options and creates a writer""" - - # Set the table_metadata - num_index_cols_meta = 0 - plc_table = plc.Table( - [ - col.to_pylibcudf(mode="read") - for col in table._columns - ] - ) - self.tbl_meta = TableInputMetadata(plc_table) - if self.index is not False: - if isinstance(table._index, cudf.core.multiindex.MultiIndex): - plc_table = plc.Table( - [ - col.to_pylibcudf(mode="read") - for col in itertools.chain(table.index._columns, table._columns) - ] - ) - self.tbl_meta = TableInputMetadata(plc_table) - for level, idx_name in enumerate(table._index.names): - self.tbl_meta.column_metadata[level].set_name(idx_name) - num_index_cols_meta = len(table._index.names) - else: - if table._index.name is not None: - plc_table = plc.Table( - [ - col.to_pylibcudf(mode="read") - for col in itertools.chain( - table.index._columns, table._columns - ) - ] - ) - self.tbl_meta = TableInputMetadata(plc_table) - self.tbl_meta.column_metadata[0].set_name(table._index.name) - num_index_cols_meta = 1 - - for i, name in enumerate(table._column_names, num_index_cols_meta): - self.tbl_meta.column_metadata[i].set_name(name) - _set_col_metadata( - table[name]._column, - self.tbl_meta.column_metadata[i], - ) - - index = ( - False if isinstance(table._index, cudf.RangeIndex) else self.index - ) - user_data = [{"pandas" : generate_pandas_metadata(table, index)}]*num_partitions - cdef compression_type comp_type = _get_comp_type(self.compression) - cdef statistics_freq stat_freq = _get_stat_freq(self.statistics) - cdef dictionary_policy dict_policy = ( - plc.io.types.DictionaryPolicy.ADAPTIVE - if self.use_dictionary - else plc.io.types.DictionaryPolicy.NEVER - ) - options = ( - plc.io.parquet.ChunkedParquetWriterOptions.builder(self.sink) - .metadata(self.tbl_meta) - .key_value_metadata(user_data) - .compression(comp_type) - .stats_level(stat_freq) - .row_group_size_bytes(self.row_group_size_bytes) - .row_group_size_rows(self.row_group_size_rows) - .max_page_size_bytes(self.max_page_size_bytes) - .max_page_size_rows(self.max_page_size_rows) - .max_dictionary_size(self.max_dictionary_size) - .write_arrow_schema(self.write_arrow_schema) - .build() - ) - options.set_dictionary_policy(dict_policy) - self.writer = plc.io.parquet.ParquetChunkedWriter.from_options(options) - self.initialized = True - - -cpdef merge_filemetadata(object filemetadata_list): - """ - Cython function to call into libcudf API, see `merge_row_group_metadata`. - - See Also - -------- - cudf.io.parquet.merge_row_group_metadata - """ - return np.asarray( - plc.io.parquet.merge_row_group_metadata(filemetadata_list).obj - ) - - -cdef statistics_freq _get_stat_freq(str statistics): - result = getattr( - plc.io.types.StatisticsFreq, - f"STATISTICS_{statistics.upper()}", - None - ) - if result is None: - raise ValueError("Unsupported `statistics_freq` type") - return result - - -cdef compression_type _get_comp_type(object compression): - if compression is None: - return plc.io.types.CompressionType.NONE - result = getattr( - plc.io.types.CompressionType, - str(compression).upper(), - None - ) - if result is None: - raise ValueError("Unsupported `compression` type") - return result - - -cdef _set_col_metadata( - Column col, - ColumnInMetadata col_meta, - bool force_nullable_schema=False, - str path=None, - object skip_compression=None, - object column_encoding=None, - object column_type_length=None, - object output_as_binary=None, -): - need_path = (skip_compression is not None or column_encoding is not None or - column_type_length is not None or output_as_binary is not None) - name = col_meta.get_name() if need_path else None - full_path = path + "." + name if path is not None else name - - if force_nullable_schema: - # Only set nullability if `force_nullable_schema` - # is true. - col_meta.set_nullability(True) - - if skip_compression is not None and full_path in skip_compression: - col_meta.set_skip_compression(True) - - if column_encoding is not None and full_path in column_encoding: - encoding = column_encoding[full_path] - if encoding is None: - c_encoding = plc.io.types.ColumnEncoding.USE_DEFAULT - else: - enc = str(encoding).upper() - c_encoding = getattr(plc.io.types.ColumnEncoding, enc, None) - if c_encoding is None: - raise ValueError("Unsupported `column_encoding` type") - col_meta.set_encoding(c_encoding) - - if column_type_length is not None and full_path in column_type_length: - col_meta.set_output_as_binary(True) - col_meta.set_type_length(column_type_length[full_path]) - - if output_as_binary is not None and full_path in output_as_binary: - col_meta.set_output_as_binary(True) - - if isinstance(col.dtype, cudf.StructDtype): - for i, (child_col, name) in enumerate( - zip(col.children, list(col.dtype.fields)) - ): - col_meta.child(i).set_name(name) - _set_col_metadata( - child_col, - col_meta.child(i), - force_nullable_schema, - full_path, - skip_compression, - column_encoding, - column_type_length, - output_as_binary - ) - elif isinstance(col.dtype, cudf.ListDtype): - if full_path is not None: - full_path = full_path + ".list" - col_meta.child(1).set_name("element") - _set_col_metadata( - col.children[1], - col_meta.child(1), - force_nullable_schema, - full_path, - skip_compression, - column_encoding, - column_type_length, - output_as_binary - ) - elif isinstance(col.dtype, cudf.core.dtypes.DecimalDtype): - col_meta.set_decimal_precision(col.dtype.precision) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 2382e9f12ed..66095d4a155 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -1,6 +1,7 @@ # Copyright (c) 2019-2024, NVIDIA CORPORATION. from __future__ import annotations +import io import itertools import math import operator @@ -10,23 +11,42 @@ from collections import defaultdict from contextlib import ExitStack from functools import partial, reduce -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any, Literal from uuid import uuid4 import numpy as np import pandas as pd +import pyarrow as pa from pyarrow import dataset as ds +import pylibcudf as plc + import cudf -from cudf._lib import parquet as libparquet +from cudf._lib.column import Column +from cudf._lib.utils import ( + _data_from_columns, + _index_level_name, + data_from_pylibcudf_io, + generate_pandas_metadata, +) from cudf.api.types import is_list_like +from cudf.core.buffer import acquire_spill_lock from cudf.core.column import as_column, column_empty from cudf.core.column.categorical import CategoricalColumn, as_unsigned_codes from cudf.utils import ioutils from cudf.utils.performance_tracking import _performance_tracking +try: + import ujson as json # type: ignore[import-untyped] +except ImportError: + import json + if TYPE_CHECKING: - from collections.abc import Callable + from collections.abc import Callable, Hashable + + from typing_extensions import Self + + from cudf.core.column import ColumnBase BYTE_SIZES = { @@ -55,31 +75,200 @@ } +@acquire_spill_lock() +def _plc_write_parquet( + table, + filepaths_or_buffers, + index: bool | None = None, + compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy", + statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP", + metadata_file_path: str | None = None, + int96_timestamps: bool = False, + row_group_size_bytes: int | None = None, + row_group_size_rows: int | None = None, + max_page_size_bytes: int | None = None, + max_page_size_rows: int | None = None, + max_dictionary_size: int | None = None, + partitions_info=None, + force_nullable_schema: bool = False, + header_version: Literal["1.0", "2.0"] = "1.0", + use_dictionary: bool = True, + skip_compression: set[Hashable] | None = None, + column_encoding: dict[ + Hashable, + Literal[ + "PLAIN", + "DICTIONARY", + "DELTA_BINARY_PACKED", + "DELTA_LENGTH_BYTE_ARRAY", + "DELTA_BYTE_ARRAY", + "BYTE_STREAM_SPLIT", + "USE_DEFAULT", + ], + ] + | None = None, + column_type_length: dict | None = None, + output_as_binary: set[Hashable] | None = None, + write_arrow_schema: bool = False, +) -> np.ndarray | None: + """ + Cython function to call into libcudf API, see `write_parquet`. + + See Also + -------- + cudf.io.parquet.write_parquet + """ + if index is True or ( + index is None and not isinstance(table.index, cudf.RangeIndex) + ): + columns = itertools.chain(table.index._columns, table._columns) + plc_table = plc.Table( + [col.to_pylibcudf(mode="read") for col in columns] + ) + tbl_meta = plc.io.types.TableInputMetadata(plc_table) + for level, idx_name in enumerate(table.index.names): + tbl_meta.column_metadata[level].set_name( + _index_level_name(idx_name, level, table._column_names) + ) + num_index_cols_meta = len(table.index.names) + else: + plc_table = plc.Table( + [col.to_pylibcudf(mode="read") for col in table._columns] + ) + tbl_meta = plc.io.types.TableInputMetadata(plc_table) + num_index_cols_meta = 0 + + for i, name in enumerate(table._column_names, num_index_cols_meta): + if not isinstance(name, str): + if cudf.get_option("mode.pandas_compatible"): + tbl_meta.column_metadata[i].set_name(str(name)) + else: + raise ValueError( + "Writing a Parquet file requires string column names" + ) + else: + tbl_meta.column_metadata[i].set_name(name) + + _set_col_metadata( + table[name]._column, + tbl_meta.column_metadata[i], + force_nullable_schema, + None, + skip_compression, + column_encoding, + column_type_length, + output_as_binary, + ) + if partitions_info is not None: + user_data = [ + { + "pandas": generate_pandas_metadata( + table.iloc[start_row : start_row + num_row].copy( + deep=False + ), + index, + ) + } + for start_row, num_row in partitions_info + ] + else: + user_data = [{"pandas": generate_pandas_metadata(table, index)}] + + if header_version not in ("1.0", "2.0"): + raise ValueError( + f"Invalid parquet header version: {header_version}. " + "Valid values are '1.0' and '2.0'" + ) + + dict_policy = ( + plc.io.types.DictionaryPolicy.ADAPTIVE + if use_dictionary + else plc.io.types.DictionaryPolicy.NEVER + ) + + comp_type = _get_comp_type(compression) + stat_freq = _get_stat_freq(statistics) + options = ( + plc.io.parquet.ParquetWriterOptions.builder( + plc.io.SinkInfo(filepaths_or_buffers), plc_table + ) + .metadata(tbl_meta) + .key_value_metadata(user_data) + .compression(comp_type) + .stats_level(stat_freq) + .int96_timestamps(int96_timestamps) + .write_v2_headers(header_version == "2.0") + .dictionary_policy(dict_policy) + .utc_timestamps(False) + .write_arrow_schema(write_arrow_schema) + .build() + ) + if partitions_info is not None: + options.set_partitions( + [ + plc.io.types.PartitionInfo(part[0], part[1]) + for part in partitions_info + ] + ) + if metadata_file_path is not None: + if is_list_like(metadata_file_path): + options.set_column_chunks_file_paths(metadata_file_path) + else: + options.set_column_chunks_file_paths([metadata_file_path]) + if row_group_size_bytes is not None: + options.set_row_group_size_bytes(row_group_size_bytes) + if row_group_size_rows is not None: + options.set_row_group_size_rows(row_group_size_rows) + if max_page_size_bytes is not None: + options.set_max_page_size_bytes(max_page_size_bytes) + if max_page_size_rows is not None: + options.set_max_page_size_rows(max_page_size_rows) + if max_dictionary_size is not None: + options.set_max_dictionary_size(max_dictionary_size) + blob = plc.io.parquet.write_parquet(options) + if metadata_file_path is not None: + return np.asarray(blob.obj) + else: + return None + + @_performance_tracking def _write_parquet( df, paths, - compression="snappy", - index=None, - statistics="ROWGROUP", - metadata_file_path=None, - int96_timestamps=False, - row_group_size_bytes=None, - row_group_size_rows=None, - max_page_size_bytes=None, - max_page_size_rows=None, - max_dictionary_size=None, + compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy", + index: bool | None = None, + statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP", + metadata_file_path: str | None = None, + int96_timestamps: bool = False, + row_group_size_bytes: int | None = None, + row_group_size_rows: int | None = None, + max_page_size_bytes: int | None = None, + max_page_size_rows: int | None = None, + max_dictionary_size: int | None = None, partitions_info=None, storage_options=None, - force_nullable_schema=False, - header_version="1.0", - use_dictionary=True, - skip_compression=None, - column_encoding=None, - column_type_length=None, - output_as_binary=None, - write_arrow_schema=True, -): + force_nullable_schema: bool = False, + header_version: Literal["1.0", "2.0"] = "1.0", + use_dictionary: bool = True, + skip_compression: set[Hashable] | None = None, + column_encoding: dict[ + Hashable, + Literal[ + "PLAIN", + "DICTIONARY", + "DELTA_BINARY_PACKED", + "DELTA_LENGTH_BYTE_ARRAY", + "DELTA_BYTE_ARRAY", + "BYTE_STREAM_SPLIT", + "USE_DEFAULT", + ], + ] + | None = None, + column_type_length: dict | None = None, + output_as_binary: set[Hashable] | None = None, + write_arrow_schema: bool = True, +) -> np.ndarray | None: if is_list_like(paths) and len(paths) > 1: if partitions_info is None: ValueError("partition info is required for multiple paths") @@ -124,11 +313,11 @@ def _write_parquet( file_objs = [ ioutils.get_IOBase_writer(file_obj) for file_obj in fsspec_objs ] - write_parquet_res = libparquet.write_parquet( + write_parquet_res = _plc_write_parquet( df, filepaths_or_buffers=file_objs, **common_args ) else: - write_parquet_res = libparquet.write_parquet( + write_parquet_res = _plc_write_parquet( df, filepaths_or_buffers=paths_or_bufs, **common_args ) @@ -141,26 +330,38 @@ def _write_parquet( def write_to_dataset( df, root_path, - compression="snappy", + compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy", filename=None, partition_cols=None, fs=None, - preserve_index=False, - return_metadata=False, - statistics="ROWGROUP", - int96_timestamps=False, - row_group_size_bytes=None, - row_group_size_rows=None, - max_page_size_bytes=None, - max_page_size_rows=None, + preserve_index: bool = False, + return_metadata: bool = False, + statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP", + int96_timestamps: bool = False, + row_group_size_bytes: int | None = None, + row_group_size_rows: int | None = None, + max_page_size_bytes: int | None = None, + max_page_size_rows: int | None = None, storage_options=None, - force_nullable_schema=False, - header_version="1.0", - use_dictionary=True, - skip_compression=None, - column_encoding=None, - column_type_length=None, - output_as_binary=None, + force_nullable_schema: bool = False, + header_version: Literal["1.0", "2.0"] = "1.0", + use_dictionary: bool = True, + skip_compression: set[Hashable] | None = None, + column_encoding: dict[ + Hashable, + Literal[ + "PLAIN", + "DICTIONARY", + "DELTA_BINARY_PACKED", + "DELTA_LENGTH_BYTE_ARRAY", + "DELTA_BYTE_ARRAY", + "BYTE_STREAM_SPLIT", + "USE_DEFAULT", + ], + ] + | None = None, + column_type_length: dict | None = None, + output_as_binary: set[Hashable] | None = None, store_schema=False, ): """Wraps `to_parquet` to write partitioned Parquet datasets. @@ -330,9 +531,29 @@ def write_to_dataset( return metadata +def _parse_metadata(meta) -> tuple[bool, Any, Any]: + file_is_range_index = False + file_index_cols = None + file_column_dtype = None + + if "index_columns" in meta and len(meta["index_columns"]) > 0: + file_index_cols = meta["index_columns"] + + if ( + isinstance(file_index_cols[0], dict) + and file_index_cols[0]["kind"] == "range" + ): + file_is_range_index = True + if "column_indexes" in meta and len(meta["column_indexes"]) == 1: + file_column_dtype = meta["column_indexes"][0]["numpy_type"] + return file_is_range_index, file_index_cols, file_column_dtype + + @ioutils.doc_read_parquet_metadata() @_performance_tracking -def read_parquet_metadata(filepath_or_buffer): +def read_parquet_metadata( + filepath_or_buffer, +) -> tuple[int, int, list[Hashable], int, list[dict[str, int]]]: """{docstring}""" # List of filepaths or buffers @@ -341,7 +562,39 @@ def read_parquet_metadata(filepath_or_buffer): bytes_per_thread=None, ) - return libparquet.read_parquet_metadata(filepaths_or_buffers) + parquet_metadata = plc.io.parquet_metadata.read_parquet_metadata( + plc.io.SourceInfo(filepaths_or_buffers) + ) + + # read all column names including index column, if any + col_names = [ + info.name() for info in parquet_metadata.schema().root().children() + ] + + index_col_names = set() + json_str = parquet_metadata.metadata()["pandas"] + if json_str != "": + meta = json.loads(json_str) + file_is_range_index, index_col, _ = _parse_metadata(meta) + if not file_is_range_index and index_col is not None: + columns = meta["columns"] + for idx_col in index_col: + for c in columns: + if c["field_name"] == idx_col: + index_col_names.add(idx_col) + + # remove the index column from the list of column names + # only if index_col_names is not None + if len(index_col_names) >= 0: + col_names = [name for name in col_names if name not in index_col_names] + + return ( + parquet_metadata.num_rows(), + parquet_metadata.num_rowgroups(), + col_names, + len(col_names), + parquet_metadata.rowgroup_metadata(), + ) @_performance_tracking @@ -913,16 +1166,18 @@ def _read_parquet( columns=None, row_groups=None, use_pandas_metadata=None, - nrows=None, - skip_rows=None, - allow_mismatched_pq_schemas=False, + nrows: int | None = None, + skip_rows: int | None = None, + allow_mismatched_pq_schemas: bool = False, *args, **kwargs, -): +) -> cudf.DataFrame: # Simple helper function to dispatch between # cudf and pyarrow to read parquet data if engine == "cudf": - if kwargs: + if set(kwargs.keys()).difference( + set(("_chunk_read_limit", "_pass_read_limit")) + ): raise ValueError( "cudf engine doesn't support the " f"following keyword arguments: {list(kwargs.keys())}" @@ -932,30 +1187,123 @@ def _read_parquet( "cudf engine doesn't support the " f"following positional arguments: {list(args)}" ) + if nrows is None: + nrows = -1 + if skip_rows is None: + skip_rows = 0 if cudf.get_option("io.parquet.low_memory"): - return libparquet.read_parquet_chunked( + # Note: If this function ever takes accepts filters + # allow_range_index needs to be False when a filter is passed + # (see read_parquet) + allow_range_index = columns is not None and len(columns) != 0 + + options = ( + plc.io.parquet.ParquetReaderOptions.builder( + plc.io.SourceInfo(filepaths_or_buffers) + ) + .use_pandas_metadata(use_pandas_metadata) + .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas) + .build() + ) + if row_groups is not None: + options.set_row_groups(row_groups) + if nrows > -1: + options.set_num_rows(nrows) + if skip_rows != 0: + options.set_skip_rows(skip_rows) + if columns is not None: + options.set_columns(columns) + + reader = plc.io.parquet.ChunkedParquetReader( + options, + chunk_read_limit=kwargs.get("_chunk_read_limit", 0), + pass_read_limit=kwargs.get("_pass_read_limit", 1024000000), + ) + + tbl_w_meta = reader.read_chunk() + column_names = tbl_w_meta.column_names(include_children=False) + child_names = tbl_w_meta.child_names + per_file_user_data = tbl_w_meta.per_file_user_data + concatenated_columns = tbl_w_meta.tbl.columns() + + # save memory + del tbl_w_meta + + while reader.has_next(): + tbl = reader.read_chunk().tbl + + for i in range(tbl.num_columns()): + concatenated_columns[i] = plc.concatenate.concatenate( + [concatenated_columns[i], tbl._columns[i]] + ) + # Drop residual columns to save memory + tbl._columns[i] = None + + df = cudf.DataFrame._from_data( + *_data_from_columns( + columns=[ + Column.from_pylibcudf(plc) + for plc in concatenated_columns + ], + column_names=column_names, + index_names=None, + ) + ) + df = _process_metadata( + df, + column_names, + child_names, + per_file_user_data, + row_groups, filepaths_or_buffers, - columns=columns, - row_groups=row_groups, - use_pandas_metadata=use_pandas_metadata, - nrows=nrows if nrows is not None else -1, - skip_rows=skip_rows if skip_rows is not None else 0, - allow_mismatched_pq_schemas=allow_mismatched_pq_schemas, + allow_range_index, + use_pandas_metadata, + nrows=nrows, + skip_rows=skip_rows, ) + return df else: - if nrows is None: - nrows = -1 - if skip_rows is None: - skip_rows = 0 - return libparquet.read_parquet( + allow_range_index = True + filters = kwargs.get("filters", None) + if columns is not None and len(columns) == 0 or filters: + allow_range_index = False + + options = ( + plc.io.parquet.ParquetReaderOptions.builder( + plc.io.SourceInfo(filepaths_or_buffers) + ) + .use_pandas_metadata(use_pandas_metadata) + .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas) + .build() + ) + if row_groups is not None: + options.set_row_groups(row_groups) + if nrows > -1: + options.set_num_rows(nrows) + if skip_rows != 0: + options.set_skip_rows(skip_rows) + if columns is not None: + options.set_columns(columns) + if filters is not None: + options.set_filter(filters) + + tbl_w_meta = plc.io.parquet.read_parquet(options) + + df = cudf.DataFrame._from_data(*data_from_pylibcudf_io(tbl_w_meta)) + + df = _process_metadata( + df, + tbl_w_meta.column_names(include_children=False), + tbl_w_meta.child_names, + tbl_w_meta.per_file_user_data, + row_groups, filepaths_or_buffers, - columns=columns, - row_groups=row_groups, - use_pandas_metadata=use_pandas_metadata, + allow_range_index, + use_pandas_metadata, nrows=nrows, skip_rows=skip_rows, - allow_mismatched_pq_schemas=allow_mismatched_pq_schemas, ) + return df else: if ( isinstance(filepaths_or_buffers, list) @@ -980,28 +1328,40 @@ def to_parquet( df, path, engine="cudf", - compression="snappy", - index=None, + compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy", + index: bool | None = None, partition_cols=None, partition_file_name=None, partition_offsets=None, - statistics="ROWGROUP", - metadata_file_path=None, - int96_timestamps=False, - row_group_size_bytes=None, - row_group_size_rows=None, - max_page_size_bytes=None, - max_page_size_rows=None, - max_dictionary_size=None, + statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP", + metadata_file_path: str | None = None, + int96_timestamps: bool = False, + row_group_size_bytes: int | None = None, + row_group_size_rows: int | None = None, + max_page_size_bytes: int | None = None, + max_page_size_rows: int | None = None, + max_dictionary_size: int | None = None, storage_options=None, - return_metadata=False, - force_nullable_schema=False, - header_version="1.0", - use_dictionary=True, - skip_compression=None, - column_encoding=None, - column_type_length=None, - output_as_binary=None, + return_metadata: bool = False, + force_nullable_schema: bool = False, + header_version: Literal["1.0", "2.0"] = "1.0", + use_dictionary: bool = True, + skip_compression: set[Hashable] | None = None, + column_encoding: dict[ + Hashable, + Literal[ + "PLAIN", + "DICTIONARY", + "DELTA_BINARY_PACKED", + "DELTA_LENGTH_BYTE_ARRAY", + "DELTA_BYTE_ARRAY", + "BYTE_STREAM_SPLIT", + "USE_DEFAULT", + ], + ] + | None = None, + column_type_length: dict | None = None, + output_as_binary: set[Hashable] | None = None, store_schema=False, *args, **kwargs, @@ -1114,10 +1474,11 @@ def to_parquet( @ioutils.doc_merge_parquet_filemetadata() -def merge_parquet_filemetadata(filemetadata_list): +def merge_parquet_filemetadata(filemetadata_list: list) -> np.ndarray: """{docstring}""" - - return libparquet.merge_filemetadata(filemetadata_list) + return np.asarray( + plc.io.parquet.merge_row_group_metadata(filemetadata_list).obj + ) def _generate_filename(): @@ -1205,10 +1566,207 @@ def _get_groups_and_offsets( return part_names, grouped_df, part_offsets -ParquetWriter = libparquet.ParquetWriter +class ParquetWriter: + """ + ParquetWriter lets you incrementally write out a Parquet file from a series + of cudf tables + + Parameters + ---------- + filepath_or_buffer : str, io.IOBase, os.PathLike, or list + File path or buffer to write to. The argument may also correspond + to a list of file paths or buffers. + index : bool or None, default None + If ``True``, include a dataframe's index(es) in the file output. + If ``False``, they will not be written to the file. If ``None``, + index(es) other than RangeIndex will be saved as columns. + compression : {'snappy', None}, default 'snappy' + Name of the compression to use. Use ``None`` for no compression. + statistics : {'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}, default 'ROWGROUP' + Level at which column statistics should be included in file. + row_group_size_bytes: int, default ``uint64 max`` + Maximum size of each stripe of the output. + By default, a virtually infinite size equal to ``uint64 max`` will be used. + row_group_size_rows: int, default 1000000 + Maximum number of rows of each stripe of the output. + By default, 1000000 (10^6 rows) will be used. + max_page_size_bytes: int, default 524288 + Maximum uncompressed size of each page of the output. + By default, 524288 (512KB) will be used. + max_page_size_rows: int, default 20000 + Maximum number of rows of each page of the output. + By default, 20000 will be used. + max_dictionary_size: int, default 1048576 + Maximum size of the dictionary page for each output column chunk. Dictionary + encoding for column chunks that exceeds this limit will be disabled. + By default, 1048576 (1MB) will be used. + use_dictionary : bool, default True + If ``True``, enable dictionary encoding for Parquet page data + subject to ``max_dictionary_size`` constraints. + If ``False``, disable dictionary encoding for Parquet page data. + store_schema : bool, default False + If ``True``, enable computing and writing arrow schema to Parquet + file footer's key-value metadata section for faithful round-tripping. + + See Also + -------- + cudf.io.parquet.write_parquet + """ + + def __init__( + self, + filepath_or_buffer, + index: bool | None = None, + compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy", + statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP", + row_group_size_bytes: int = int(np.iinfo(np.uint64).max), + row_group_size_rows: int = 1000000, + max_page_size_bytes: int = 524288, + max_page_size_rows: int = 20000, + max_dictionary_size: int = 1048576, + use_dictionary: bool = True, + store_schema: bool = False, + ): + filepaths_or_buffers = ( + list(filepath_or_buffer) + if is_list_like(filepath_or_buffer) + else [filepath_or_buffer] + ) + self.sink = plc.io.SinkInfo(filepaths_or_buffers) + self.statistics = statistics + self.compression = compression + self.index = index + self.initialized = False + self.row_group_size_bytes = row_group_size_bytes + self.row_group_size_rows = row_group_size_rows + self.max_page_size_bytes = max_page_size_bytes + self.max_page_size_rows = max_page_size_rows + self.max_dictionary_size = max_dictionary_size + self.use_dictionary = use_dictionary + self.write_arrow_schema = store_schema + + def write_table(self, table, partitions_info=None) -> None: + """Writes a single table to the file""" + if not self.initialized: + self._initialize_chunked_state( + table, + num_partitions=len(partitions_info) if partitions_info else 1, + ) + if self.index is not False and ( + table.index.name is not None + or isinstance(table.index, cudf.MultiIndex) + ): + columns = itertools.chain(table.index._columns, table._columns) + plc_table = plc.Table( + [col.to_pylibcudf(mode="read") for col in columns] + ) + else: + plc_table = plc.Table( + [col.to_pylibcudf(mode="read") for col in table._columns] + ) + self.writer.write(plc_table, partitions_info) + + def close(self, metadata_file_path=None) -> np.ndarray | None: + if not self.initialized: + return None + column_chunks_file_paths = [] + if metadata_file_path is not None: + if is_list_like(metadata_file_path): + column_chunks_file_paths = list(metadata_file_path) + else: + column_chunks_file_paths = [metadata_file_path] + blob = self.writer.close(column_chunks_file_paths) + if metadata_file_path is not None: + return np.asarray(blob.obj) + return None + + def __enter__(self) -> Self: + return self + + def __exit__(self, *args) -> None: + self.close() + + def _initialize_chunked_state( + self, table, num_partitions: int = 1 + ) -> None: + """Prepares all the values required to build the + chunked_parquet_writer_options and creates a writer + """ + # Set the table_metadata + num_index_cols_meta = 0 + plc_table = plc.Table( + [col.to_pylibcudf(mode="read") for col in table._columns] + ) + self.tbl_meta = plc.io.types.TableInputMetadata(plc_table) + if self.index is not False: + if isinstance(table.index, cudf.MultiIndex): + plc_table = plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in itertools.chain( + table.index._columns, table._columns + ) + ] + ) + self.tbl_meta = plc.io.types.TableInputMetadata(plc_table) + for level, idx_name in enumerate(table.index.names): + self.tbl_meta.column_metadata[level].set_name(idx_name) + num_index_cols_meta = len(table.index.names) + else: + if table.index.name is not None: + plc_table = plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in itertools.chain( + table.index._columns, table._columns + ) + ] + ) + self.tbl_meta = plc.io.types.TableInputMetadata(plc_table) + self.tbl_meta.column_metadata[0].set_name(table.index.name) + num_index_cols_meta = 1 + + for i, name in enumerate(table._column_names, num_index_cols_meta): + self.tbl_meta.column_metadata[i].set_name(name) + _set_col_metadata( + table[name]._column, + self.tbl_meta.column_metadata[i], + ) -def _parse_bytes(s): + index = ( + False if isinstance(table.index, cudf.RangeIndex) else self.index + ) + user_data = [ + {"pandas": generate_pandas_metadata(table, index)} + ] * num_partitions + comp_type = _get_comp_type(self.compression) + stat_freq = _get_stat_freq(self.statistics) + dict_policy = ( + plc.io.types.DictionaryPolicy.ADAPTIVE + if self.use_dictionary + else plc.io.types.DictionaryPolicy.NEVER + ) + options = ( + plc.io.parquet.ChunkedParquetWriterOptions.builder(self.sink) + .metadata(self.tbl_meta) + .key_value_metadata(user_data) + .compression(comp_type) + .stats_level(stat_freq) + .row_group_size_bytes(self.row_group_size_bytes) + .row_group_size_rows(self.row_group_size_rows) + .max_page_size_bytes(self.max_page_size_bytes) + .max_page_size_rows(self.max_page_size_rows) + .max_dictionary_size(self.max_dictionary_size) + .write_arrow_schema(self.write_arrow_schema) + .build() + ) + options.set_dictionary_policy(dict_policy) + self.writer = plc.io.parquet.ParquetChunkedWriter.from_options(options) + self.initialized = True + + +def _parse_bytes(s: str) -> int: """Parse byte string to numbers Utility function vendored from Dask. @@ -1345,8 +1903,8 @@ def __init__( path, partition_cols, index=None, - compression="snappy", - statistics="ROWGROUP", + compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy", + statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP", max_file_size=None, file_name_prefix=None, storage_options=None, @@ -1370,9 +1928,7 @@ def __init__( self.partition_cols = partition_cols # Collection of `ParquetWriter`s, and the corresponding # partition_col values they're responsible for - self._chunked_writers: list[ - tuple[libparquet.ParquetWriter, list[str], str] - ] = [] + self._chunked_writers: list[tuple[ParquetWriter, list[str], str]] = [] # Map of partition_col values to their ParquetWriter's index # in self._chunked_writers for reverse lookup self.path_cw_map: dict[str, int] = {} @@ -1563,3 +2119,257 @@ def _hive_dirname(name, val): if pd.isna(val): val = "__HIVE_DEFAULT_PARTITION__" return f"{name}={val}" + + +def _set_col_metadata( + col: ColumnBase, + col_meta: plc.io.types.ColumnInMetadata, + force_nullable_schema: bool = False, + path: str | None = None, + skip_compression: set[Hashable] | None = None, + column_encoding: dict[ + Hashable, + Literal[ + "PLAIN", + "DICTIONARY", + "DELTA_BINARY_PACKED", + "DELTA_LENGTH_BYTE_ARRAY", + "DELTA_BYTE_ARRAY", + "BYTE_STREAM_SPLIT", + "USE_DEFAULT", + ], + ] + | None = None, + column_type_length: dict | None = None, + output_as_binary: set[Hashable] | None = None, +) -> None: + need_path = ( + skip_compression is not None + or column_encoding is not None + or column_type_length is not None + or output_as_binary is not None + ) + name = col_meta.get_name() if need_path else None + full_path = ( + path + "." + name if (path is not None and name is not None) else name + ) + + if force_nullable_schema: + # Only set nullability if `force_nullable_schema` + # is true. + col_meta.set_nullability(True) + + if skip_compression is not None and full_path in skip_compression: + col_meta.set_skip_compression(True) + + if column_encoding is not None and full_path in column_encoding: + encoding = column_encoding[full_path] + if encoding is None: + c_encoding = plc.io.types.ColumnEncoding.USE_DEFAULT + else: + enc = str(encoding).upper() + c_encoding = getattr(plc.io.types.ColumnEncoding, enc, None) + if c_encoding is None: + raise ValueError("Unsupported `column_encoding` type") + col_meta.set_encoding(c_encoding) + + if column_type_length is not None and full_path in column_type_length: + col_meta.set_output_as_binary(True) + col_meta.set_type_length(column_type_length[full_path]) + + if output_as_binary is not None and full_path in output_as_binary: + col_meta.set_output_as_binary(True) + + if isinstance(col.dtype, cudf.StructDtype): + for i, (child_col, name) in enumerate( + zip(col.children, list(col.dtype.fields)) + ): + col_meta.child(i).set_name(name) + _set_col_metadata( + child_col, + col_meta.child(i), + force_nullable_schema, + full_path, + skip_compression, + column_encoding, + column_type_length, + output_as_binary, + ) + elif isinstance(col.dtype, cudf.ListDtype): + if full_path is not None: + full_path = full_path + ".list" + col_meta.child(1).set_name("element") + _set_col_metadata( + col.children[1], + col_meta.child(1), + force_nullable_schema, + full_path, + skip_compression, + column_encoding, + column_type_length, + output_as_binary, + ) + elif isinstance(col.dtype, cudf.core.dtypes.DecimalDtype): + col_meta.set_decimal_precision(col.dtype.precision) + + +def _get_comp_type( + compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None], +) -> plc.io.types.CompressionType: + if compression is None: + return plc.io.types.CompressionType.NONE + result = getattr(plc.io.types.CompressionType, compression.upper(), None) + if result is None: + raise ValueError("Unsupported `compression` type") + return result + + +def _get_stat_freq( + statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"], +) -> plc.io.types.StatisticsFreq: + result = getattr( + plc.io.types.StatisticsFreq, f"STATISTICS_{statistics.upper()}", None + ) + if result is None: + raise ValueError("Unsupported `statistics_freq` type") + return result + + +def _process_metadata( + df: cudf.DataFrame, + names: list[Hashable], + child_names: dict, + per_file_user_data: list, + row_groups, + filepaths_or_buffers, + allow_range_index: bool, + use_pandas_metadata: bool, + nrows: int = -1, + skip_rows: int = 0, +) -> cudf.DataFrame: + ioutils._add_df_col_struct_names(df, child_names) + index_col = None + is_range_index = True + column_index_type = None + index_col_names = None + meta = None + for single_file in per_file_user_data: + if b"pandas" not in single_file: + continue + json_str = single_file[b"pandas"].decode("utf-8") + meta = json.loads(json_str) + file_is_range_index, index_col, column_index_type = _parse_metadata( + meta + ) + is_range_index &= file_is_range_index + + if ( + not file_is_range_index + and index_col is not None + and index_col_names is None + ): + index_col_names = {} + for idx_col in index_col: + for c in meta["columns"]: + if c["field_name"] == idx_col: + index_col_names[idx_col] = c["name"] + + if meta is not None: + # Book keep each column metadata as the order + # of `meta["columns"]` and `column_names` are not + # guaranteed to be deterministic and same always. + meta_data_per_column = { + col_meta["name"]: col_meta for col_meta in meta["columns"] + } + + # update the decimal precision of each column + for col in names: + if isinstance(df._data[col].dtype, cudf.core.dtypes.DecimalDtype): + df._data[col].dtype.precision = meta_data_per_column[col][ + "metadata" + ]["precision"] + + # Set the index column + if index_col is not None and len(index_col) > 0: + if is_range_index: + if not allow_range_index: + return df + + if len(per_file_user_data) > 1: + range_index_meta = { + "kind": "range", + "name": None, + "start": 0, + "stop": len(df), + "step": 1, + } + else: + range_index_meta = index_col[0] + + if row_groups is not None: + per_file_metadata = [ + pa.parquet.read_metadata( + # Pyarrow cannot read directly from bytes + io.BytesIO(s) if isinstance(s, bytes) else s + ) + for s in filepaths_or_buffers + ] + + filtered_idx = [] + for i, file_meta in enumerate(per_file_metadata): + row_groups_i = [] + start = 0 + for row_group in range(file_meta.num_row_groups): + stop = start + file_meta.row_group(row_group).num_rows + row_groups_i.append((start, stop)) + start = stop + + for rg in row_groups[i]: + filtered_idx.append( + cudf.RangeIndex( + start=row_groups_i[rg][0], + stop=row_groups_i[rg][1], + step=range_index_meta["step"], + ) + ) + + if len(filtered_idx) > 0: + idx = cudf.concat(filtered_idx) + else: + idx = cudf.Index._from_column( + cudf.core.column.column_empty(0) + ) + else: + start = range_index_meta["start"] + skip_rows # type: ignore[operator] + stop = range_index_meta["stop"] + if nrows > -1: + stop = start + nrows + idx = cudf.RangeIndex( + start=start, + stop=stop, + step=range_index_meta["step"], + name=range_index_meta["name"], + ) + + df.index = idx + elif set(index_col).issubset(names): + index_data = df[index_col] + actual_index_names = iter(index_col_names.values()) + if index_data._num_columns == 1: + idx = cudf.Index._from_column( + index_data._columns[0], name=next(actual_index_names) + ) + else: + idx = cudf.MultiIndex.from_frame( + index_data, names=list(actual_index_names) + ) + df.drop(columns=index_col, inplace=True) + df.index = idx + else: + if use_pandas_metadata: + df.index.names = index_col + + if df._num_columns == 0 and column_index_type is not None: + df._data.label_dtype = cudf.dtype(column_index_type) + + return df diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 13efa71ebae..77d1f77d30b 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -22,7 +22,6 @@ from pyarrow import parquet as pq import cudf -from cudf._lib.parquet import read_parquet_chunked from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION from cudf.io.parquet import ( ParquetDatasetWriter, @@ -3775,13 +3774,14 @@ def test_parquet_chunked_reader( ) buffer = BytesIO() df.to_parquet(buffer, row_group_size=10000) - actual = read_parquet_chunked( - [buffer], - chunk_read_limit=chunk_read_limit, - pass_read_limit=pass_read_limit, - use_pandas_metadata=use_pandas_metadata, - row_groups=row_groups, - ) + with cudf.option_context("io.parquet.low_memory", True): + actual = cudf.read_parquet( + [buffer], + _chunk_read_limit=chunk_read_limit, + _pass_read_limit=pass_read_limit, + use_pandas_metadata=use_pandas_metadata, + row_groups=row_groups, + ) expected = cudf.read_parquet( buffer, use_pandas_metadata=use_pandas_metadata, row_groups=row_groups ) @@ -3825,12 +3825,13 @@ def test_parquet_chunked_reader_structs( # Number of rows to read nrows = num_rows if num_rows is not None else len(df) - actual = read_parquet_chunked( - [buffer], - chunk_read_limit=chunk_read_limit, - pass_read_limit=pass_read_limit, - nrows=nrows, - ) + with cudf.option_context("io.parquet.low_memory", True): + actual = cudf.read_parquet( + [buffer], + _chunk_read_limit=chunk_read_limit, + _pass_read_limit=pass_read_limit, + nrows=nrows, + ) expected = cudf.read_parquet( buffer, nrows=nrows, @@ -3877,12 +3878,13 @@ def test_parquet_chunked_reader_string_decoders( nrows = num_rows if num_rows is not None else len(df) # Check with num_rows specified - actual = read_parquet_chunked( - [buffer], - chunk_read_limit=chunk_read_limit, - pass_read_limit=pass_read_limit, - nrows=nrows, - ) + with cudf.option_context("io.parquet.low_memory", True): + actual = cudf.read_parquet( + [buffer], + _chunk_read_limit=chunk_read_limit, + _pass_read_limit=pass_read_limit, + nrows=nrows, + ) expected = cudf.read_parquet( buffer, nrows=nrows, @@ -3982,13 +3984,14 @@ def test_parquet_reader_with_mismatched_tables(store_schema): ).reset_index(drop=True) # Read with chunked reader (filter columns not supported) - got_chunked = read_parquet_chunked( - [buf1, buf2], - columns=["list", "d_list", "str"], - chunk_read_limit=240, - pass_read_limit=240, - allow_mismatched_pq_schemas=True, - ) + with cudf.option_context("io.parquet.low_memory", True): + got_chunked = cudf.read_parquet( + [buf1, buf2], + columns=["list", "d_list", "str"], + _chunk_read_limit=240, + _pass_read_limit=240, + allow_mismatched_pq_schemas=True, + ) # Construct the expected table without filter columns expected_chunked = cudf.concat( @@ -4054,13 +4057,14 @@ def test_parquet_reader_with_mismatched_structs(): ) # Read with chunked reader - got_chunked = read_parquet_chunked( - [buf1, buf2], - columns=["struct.b.b_b.b_b_a"], - chunk_read_limit=240, - pass_read_limit=240, - allow_mismatched_pq_schemas=True, - ) + with cudf.option_context("io.parquet.low_memory", True): + got_chunked = cudf.read_parquet( + [buf1, buf2], + columns=["struct.b.b_b.b_b_a"], + _chunk_read_limit=240, + _pass_read_limit=240, + allow_mismatched_pq_schemas=True, + ) got_chunked = ( cudf.Series(got_chunked["struct"]) .struct.field("b") diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index d9a3da6666d..a04fcb8df7a 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -43,7 +43,6 @@ } _BYTES_PER_THREAD_DEFAULT = 256 * 1024 * 1024 -_ROW_GROUP_SIZE_BYTES_DEFAULT = np.iinfo(np.uint64).max _docstring_remote_sources = """ - cuDF supports local and remote data stores. See configuration details for From f811c383b46d7a8acc8496593e3d0caff83d6c8f Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 12 Dec 2024 17:56:03 -0500 Subject: [PATCH 67/78] Allow large strings in nvbench strings benchmarks (#17571) Removes the 2GB limit check from the strings benchmarks and adjusts the parameters to be consistent across the benchmarks. The default parameters will still not exceed 2GB for automation purposes. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Paul Mattione (https://github.com/pmattione-nvidia) URL: https://github.com/rapidsai/cudf/pull/17571 --- cpp/benchmarks/string/case.cpp | 19 +++---- cpp/benchmarks/string/char_types.cpp | 15 +++--- cpp/benchmarks/string/contains.cpp | 13 ++--- cpp/benchmarks/string/copy_if_else.cpp | 15 +++--- cpp/benchmarks/string/copy_range.cpp | 15 +++--- cpp/benchmarks/string/count.cpp | 15 +++--- cpp/benchmarks/string/extract.cpp | 9 +--- cpp/benchmarks/string/join_strings.cpp | 15 +++--- cpp/benchmarks/string/lengths.cpp | 15 +++--- cpp/benchmarks/string/like.cpp | 9 +--- cpp/benchmarks/string/replace_re.cpp | 19 +++---- cpp/benchmarks/string/reverse.cpp | 15 +++--- cpp/benchmarks/string/slice.cpp | 9 +--- cpp/benchmarks/string/split.cpp | 15 +++--- cpp/benchmarks/string/split_re.cpp | 15 +++--- cpp/benchmarks/string/string_bench_args.hpp | 56 --------------------- 16 files changed, 80 insertions(+), 189 deletions(-) delete mode 100644 cpp/benchmarks/string/string_bench_args.hpp diff --git a/cpp/benchmarks/string/case.cpp b/cpp/benchmarks/string/case.cpp index cd4d3ca964b..9750475a079 100644 --- a/cpp/benchmarks/string/case.cpp +++ b/cpp/benchmarks/string/case.cpp @@ -24,18 +24,14 @@ void bench_case(nvbench::state& state) { - auto const n_rows = static_cast(state.get_int64("num_rows")); - auto const max_width = static_cast(state.get_int64("row_width")); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const encoding = state.get_string("encoding"); - if (static_cast(n_rows) * static_cast(max_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, max_width); - auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); auto col_view = column->view(); @@ -74,6 +70,7 @@ void bench_case(nvbench::state& state) NVBENCH_BENCH(bench_case) .set_name("case") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) - .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("encoding", {"ascii", "utf8"}); diff --git a/cpp/benchmarks/string/char_types.cpp b/cpp/benchmarks/string/char_types.cpp index eec9a5f54d7..abc5254392e 100644 --- a/cpp/benchmarks/string/char_types.cpp +++ b/cpp/benchmarks/string/char_types.cpp @@ -25,16 +25,12 @@ static void bench_char_types(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const api_type = state.get_string("api"); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); cudf::strings_column_view input(table->view().column(0)); @@ -61,6 +57,7 @@ static void bench_char_types(nvbench::state& state) NVBENCH_BENCH(bench_char_types) .set_name("char_types") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("api", {"all", "filter"}); diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp index a73017dda18..e3940cbc0c7 100644 --- a/cpp/benchmarks/string/contains.cpp +++ b/cpp/benchmarks/string/contains.cpp @@ -29,17 +29,12 @@ std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$", "5W43" static void bench_contains(nvbench::state& state) { - auto const n_rows = static_cast(state.get_int64("num_rows")); + auto const num_rows = static_cast(state.get_int64("num_rows")); auto const row_width = static_cast(state.get_int64("row_width")); auto const pattern_index = static_cast(state.get_int64("pattern")); auto const hit_rate = static_cast(state.get_int64("hit_rate")); - if (static_cast(n_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - - auto col = create_string_column(n_rows, row_width, hit_rate); + auto col = create_string_column(num_rows, row_width, hit_rate); auto input = cudf::strings_column_view(col->view()); auto pattern = patterns[pattern_index]; @@ -56,7 +51,7 @@ static void bench_contains(nvbench::state& state) NVBENCH_BENCH(bench_contains) .set_name("contains") - .add_int64_axis("row_width", {32, 64, 128, 256, 512}) - .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216}) + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_int64_axis("hit_rate", {50, 100}) // percentage .add_int64_axis("pattern", {0, 1, 2}); diff --git a/cpp/benchmarks/string/copy_if_else.cpp b/cpp/benchmarks/string/copy_if_else.cpp index e06cca497c2..5a5743dfddf 100644 --- a/cpp/benchmarks/string/copy_if_else.cpp +++ b/cpp/benchmarks/string/copy_if_else.cpp @@ -25,15 +25,11 @@ static void bench_copy(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); data_profile const str_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const source_table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, str_profile); auto const target_table = @@ -58,5 +54,6 @@ static void bench_copy(nvbench::state& state) NVBENCH_BENCH(bench_copy) .set_name("copy_if_else") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/copy_range.cpp b/cpp/benchmarks/string/copy_range.cpp index af217a49195..7e7353a0e78 100644 --- a/cpp/benchmarks/string/copy_range.cpp +++ b/cpp/benchmarks/string/copy_range.cpp @@ -25,16 +25,12 @@ static void bench_copy_range(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); data_profile const table_profile = data_profile_builder() - .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width) + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width) .no_validity(); auto const source_tables = create_random_table( {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, table_profile); @@ -56,5 +52,6 @@ static void bench_copy_range(nvbench::state& state) NVBENCH_BENCH(bench_copy_range) .set_name("copy_range") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/count.cpp b/cpp/benchmarks/string/count.cpp index f964bc5d224..cf90e316f71 100644 --- a/cpp/benchmarks/string/count.cpp +++ b/cpp/benchmarks/string/count.cpp @@ -30,16 +30,12 @@ static std::string patterns[] = {"\\d+", "a"}; static void bench_count(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const pattern_index = static_cast(state.get_int64("pattern")); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); cudf::strings_column_view input(table->view().column(0)); @@ -61,6 +57,7 @@ static void bench_count(nvbench::state& state) NVBENCH_BENCH(bench_count) .set_name("count") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_int64_axis("pattern", {0, 1}); diff --git a/cpp/benchmarks/string/extract.cpp b/cpp/benchmarks/string/extract.cpp index af4fedb5799..d6866598ff4 100644 --- a/cpp/benchmarks/string/extract.cpp +++ b/cpp/benchmarks/string/extract.cpp @@ -32,11 +32,6 @@ static void bench_extract(nvbench::state& state) auto const num_rows = static_cast(state.get_int64("num_rows")); auto const row_width = static_cast(state.get_int64("row_width")); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - auto groups = static_cast(state.get_int64("groups")); std::default_random_engine generator; @@ -79,6 +74,6 @@ static void bench_extract(nvbench::state& state) NVBENCH_BENCH(bench_extract) .set_name("extract") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_int64_axis("groups", {1, 2, 4}); diff --git a/cpp/benchmarks/string/join_strings.cpp b/cpp/benchmarks/string/join_strings.cpp index 6dcf731ad3c..27652193b7b 100644 --- a/cpp/benchmarks/string/join_strings.cpp +++ b/cpp/benchmarks/string/join_strings.cpp @@ -25,15 +25,11 @@ static void bench_join(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); cudf::strings_column_view input(table->view().column(0)); @@ -54,5 +50,6 @@ static void bench_join(nvbench::state& state) NVBENCH_BENCH(bench_join) .set_name("strings_join") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/lengths.cpp b/cpp/benchmarks/string/lengths.cpp index a19060ead3b..8156e19412b 100644 --- a/cpp/benchmarks/string/lengths.cpp +++ b/cpp/benchmarks/string/lengths.cpp @@ -25,15 +25,11 @@ static void bench_lengths(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); cudf::strings_column_view input(table->view().column(0)); @@ -51,5 +47,6 @@ static void bench_lengths(nvbench::state& state) NVBENCH_BENCH(bench_lengths) .set_name("lengths") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/like.cpp b/cpp/benchmarks/string/like.cpp index 105ae65cbe8..f6410aaef30 100644 --- a/cpp/benchmarks/string/like.cpp +++ b/cpp/benchmarks/string/like.cpp @@ -30,11 +30,6 @@ static void bench_like(nvbench::state& state) auto const row_width = static_cast(state.get_int64("row_width")); auto const hit_rate = static_cast(state.get_int64("hit_rate")); - if (static_cast(n_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - auto col = create_string_column(n_rows, row_width, hit_rate); auto input = cudf::strings_column_view(col->view()); @@ -54,6 +49,6 @@ static void bench_like(nvbench::state& state) NVBENCH_BENCH(bench_like) .set_name("strings_like") - .add_int64_axis("row_width", {32, 64, 128, 256, 512}) - .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216}) + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_int64_axis("hit_rate", {10, 25, 70, 100}); diff --git a/cpp/benchmarks/string/replace_re.cpp b/cpp/benchmarks/string/replace_re.cpp index 4dcf1314f83..69426a2d484 100644 --- a/cpp/benchmarks/string/replace_re.cpp +++ b/cpp/benchmarks/string/replace_re.cpp @@ -26,18 +26,14 @@ static void bench_replace(nvbench::state& state) { - auto const n_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const rtype = state.get_string("type"); - if (static_cast(n_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); - auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); auto program = cudf::strings::regex_program::create("(\\d+)"); @@ -62,6 +58,7 @@ static void bench_replace(nvbench::state& state) NVBENCH_BENCH(bench_replace) .set_name("replace_re") - .add_int64_axis("row_width", {32, 64, 128, 256, 512}) - .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("type", {"replace", "backref"}); diff --git a/cpp/benchmarks/string/reverse.cpp b/cpp/benchmarks/string/reverse.cpp index a2676609a40..e2e914cb350 100644 --- a/cpp/benchmarks/string/reverse.cpp +++ b/cpp/benchmarks/string/reverse.cpp @@ -25,15 +25,11 @@ static void bench_reverse(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); cudf::strings_column_view input(table->view().column(0)); @@ -51,5 +47,6 @@ static void bench_reverse(nvbench::state& state) NVBENCH_BENCH(bench_reverse) .set_name("reverse") - .add_int64_axis("row_width", {8, 16, 32, 64, 128}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/slice.cpp b/cpp/benchmarks/string/slice.cpp index 1898f0340b6..c828a8ed0b0 100644 --- a/cpp/benchmarks/string/slice.cpp +++ b/cpp/benchmarks/string/slice.cpp @@ -36,11 +36,6 @@ static void bench_slice(nvbench::state& state) auto const row_width = static_cast(state.get_int64("row_width")); auto const stype = state.get_string("type"); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const profile = data_profile_builder().distribution( cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); @@ -76,6 +71,6 @@ static void bench_slice(nvbench::state& state) NVBENCH_BENCH(bench_slice) .set_name("slice") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) - .add_int64_axis("num_rows", {262144, 2097152, 16777216}) + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("type", {"position", "multi"}); diff --git a/cpp/benchmarks/string/split.cpp b/cpp/benchmarks/string/split.cpp index 9ef58daf0fc..9c7c27c4f07 100644 --- a/cpp/benchmarks/string/split.cpp +++ b/cpp/benchmarks/string/split.cpp @@ -28,16 +28,12 @@ static void bench_split(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const stype = state.get_string("type"); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); cudf::string_scalar target("+"); @@ -66,6 +62,7 @@ static void bench_split(nvbench::state& state) NVBENCH_BENCH(bench_split) .set_name("split") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("type", {"split", "split_ws", "record", "record_ws"}); diff --git a/cpp/benchmarks/string/split_re.cpp b/cpp/benchmarks/string/split_re.cpp index 1fdb6e67109..34a7aa96e84 100644 --- a/cpp/benchmarks/string/split_re.cpp +++ b/cpp/benchmarks/string/split_re.cpp @@ -28,17 +28,13 @@ static void bench_split(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto prog = cudf::strings::regex_program::create("\\d+"); data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); @@ -56,5 +52,6 @@ static void bench_split(nvbench::state& state) NVBENCH_BENCH(bench_split) .set_name("split_re") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/string_bench_args.hpp b/cpp/benchmarks/string/string_bench_args.hpp deleted file mode 100644 index a34026281e8..00000000000 --- a/cpp/benchmarks/string/string_bench_args.hpp +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include - -#include - -/** - * @brief Generate row count and row length argument ranges for a string benchmark. - * - * Generates a series of row count and row length arguments for string benchmarks. - * Combinations of row count and row length that would exceed the maximum string character - * column data length are not generated. - * - * @param b Benchmark to update with row count and row length arguments. - * @param min_rows Minimum row count argument to generate. - * @param max_rows Maximum row count argument to generate. - * @param rows_mult Row count multiplier to generate intermediate row count arguments. - * @param min_rowlen Minimum row length argument to generate. - * @param max_rowlen Maximum row length argument to generate. - * @param rowlen_mult Row length multiplier to generate intermediate row length arguments. - */ -inline void generate_string_bench_args(benchmark::internal::Benchmark* b, - int min_rows, - int max_rows, - int rows_mult, - int min_rowlen, - int max_rowlen, - int rowlen_mult) -{ - for (int row_count = min_rows; row_count <= max_rows; row_count *= rows_mult) { - for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= rowlen_mult) { - // avoid generating combinations that exceed the cudf column limit - size_t total_chars = static_cast(row_count) * rowlen; - if (total_chars < static_cast(std::numeric_limits::max())) { - b->Args({row_count, rowlen}); - } - } - } -} From 8a3e5f1a7af6c638397fcabf17bea9192bd799d2 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 12 Dec 2024 17:40:20 -0800 Subject: [PATCH 68/78] Remove cudf._lib.nvtext in favor of inlining pylibcudf (#17535) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17535 --- python/cudf/cudf/_lib/CMakeLists.txt | 2 - python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/nvtext/CMakeLists.txt | 24 -- python/cudf/cudf/_lib/nvtext/__init__.pxd | 0 python/cudf/cudf/_lib/nvtext/__init__.py | 0 .../cudf/_lib/nvtext/byte_pair_encode.pyx | 24 -- .../cudf/cudf/_lib/nvtext/edit_distance.pyx | 24 -- .../cudf/cudf/_lib/nvtext/generate_ngrams.pyx | 35 -- python/cudf/cudf/_lib/nvtext/jaccard.pyx | 17 - python/cudf/cudf/_lib/nvtext/minhash.pyx | 35 -- .../cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx | 24 -- python/cudf/cudf/_lib/nvtext/normalize.pyx | 28 -- python/cudf/cudf/_lib/nvtext/replace.pyx | 52 --- python/cudf/cudf/_lib/nvtext/stemmer.pyx | 55 --- .../cudf/_lib/nvtext/subword_tokenize.pyx | 38 -- python/cudf/cudf/_lib/nvtext/tokenize.pyx | 86 ---- python/cudf/cudf/_lib/strings/__init__.pxd | 0 python/cudf/cudf/_lib/strings/__init__.py | 30 -- python/cudf/cudf/core/byte_pair_encoding.py | 13 +- python/cudf/cudf/core/column/string.py | 388 ++++++++++++++---- python/cudf/cudf/core/subword_tokenizer.py | 7 +- python/cudf/cudf/core/tokenize_vocabulary.py | 9 +- 22 files changed, 328 insertions(+), 564 deletions(-) delete mode 100644 python/cudf/cudf/_lib/nvtext/CMakeLists.txt delete mode 100644 python/cudf/cudf/_lib/nvtext/__init__.pxd delete mode 100644 python/cudf/cudf/_lib/nvtext/__init__.py delete mode 100644 python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/edit_distance.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/jaccard.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/minhash.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/normalize.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/replace.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/stemmer.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/tokenize.pyx delete mode 100644 python/cudf/cudf/_lib/strings/__init__.pxd delete mode 100644 python/cudf/cudf/_lib/strings/__init__.py diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index f422635d22a..c2677c6d88d 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -30,5 +30,3 @@ target_include_directories(interop PUBLIC "$ letter_type.CONSONANT - VOWEL = letter_type.VOWEL - - -@acquire_spill_lock() -def porter_stemmer_measure(Column strings): - return Column.from_pylibcudf( - nvtext.stemmer.porter_stemmer_measure( - strings.to_pylibcudf(mode="read"), - ) - ) - - -@acquire_spill_lock() -def is_letter(Column strings, - object ltype, - size_type index): - return Column.from_pylibcudf( - nvtext.stemmer.is_letter( - strings.to_pylibcudf(mode="read"), - ltype==LetterType.VOWEL, - index, - ) - ) - - -@acquire_spill_lock() -def is_letter_multi(Column strings, - object ltype, - Column indices): - return Column.from_pylibcudf( - nvtext.stemmer.is_letter( - strings.to_pylibcudf(mode="read"), - ltype==LetterType.VOWEL, - indices.to_pylibcudf(mode="read"), - ) - ) diff --git a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx deleted file mode 100644 index 5e0bfb74705..00000000000 --- a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libc.stdint cimport uint32_t - -from cudf.core.buffer import acquire_spill_lock - -from libcpp cimport bool - -from cudf._lib.column cimport Column - -from pylibcudf import nvtext - - -@acquire_spill_lock() -def subword_tokenize_inmem_hash( - Column strings, - object hashed_vocabulary, - uint32_t max_sequence_length=64, - uint32_t stride=48, - bool do_lower=True, - bool do_truncate=False, -): - """ - Subword tokenizes text series by using the pre-loaded hashed vocabulary - """ - result = nvtext.subword_tokenize.subword_tokenize( - strings.to_pylibcudf(mode="read"), - hashed_vocabulary, - max_sequence_length, - stride, - do_lower, - do_truncate, - ) - # return the 3 tensor components - tokens = Column.from_pylibcudf(result[0]) - masks = Column.from_pylibcudf(result[1]) - metadata = Column.from_pylibcudf(result[2]) - return tokens, masks, metadata diff --git a/python/cudf/cudf/_lib/nvtext/tokenize.pyx b/python/cudf/cudf/_lib/nvtext/tokenize.pyx deleted file mode 100644 index f473c48e2f7..00000000000 --- a/python/cudf/cudf/_lib/nvtext/tokenize.pyx +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.types cimport size_type - -from pylibcudf.nvtext.tokenize import TokenizeVocabulary # no-cython-lint - -from cudf._lib.column cimport Column - -from pylibcudf import nvtext - - -@acquire_spill_lock() -def _tokenize_scalar(Column strings, object py_delimiter): - return Column.from_pylibcudf( - nvtext.tokenize.tokenize_scalar( - strings.to_pylibcudf(mode="read"), - py_delimiter.device_value.c_value - ) - ) - - -@acquire_spill_lock() -def _tokenize_column(Column strings, Column delimiters): - return Column.from_pylibcudf( - nvtext.tokenize.tokenize_column( - strings.to_pylibcudf(mode="read"), - delimiters.to_pylibcudf(mode="read"), - ) - ) - - -@acquire_spill_lock() -def _count_tokens_scalar(Column strings, object py_delimiter): - return Column.from_pylibcudf( - nvtext.tokenize.count_tokens_scalar( - strings.to_pylibcudf(mode="read"), - py_delimiter.device_value.c_value - ) - ) - - -@acquire_spill_lock() -def _count_tokens_column(Column strings, Column delimiters): - return Column.from_pylibcudf( - nvtext.tokenize.count_tokens_column( - strings.to_pylibcudf(mode="read"), - delimiters.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def character_tokenize(Column strings): - return Column.from_pylibcudf( - nvtext.tokenize.character_tokenize( - strings.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def detokenize(Column strings, Column indices, object py_separator): - return Column.from_pylibcudf( - nvtext.tokenize.detokenize( - strings.to_pylibcudf(mode="read"), - indices.to_pylibcudf(mode="read"), - py_separator.device_value.c_value - ) - ) - - -@acquire_spill_lock() -def tokenize_with_vocabulary(Column strings, - object vocabulary, - object py_delimiter, - size_type default_id): - return Column.from_pylibcudf( - nvtext.tokenize.tokenize_with_vocabulary( - strings.to_pylibcudf(mode="read"), - vocabulary, - py_delimiter.device_value.c_value, - default_id - ) - ) diff --git a/python/cudf/cudf/_lib/strings/__init__.pxd b/python/cudf/cudf/_lib/strings/__init__.pxd deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py deleted file mode 100644 index b9095a22a42..00000000000 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cudf._lib.nvtext.edit_distance import edit_distance, edit_distance_matrix -from cudf._lib.nvtext.generate_ngrams import ( - generate_character_ngrams, - generate_ngrams, - hash_character_ngrams, -) -from cudf._lib.nvtext.jaccard import jaccard_index -from cudf._lib.nvtext.minhash import ( - minhash, - minhash64, -) -from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize -from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces -from cudf._lib.nvtext.replace import filter_tokens, replace_tokens -from cudf._lib.nvtext.stemmer import ( - LetterType, - is_letter, - is_letter_multi, - porter_stemmer_measure, -) -from cudf._lib.nvtext.tokenize import ( - _count_tokens_column, - _count_tokens_scalar, - _tokenize_column, - _tokenize_scalar, - character_tokenize, - detokenize, - tokenize_with_vocabulary, -) diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py index 8d38a5f2272..b49f5154697 100644 --- a/python/cudf/cudf/core/byte_pair_encoding.py +++ b/python/cudf/cudf/core/byte_pair_encoding.py @@ -5,9 +5,6 @@ import pylibcudf as plc import cudf -from cudf._lib.nvtext.byte_pair_encode import ( - byte_pair_encoding as cpp_byte_pair_encoding, -) class BytePairEncoder: @@ -25,12 +22,12 @@ class BytePairEncoder: BytePairEncoder """ - def __init__(self, merges_pair: "cudf.Series"): + def __init__(self, merges_pair: cudf.Series) -> None: self.merge_pairs = plc.nvtext.byte_pair_encode.BPEMergePairs( merges_pair._column.to_pylibcudf(mode="read") ) - def __call__(self, text, separator: str = " ") -> cudf.Series: + def __call__(self, text: cudf.Series, separator: str = " ") -> cudf.Series: """ Parameters @@ -57,6 +54,6 @@ def __call__(self, text, separator: str = " ") -> cudf.Series: dtype: object """ sep = cudf.Scalar(separator, dtype="str") - result = cpp_byte_pair_encoding(text._column, self.merge_pairs, sep) - - return cudf.Series._from_column(result) + return cudf.Series._from_column( + text._column.byte_pair_encoding(self.merge_pairs, sep) + ) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 06196717ce3..c021554f3bd 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -20,7 +20,7 @@ import cudf.core.column.column as column import cudf.core.column.datetime as datetime from cudf import _lib as libcudf -from cudf._lib import string_casting as str_cast, strings as libstrings +from cudf._lib import string_casting as str_cast from cudf._lib.column import Column from cudf._lib.types import size_type_dtype from cudf.api.types import is_integer, is_scalar, is_string_dtype @@ -45,6 +45,7 @@ SeriesOrIndex, ) from cudf.core.buffer import Buffer + from cudf.core.column.lists import ListColumn from cudf.core.column.numerical import NumericalColumn @@ -624,7 +625,7 @@ def join( def _split_by_character(self): col = self._column.fillna("") # sanitize nulls - result_col = libstrings.character_tokenize(col) + result_col = col.character_tokenize() offset_col = col.children[0] @@ -4693,9 +4694,7 @@ def normalize_spaces(self) -> SeriesOrIndex: 1 test string dtype: object """ - return self._return_or_inplace( - libstrings.normalize_spaces(self._column) - ) + return self._return_or_inplace(self._column.normalize_spaces()) def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: r""" @@ -4743,7 +4742,7 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: dtype: object """ return self._return_or_inplace( - libstrings.normalize_characters(self._column, do_lower) + self._column.normalize_characters(do_lower) ) def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: @@ -4775,16 +4774,16 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: 2 goodbye dtype: object """ - delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True) + delim = _massage_string_arg(delimiter, "delimiter", allow_col=True) - if isinstance(delimiter, Column): + if isinstance(delim, Column): result = self._return_or_inplace( - libstrings._tokenize_column(self._column, delimiter), + self._column.tokenize_column(delim), retain_index=False, ) - elif isinstance(delimiter, cudf.Scalar): + elif isinstance(delim, cudf.Scalar): result = self._return_or_inplace( - libstrings._tokenize_scalar(self._column, delimiter), + self._column.tokenize_scalar(delim), retain_index=False, ) else: @@ -4799,7 +4798,7 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: return result def detokenize( - self, indices: "cudf.Series", separator: str = " " + self, indices: cudf.Series, separator: str = " " ) -> SeriesOrIndex: """ Combines tokens into strings by concatenating them in the order @@ -4829,9 +4828,9 @@ def detokenize( 2 three dtype: object """ - separator = _massage_string_arg(separator, "separator") + sep = _massage_string_arg(separator, "separator") return self._return_or_inplace( - libstrings.detokenize(self._column, indices._column, separator), + self._column.detokenize(indices._column, sep), # type: ignore[arg-type] retain_index=False, ) @@ -4882,17 +4881,15 @@ def character_tokenize(self) -> SeriesOrIndex: 2 . dtype: object """ - result_col = libstrings.character_tokenize(self._column) + result_col = self._column.character_tokenize() if isinstance(self._parent, cudf.Series): lengths = self.len().fillna(0) index = self._parent.index.repeat(lengths) - return cudf.Series._from_column( + return type(self._parent)._from_column( result_col, name=self._parent.name, index=index ) - elif isinstance(self._parent, cudf.BaseIndex): - return cudf.Index._from_column(result_col, name=self._parent.name) else: - return result_col + return self._return_or_inplace(result_col) def token_count(self, delimiter: str = " ") -> SeriesOrIndex: """ @@ -4919,15 +4916,15 @@ def token_count(self, delimiter: str = " ") -> SeriesOrIndex: 2 0 dtype: int32 """ - delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True) - if isinstance(delimiter, Column): + delim = _massage_string_arg(delimiter, "delimiter", allow_col=True) + if isinstance(delim, Column): return self._return_or_inplace( - libstrings._count_tokens_column(self._column, delimiter) + self._column.count_tokens_column(delim) ) - elif isinstance(delimiter, cudf.Scalar): + elif isinstance(delim, cudf.Scalar): return self._return_or_inplace( - libstrings._count_tokens_scalar(self._column, delimiter) + self._column.count_tokens_scalar(delim) # type: ignore[arg-type] ) else: raise TypeError( @@ -4966,9 +4963,9 @@ def ngrams(self, n: int = 2, separator: str = "_") -> SeriesOrIndex: 2 xyz_hhh dtype: object """ - separator = _massage_string_arg(separator, "separator") + sep = _massage_string_arg(separator, "separator") return self._return_or_inplace( - libstrings.generate_ngrams(self._column, n, separator), + self._column.generate_ngrams(n, sep), # type: ignore[arg-type] retain_index=False, ) @@ -5015,7 +5012,7 @@ def character_ngrams( dtype: list """ result = self._return_or_inplace( - libstrings.generate_character_ngrams(self._column, n), + self._column.generate_character_ngrams(n), retain_index=True, ) if isinstance(result, cudf.Series) and not as_list: @@ -5060,7 +5057,7 @@ def hash_character_ngrams( """ result = self._return_or_inplace( - libstrings.hash_character_ngrams(self._column, n), + self._column.hash_character_ngrams(n), retain_index=True, ) if isinstance(result, cudf.Series) and not as_list: @@ -5098,10 +5095,10 @@ def ngrams_tokenize( 2 best_book dtype: object """ - delimiter = _massage_string_arg(delimiter, "delimiter") - separator = _massage_string_arg(separator, "separator") + delim = _massage_string_arg(delimiter, "delimiter") + sep = _massage_string_arg(separator, "separator") return self._return_or_inplace( - libstrings.ngrams_tokenize(self._column, n, delimiter, separator), + self._column.ngrams_tokenize(n, delim, sep), # type: ignore[arg-type] retain_index=False, ) @@ -5180,10 +5177,9 @@ def replace_tokens( ) return self._return_or_inplace( - libstrings.replace_tokens( - self._column, - targets_column, - replacements_column, + self._column.replace_tokens( + targets_column, # type: ignore[arg-type] + replacements_column, # type: ignore[arg-type] cudf.Scalar(delimiter, dtype="str"), ), ) @@ -5251,8 +5247,7 @@ def filter_tokens( ) return self._return_or_inplace( - libstrings.filter_tokens( - self._column, + self._column.filter_tokens( min_token_length, cudf.Scalar(replacement, dtype="str"), cudf.Scalar(delimiter, dtype="str"), @@ -5278,9 +5273,7 @@ def porter_stemmer_measure(self) -> SeriesOrIndex: 1 2 dtype: int32 """ - return self._return_or_inplace( - libstrings.porter_stemmer_measure(self._column) - ) + return self._return_or_inplace(self._column.porter_stemmer_measure()) def is_consonant(self, position) -> SeriesOrIndex: """ @@ -5313,17 +5306,10 @@ def is_consonant(self, position) -> SeriesOrIndex: 1 False dtype: bool """ - ltype = libstrings.LetterType.CONSONANT - if can_convert_to_column(position): - return self._return_or_inplace( - libstrings.is_letter_multi( - self._column, ltype, column.as_column(position) - ), - ) - + position = column.as_column(position) return self._return_or_inplace( - libstrings.is_letter(self._column, ltype, position) + self._column.is_letter(False, position) # type: ignore[arg-type] ) def is_vowel(self, position) -> SeriesOrIndex: @@ -5357,17 +5343,10 @@ def is_vowel(self, position) -> SeriesOrIndex: 1 True dtype: bool """ - ltype = libstrings.LetterType.VOWEL - if can_convert_to_column(position): - return self._return_or_inplace( - libstrings.is_letter_multi( - self._column, ltype, column.as_column(position) - ), - ) - + position = column.as_column(position) return self._return_or_inplace( - libstrings.is_letter(self._column, ltype, position) + self._column.is_letter(True, position) # type: ignore[arg-type] ) def edit_distance(self, targets) -> SeriesOrIndex: @@ -5416,7 +5395,7 @@ def edit_distance(self, targets) -> SeriesOrIndex: ) return self._return_or_inplace( - libstrings.edit_distance(self._column, targets_column) + self._column.edit_distance(targets_column) # type: ignore[arg-type] ) def edit_distance_matrix(self) -> SeriesOrIndex: @@ -5456,9 +5435,7 @@ def edit_distance_matrix(self) -> SeriesOrIndex: "Cannot compute edit distance between null strings. " "Consider removing them using `dropna` or fill with `fillna`." ) - return self._return_or_inplace( - libstrings.edit_distance_matrix(self._column) - ) + return self._return_or_inplace(self._column.edit_distance_matrix()) def minhash( self, seed: np.uint32, a: ColumnLike, b: ColumnLike, width: int @@ -5508,7 +5485,7 @@ def minhash( f"Expecting a Series with dtype uint32, got {type(b)}" ) return self._return_or_inplace( - libstrings.minhash(self._column, seed, a_column, b_column, width) + self._column.minhash(seed, a_column, b_column, width) # type: ignore[arg-type] ) def minhash64( @@ -5559,7 +5536,7 @@ def minhash64( f"Expecting a Series with dtype uint64, got {type(b)}" ) return self._return_or_inplace( - libstrings.minhash64(self._column, seed, a_column, b_column, width) + self._column.minhash64(seed, a_column, b_column, width) # type: ignore[arg-type] ) def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex: @@ -5585,13 +5562,14 @@ def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex: 1 0.307692 dtype: float32 """ - return self._return_or_inplace( - libstrings.jaccard_index(self._column, input._column, width), + self._column.jaccard_index(input._column, width) ) -def _massage_string_arg(value, name, allow_col=False): +def _massage_string_arg( + value, name, allow_col: bool = False +) -> StringColumn | cudf.Scalar: if isinstance(value, cudf.Scalar): return value @@ -5602,9 +5580,9 @@ def _massage_string_arg(value, name, allow_col=False): if allow_col: if isinstance(value, list): - return column.as_column(value, dtype="str") + return column.as_column(value, dtype="str") # type: ignore[return-value] - if isinstance(value, Column) and is_string_dtype(value.dtype): + if isinstance(value, StringColumn): return value allowed_types.append("Column") @@ -6148,6 +6126,278 @@ def view(self, dtype) -> "cudf.core.column.ColumnBase": return to_view.view(dtype) + @acquire_spill_lock() + def minhash( + self, + seed: np.uint32, + a: NumericalColumn, + b: NumericalColumn, + width: int, + ) -> ListColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.minhash.minhash( + self.to_pylibcudf(mode="read"), + seed, + a.to_pylibcudf(mode="read"), + b.to_pylibcudf(mode="read"), + width, + ) + ) + + @acquire_spill_lock() + def minhash64( + self, + seed: np.uint64, + a: NumericalColumn, + b: NumericalColumn, + width: int, + ) -> ListColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.minhash.minhash64( + self.to_pylibcudf(mode="read"), + seed, + a.to_pylibcudf(mode="read"), + b.to_pylibcudf(mode="read"), + width, + ) + ) + + @acquire_spill_lock() + def jaccard_index(self, other: Self, width: int) -> NumericalColumn: + result = plc.nvtext.jaccard.jaccard_index( + self.to_pylibcudf(mode="read"), + other.to_pylibcudf(mode="read"), + width, + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def generate_ngrams(self, ngrams: int, separator: cudf.Scalar) -> Self: + result = plc.nvtext.generate_ngrams.generate_ngrams( + self.to_pylibcudf(mode="read"), + ngrams, + separator.device_value.c_value, + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def generate_character_ngrams(self, ngrams: int) -> ListColumn: + result = plc.nvtext.generate_ngrams.generate_character_ngrams( + self.to_pylibcudf(mode="read"), ngrams + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def hash_character_ngrams(self, ngrams: int) -> ListColumn: + result = plc.nvtext.generate_ngrams.hash_character_ngrams( + self.to_pylibcudf(mode="read"), ngrams + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def edit_distance(self, targets: Self) -> NumericalColumn: + result = plc.nvtext.edit_distance.edit_distance( + self.to_pylibcudf(mode="read"), targets.to_pylibcudf(mode="read") + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def edit_distance_matrix(self) -> ListColumn: + result = plc.nvtext.edit_distance.edit_distance_matrix( + self.to_pylibcudf(mode="read") + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def byte_pair_encoding( + self, + merge_pairs: plc.nvtext.byte_pair_encode.BPEMergePairs, + separator: cudf.Scalar, + ) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.byte_pair_encode.byte_pair_encoding( + self.to_pylibcudf(mode="read"), + merge_pairs, + separator.device_value.c_value, + ) + ) + + @acquire_spill_lock() + def ngrams_tokenize( + self, + ngrams: int, + delimiter: cudf.Scalar, + separator: cudf.Scalar, + ) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.ngrams_tokenize.ngrams_tokenize( + self.to_pylibcudf(mode="read"), + ngrams, + delimiter.device_value.c_value, + separator.device_value.c_value, + ) + ) + + @acquire_spill_lock() + def normalize_spaces(self) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.normalize.normalize_spaces( + self.to_pylibcudf(mode="read") + ) + ) + + @acquire_spill_lock() + def normalize_characters(self, do_lower: bool = True) -> Self: + return Column.from_pylibcudf( # type: ignore[return-value] + plc.nvtext.normalize.normalize_characters( + self.to_pylibcudf(mode="read"), + do_lower, + ) + ) + + @acquire_spill_lock() + def replace_tokens( + self, targets: Self, replacements: Self, delimiter: cudf.Scalar + ) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.replace.replace_tokens( + self.to_pylibcudf(mode="read"), + targets.to_pylibcudf(mode="read"), + replacements.to_pylibcudf(mode="read"), + delimiter.device_value.c_value, + ) + ) + + @acquire_spill_lock() + def filter_tokens( + self, + min_token_length: int, + replacement: cudf.Scalar, + delimiter: cudf.Scalar, + ) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.replace.filter_tokens( + self.to_pylibcudf(mode="read"), + min_token_length, + replacement.device_value.c_value, + delimiter.device_value.c_value, + ) + ) + + @acquire_spill_lock() + def porter_stemmer_measure(self) -> NumericalColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.stemmer.porter_stemmer_measure( + self.to_pylibcudf(mode="read") + ) + ) + + @acquire_spill_lock() + def is_letter(self, is_vowel: bool, index: int | NumericalColumn) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.stemmer.is_letter( + self.to_pylibcudf(mode="read"), + is_vowel, + index + if isinstance(index, int) + else index.to_pylibcudf(mode="read"), + ) + ) + + @acquire_spill_lock() + def subword_tokenize( + self, + hashed_vocabulary: plc.nvtext.subword_tokenize.HashedVocabulary, + max_sequence_length: int = 64, + stride: int = 48, + do_lower: bool = True, + do_truncate: bool = False, + ) -> tuple[ColumnBase, ColumnBase, ColumnBase]: + """ + Subword tokenizes text series by using the pre-loaded hashed vocabulary + """ + result = plc.nvtext.subword_tokenize.subword_tokenize( + self.to_pylibcudf(mode="read"), + hashed_vocabulary, + max_sequence_length, + stride, + do_lower, + do_truncate, + ) + # return the 3 tensor components + tokens = type(self).from_pylibcudf(result[0]) + masks = type(self).from_pylibcudf(result[1]) + metadata = type(self).from_pylibcudf(result[2]) + return tokens, masks, metadata + + @acquire_spill_lock() + def tokenize_scalar(self, delimiter: cudf.Scalar) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.tokenize_scalar( + self.to_pylibcudf(mode="read"), delimiter.device_value.c_value + ) + ) + + @acquire_spill_lock() + def tokenize_column(self, delimiters: Self) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.tokenize_column( + self.to_pylibcudf(mode="read"), + delimiters.to_pylibcudf(mode="read"), + ) + ) + + @acquire_spill_lock() + def count_tokens_scalar(self, delimiter: cudf.Scalar) -> NumericalColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.count_tokens_scalar( + self.to_pylibcudf(mode="read"), delimiter.device_value.c_value + ) + ) + + @acquire_spill_lock() + def count_tokens_column(self, delimiters: Self) -> NumericalColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.count_tokens_column( + self.to_pylibcudf(mode="read"), + delimiters.to_pylibcudf(mode="read"), + ) + ) + + @acquire_spill_lock() + def character_tokenize(self) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.character_tokenize( + self.to_pylibcudf(mode="read") + ) + ) + + @acquire_spill_lock() + def tokenize_with_vocabulary( + self, + vocabulary: plc.nvtext.tokenize.TokenizeVocabulary, + delimiter: cudf.Scalar, + default_id: int, + ) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.tokenize_with_vocabulary( + self.to_pylibcudf(mode="read"), + vocabulary, + delimiter.device_value.c_value, + default_id, + ) + ) + + @acquire_spill_lock() + def detokenize(self, indices: ColumnBase, separator: cudf.Scalar) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.detokenize( + self.to_pylibcudf(mode="read"), + indices.to_pylibcudf(mode="read"), + separator.device_value.c_value, + ) + ) + def _modify_characters( self, method: Callable[[plc.Column], plc.Column] ) -> Self: diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py index dda1f199078..479838ef2a8 100644 --- a/python/cudf/cudf/core/subword_tokenizer.py +++ b/python/cudf/cudf/core/subword_tokenizer.py @@ -8,10 +8,6 @@ import pylibcudf as plc -from cudf._lib.nvtext.subword_tokenize import ( - subword_tokenize_inmem_hash as cpp_subword_tokenize, -) - def _cast_to_appropriate_type(ar, cast_type): if cast_type == "cp": @@ -210,8 +206,7 @@ def __call__( stride = max_length - stride # behavior varies from subword_tokenize but maps with huggingface - input_ids, attention_mask, metadata = cpp_subword_tokenize( - text._column, + input_ids, attention_mask, metadata = text._column.subword_tokenize( self.vocab_file, max_sequence_length=max_length, stride=stride, diff --git a/python/cudf/cudf/core/tokenize_vocabulary.py b/python/cudf/cudf/core/tokenize_vocabulary.py index 1e31376cce8..fb8b9b3131c 100644 --- a/python/cudf/cudf/core/tokenize_vocabulary.py +++ b/python/cudf/cudf/core/tokenize_vocabulary.py @@ -5,9 +5,6 @@ import pylibcudf as plc import cudf -from cudf._lib.nvtext.tokenize import ( - tokenize_with_vocabulary as cpp_tokenize_with_vocabulary, -) class TokenizeVocabulary: @@ -20,7 +17,7 @@ class TokenizeVocabulary: Strings column of vocabulary terms """ - def __init__(self, vocabulary: "cudf.Series"): + def __init__(self, vocabulary: cudf.Series) -> None: self.vocabulary = plc.nvtext.tokenize.TokenizeVocabulary( vocabulary._column.to_pylibcudf(mode="read") ) @@ -46,8 +43,8 @@ def tokenize( if delimiter is None: delimiter = "" delim = cudf.Scalar(delimiter, dtype="str") - result = cpp_tokenize_with_vocabulary( - text._column, self.vocabulary, delim, default_id + result = text._column.tokenize_with_vocabulary( + self.vocabulary, delim, default_id ) return cudf.Series._from_column(result) From 774970283bfa6ca5ac4bc0619fc8595f01b7362b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 12 Dec 2024 18:06:44 -0800 Subject: [PATCH 69/78] Remove cudf._lib.csv in favor in inlining pylibcudf (#17485) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17485 --- python/cudf/cudf/_lib/CMakeLists.txt | 5 +- python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/csv.pyx | 414 ------------------------ python/cudf/cudf/io/csv.py | 466 ++++++++++++++++++++++----- 4 files changed, 385 insertions(+), 501 deletions(-) delete mode 100644 python/cudf/cudf/_lib/csv.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index c2677c6d88d..b402db0443d 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -12,9 +12,8 @@ # the License. # ============================================================================= -set(cython_sources - column.pyx copying.pyx csv.pyx groupby.pyx interop.pyx reduce.pyx scalar.pyx sort.pyx - stream_compaction.pyx string_casting.pyx strings_udf.pyx types.pyx utils.pyx +set(cython_sources column.pyx copying.pyx groupby.pyx interop.pyx reduce.pyx scalar.pyx sort.pyx + stream_compaction.pyx string_casting.pyx strings_udf.pyx types.pyx utils.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index f86a15b932b..0299b264189 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -3,7 +3,6 @@ from . import ( copying, - csv, groupby, interop, reduce, diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx deleted file mode 100644 index 641fc18c203..00000000000 --- a/python/cudf/cudf/_lib/csv.pyx +++ /dev/null @@ -1,414 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libcpp cimport bool - -cimport pylibcudf.libcudf.types as libcudf_types - -from cudf._lib.types cimport dtype_to_pylibcudf_type - -import errno -import os -from collections import abc -from io import BytesIO, StringIO - -import numpy as np -import pandas as pd - -import cudf -from cudf.core.buffer import acquire_spill_lock - -from libcpp cimport bool - -from cudf._lib.utils cimport data_from_pylibcudf_io - -import pylibcudf as plc - -from cudf.api.types import is_hashable - -from pylibcudf.types cimport DataType - -CSV_HEX_TYPE_MAP = { - "hex": np.dtype("int64"), - "hex64": np.dtype("int64"), - "hex32": np.dtype("int32") -} - - -def validate_args( - object delimiter, - object sep, - bool delim_whitespace, - object decimal, - object thousands, - object nrows, - int skipfooter, - object byte_range, - int skiprows -): - if delim_whitespace: - if delimiter is not None: - raise ValueError("cannot set both delimiter and delim_whitespace") - if sep != ',': - raise ValueError("cannot set both sep and delim_whitespace") - - # Alias sep -> delimiter. - actual_delimiter = delimiter if delimiter else sep - - if decimal == actual_delimiter: - raise ValueError("decimal cannot be the same as delimiter") - - if thousands == actual_delimiter: - raise ValueError("thousands cannot be the same as delimiter") - - if nrows is not None and skipfooter != 0: - raise ValueError("cannot use both nrows and skipfooter parameters") - - if byte_range is not None: - if skipfooter != 0 or skiprows != 0 or nrows is not None: - raise ValueError("""cannot manually limit rows to be read when - using the byte range parameter""") - - -def read_csv( - object datasource, - object lineterminator="\n", - object quotechar='"', - int quoting=0, - bool doublequote=True, - object header="infer", - bool mangle_dupe_cols=True, - object usecols=None, - object sep=",", - object delimiter=None, - bool delim_whitespace=False, - bool skipinitialspace=False, - object names=None, - object dtype=None, - int skipfooter=0, - int skiprows=0, - bool dayfirst=False, - object compression="infer", - object thousands=None, - object decimal=".", - object true_values=None, - object false_values=None, - object nrows=None, - object byte_range=None, - bool skip_blank_lines=True, - object parse_dates=None, - object comment=None, - object na_values=None, - bool keep_default_na=True, - bool na_filter=True, - object prefix=None, - object index_col=None, -): - """ - Cython function to call into libcudf API, see `read_csv`. - - See Also - -------- - cudf.read_csv - """ - - if not isinstance(datasource, (BytesIO, StringIO, bytes)): - if not os.path.isfile(datasource): - raise FileNotFoundError( - errno.ENOENT, os.strerror(errno.ENOENT), datasource - ) - - if isinstance(datasource, StringIO): - datasource = datasource.read().encode() - elif isinstance(datasource, str) and not os.path.isfile(datasource): - datasource = datasource.encode() - - validate_args(delimiter, sep, delim_whitespace, decimal, thousands, - nrows, skipfooter, byte_range, skiprows) - - # Alias sep -> delimiter. - if delimiter is None: - delimiter = sep - - delimiter = str(delimiter) - - if byte_range is None: - byte_range = (0, 0) - - if compression is None: - c_compression = plc.io.types.CompressionType.NONE - else: - compression_map = { - "infer": plc.io.types.CompressionType.AUTO, - "gzip": plc.io.types.CompressionType.GZIP, - "bz2": plc.io.types.CompressionType.BZIP2, - "zip": plc.io.types.CompressionType.ZIP, - } - c_compression = compression_map[compression] - - # We need this later when setting index cols - orig_header = header - - if names is not None: - # explicitly mentioned name, so don't check header - if header is None or header == 'infer': - header = -1 - else: - header = header - names = list(names) - else: - if header is None: - header = -1 - elif header == 'infer': - header = 0 - - hex_cols = [] - - new_dtypes = [] - if dtype is not None: - if isinstance(dtype, abc.Mapping): - new_dtypes = dict() - for k, v in dtype.items(): - col_type = v - if is_hashable(v) and v in CSV_HEX_TYPE_MAP: - col_type = CSV_HEX_TYPE_MAP[v] - hex_cols.append(str(k)) - - new_dtypes[k] = _get_plc_data_type_from_dtype( - cudf.dtype(col_type) - ) - elif ( - cudf.api.types.is_scalar(dtype) or - isinstance(dtype, ( - np.dtype, pd.api.extensions.ExtensionDtype, type - )) - ): - if is_hashable(dtype) and dtype in CSV_HEX_TYPE_MAP: - dtype = CSV_HEX_TYPE_MAP[dtype] - hex_cols.append(0) - - new_dtypes.append( - _get_plc_data_type_from_dtype(dtype) - ) - elif isinstance(dtype, abc.Collection): - for index, col_dtype in enumerate(dtype): - if is_hashable(col_dtype) and col_dtype in CSV_HEX_TYPE_MAP: - col_dtype = CSV_HEX_TYPE_MAP[col_dtype] - hex_cols.append(index) - - new_dtypes.append( - _get_plc_data_type_from_dtype(col_dtype) - ) - else: - raise ValueError( - "dtype should be a scalar/str/list-like/dict-like" - ) - options = ( - plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([datasource])) - .compression(c_compression) - .mangle_dupe_cols(mangle_dupe_cols) - .byte_range_offset(byte_range[0]) - .byte_range_size(byte_range[1]) - .nrows(nrows if nrows is not None else -1) - .skiprows(skiprows) - .skipfooter(skipfooter) - .quoting(quoting) - .lineterminator(str(lineterminator)) - .quotechar(quotechar) - .decimal(decimal) - .delim_whitespace(delim_whitespace) - .skipinitialspace(skipinitialspace) - .skip_blank_lines(skip_blank_lines) - .doublequote(doublequote) - .keep_default_na(keep_default_na) - .na_filter(na_filter) - .dayfirst(dayfirst) - .build() - ) - - options.set_header(header) - - if names is not None: - options.set_names([str(name) for name in names]) - - if prefix is not None: - options.set_prefix(prefix) - - if usecols is not None: - if all(isinstance(col, int) for col in usecols): - options.set_use_cols_indexes(list(usecols)) - else: - options.set_use_cols_names([str(name) for name in usecols]) - - if delimiter is not None: - options.set_delimiter(delimiter) - - if thousands is not None: - options.set_thousands(thousands) - - if comment is not None: - options.set_comment(comment) - - if parse_dates is not None: - options.set_parse_dates(list(parse_dates)) - - if hex_cols is not None: - options.set_parse_hex(list(hex_cols)) - - options.set_dtypes(new_dtypes) - - if true_values is not None: - options.set_true_values([str(val) for val in true_values]) - - if false_values is not None: - options.set_false_values([str(val) for val in false_values]) - - if na_values is not None: - options.set_na_values([str(val) for val in na_values]) - - df = cudf.DataFrame._from_data( - *data_from_pylibcudf_io(plc.io.csv.read_csv(options)) - ) - - if dtype is not None: - if isinstance(dtype, abc.Mapping): - for k, v in dtype.items(): - if isinstance(cudf.dtype(v), cudf.CategoricalDtype): - df._data[str(k)] = df._data[str(k)].astype(v) - elif ( - cudf.api.types.is_scalar(dtype) or - isinstance(dtype, ( - np.dtype, pd.api.extensions.ExtensionDtype, type - )) - ): - if isinstance(cudf.dtype(dtype), cudf.CategoricalDtype): - df = df.astype(dtype) - elif isinstance(dtype, abc.Collection): - for index, col_dtype in enumerate(dtype): - if isinstance(cudf.dtype(col_dtype), cudf.CategoricalDtype): - col_name = df._column_names[index] - df._data[col_name] = df._data[col_name].astype(col_dtype) - - if names is not None and len(names) and isinstance(names[0], int): - df.columns = [int(x) for x in df._data] - elif names is None and header == -1 and cudf.get_option("mode.pandas_compatible"): - df.columns = [int(x) for x in df._column_names] - - # Set index if the index_col parameter is passed - if index_col is not None and index_col is not False: - if isinstance(index_col, int): - index_col_name = df._data.get_labels_by_index(index_col)[0] - df = df.set_index(index_col_name) - if isinstance(index_col_name, str) and \ - names is None and orig_header == "infer": - if index_col_name.startswith("Unnamed:"): - # TODO: Try to upstream it to libcudf - # csv reader in future - df._index.name = None - elif names is None: - df._index.name = index_col - else: - df = df.set_index(index_col) - - return df - - -@acquire_spill_lock() -def write_csv( - table, - object path_or_buf=None, - object sep=",", - object na_rep="", - bool header=True, - object lineterminator="\n", - int rows_per_chunk=8, - bool index=True, -): - """ - Cython function to call into libcudf API, see `write_csv`. - - See Also - -------- - cudf.to_csv - """ - index_and_not_empty = index is True and table.index is not None - columns = [ - col.to_pylibcudf(mode="read") for col in table.index._columns - ] if index_and_not_empty else [] - columns.extend(col.to_pylibcudf(mode="read") for col in table._columns) - col_names = [] - if header: - all_names = list(table.index.names) if index_and_not_empty else [] - all_names.extend( - na_rep if name is None or pd.isnull(name) - else name for name in table._column_names - ) - col_names = [ - '""' if (name in (None, '') and len(all_names) == 1) - else (str(name) if name not in (None, '') else '') - for name in all_names - ] - try: - plc.io.csv.write_csv( - ( - plc.io.csv.CsvWriterOptions.builder( - plc.io.SinkInfo([path_or_buf]), plc.Table(columns) - ) - .names(col_names) - .na_rep(na_rep) - .include_header(header) - .rows_per_chunk(rows_per_chunk) - .line_terminator(str(lineterminator)) - .inter_column_delimiter(str(sep)) - .true_value("True") - .false_value("False") - .build() - ) - ) - except OverflowError: - raise OverflowError( - f"Writing CSV file with chunksize={rows_per_chunk} failed. " - "Consider providing a smaller chunksize argument." - ) - - -cdef DataType _get_plc_data_type_from_dtype(object dtype) except *: - # TODO: Remove this work-around Dictionary types - # in libcudf are fully mapped to categorical columns: - # https://github.com/rapidsai/cudf/issues/3960 - if isinstance(dtype, cudf.CategoricalDtype): - dtype = dtype.categories.dtype - elif dtype == "category": - dtype = "str" - - if isinstance(dtype, str): - if str(dtype) == "date32": - return DataType( - libcudf_types.type_id.TIMESTAMP_DAYS - ) - elif str(dtype) in ("date", "date64"): - return DataType( - libcudf_types.type_id.TIMESTAMP_MILLISECONDS - ) - elif str(dtype) == "timestamp": - return DataType( - libcudf_types.type_id.TIMESTAMP_MILLISECONDS - ) - elif str(dtype) == "timestamp[us]": - return DataType( - libcudf_types.type_id.TIMESTAMP_MICROSECONDS - ) - elif str(dtype) == "timestamp[s]": - return DataType( - libcudf_types.type_id.TIMESTAMP_SECONDS - ) - elif str(dtype) == "timestamp[ms]": - return DataType( - libcudf_types.type_id.TIMESTAMP_MILLISECONDS - ) - elif str(dtype) == "timestamp[ns]": - return DataType( - libcudf_types.type_id.TIMESTAMP_NANOSECONDS - ) - - dtype = cudf.dtype(dtype) - return dtype_to_pylibcudf_type(dtype) diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index 3dc8915bfd1..da9a66f3874 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -1,57 +1,73 @@ # Copyright (c) 2018-2024, NVIDIA CORPORATION. +from __future__ import annotations +import errno +import itertools +import os import warnings from collections import abc from io import BytesIO, StringIO +from typing import cast import numpy as np +import pandas as pd + +import pylibcudf as plc import cudf -from cudf import _lib as libcudf -from cudf.api.types import is_scalar +from cudf._lib.types import dtype_to_pylibcudf_type +from cudf._lib.utils import data_from_pylibcudf_io +from cudf.api.types import is_hashable, is_scalar +from cudf.core.buffer import acquire_spill_lock from cudf.utils import ioutils from cudf.utils.dtypes import _maybe_convert_to_default_type from cudf.utils.performance_tracking import _performance_tracking +_CSV_HEX_TYPE_MAP = { + "hex": np.dtype("int64"), + "hex64": np.dtype("int64"), + "hex32": np.dtype("int32"), +} + @_performance_tracking @ioutils.doc_read_csv() def read_csv( filepath_or_buffer, - sep=",", - delimiter=None, + sep: str = ",", + delimiter: str | None = None, header="infer", names=None, index_col=None, usecols=None, prefix=None, - mangle_dupe_cols=True, + mangle_dupe_cols: bool = True, dtype=None, true_values=None, false_values=None, - skipinitialspace=False, - skiprows=0, - skipfooter=0, - nrows=None, + skipinitialspace: bool = False, + skiprows: int = 0, + skipfooter: int = 0, + nrows: int | None = None, na_values=None, - keep_default_na=True, - na_filter=True, - skip_blank_lines=True, + keep_default_na: bool = True, + na_filter: bool = True, + skip_blank_lines: bool = True, parse_dates=None, - dayfirst=False, + dayfirst: bool = False, compression="infer", - thousands=None, - decimal=".", - lineterminator="\n", - quotechar='"', - quoting=0, - doublequote=True, - comment=None, - delim_whitespace=False, - byte_range=None, + thousands: str | None = None, + decimal: str = ".", + lineterminator: str = "\n", + quotechar: str = '"', + quoting: int = 0, + doublequote: bool = True, + comment: str | None = None, + delim_whitespace: bool = False, + byte_range: list[int] | tuple[int, int] | None = None, storage_options=None, - bytes_per_thread=None, -): + bytes_per_thread: int | None = None, +) -> cudf.DataFrame: """{docstring}""" if delim_whitespace is not False: @@ -77,60 +93,225 @@ def read_csv( if na_values is not None and is_scalar(na_values): na_values = [na_values] - df = libcudf.csv.read_csv( - filepath_or_buffer, - lineterminator=lineterminator, - quotechar=quotechar, - quoting=quoting, - doublequote=doublequote, - header=header, - mangle_dupe_cols=mangle_dupe_cols, - usecols=usecols, - sep=sep, - delimiter=delimiter, - delim_whitespace=delim_whitespace, - skipinitialspace=skipinitialspace, - names=names, - dtype=dtype, - skipfooter=skipfooter, - skiprows=skiprows, - dayfirst=dayfirst, - compression=compression, - thousands=thousands, - decimal=decimal, - true_values=true_values, - false_values=false_values, - nrows=nrows, - byte_range=byte_range, - skip_blank_lines=skip_blank_lines, - parse_dates=parse_dates, - comment=comment, - na_values=na_values, - keep_default_na=keep_default_na, - na_filter=na_filter, - prefix=prefix, - index_col=index_col, + if not isinstance(filepath_or_buffer, (BytesIO, StringIO, bytes)): + if not os.path.isfile(filepath_or_buffer): + raise FileNotFoundError( + errno.ENOENT, os.strerror(errno.ENOENT), filepath_or_buffer + ) + + if isinstance(filepath_or_buffer, StringIO): + filepath_or_buffer = filepath_or_buffer.read().encode() + elif isinstance(filepath_or_buffer, str) and not os.path.isfile( + filepath_or_buffer + ): + filepath_or_buffer = filepath_or_buffer.encode() + + _validate_args( + delimiter, + sep, + delim_whitespace, + decimal, + thousands, + nrows, + skipfooter, + byte_range, + skiprows, + ) + + # Alias sep -> delimiter. + if delimiter is None: + delimiter = sep + + delimiter = str(delimiter) + + if byte_range is None: + byte_range = (0, 0) + + if compression is None: + c_compression = plc.io.types.CompressionType.NONE + else: + compression_map = { + "infer": plc.io.types.CompressionType.AUTO, + "gzip": plc.io.types.CompressionType.GZIP, + "bz2": plc.io.types.CompressionType.BZIP2, + "zip": plc.io.types.CompressionType.ZIP, + } + c_compression = compression_map[compression] + + # We need this later when setting index cols + orig_header = header + + if names is not None: + # explicitly mentioned name, so don't check header + if header is None or header == "infer": + header = -1 + else: + header = header + names = list(names) + else: + if header is None: + header = -1 + elif header == "infer": + header = 0 + + hex_cols: list[abc.Hashable] = [] + new_dtypes: list[plc.DataType] | dict[abc.Hashable, plc.DataType] = [] + if dtype is not None: + if isinstance(dtype, abc.Mapping): + new_dtypes = {} + for k, col_type in dtype.items(): + if is_hashable(col_type) and col_type in _CSV_HEX_TYPE_MAP: + col_type = _CSV_HEX_TYPE_MAP[col_type] + hex_cols.append(str(k)) + + new_dtypes[k] = _get_plc_data_type_from_dtype( + cudf.dtype(col_type) + ) + elif cudf.api.types.is_scalar(dtype) or isinstance( + dtype, (np.dtype, pd.api.extensions.ExtensionDtype, type) + ): + if is_hashable(dtype) and dtype in _CSV_HEX_TYPE_MAP: + dtype = _CSV_HEX_TYPE_MAP[dtype] + hex_cols.append(0) + + cast(list, new_dtypes).append(_get_plc_data_type_from_dtype(dtype)) + elif isinstance(dtype, abc.Collection): + for index, col_dtype in enumerate(dtype): + if is_hashable(col_dtype) and col_dtype in _CSV_HEX_TYPE_MAP: + col_dtype = _CSV_HEX_TYPE_MAP[col_dtype] + hex_cols.append(index) + + new_dtypes.append(_get_plc_data_type_from_dtype(col_dtype)) + else: + raise ValueError( + "dtype should be a scalar/str/list-like/dict-like" + ) + options = ( + plc.io.csv.CsvReaderOptions.builder( + plc.io.SourceInfo([filepath_or_buffer]) + ) + .compression(c_compression) + .mangle_dupe_cols(mangle_dupe_cols) + .byte_range_offset(byte_range[0]) + .byte_range_size(byte_range[1]) + .nrows(nrows if nrows is not None else -1) + .skiprows(skiprows) + .skipfooter(skipfooter) + .quoting(quoting) + .lineterminator(str(lineterminator)) + .quotechar(quotechar) + .decimal(decimal) + .delim_whitespace(delim_whitespace) + .skipinitialspace(skipinitialspace) + .skip_blank_lines(skip_blank_lines) + .doublequote(doublequote) + .keep_default_na(keep_default_na) + .na_filter(na_filter) + .dayfirst(dayfirst) + .build() + ) + + options.set_header(header) + + if names is not None: + options.set_names([str(name) for name in names]) + + if prefix is not None: + options.set_prefix(prefix) + + if usecols is not None: + if all(isinstance(col, int) for col in usecols): + options.set_use_cols_indexes(list(usecols)) + else: + options.set_use_cols_names([str(name) for name in usecols]) + + if delimiter is not None: + options.set_delimiter(delimiter) + + if thousands is not None: + options.set_thousands(thousands) + + if comment is not None: + options.set_comment(comment) + + if parse_dates is not None: + options.set_parse_dates(list(parse_dates)) + + if hex_cols is not None: + options.set_parse_hex(list(hex_cols)) + + options.set_dtypes(new_dtypes) + + if true_values is not None: + options.set_true_values([str(val) for val in true_values]) + + if false_values is not None: + options.set_false_values([str(val) for val in false_values]) + + if na_values is not None: + options.set_na_values([str(val) for val in na_values]) + + df = cudf.DataFrame._from_data( + *data_from_pylibcudf_io(plc.io.csv.read_csv(options)) ) + if isinstance(dtype, abc.Mapping): + for k, v in dtype.items(): + if isinstance(cudf.dtype(v), cudf.CategoricalDtype): + df._data[str(k)] = df._data[str(k)].astype(v) + elif dtype == "category" or isinstance(dtype, cudf.CategoricalDtype): + df = df.astype(dtype) + elif isinstance(dtype, abc.Collection) and not is_scalar(dtype): + for index, col_dtype in enumerate(dtype): + if isinstance(cudf.dtype(col_dtype), cudf.CategoricalDtype): + col_name = df._column_names[index] + df._data[col_name] = df._data[col_name].astype(col_dtype) + + if names is not None and len(names) and isinstance(names[0], int): + df.columns = [int(x) for x in df._data] + elif ( + names is None + and header == -1 + and cudf.get_option("mode.pandas_compatible") + ): + df.columns = [int(x) for x in df._column_names] + + # Set index if the index_col parameter is passed + if index_col is not None and index_col is not False: + if isinstance(index_col, int): + index_col_name = df._data.get_labels_by_index(index_col)[0] + df = df.set_index(index_col_name) + if ( + isinstance(index_col_name, str) + and names is None + and orig_header == "infer" + ): + if index_col_name.startswith("Unnamed:"): + # TODO: Try to upstream it to libcudf + # csv reader in future + df.index.name = None + elif names is None: + df.index.name = index_col + else: + df = df.set_index(index_col) + if dtype is None or isinstance(dtype, abc.Mapping): # There exists some dtypes in the result columns that is inferred. # Find them and map them to the default dtypes. specified_dtypes = {} if dtype is None else dtype - unspecified_dtypes = { - name: dtype - for name, dtype in df._dtypes - if name not in specified_dtypes - } default_dtypes = {} - - for name, dt in unspecified_dtypes.items(): - if dt == np.dtype("i1"): + for name, dt in df._dtypes: + if name in specified_dtypes: + continue + elif dt == np.dtype("i1"): # csv reader reads all null column as int8. # The dtype should remain int8. default_dtypes[name] = dt else: default_dtypes[name] = _maybe_convert_to_default_type(dt) - df = df.astype(default_dtypes) + + if default_dtypes: + df = df.astype(default_dtypes) return df @@ -138,17 +319,17 @@ def read_csv( @_performance_tracking @ioutils.doc_to_csv() def to_csv( - df, + df: cudf.DataFrame, path_or_buf=None, - sep=",", - na_rep="", + sep: str = ",", + na_rep: str = "", columns=None, - header=True, - index=True, + header: bool = True, + index: bool = True, encoding=None, compression=None, - lineterminator="\n", - chunksize=None, + lineterminator: str = "\n", + chunksize: int | None = None, storage_options=None, ): """{docstring}""" @@ -187,15 +368,10 @@ def to_csv( ) for _, dtype in df._dtypes: - if isinstance(dtype, cudf.ListDtype): - raise NotImplementedError( - "Writing to csv format is not yet supported with " - "list columns." - ) - elif isinstance(dtype, cudf.StructDtype): + if isinstance(dtype, (cudf.ListDtype, cudf.StructDtype)): raise NotImplementedError( "Writing to csv format is not yet supported with " - "Struct columns." + f"{dtype} columns." ) # TODO: Need to typecast categorical columns to the underlying @@ -208,7 +384,7 @@ def to_csv( df = df.copy(deep=False) for col_name, col in df._column_labels_and_values: if isinstance(col.dtype, cudf.CategoricalDtype): - df._data[col_name] = col.astype(col.categories.dtype) + df._data[col_name] = col.astype(col.dtype.categories.dtype) if isinstance(df.index, cudf.CategoricalIndex): df.index = df.index.astype(df.index.categories.dtype) @@ -218,7 +394,7 @@ def to_csv( if ioutils.is_fsspec_open_file(path_or_buf): with path_or_buf as file_obj: file_obj = ioutils.get_IOBase_writer(file_obj) - libcudf.csv.write_csv( + _plc_write_csv( df, path_or_buf=file_obj, sep=sep, @@ -229,7 +405,7 @@ def to_csv( index=index, ) else: - libcudf.csv.write_csv( + _plc_write_csv( df, path_or_buf=path_or_buf, sep=sep, @@ -243,3 +419,127 @@ def to_csv( if return_as_string: path_or_buf.seek(0) return path_or_buf.read() + + +@acquire_spill_lock() +def _plc_write_csv( + table: cudf.DataFrame, + path_or_buf=None, + sep: str = ",", + na_rep: str = "", + header: bool = True, + lineterminator: str = "\n", + rows_per_chunk: int = 8, + index: bool = True, +) -> None: + iter_columns = ( + itertools.chain(table.index._columns, table._columns) + if index + else table._columns + ) + columns = [col.to_pylibcudf(mode="read") for col in iter_columns] + col_names = [] + if header: + table_names = ( + na_rep if name is None or pd.isnull(name) else name + for name in table._column_names + ) + iter_names = ( + itertools.chain(table.index.names, table_names) + if index + else table_names + ) + all_names = list(iter_names) + col_names = [ + '""' + if (name in (None, "") and len(all_names) == 1) + else (str(name) if name not in (None, "") else "") + for name in all_names + ] + try: + plc.io.csv.write_csv( + ( + plc.io.csv.CsvWriterOptions.builder( + plc.io.SinkInfo([path_or_buf]), plc.Table(columns) + ) + .names(col_names) + .na_rep(na_rep) + .include_header(header) + .rows_per_chunk(rows_per_chunk) + .line_terminator(str(lineterminator)) + .inter_column_delimiter(str(sep)) + .true_value("True") + .false_value("False") + .build() + ) + ) + except OverflowError as err: + raise OverflowError( + f"Writing CSV file with chunksize={rows_per_chunk} failed. " + "Consider providing a smaller chunksize argument." + ) from err + + +def _validate_args( + delimiter: str | None, + sep: str, + delim_whitespace: bool, + decimal: str, + thousands: str | None, + nrows: int | None, + skipfooter: int, + byte_range: list[int] | tuple[int, int] | None, + skiprows: int, +) -> None: + if delim_whitespace: + if delimiter is not None: + raise ValueError("cannot set both delimiter and delim_whitespace") + if sep != ",": + raise ValueError("cannot set both sep and delim_whitespace") + + # Alias sep -> delimiter. + actual_delimiter = delimiter if delimiter else sep + + if decimal == actual_delimiter: + raise ValueError("decimal cannot be the same as delimiter") + + if thousands == actual_delimiter: + raise ValueError("thousands cannot be the same as delimiter") + + if nrows is not None and skipfooter != 0: + raise ValueError("cannot use both nrows and skipfooter parameters") + + if byte_range is not None: + if skipfooter != 0 or skiprows != 0 or nrows is not None: + raise ValueError( + "cannot manually limit rows to be read when using the byte range parameter" + ) + + +def _get_plc_data_type_from_dtype(dtype) -> plc.DataType: + # TODO: Remove this work-around Dictionary types + # in libcudf are fully mapped to categorical columns: + # https://github.com/rapidsai/cudf/issues/3960 + if isinstance(dtype, cudf.CategoricalDtype): + dtype = dtype.categories.dtype + elif dtype == "category": + dtype = "str" + + if isinstance(dtype, str): + if dtype == "date32": + return plc.DataType(plc.types.TypeId.TIMESTAMP_DAYS) + elif dtype in ("date", "date64"): + return plc.DataType(plc.types.TypeId.TIMESTAMP_MILLISECONDS) + elif dtype == "timestamp": + return plc.DataType(plc.types.TypeId.TIMESTAMP_MILLISECONDS) + elif dtype == "timestamp[us]": + return plc.DataType(plc.types.TypeId.TIMESTAMP_MICROSECONDS) + elif dtype == "timestamp[s]": + return plc.DataType(plc.types.TypeId.TIMESTAMP_SECONDS) + elif dtype == "timestamp[ms]": + return plc.DataType(plc.types.TypeId.TIMESTAMP_MILLISECONDS) + elif dtype == "timestamp[ns]": + return plc.DataType(plc.types.TypeId.TIMESTAMP_NANOSECONDS) + + dtype = cudf.dtype(dtype) + return dtype_to_pylibcudf_type(dtype) From 5baaf6d7f868dc42f8e0213e164dca340a7bfcff Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Thu, 12 Dec 2024 21:55:49 -0500 Subject: [PATCH 70/78] Propagate failures in pandas integration tests and Skip failing tests (#17521) This PR ensures that the integration tests fail in any one of the test modules fails. It also skips of xfails any tests that are not currently passing. Finally, it fixes one incorrect use of `rng.random`. Some of the change were originally made in #17489 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17521 --- .../third-party-integration/test.sh | 16 ++- .../dependencies.yaml | 15 -- .../tests/test_catboost.py | 129 ------------------ .../tests/test_holoviews.py | 3 + .../tests/test_matplotlib.py | 6 + .../tests/test_numpy.py | 3 + .../tests/test_pytorch.py | 3 + .../tests/test_seaborn.py | 3 + .../tests/test_stumpy_distributed.py | 2 +- .../tests/test_tensorflow.py | 1 + .../tests/test_xgboost.py | 3 + 11 files changed, 34 insertions(+), 150 deletions(-) delete mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py diff --git a/ci/cudf_pandas_scripts/third-party-integration/test.sh b/ci/cudf_pandas_scripts/third-party-integration/test.sh index f8ddbaba0f3..30e3ffc9a43 100755 --- a/ci/cudf_pandas_scripts/third-party-integration/test.sh +++ b/ci/cudf_pandas_scripts/third-party-integration/test.sh @@ -26,6 +26,8 @@ main() { LIBS=${LIBS#[} LIBS=${LIBS%]} + ANY_FAILURES=0 + for lib in ${LIBS//,/ }; do lib=$(echo "$lib" | tr -d '""') echo "Running tests for library $lib" @@ -56,10 +58,6 @@ main() { rapids-logger "Check GPU usage" nvidia-smi - EXITCODE=0 - trap "EXITCODE=1" ERR - set +e - rapids-logger "pytest ${lib}" NUM_PROCESSES=8 @@ -72,12 +70,20 @@ main() { fi done + EXITCODE=0 + trap "EXITCODE=1" ERR + set +e + TEST_DIR=${TEST_DIR} NUM_PROCESSES=${NUM_PROCESSES} ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh ${lib} + set -e rapids-logger "Test script exiting with value: ${EXITCODE}" + if [[ ${EXITCODE} != 0 ]]; then + ANY_FAILURES=1 + fi done - exit ${EXITCODE} + exit ${ANY_FAILURES} } main "$@" diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml index e726b7fdca1..3891110e9d3 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml @@ -76,13 +76,6 @@ files: - py_version - test_base - test_xgboost - test_catboost: - output: none - includes: - - cuda_version - - py_version - - test_base - - test_catboost test_cuml: output: none includes: @@ -251,14 +244,6 @@ dependencies: - pip - pip: - xgboost>=2.0.1 - test_catboost: - common: - - output_types: conda - packages: - - numpy - - scipy - - scikit-learn - - catboost test_cuml: common: - output_types: conda diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py deleted file mode 100644 index 04cc69231fe..00000000000 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py +++ /dev/null @@ -1,129 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -import numpy as np -import pandas as pd -import pytest -from catboost import CatBoostClassifier, CatBoostRegressor, Pool -from sklearn.datasets import make_classification, make_regression - -rng = np.random.default_rng(seed=42) - - -def assert_catboost_equal(expect, got, rtol=1e-7, atol=0.0): - if isinstance(expect, (tuple, list)): - assert len(expect) == len(got) - for e, g in zip(expect, got): - assert_catboost_equal(e, g, rtol, atol) - elif isinstance(expect, np.ndarray): - np.testing.assert_allclose(expect, got, rtol=rtol, atol=atol) - elif isinstance(expect, pd.DataFrame): - pd.testing.assert_frame_equal(expect, got) - elif isinstance(expect, pd.Series): - pd.testing.assert_series_equal(expect, got) - else: - assert expect == got - - -pytestmark = pytest.mark.assert_eq(fn=assert_catboost_equal) - - -@pytest.fixture -def regression_data(): - X, y = make_regression(n_samples=100, n_features=10, random_state=42) - return pd.DataFrame(X), pd.Series(y) - - -@pytest.fixture -def classification_data(): - X, y = make_classification( - n_samples=100, n_features=10, n_classes=2, random_state=42 - ) - return pd.DataFrame(X), pd.Series(y) - - -def test_catboost_regressor_with_dataframe(regression_data): - X, y = regression_data - model = CatBoostRegressor(iterations=10, verbose=0) - model.fit(X, y) - predictions = model.predict(X) - return predictions - - -def test_catboost_regressor_with_numpy(regression_data): - X, y = regression_data - model = CatBoostRegressor(iterations=10, verbose=0) - model.fit(X.values, y.values) - predictions = model.predict(X.values) - return predictions - - -def test_catboost_classifier_with_dataframe(classification_data): - X, y = classification_data - model = CatBoostClassifier(iterations=10, verbose=0) - model.fit(X, y) - predictions = model.predict(X) - return predictions - - -def test_catboost_classifier_with_numpy(classification_data): - X, y = classification_data - model = CatBoostClassifier(iterations=10, verbose=0) - model.fit(X.values, y.values) - predictions = model.predict(X.values) - return predictions - - -def test_catboost_with_pool_and_dataframe(regression_data): - X, y = regression_data - train_pool = Pool(X, y) - model = CatBoostRegressor(iterations=10, verbose=0) - model.fit(train_pool) - predictions = model.predict(X) - return predictions - - -def test_catboost_with_pool_and_numpy(regression_data): - X, y = regression_data - train_pool = Pool(X.values, y.values) - model = CatBoostRegressor(iterations=10, verbose=0) - model.fit(train_pool) - predictions = model.predict(X.values) - return predictions - - -def test_catboost_with_categorical_features(): - data = { - "numerical_feature": rng.standard_normal(100), - "categorical_feature": rng.choice(["A", "B", "C"], size=100), - "target": rng.integers(0, 2, size=100), - } - df = pd.DataFrame(data) - X = df[["numerical_feature", "categorical_feature"]] - y = df["target"] - cat_features = ["categorical_feature"] - model = CatBoostClassifier( - iterations=10, verbose=0, cat_features=cat_features - ) - model.fit(X, y) - predictions = model.predict(X) - return predictions - - -@pytest.mark.parametrize( - "X, y", - [ - ( - pd.DataFrame(rng.standard_normal((100, 5))), - pd.Series(rng.standard_normal(100)), - ), - (rng.standard_normal((100, 5)), rng.standard_normal(100)), - ], -) -def test_catboost_train_test_split(X, y): - from sklearn.model_selection import train_test_split - - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) - model = CatBoostRegressor(iterations=10, verbose=0) - model.fit(X_train, y_train) - predictions = model.predict(X_test) - return len(X_train), len(X_test), len(y_train), len(y_test), predictions diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py index bef02c86355..8be48953974 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py @@ -71,6 +71,9 @@ def test_holoviews_heatmap(df): ) +@pytest.mark.skip( + reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'" +) def test_holoviews_histogram(df): return get_plot_info(hv.Histogram(df.values)) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py index 1909392b9f7..c91808021e8 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py @@ -33,6 +33,9 @@ def assert_plots_equal(expect, got): pytestmark = pytest.mark.assert_eq(fn=assert_plots_equal) +@pytest.mark.skip( + reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'" +) def test_line(): df = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [2, 4, 6, 8, 10]}) (data,) = plt.plot(df["x"], df["y"], marker="o", linestyle="-") @@ -40,6 +43,9 @@ def test_line(): return plt.gca() +@pytest.mark.skip( + reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'" +) def test_bar(): data = pd.Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"]) ax = data.plot(kind="bar") diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py index 472f1889354..4d35d9e8946 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py @@ -37,6 +37,9 @@ def test_numpy_dot(df): return np.dot(df, df.T) +@pytest.mark.skip( + reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'" +) def test_numpy_fft(sr): fft = np.fft.fft(sr) return fft diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py index ad287471aa0..7cea635afc4 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py @@ -116,6 +116,9 @@ def test_torch_train(data): return model(test_x1, test_x2) +@pytest.mark.skip( + reason="AssertionError: The values for attribute 'device' do not match: cpu != cuda:0." +) def test_torch_tensor_ctor(): s = pd.Series(range(5)) return torch.tensor(s.values) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py index 021c5bac9b7..f6a8a96ae3c 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py @@ -54,6 +54,9 @@ def test_scatter(df): return ax +@pytest.mark.skip( + reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'" +) def test_lineplot_with_sns_data(): df = sns.load_dataset("flights") ax = sns.lineplot(data=df, x="month", y="passengers") diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py index 0777d982ac2..f275659288e 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py @@ -41,7 +41,7 @@ def test_multidimensional_distributed_timeseries(dask_client): rng = np.random.default_rng(seed=42) # Each row represents data from a different dimension while each column represents # data from the same dimension - your_time_series = rng.random(3, 1000) + your_time_series = rng.random((3, 1000)) # Approximately, how many data points might be found in a pattern window_size = 50 diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py index ba1f518cbfd..b4fad3024e7 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py @@ -271,6 +271,7 @@ def call(self, values): return tf.concat(values, axis=-1) +@pytest.mark.xfail(reason="ValueError: Invalid dtype: object") def test_full_example_train_with_df(df, target): # https://www.tensorflow.org/tutorials/load_data/pandas_dataframe#full_example # Inputs are directly passed as dictionary of series diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py index 70f1e6a4250..0fd632507a6 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py @@ -113,6 +113,9 @@ def test_with_external_memory( return predt +@pytest.mark.skip( + reason="TypeError: Implicit conversion to a NumPy array is not allowed. Please use `.get()` to construct a NumPy array explicitly." +) @pytest.mark.parametrize("device", ["cpu", "cuda"]) def test_predict(device: str) -> np.ndarray: reg = xgb.XGBRegressor(n_estimators=2, device=device) From 48aa08f6dca0d60da421adb4b1735f075881541d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 12 Dec 2024 20:46:26 -0800 Subject: [PATCH 71/78] Remove cudf._lib.reduce in favor of inlining pylibcudf (#17574) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17574 --- python/cudf/cudf/_lib/CMakeLists.txt | 2 +- python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/copying.pyx | 4 +- python/cudf/cudf/_lib/reduce.pyx | 135 ------------------ python/cudf/cudf/core/column/column.py | 122 +++++++++++++--- python/cudf/cudf/core/column/interval.py | 14 -- python/cudf/cudf/core/column/numerical.py | 27 +--- .../cudf/cudf/core/column/numerical_base.py | 6 +- python/cudf/cudf/core/column/struct.py | 7 +- python/cudf/cudf/core/copy_types.py | 5 +- python/cudf/cudf/core/dataframe.py | 11 +- python/cudf/cudf/core/multiindex.py | 6 +- python/cudf/cudf/core/window/ewm.py | 10 +- 13 files changed, 120 insertions(+), 230 deletions(-) delete mode 100644 python/cudf/cudf/_lib/reduce.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index b402db0443d..8cec8af3c67 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources column.pyx copying.pyx groupby.pyx interop.pyx reduce.pyx scalar.pyx sort.pyx +set(cython_sources column.pyx copying.pyx groupby.pyx interop.pyx scalar.pyx sort.pyx stream_compaction.pyx string_casting.pyx strings_udf.pyx types.pyx utils.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 0299b264189..001e5cbb676 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -5,7 +5,6 @@ copying, groupby, interop, - reduce, sort, stream_compaction, string_casting, diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index a7ea9c25a86..ef544dc89eb 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -12,8 +12,6 @@ from cudf._lib.scalar import as_device_scalar from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.reduce import minmax - from pylibcudf.libcudf.types cimport size_type from cudf._lib.utils cimport columns_from_pylibcudf_table, data_from_pylibcudf_table @@ -34,7 +32,7 @@ def _gather_map_is_valid( """ if not check_bounds or nullify or len(gather_map) == 0: return True - gm_min, gm_max = minmax(gather_map) + gm_min, gm_max = gather_map.minmax() return gm_min >= -nrows and gm_max < nrows diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx deleted file mode 100644 index 2850cab93a1..00000000000 --- a/python/cudf/cudf/_lib/reduce.pyx +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -import warnings - -import cudf -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.types cimport dtype_to_pylibcudf_type, is_decimal_type_id - -import pylibcudf - -from cudf.core._internals.aggregation import make_aggregation - - -@acquire_spill_lock() -def reduce(reduction_op, Column incol, dtype=None, **kwargs): - """ - Top level Cython reduce function wrapping libcudf reductions. - - Parameters - ---------- - reduction_op : string - A string specifying the operation, e.g. sum, prod - incol : Column - A cuDF Column object - dtype: numpy.dtype, optional - A numpy data type to use for the output, defaults - to the same type as the input column - """ - if dtype is not None: - warnings.warn( - "dtype is deprecated and will be remove in a future release. " - "Cast the result (e.g. .astype) after the operation instead.", - FutureWarning - ) - col_dtype = dtype - else: - col_dtype = incol._reduction_result_dtype(reduction_op) - - # check empty case - if len(incol) <= incol.null_count: - if reduction_op == 'sum' or reduction_op == 'sum_of_squares': - return incol.dtype.type(0) - if reduction_op == 'product': - return incol.dtype.type(1) - if reduction_op == "any": - return False - - return cudf.utils.dtypes._get_nan_for_dtype(col_dtype) - - result = pylibcudf.reduce.reduce( - incol.to_pylibcudf(mode="read"), - make_aggregation(reduction_op, kwargs).c_obj, - dtype_to_pylibcudf_type(col_dtype), - ) - - if is_decimal_type_id(result.type().id()): - scale = -result.type().scale() - precision = _reduce_precision(col_dtype, reduction_op, len(incol)) - return DeviceScalar.from_pylibcudf( - result, - dtype=col_dtype.__class__(precision, scale), - ).value - scalar = DeviceScalar.from_pylibcudf(result).value - if isinstance(col_dtype, cudf.StructDtype): - # TODO: Utilize column_metadata in libcudf to maintain field labels - return dict(zip(col_dtype.fields.keys(), scalar.values())) - return scalar - - -@acquire_spill_lock() -def scan(scan_op, Column incol, inclusive, **kwargs): - """ - Top level Cython scan function wrapping libcudf scans. - - Parameters - ---------- - incol : Column - A cuDF Column object - scan_op : string - A string specifying the operation, e.g. cumprod - inclusive: bool - Flag for including nulls in relevant scan - """ - return Column.from_pylibcudf( - pylibcudf.reduce.scan( - incol.to_pylibcudf(mode="read"), - make_aggregation(scan_op, kwargs).c_obj, - pylibcudf.reduce.ScanType.INCLUSIVE if inclusive - else pylibcudf.reduce.ScanType.EXCLUSIVE, - ) - ) - - -@acquire_spill_lock() -def minmax(Column incol): - """ - Top level Cython minmax function wrapping libcudf minmax. - - Parameters - ---------- - incol : Column - A cuDF Column object - - Returns - ------- - A pair of ``(min, max)`` values of ``incol`` - """ - min, max = pylibcudf.reduce.minmax(incol.to_pylibcudf(mode="read")) - return ( - cudf.Scalar.from_device_scalar(DeviceScalar.from_pylibcudf(min)), - cudf.Scalar.from_device_scalar(DeviceScalar.from_pylibcudf(max)), - ) - - -def _reduce_precision(dtype, op, nrows): - """ - Returns the result precision when performing the reduce - operation `op` for the given dtype and column size. - - See: https://docs.microsoft.com/en-us/sql/t-sql/data-types/precision-scale-and-length-transact-sql - """ # noqa: E501 - p = dtype.precision - if op in ("min", "max"): - new_p = p - elif op == "sum": - new_p = p + nrows - 1 - elif op == "product": - new_p = p * nrows + nrows - 1 - elif op == "sum_of_squares": - new_p = 2 * p + nrows - else: - raise NotImplementedError() - return max(min(new_p, dtype.MAX_PRECISION), 0) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 68307f0e109..42b4fda8be2 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2,6 +2,7 @@ from __future__ import annotations +import warnings from collections import abc from collections.abc import MutableSequence, Sequence from functools import cached_property @@ -31,7 +32,7 @@ drop_duplicates, drop_nulls, ) -from cudf._lib.types import size_type_dtype +from cudf._lib.types import dtype_to_pylibcudf_type, size_type_dtype from cudf.api.types import ( _is_non_decimal_numeric_dtype, _is_pandas_nullable_extension_dtype, @@ -41,7 +42,7 @@ is_string_dtype, ) from cudf.core._compat import PANDAS_GE_210 -from cudf.core._internals import unary +from cudf.core._internals import aggregation, unary from cudf.core._internals.timezones import get_compatible_timezone from cudf.core.abc import Serializable from cudf.core.buffer import ( @@ -259,21 +260,17 @@ def all(self, skipna: bool = True) -> bool: # The skipna argument is only used for numerical columns. # If all entries are null the result is True, including when the column # is empty. - if self.null_count == self.size: return True - - return libcudf.reduce.reduce("all", self) + return self.reduce("all") def any(self, skipna: bool = True) -> bool: # Early exit for fast cases. - if not skipna and self.has_nulls(): return True elif skipna and self.null_count == self.size: return False - - return libcudf.reduce.reduce("any", self) + return self.reduce("any") def dropna(self) -> Self: if self.has_nulls(): @@ -1393,33 +1390,35 @@ def _reduce( ) if isinstance(preprocessed, ColumnBase): dtype = kwargs.pop("dtype", None) - return libcudf.reduce.reduce( - op, preprocessed, dtype=dtype, **kwargs - ) + return preprocessed.reduce(op, dtype, **kwargs) return preprocessed + def _can_return_nan(self, skipna: bool | None = None) -> bool: + return not skipna and self.has_nulls(include_nan=False) + def _process_for_reduction( self, skipna: bool | None = None, min_count: int = 0 ) -> ColumnBase | ScalarLike: - if skipna is None: - skipna = True + skipna = True if skipna is None else skipna - if self.has_nulls(): + if self._can_return_nan(skipna=skipna): + return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) + + col = self.nans_to_nulls() if skipna else self + if col.has_nulls(): if skipna: - result_col = self.dropna() + col = col.dropna() else: return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - result_col = self - # TODO: If and when pandas decides to validate that `min_count` >= 0 we # should insert comparable behavior. # https://github.com/pandas-dev/pandas/issues/50022 if min_count > 0: - valid_count = len(result_col) - result_col.null_count + valid_count = len(col) - col.null_count if valid_count < min_count: return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - return result_col + return col def _reduction_result_dtype(self, reduction_op: str) -> Dtype: """ @@ -1529,6 +1528,91 @@ def one_hot_encode( for col in plc_table.columns() ) + @acquire_spill_lock() + def scan(self, scan_op: str, inclusive: bool, **kwargs) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.reduce.scan( + self.to_pylibcudf(mode="read"), + aggregation.make_aggregation(scan_op, kwargs).c_obj, + plc.reduce.ScanType.INCLUSIVE + if inclusive + else plc.reduce.ScanType.EXCLUSIVE, + ) + ) + + def reduce(self, reduction_op: str, dtype=None, **kwargs) -> ScalarLike: + if dtype is not None: + warnings.warn( + "dtype is deprecated and will be remove in a future release. " + "Cast the result (e.g. .astype) after the operation instead.", + FutureWarning, + ) + col_dtype = dtype + else: + col_dtype = self._reduction_result_dtype(reduction_op) + + # check empty case + if len(self) <= self.null_count: + if reduction_op == "sum" or reduction_op == "sum_of_squares": + return self.dtype.type(0) + if reduction_op == "product": + return self.dtype.type(1) + if reduction_op == "any": + return False + + return cudf.utils.dtypes._get_nan_for_dtype(col_dtype) + + with acquire_spill_lock(): + plc_scalar = plc.reduce.reduce( + self.to_pylibcudf(mode="read"), + aggregation.make_aggregation(reduction_op, kwargs).c_obj, + dtype_to_pylibcudf_type(col_dtype), + ) + result_col = type(self).from_pylibcudf( + plc.Column.from_scalar(plc_scalar, 1) + ) + if plc_scalar.type().id() in { + plc.TypeId.DECIMAL128, + plc.TypeId.DECIMAL64, + plc.TypeId.DECIMAL32, + }: + scale = -plc_scalar.type().scale() + # https://docs.microsoft.com/en-us/sql/t-sql/data-types/precision-scale-and-length-transact-sql + p = col_dtype.precision + nrows = len(self) + if reduction_op in {"min", "max"}: + new_p = p + elif reduction_op == "sum": + new_p = p + nrows - 1 + elif reduction_op == "product": + new_p = p * nrows + nrows - 1 + elif reduction_op == "sum_of_squares": + new_p = 2 * p + nrows + else: + raise NotImplementedError( + f"{reduction_op} not implemented for decimal types." + ) + precision = max(min(new_p, col_dtype.MAX_PRECISION), 0) + new_dtype = type(col_dtype)(precision, scale) + result_col = result_col.astype(new_dtype) + elif isinstance(col_dtype, cudf.IntervalDtype): + result_col = type(self).from_struct_column( # type: ignore[attr-defined] + result_col, closed=col_dtype.closed + ) + return result_col.element_indexing(0) + + @acquire_spill_lock() + def minmax(self) -> tuple[ScalarLike, ScalarLike]: + min_val, max_val = plc.reduce.minmax(self.to_pylibcudf(mode="read")) + return ( + type(self) + .from_pylibcudf(plc.Column.from_scalar(min_val, 1)) + .element_indexing(0), + type(self) + .from_pylibcudf(plc.Column.from_scalar(max_val, 1)) + .element_indexing(0), + ) + def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool: """Check if an object dtype Series or array contains NaN.""" diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index 34975fc94f4..dd8f58a118e 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -14,7 +14,6 @@ if TYPE_CHECKING: from typing_extensions import Self - from cudf._typing import ScalarLike from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase @@ -211,16 +210,3 @@ def element_indexing(self, index: int): if cudf.get_option("mode.pandas_compatible"): return pd.Interval(**result, closed=self.dtype.closed) return result - - def _reduce( - self, - op: str, - skipna: bool | None = None, - min_count: int = 0, - *args, - **kwargs, - ) -> ScalarLike: - result = super()._reduce(op, skipna, min_count, *args, **kwargs) - if cudf.get_option("mode.pandas_compatible"): - return pd.Interval(**result, closed=self.dtype.closed) - return result diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 790cd6ea9bb..28a2bd7fa6c 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -420,22 +420,12 @@ def all(self, skipna: bool = True) -> bool: # If all entries are null the result is True, including when the column # is empty. result_col = self.nans_to_nulls() if skipna else self - - if result_col.null_count == result_col.size: - return True - - return libcudf.reduce.reduce("all", result_col) + return super(type(self), result_col).all(skipna=skipna) def any(self, skipna: bool = True) -> bool: # Early exit for fast cases. result_col = self.nans_to_nulls() if skipna else self - - if not skipna and result_col.has_nulls(): - return True - elif skipna and result_col.null_count == result_col.size: - return False - - return libcudf.reduce.reduce("any", result_col) + return super(type(self), result_col).any(skipna=skipna) @functools.cached_property def nan_count(self) -> int: @@ -483,19 +473,6 @@ def _process_values_for_isin( def _can_return_nan(self, skipna: bool | None = None) -> bool: return not skipna and self.has_nulls(include_nan=True) - def _process_for_reduction( - self, skipna: bool | None = None, min_count: int = 0 - ) -> NumericalColumn | ScalarLike: - skipna = True if skipna is None else skipna - - if self._can_return_nan(skipna=skipna): - return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - - col = self.nans_to_nulls() if skipna else self - return super(NumericalColumn, col)._process_for_reduction( - skipna=skipna, min_count=min_count - ) - def find_and_replace( self, to_replace: ColumnLike, diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 3f9abdabc2f..e06a0447f5c 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -263,6 +263,6 @@ def round( ) def _scan(self, op: str) -> ColumnBase: - return libcudf.reduce.scan( - op.replace("cum", ""), self, True - )._with_type_metadata(self.dtype) + return self.scan(op.replace("cum", ""), True)._with_type_metadata( + self.dtype + ) diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index db6ad72ab56..ba765b50729 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -107,12 +107,9 @@ def memory_usage(self) -> int: return n - def element_indexing(self, index: int): + def element_indexing(self, index: int) -> dict: result = super().element_indexing(index) - return { - field: value - for field, value in zip(self.dtype.fields, result.values()) - } + return dict(zip(self.dtype.fields, result.values())) def __setitem__(self, key, value): if isinstance(value, dict): diff --git a/python/cudf/cudf/core/copy_types.py b/python/cudf/cudf/core/copy_types.py index 16d8964f083..4b6ad59c8e1 100644 --- a/python/cudf/cudf/core/copy_types.py +++ b/python/cudf/cudf/core/copy_types.py @@ -5,7 +5,6 @@ from typing_extensions import Self import cudf -import cudf._lib as libcudf from cudf._lib.types import size_type_dtype if TYPE_CHECKING: @@ -70,8 +69,8 @@ def __init__(self, column: Any, nrows: int, *, nullify: bool): if self.column.dtype.kind not in {"i", "u"}: raise TypeError("Gather map must have integer dtype") if not nullify: - lo, hi = libcudf.reduce.minmax(self.column) - if lo.value < -nrows or hi.value >= nrows: + lo, hi = self.column.minmax() + if lo < -nrows or hi >= nrows: raise IndexError( f"Gather map is out of bounds for [0, {nrows})" ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index b74128a8a61..8cdc45e12da 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2505,16 +2505,7 @@ def scatter_by_map( ) if map_index.size > 0: - plc_lo, plc_hi = plc.reduce.minmax( - map_index.to_pylibcudf(mode="read") - ) - # TODO: Use pylibcudf Scalar once APIs are more developed - lo = libcudf.column.Column.from_pylibcudf( - plc.Column.from_scalar(plc_lo, 1) - ).element_indexing(0) - hi = libcudf.column.Column.from_pylibcudf( - plc.Column.from_scalar(plc_hi, 1) - ).element_indexing(0) + lo, hi = map_index.minmax() if lo < 0 or hi >= map_size: raise ValueError("Partition map has invalid values") diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 5a41a33e583..f5ee36f851c 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -191,12 +191,12 @@ def __init__( source_data = {} for i, (code, level) in enumerate(zip(new_codes, new_levels)): if len(code): - lo, hi = libcudf.reduce.minmax(code) - if lo.value < -1 or hi.value > len(level) - 1: + lo, hi = code.minmax() + if lo < -1 or hi > len(level) - 1: raise ValueError( f"Codes must be -1 <= codes <= {len(level) - 1}" ) - if lo.value == -1: + if lo == -1: # Now we can gather and insert null automatically code[code == -1] = np.iinfo(size_type_dtype).min result_col = libcudf.copying.gather( diff --git a/python/cudf/cudf/core/window/ewm.py b/python/cudf/cudf/core/window/ewm.py index 094df955273..c4a063a50e8 100644 --- a/python/cudf/cudf/core/window/ewm.py +++ b/python/cudf/cudf/core/window/ewm.py @@ -6,7 +6,6 @@ import numpy as np -from cudf._lib.reduce import scan from cudf.api.types import is_numeric_dtype from cudf.core.window.rolling import _RollingBase @@ -194,13 +193,8 @@ def _apply_agg_column( # as such we need to convert the nans to nulls before # passing them in. to_libcudf_column = source_column.astype("float64").nans_to_nulls() - - return scan( - agg_name, - to_libcudf_column, - True, - com=self.com, - adjust=self.adjust, + return to_libcudf_column.scan( + agg_name, True, com=self.com, adjust=self.adjust ) From f3f159ae166426125347e7d6f8dd7210d4075179 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 13 Dec 2024 08:46:57 -0500 Subject: [PATCH 72/78] Use no-sync copy for fixed-width types in cudf::concatenate (#17584) Replacing `thrust::copy` with `cudaMemcpyAsync` improves performance upto 2x in specific cases in `cudf::concatenate` The `thrust::copy` does a sync for device-to-device copy though it is not necessary. Using `rmm::exec_policy_nosync` had no effect. Will work with CCCL to determine if this is a bug in `thrust::copy` since computing the return value does not require a sync. Also moved the benchmark for concatenate from googlebench to nvbench. Closes #17172 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/17584 --- cpp/benchmarks/CMakeLists.txt | 5 +- cpp/benchmarks/column/concatenate.cpp | 169 ------------------------- cpp/benchmarks/copying/concatenate.cpp | 84 ++++++++++++ cpp/src/copying/concatenate.cu | 6 +- 4 files changed, 92 insertions(+), 172 deletions(-) delete mode 100644 cpp/benchmarks/column/concatenate.cpp create mode 100644 cpp/benchmarks/copying/concatenate.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 8e5ea900efa..b1456600c95 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -140,8 +140,9 @@ function(ConfigureNVBench CMAKE_BENCH_NAME) endfunction() # ################################################################################################## -# * column benchmarks ----------------------------------------------------------------------------- -ConfigureBench(COLUMN_CONCAT_BENCH column/concatenate.cpp) +# * copying benchmarks +# ----------------------------------------------------------------------------- +ConfigureNVBench(COPYING_NVBENCH copying/concatenate.cpp) # ################################################################################################## # * gather benchmark ------------------------------------------------------------------------------ diff --git a/cpp/benchmarks/column/concatenate.cpp b/cpp/benchmarks/column/concatenate.cpp deleted file mode 100644 index 51106c72137..00000000000 --- a/cpp/benchmarks/column/concatenate.cpp +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include -#include -#include -#include - -#include - -#include -#include -#include - -#include -#include -#include - -#include -#include - -class Concatenate : public cudf::benchmark {}; - -template -static void BM_concatenate(benchmark::State& state) -{ - cudf::size_type const num_rows = state.range(0); - cudf::size_type const num_cols = state.range(1); - - auto input = create_sequence_table(cycle_dtypes({cudf::type_to_id()}, num_cols), - row_count{num_rows}, - Nullable ? std::optional{2.0 / 3.0} : std::nullopt); - auto input_columns = input->view(); - std::vector column_views(input_columns.begin(), input_columns.end()); - - CUDF_CHECK_CUDA(0); - - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - auto result = cudf::concatenate(column_views); - } - - state.SetBytesProcessed(state.iterations() * num_cols * num_rows * sizeof(T)); -} - -#define CONCAT_BENCHMARK_DEFINE(type, nullable) \ - BENCHMARK_DEFINE_F(Concatenate, BM_concatenate##_##nullable_##nullable) \ - (::benchmark::State & st) { BM_concatenate(st); } \ - BENCHMARK_REGISTER_F(Concatenate, BM_concatenate##_##nullable_##nullable) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 18}, {2, 1024}}) \ - ->Unit(benchmark::kMillisecond) \ - ->UseManualTime(); - -CONCAT_BENCHMARK_DEFINE(int64_t, false) -CONCAT_BENCHMARK_DEFINE(int64_t, true) - -template -static void BM_concatenate_tables(benchmark::State& state) -{ - cudf::size_type const num_rows = state.range(0); - cudf::size_type const num_cols = state.range(1); - cudf::size_type const num_tables = state.range(2); - - std::vector> tables(num_tables); - std::generate_n(tables.begin(), num_tables, [&]() { - return create_sequence_table(cycle_dtypes({cudf::type_to_id()}, num_cols), - row_count{num_rows}, - Nullable ? std::optional{2.0 / 3.0} : std::nullopt); - }); - - // Generate table views - std::vector table_views(num_tables); - std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) mutable { - return table->view(); - }); - - CUDF_CHECK_CUDA(0); - - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - auto result = cudf::concatenate(table_views); - } - - state.SetBytesProcessed(state.iterations() * num_cols * num_rows * num_tables * sizeof(T)); -} - -#define CONCAT_TABLES_BENCHMARK_DEFINE(type, nullable) \ - BENCHMARK_DEFINE_F(Concatenate, BM_concatenate_tables##_##nullable_##nullable) \ - (::benchmark::State & st) { BM_concatenate_tables(st); } \ - BENCHMARK_REGISTER_F(Concatenate, BM_concatenate_tables##_##nullable_##nullable) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 8, 1 << 12}, {2, 32}, {2, 128}}) \ - ->Unit(benchmark::kMillisecond) \ - ->UseManualTime(); - -CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, false) -CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, true) - -class ConcatenateStrings : public cudf::benchmark {}; - -template -static void BM_concatenate_strings(benchmark::State& state) -{ - using column_wrapper = cudf::test::strings_column_wrapper; - - auto const num_rows = state.range(0); - auto const num_chars = state.range(1); - auto const num_cols = state.range(2); - - std::string str(num_chars, 'a'); - - // Create owning columns - std::vector columns; - columns.reserve(num_cols); - std::generate_n(std::back_inserter(columns), num_cols, [num_rows, c_str = str.c_str()]() { - auto iter = thrust::make_constant_iterator(c_str); - if (Nullable) { - auto count_it = thrust::make_counting_iterator(0); - auto valid_iter = - thrust::make_transform_iterator(count_it, [](auto i) { return i % 3 == 0; }); - return column_wrapper(iter, iter + num_rows, valid_iter); - } else { - return column_wrapper(iter, iter + num_rows); - } - }); - - // Generate column views - std::vector column_views; - column_views.reserve(columns.size()); - std::transform( - columns.begin(), columns.end(), std::back_inserter(column_views), [](auto const& col) { - return static_cast(col); - }); - - CUDF_CHECK_CUDA(0); - - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - auto result = cudf::concatenate(column_views); - } - - state.SetBytesProcessed(state.iterations() * num_cols * num_rows * - (sizeof(int32_t) + num_chars)); // offset + chars -} - -#define CONCAT_STRINGS_BENCHMARK_DEFINE(nullable) \ - BENCHMARK_DEFINE_F(Concatenate, BM_concatenate_strings##_##nullable_##nullable) \ - (::benchmark::State & st) { BM_concatenate_strings(st); } \ - BENCHMARK_REGISTER_F(Concatenate, BM_concatenate_strings##_##nullable_##nullable) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 8, 1 << 14}, {8, 128}, {2, 256}}) \ - ->Unit(benchmark::kMillisecond) \ - ->UseManualTime(); - -CONCAT_STRINGS_BENCHMARK_DEFINE(false) -CONCAT_STRINGS_BENCHMARK_DEFINE(true) diff --git a/cpp/benchmarks/copying/concatenate.cpp b/cpp/benchmarks/copying/concatenate.cpp new file mode 100644 index 00000000000..586b479d0ad --- /dev/null +++ b/cpp/benchmarks/copying/concatenate.cpp @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +#include +#include +#include +#include + +#include + +#include + +static void bench_concatenate(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const num_cols = static_cast(state.get_int64("num_cols")); + auto const nulls = static_cast(state.get_float64("nulls")); + + auto input = create_sequence_table( + cycle_dtypes({cudf::type_to_id()}, num_cols), row_count{num_rows}, nulls); + auto input_columns = input->view(); + auto column_views = std::vector(input_columns.begin(), input_columns.end()); + + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + state.add_global_memory_reads(num_rows * num_cols); + state.add_global_memory_writes(num_rows * num_cols); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { auto result = cudf::concatenate(column_views); }); +} + +NVBENCH_BENCH(bench_concatenate) + .set_name("concatenate") + .add_int64_axis("num_rows", {64, 512, 4096, 32768, 262144}) + .add_int64_axis("num_cols", {2, 8, 64, 512, 1024}) + .add_float64_axis("nulls", {0.0, 0.3}); + +static void bench_concatenate_strings(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const num_cols = static_cast(state.get_int64("num_cols")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const nulls = static_cast(state.get_float64("nulls")); + + data_profile const profile = + data_profile_builder() + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width) + .null_probability(nulls); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); + auto const input = column->view(); + + auto column_views = std::vector(num_cols, input); + + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + auto const sv = cudf::strings_column_view(input); + state.add_global_memory_reads(sv.chars_size(stream) * num_cols); + state.add_global_memory_writes(sv.chars_size(stream) * num_cols); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { auto result = cudf::concatenate(column_views); }); +} + +NVBENCH_BENCH(bench_concatenate_strings) + .set_name("concatenate_strings") + .add_int64_axis("num_rows", {256, 512, 4096, 16384}) + .add_int64_axis("num_cols", {2, 8, 64, 256}) + .add_int64_axis("row_width", {32, 128}) + .add_float64_axis("nulls", {0.0, 0.3}); diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu index d8419760120..6fc49afd7ac 100644 --- a/cpp/src/copying/concatenate.cu +++ b/cpp/src/copying/concatenate.cu @@ -308,7 +308,11 @@ std::unique_ptr for_each_concatenate(host_span views, auto count = 0; for (auto& v : views) { - thrust::copy(rmm::exec_policy(stream), v.begin(), v.end(), m_view.begin() + count); + cudaMemcpyAsync(m_view.begin() + count, + v.begin(), + v.size() * sizeof(T), + cudaMemcpyDeviceToDevice, + stream.value()); count += v.size(); } From a0957273a686875c8c3da19dfb80f4048e472e19 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 13 Dec 2024 08:47:35 -0500 Subject: [PATCH 73/78] Allow large strings in nvtext benchmarks (#17579) Removes the 2GB limit check from the nvtext benchmarks and adjusts the parameters to be consistent across the benchmarks. Also converts the subword-tokenizer to nvbench and removes the unused `word_minhash.cpp` source file. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/17579 --- cpp/benchmarks/CMakeLists.txt | 15 ++++-- cpp/benchmarks/text/edit_distance.cpp | 15 +++--- cpp/benchmarks/text/hash_ngrams.cpp | 15 +++--- cpp/benchmarks/text/jaccard.cpp | 13 ++--- cpp/benchmarks/text/normalize.cpp | 15 +++--- cpp/benchmarks/text/replace.cpp | 9 +--- cpp/benchmarks/text/subword.cpp | 58 +++++++++----------- cpp/benchmarks/text/tokenize.cpp | 15 +++--- cpp/benchmarks/text/vocab.cpp | 17 +++--- cpp/benchmarks/text/word_minhash.cpp | 77 --------------------------- 10 files changed, 74 insertions(+), 175 deletions(-) delete mode 100644 cpp/benchmarks/text/word_minhash.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index b1456600c95..749e1b628ee 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -352,11 +352,18 @@ ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binary # ################################################################################################## # * nvtext benchmark ------------------------------------------------------------------- -ConfigureBench(TEXT_BENCH text/subword.cpp) - ConfigureNVBench( - TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp - text/ngrams.cpp text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp + TEXT_NVBENCH + text/edit_distance.cpp + text/hash_ngrams.cpp + text/jaccard.cpp + text/minhash.cpp + text/ngrams.cpp + text/normalize.cpp + text/replace.cpp + text/subword.cpp + text/tokenize.cpp + text/vocab.cpp ) # ################################################################################################## diff --git a/cpp/benchmarks/text/edit_distance.cpp b/cpp/benchmarks/text/edit_distance.cpp index 6ffa90edb8f..0ad1ae30f8c 100644 --- a/cpp/benchmarks/text/edit_distance.cpp +++ b/cpp/benchmarks/text/edit_distance.cpp @@ -27,15 +27,11 @@ static void bench_edit_distance(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); data_profile const strings_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const strings_table = create_random_table( {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, strings_profile); cudf::strings_column_view input1(strings_table->view().column(0)); @@ -55,5 +51,6 @@ static void bench_edit_distance(nvbench::state& state) NVBENCH_BENCH(bench_edit_distance) .set_name("edit_distance") - .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144}) - .add_int64_axis("row_width", {8, 16, 32, 64, 128, 256}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144}); diff --git a/cpp/benchmarks/text/hash_ngrams.cpp b/cpp/benchmarks/text/hash_ngrams.cpp index 4e5daf83a3c..7577cf00c0f 100644 --- a/cpp/benchmarks/text/hash_ngrams.cpp +++ b/cpp/benchmarks/text/hash_ngrams.cpp @@ -27,16 +27,12 @@ static void bench_hash_ngrams(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const ngrams = static_cast(state.get_int64("ngrams")); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const strings_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const strings_table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile); cudf::strings_column_view input(strings_table->view().column(0)); @@ -55,6 +51,7 @@ static void bench_hash_ngrams(nvbench::state& state) NVBENCH_BENCH(bench_hash_ngrams) .set_name("hash_ngrams") - .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144}) - .add_int64_axis("row_width", {128, 512, 2048}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {128, 512, 2048}) + .add_int64_axis("num_rows", {16384, 32768, 262144}) .add_int64_axis("ngrams", {5, 10}); diff --git a/cpp/benchmarks/text/jaccard.cpp b/cpp/benchmarks/text/jaccard.cpp index d5b74da6773..5506501138b 100644 --- a/cpp/benchmarks/text/jaccard.cpp +++ b/cpp/benchmarks/text/jaccard.cpp @@ -28,17 +28,13 @@ static void bench_jaccard(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const substring_width = static_cast(state.get_int64("substring_width")); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const strings_profile = data_profile_builder() - .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width) + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width) .no_validity(); auto const input_table = create_random_table( {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, strings_profile); @@ -59,6 +55,7 @@ static void bench_jaccard(nvbench::state& state) NVBENCH_BENCH(bench_jaccard) .set_name("jaccard") + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {128, 512, 1024, 2048}) .add_int64_axis("num_rows", {32768, 131072, 262144}) - .add_int64_axis("row_width", {128, 512, 1024, 2048}) .add_int64_axis("substring_width", {5, 10}); diff --git a/cpp/benchmarks/text/normalize.cpp b/cpp/benchmarks/text/normalize.cpp index 71bccd80d39..594dc0de28a 100644 --- a/cpp/benchmarks/text/normalize.cpp +++ b/cpp/benchmarks/text/normalize.cpp @@ -28,16 +28,12 @@ static void bench_normalize(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const normalize_type = state.get_string("type"); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); @@ -60,6 +56,7 @@ static void bench_normalize(nvbench::state& state) NVBENCH_BENCH(bench_normalize) .set_name("normalize") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("type", {"spaces", "characters", "to_lower"}); diff --git a/cpp/benchmarks/text/replace.cpp b/cpp/benchmarks/text/replace.cpp index 767ebab3eee..24ca4e5dfd7 100644 --- a/cpp/benchmarks/text/replace.cpp +++ b/cpp/benchmarks/text/replace.cpp @@ -31,11 +31,6 @@ static void bench_replace(nvbench::state& state) auto const num_rows = static_cast(state.get_int64("num_rows")); auto const row_width = static_cast(state.get_int64("row_width")); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - std::vector words{" ", "one ", "two ", "three ", "four ", "five ", "six ", "sevén ", "eight ", "nine ", "ten ", "eleven ", "twelve ", "thirteen ", "fourteen ", @@ -71,5 +66,5 @@ static void bench_replace(nvbench::state& state) NVBENCH_BENCH(bench_replace) .set_name("replace") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/text/subword.cpp b/cpp/benchmarks/text/subword.cpp index dd8df695d3e..0b4e3bdefa5 100644 --- a/cpp/benchmarks/text/subword.cpp +++ b/cpp/benchmarks/text/subword.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,9 +14,6 @@ * limitations under the License. */ -#include -#include - #include #include @@ -24,6 +21,8 @@ #include +#include + #include #include #include @@ -54,40 +53,33 @@ static std::string create_hash_vocab_file() return hash_file; } -static void BM_subword_tokenizer(benchmark::State& state) +static void bench_subword_tokenizer(nvbench::state& state) { - auto const nrows = static_cast(state.range(0)); - std::vector h_strings(nrows, "This is a test "); + auto const num_rows = static_cast(state.get_int64("num_rows")); + + std::vector h_strings(num_rows, "This is a test "); cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end()); static std::string hash_file = create_hash_vocab_file(); std::vector offsets{14}; - uint32_t max_sequence_length = 64; - uint32_t stride = 48; - uint32_t do_truncate = 0; - uint32_t do_lower = 1; - // - auto vocab = nvtext::load_vocabulary_file(hash_file); - for (auto _ : state) { - cuda_event_timer raii(state, true); - auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings}, - *vocab, - max_sequence_length, - stride, - do_lower, - do_truncate); - } -} + uint32_t max_sequence = 64; + uint32_t stride = 48; + uint32_t do_truncate = 0; + uint32_t do_lower = 1; -class Subword : public cudf::benchmark {}; + auto input = cudf::strings_column_view{strings}; -#define SUBWORD_BM_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(Subword, name)(::benchmark::State & state) { BM_subword_tokenizer(state); } \ - BENCHMARK_REGISTER_F(Subword, name) \ - ->RangeMultiplier(2) \ - ->Range(1 << 10, 1 << 17) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + auto chars_size = input.chars_size(cudf::get_default_stream()); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(num_rows * max_sequence); -SUBWORD_BM_BENCHMARK_DEFINE(BM_subword_tokenizer); + auto vocab = nvtext::load_vocabulary_file(hash_file); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = + nvtext::subword_tokenize(input, *vocab, max_sequence, stride, do_lower, do_truncate); + }); +} -// BENCHMARK_MAIN(); +NVBENCH_BENCH(bench_subword_tokenizer) + .set_name("subword_tokenize") + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/text/tokenize.cpp b/cpp/benchmarks/text/tokenize.cpp index e83310e0343..b9590c5539f 100644 --- a/cpp/benchmarks/text/tokenize.cpp +++ b/cpp/benchmarks/text/tokenize.cpp @@ -31,17 +31,13 @@ static void bench_tokenize(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const tokenize_type = state.get_string("type"); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const profile = data_profile_builder() - .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width) + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width) .no_validity(); auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); @@ -82,6 +78,7 @@ static void bench_tokenize(nvbench::state& state) NVBENCH_BENCH(bench_tokenize) .set_name("tokenize") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("type", {"whitespace", "multi", "count", "count_multi", "ngrams", "characters"}); diff --git a/cpp/benchmarks/text/vocab.cpp b/cpp/benchmarks/text/vocab.cpp index 523d277df18..0502f375d99 100644 --- a/cpp/benchmarks/text/vocab.cpp +++ b/cpp/benchmarks/text/vocab.cpp @@ -33,16 +33,12 @@ static void bench_vocab_tokenize(nvbench::state& state) { auto const stream = cudf::get_default_stream(); auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - - auto const column = [num_rows, row_width] { + auto const column = [num_rows, min_width, max_width] { data_profile const profile = data_profile_builder().no_validity().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const col = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); return cudf::strings::filter_characters_of_type( cudf::strings_column_view(col->view()), @@ -85,5 +81,6 @@ static void bench_vocab_tokenize(nvbench::state& state) NVBENCH_BENCH(bench_vocab_tokenize) .set_name("vocab_tokenize") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) - .add_int64_axis("num_rows", {262144, 524288, 1048576, 2097152, 4194304, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/text/word_minhash.cpp b/cpp/benchmarks/text/word_minhash.cpp deleted file mode 100644 index adc3dddc59c..00000000000 --- a/cpp/benchmarks/text/word_minhash.cpp +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include - -#include - -#include - -#include - -static void bench_word_minhash(nvbench::state& state) -{ - auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - auto const seed_count = static_cast(state.get_int64("seed_count")); - auto const base64 = state.get_int64("hash_type") == 64; - - data_profile const strings_profile = - data_profile_builder().distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, 5); - auto strings_table = - create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile); - - auto const num_offsets = (num_rows / row_width) + 1; - auto offsets = cudf::sequence(num_offsets, - cudf::numeric_scalar(0), - cudf::numeric_scalar(row_width)); - - auto source = cudf::make_lists_column(num_offsets - 1, - std::move(offsets), - std::move(strings_table->release().front()), - 0, - rmm::device_buffer{}); - - data_profile const seeds_profile = data_profile_builder().no_validity().distribution( - cudf::type_to_id(), distribution_id::NORMAL, 0, 256); - auto const seed_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32; - auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile); - auto seeds = seeds_table->get_column(0); - - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - - cudf::strings_column_view input(cudf::lists_column_view(source->view()).child()); - auto chars_size = input.chars_size(cudf::get_default_stream()); - state.add_global_memory_reads(chars_size); - state.add_global_memory_writes(num_rows); // output are hashes - - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = base64 ? nvtext::word_minhash64(source->view(), seeds.view()) - : nvtext::word_minhash(source->view(), seeds.view()); - }); -} - -NVBENCH_BENCH(bench_word_minhash) - .set_name("word_minhash") - .add_int64_axis("num_rows", {131072, 262144, 524288, 1048576, 2097152}) - .add_int64_axis("row_width", {10, 100, 1000}) - .add_int64_axis("seed_count", {2, 25}) - .add_int64_axis("hash_type", {32, 64}); From 62669e04cc11bd53dab1102e83aba76804f4dbde Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 13 Dec 2024 10:10:02 -0500 Subject: [PATCH 74/78] Fix ctest fail running libcudf tests in a Debug build (#17576) Fixes libcudf gtest failures when running with ctest on a Debug build. The error from `LastTest.log` indicates: ``` 1/106 Testing: COLUMN_TEST 1/106 Test: COLUMN_TEST Command: "/conda/envs/rapids/bin/cmake" "-Dcommand_to_run=/cudf/cpp/build/gtests/COLUMN_TEST" "-Dcommand_args=" "-P=/cudf/cpp/build/rapids-cmake/./run_gpu_test.cmake" Directory: /cudf/cpp/build/tests "COLUMN_TEST" start time: Dec 11 15:46 UTC Output: ---------------------------------------------------------- /conda/envs/rapids/bin/cmake: symbol lookup error: /cudf/cpp/build/libcudf_identify_stream_usage_mode_cudf.so: undefined symbol: _ZN3rmm6loggerD1Ev Test time = 0.00 sec ---------------------------------------------------------- Test Failed. "COLUMN_TEST" end time: Dec 11 15:46 UTC "COLUMN_TEST" time elapsed: 00:00:00 ``` Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17576 --- cpp/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 2f17b57b0a4..78f529a44d3 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -1105,7 +1105,7 @@ if(CUDF_BUILD_STREAMS_TEST_UTIL) ${_tgt} PRIVATE "$:${CUDF_CXX_FLAGS}>>" ) target_include_directories(${_tgt} PRIVATE "$") - target_link_libraries(${_tgt} PUBLIC CUDA::cudart rmm::rmm) + target_link_libraries(${_tgt} PUBLIC CUDA::cudart rmm::rmm rmm::rmm_logger rmm::rmm_logger_impl) if(CUDF_BUILD_STACKTRACE_DEBUG) target_link_libraries(${_tgt} PRIVATE cudf_backtrace) endif() From 4d6925ce1b83e10ea249346436ff8fdc4d28d73d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 13 Dec 2024 10:30:45 -0800 Subject: [PATCH 75/78] Remove unused masked keyword in column_empty (#17530) Follow up to https://github.com/rapidsai/cudf/pull/16715. Now that the usages of the `masked` keyword in RAPIDS have been address (https://github.com/rapidsai/cuspatial/pull/1496 is the only one I could find), I think we can remove this keyword all together in this method Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17530 --- python/cudf/cudf/core/column/categorical.py | 2 +- python/cudf/cudf/core/column/column.py | 12 ++---- python/cudf/cudf/core/column/datetime.py | 6 +-- .../cudf/cudf/core/column/numerical_base.py | 2 +- python/cudf/cudf/core/column/string.py | 2 +- python/cudf/cudf/core/column/timedelta.py | 2 +- python/cudf/cudf/core/dataframe.py | 39 +++++++------------ python/cudf/cudf/core/dtypes.py | 4 +- python/cudf/cudf/core/groupby/groupby.py | 7 ++-- python/cudf/cudf/core/index.py | 2 +- python/cudf/cudf/core/indexed_frame.py | 1 - python/cudf/cudf/io/parquet.py | 1 - 12 files changed, 28 insertions(+), 52 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 71ec11e75af..a0cf38c6f51 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1193,7 +1193,7 @@ def _concat( f"size > {libcudf.MAX_COLUMN_SIZE_STR}" ) elif newsize == 0: - codes_col = column.column_empty(0, head.codes.dtype, masked=True) + codes_col = column.column_empty(0, head.codes.dtype) else: codes_col = column.concat_columns(codes) # type: ignore[arg-type] diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 42b4fda8be2..624a3ac95ed 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -551,7 +551,7 @@ def slice(self, start: int, stop: int, stride: int | None = None) -> Self: if stop < 0 and not (stride < 0 and stop == -1): stop = stop + len(self) if (stride > 0 and start >= stop) or (stride < 0 and start <= stop): - return cast(Self, column_empty(0, self.dtype, masked=True)) + return cast(Self, column_empty(0, self.dtype)) # compute mask slice if stride == 1: return libcudf.copying.column_slice(self, [start, stop])[ @@ -1054,7 +1054,7 @@ def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase: if self.dtype == dtype: result = self else: - result = column_empty(0, dtype=dtype, masked=self.nullable) + result = column_empty(0, dtype=dtype) elif dtype == "category": # TODO: Figure out why `cudf.dtype("category")` # astype's different than just the string @@ -1625,7 +1625,6 @@ def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool: def column_empty( row_count: int, dtype: Dtype = "object", - masked: bool = False, for_numba: bool = False, ) -> ColumnBase: """ @@ -1642,9 +1641,6 @@ def column_empty( dtype : Dtype Type of the column. - masked : bool - Unused. - for_numba : bool, default False If True, don't allocate a mask as it's not supported by numba. """ @@ -2420,7 +2416,7 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: """Concatenate a sequence of columns.""" if len(objs) == 0: dtype = cudf.dtype(None) - return column_empty(0, dtype=dtype, masked=True) + return column_empty(0, dtype=dtype) # If all columns are `NumericalColumn` with different dtypes, # we cast them to a common dtype. @@ -2467,7 +2463,7 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: f"size > {libcudf.MAX_COLUMN_SIZE_STR}" ) elif newsize == 0: - return column_empty(0, head.dtype, masked=True) + return column_empty(0, head.dtype) # Filter out inputs that have 0 length, then concatenate. objs_with_len = [o for o in objs if len(o)] diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index b526a6efa51..81b82040b8d 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -598,14 +598,12 @@ def strftime(self, format: str) -> cudf.core.column.StringColumn: if len(self) == 0: return cast( cudf.core.column.StringColumn, - column.column_empty(0, dtype="object", masked=False), + column.column_empty(0, dtype="object"), ) if format in _DATETIME_SPECIAL_FORMATS: names = as_column(_DATETIME_NAMES) else: - names = cudf.core.column.column_empty( - 0, dtype="object", masked=False - ) + names = column.column_empty(0, dtype="object") return string._datetime_to_str_typecast_functions[self.dtype]( self, format, names ) diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index e06a0447f5c..7a39355dd50 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -139,7 +139,7 @@ def quantile( result = cast( NumericalBaseColumn, cudf.core.column.column_empty( - row_count=len(q), dtype=self.dtype, masked=True + row_count=len(q), dtype=self.dtype ), ) else: diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index c021554f3bd..d76caa5c3b8 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5855,7 +5855,7 @@ def strptime( f"dtype must be datetime or timedelta type, not {dtype}" ) elif self.null_count == len(self): - return column.column_empty(len(self), dtype=dtype, masked=True) # type: ignore[return-value] + return column.column_empty(len(self), dtype=dtype) # type: ignore[return-value] elif (self == "None").any(): raise ValueError( "Cannot convert `None` value to datetime or timedelta." diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index f3a7916aa35..8b1515acae2 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -294,7 +294,7 @@ def strftime(self, format: str) -> cudf.core.column.StringColumn: if len(self) == 0: return cast( cudf.core.column.StringColumn, - column.column_empty(0, dtype="object", masked=False), + column.column_empty(0, dtype="object"), ) else: return string._timedelta_to_str_typecast_functions[self.dtype]( diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 8cdc45e12da..fce361e18ea 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -774,9 +774,7 @@ def __init__( label_dtype = getattr(columns, "dtype", None) self._data = ColumnAccessor( { - k: column.column_empty( - len(self), dtype="object", masked=True - ) + k: column_empty(len(self), dtype="object") for k in columns }, level_names=tuple(columns.names) @@ -979,8 +977,8 @@ def _init_from_series_list(self, data, columns, index): if columns is not None: for col_name in columns: if col_name not in self._data: - self._data[col_name] = column.column_empty( - row_count=len(self), dtype=None, masked=True + self._data[col_name] = column_empty( + row_count=len(self), dtype=None ) self._data._level_names = ( tuple(columns.names) @@ -1031,11 +1029,7 @@ def _init_from_list_like(self, data, index=None, columns=None): data = list(itertools.zip_longest(*data)) if columns is not None and len(data) == 0: - data = [ - cudf.core.column.column_empty(row_count=0, dtype=None) - for _ in columns - ] - + data = [column_empty(row_count=0, dtype=None) for _ in columns] for col_name, col in enumerate(data): self._data[col_name] = column.as_column(col) self._data.rangeindex = True @@ -1074,9 +1068,8 @@ def _init_from_dict_like( # the provided index, so we need to return a masked # array of nulls if an index is given. empty_column = functools.partial( - cudf.core.column.column_empty, - row_count=(0 if index is None else len(index)), - masked=index is not None, + column_empty, + row_count=0 if index is None else len(index), ) data = { @@ -1421,7 +1414,7 @@ def __setitem__(self, arg, value): new_columns = ( value if key == arg - else column.column_empty( + else column_empty( row_count=length, dtype=col.dtype ) for key, col in self._column_labels_and_values @@ -3373,7 +3366,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True): if num_cols != 0: ca = self._data._from_columns_like_self( ( - column.column_empty(row_count=length, dtype=dtype) + column_empty(row_count=length, dtype=dtype) for _, dtype in self._dtypes ), verify=False, @@ -3479,7 +3472,7 @@ def diff(self, periods=1, axis=0): if abs(periods) > len(self): df = cudf.DataFrame._from_data( { - name: column_empty(len(self), dtype=dtype, masked=True) + name: column_empty(len(self), dtype=dtype) for name, dtype in zip(self._column_names, self.dtypes) } ) @@ -3859,9 +3852,7 @@ def agg(self, aggs, axis=None): result = DataFrame(index=idxs, columns=cols) for key in aggs.keys(): col = self[key] - col_empty = column_empty( - len(idxs), dtype=col.dtype, masked=True - ) + col_empty = column_empty(len(idxs), dtype=col.dtype) ans = cudf.Series._from_column( col_empty, index=cudf.Index(idxs) ) @@ -6177,9 +6168,7 @@ def quantile( quant_index=False, )._column if len(res) == 0: - res = column.column_empty( - row_count=len(qs), dtype=ser.dtype - ) + res = column_empty(row_count=len(qs), dtype=ser.dtype) result[k] = res result = DataFrame._from_data(result) @@ -7333,9 +7322,7 @@ def unnamed_group_generator(): ) all_nulls = functools.cache( - functools.partial( - column_empty, self.shape[0], common_type, masked=True - ) + functools.partial(column_empty, self.shape[0], common_type) ) # homogenize the dtypes of the columns @@ -8582,7 +8569,7 @@ def _cast_cols_to_common_dtypes(col_idxs, list_of_columns, dtypes, categories): # If column not in this df, fill with an all-null column if idx >= len(cols) or cols[idx] is None: n = len(next(x for x in cols if x is not None)) - cols[idx] = column_empty(row_count=n, dtype=dtype, masked=True) + cols[idx] = column_empty(row_count=n, dtype=dtype) else: # If column is categorical, rebase the codes with the # combined categories, and cast the new codes to the diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 9bb29f1920a..971f0be77f8 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -189,9 +189,7 @@ def categories(self) -> cudf.Index: Index(['b', 'a'], dtype='object') """ if self._categories is None: - col = cudf.core.column.column_empty( - 0, dtype="object", masked=False - ) + col = cudf.core.column.column_empty(0, dtype="object") else: col = self._categories return cudf.Index._from_column(col) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index d4f3394833a..a8d82f977d5 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -493,9 +493,7 @@ def size(self): """ Return the size of each group. """ - col = cudf.core.column.column_empty( - len(self.obj), "int8", masked=False - ) + col = cudf.core.column.column_empty(len(self.obj), "int8") result = ( cudf.Series._from_column(col, name=getattr(self.obj, "name", None)) .groupby(self.grouping, sort=self._sort, dropna=self._dropna) @@ -523,7 +521,8 @@ def cumcount(self, ascending: bool = True): return ( cudf.Series._from_column( cudf.core.column.column_empty( - len(self.obj), "int8", masked=False + len(self.obj), + "int8", ), index=self.obj.index, ) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index eeb6e3bd547..8d3ef1036d1 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -336,7 +336,7 @@ def _values(self) -> ColumnBase: if len(self) > 0: return column.as_column(self._range, dtype=self.dtype) else: - return column.column_empty(0, masked=False, dtype=self.dtype) + return column.column_empty(0, dtype=self.dtype) def _clean_nulls_from_index(self) -> Self: return self diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 0e6a5e03ea6..81d954960e2 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3851,7 +3851,6 @@ def _reindex( if name in df._data else cudf.core.column.column.column_empty( dtype=dtypes.get(name, np.float64), - masked=True, row_count=len(index), ) ) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 66095d4a155..153ee0fa01a 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -1139,7 +1139,6 @@ def _parquet_to_frame( dfs[-1][name] = column_empty( row_count=_len, dtype=_dtype, - masked=True, ) else: dfs[-1][name] = as_column( From 1a67646fa3998788757b05a08eae1c8d1ee73eb2 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 13 Dec 2024 12:23:30 -0800 Subject: [PATCH 76/78] Move cudf._lib.sort to cudf.core._internals (#17488) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17488 --- python/cudf/cudf/_lib/CMakeLists.txt | 4 +- python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/sort.pyx | 365 ------------------ python/cudf/cudf/core/_internals/sorting.py | 205 ++++++++++ python/cudf/cudf/core/column/column.py | 23 +- python/cudf/cudf/core/column/numerical.py | 65 ++-- .../cudf/cudf/core/column/numerical_base.py | 4 +- python/cudf/cudf/core/frame.py | 3 +- python/cudf/cudf/core/groupby/groupby.py | 25 +- python/cudf/cudf/core/indexed_frame.py | 44 ++- python/cudf/cudf/core/join/join.py | 5 +- python/cudf/cudf/core/multiindex.py | 3 +- python/cudf/cudf/core/series.py | 7 +- 13 files changed, 324 insertions(+), 430 deletions(-) delete mode 100644 python/cudf/cudf/_lib/sort.pyx create mode 100644 python/cudf/cudf/core/_internals/sorting.py diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 8cec8af3c67..427ffcc8c12 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -12,8 +12,8 @@ # the License. # ============================================================================= -set(cython_sources column.pyx copying.pyx groupby.pyx interop.pyx scalar.pyx sort.pyx - stream_compaction.pyx string_casting.pyx strings_udf.pyx types.pyx utils.pyx +set(cython_sources column.pyx copying.pyx groupby.pyx interop.pyx scalar.pyx stream_compaction.pyx + string_casting.pyx strings_udf.pyx types.pyx utils.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 001e5cbb676..26afdd62caf 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -5,7 +5,6 @@ copying, groupby, interop, - sort, stream_compaction, string_casting, strings_udf, diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx deleted file mode 100644 index eefe37d9880..00000000000 --- a/python/cudf/cudf/_lib/sort.pyx +++ /dev/null @@ -1,365 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from itertools import repeat - -from cudf.core.buffer import acquire_spill_lock - -from libcpp cimport bool - -from pylibcudf.libcudf.aggregation cimport rank_method -from cudf._lib.column cimport Column -from cudf._lib.utils cimport columns_from_pylibcudf_table - -import pylibcudf - - -@acquire_spill_lock() -def is_sorted( - list source_columns, object ascending=None, object null_position=None -): - """ - Checks whether the rows of a `table` are sorted in lexicographical order. - - Parameters - ---------- - source_columns : list of columns - columns to be checked for sort order - ascending : None or list-like of booleans - None or list-like of boolean values indicating expected sort order of - each column. If list-like, size of list-like must be len(columns). If - None, all columns expected sort order is set to ascending. False (0) - - descending, True (1) - ascending. - null_position : None or list-like of booleans - None or list-like of boolean values indicating desired order of nulls - compared to other elements. If list-like, size of list-like must be - len(columns). If None, null order is set to before. False (0) - after, - True (1) - before. - - Returns - ------- - returns : boolean - Returns True, if sorted as expected by ``ascending`` and - ``null_position``, False otherwise. - """ - - if ascending is None: - column_order = [pylibcudf.types.Order.ASCENDING] * len(source_columns) - else: - if len(ascending) != len(source_columns): - raise ValueError( - f"Expected a list-like of length {len(source_columns)}, " - f"got length {len(ascending)} for `ascending`" - ) - column_order = [pylibcudf.types.Order.DESCENDING] * len(source_columns) - for idx, val in enumerate(ascending): - if val: - column_order[idx] = pylibcudf.types.Order.ASCENDING - - if null_position is None: - null_precedence = [pylibcudf.types.NullOrder.AFTER] * len(source_columns) - else: - if len(null_position) != len(source_columns): - raise ValueError( - f"Expected a list-like of length {len(source_columns)}, " - f"got length {len(null_position)} for `null_position`" - ) - null_precedence = [pylibcudf.types.NullOrder.AFTER] * len(source_columns) - for idx, val in enumerate(null_position): - if val: - null_precedence[idx] = pylibcudf.types.NullOrder.BEFORE - - return pylibcudf.sorting.is_sorted( - pylibcudf.Table( - [c.to_pylibcudf(mode="read") for c in source_columns] - ), - column_order, - null_precedence - ) - - -def ordering(column_order, null_precedence): - """ - Construct order and null order vectors - - Parameters - ---------- - column_order - Iterable of bool (True for ascending order, False for descending) - null_precedence - Iterable string for null positions ("first" for start, "last" for end) - - Both iterables must be the same length (not checked) - - Returns - ------- - pair of vectors (order, and null_order) - """ - c_column_order = [] - c_null_precedence = [] - for asc, null in zip(column_order, null_precedence): - c_column_order.append( - pylibcudf.types.Order.ASCENDING if asc else pylibcudf.types.Order.DESCENDING - ) - if asc ^ (null == "first"): - c_null_precedence.append(pylibcudf.types.NullOrder.AFTER) - elif asc ^ (null == "last"): - c_null_precedence.append(pylibcudf.types.NullOrder.BEFORE) - else: - raise ValueError(f"Invalid null precedence {null}") - return c_column_order, c_null_precedence - - -@acquire_spill_lock() -def order_by( - list columns_from_table, - object ascending, - str na_position, - *, - bool stable -): - """ - Get index to sort the table in ascending/descending order. - - Parameters - ---------- - columns_from_table : list[Column] - Columns from the table which will be sorted - ascending : sequence[bool] - Sequence of boolean values which correspond to each column - in the table to be sorted signifying the order of each column - True - Ascending and False - Descending - na_position : str - Whether null values should show up at the "first" or "last" - position of **all** sorted column. - stable : bool - Should the sort be stable? (no default) - - Returns - ------- - Column of indices that sorts the table - """ - order = ordering(ascending, repeat(na_position)) - func = getattr(pylibcudf.sorting, f"{'stable_' if stable else ''}sorted_order") - - return Column.from_pylibcudf( - func( - pylibcudf.Table( - [c.to_pylibcudf(mode="read") for c in columns_from_table], - ), - order[0], - order[1], - ) - ) - - -@acquire_spill_lock() -def sort( - list values, - list column_order=None, - list null_precedence=None, -): - """ - Sort the table in ascending/descending order. - - Parameters - ---------- - values : list[Column] - Columns of the table which will be sorted - column_order : list[bool], optional - Sequence of boolean values which correspond to each column in - keys providing the sort order (default all True). - With True <=> ascending; False <=> descending. - null_precedence : list[str], optional - Sequence of "first" or "last" values (default "first") - indicating the position of null values when sorting the keys. - """ - ncol = len(values) - order = ordering( - column_order or repeat(True, ncol), - null_precedence or repeat("first", ncol), - ) - return columns_from_pylibcudf_table( - pylibcudf.sorting.sort( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]), - order[0], - order[1], - ) - ) - - -@acquire_spill_lock() -def sort_by_key( - list values, - list keys, - object ascending, - object na_position, - *, - bool stable, -): - """ - Sort a table by given keys - - Parameters - ---------- - values : list[Column] - Columns of the table which will be sorted - keys : list[Column] - Columns making up the sort key - ascending : list[bool] - Sequence of boolean values which correspond to each column - in the table to be sorted signifying the order of each column - True - Ascending and False - Descending - na_position : list[str] - Sequence of "first" or "last" values (default "first") - indicating the position of null values when sorting the keys. - stable : bool - Should the sort be stable? (no default) - - Returns - ------- - list[Column] - list of value columns sorted by keys - """ - order = ordering(ascending, na_position) - func = getattr(pylibcudf.sorting, f"{'stable_' if stable else ''}sort_by_key") - return columns_from_pylibcudf_table( - func( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]), - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in keys]), - order[0], - order[1], - ) - ) - - -@acquire_spill_lock() -def segmented_sort_by_key( - list values, - list keys, - Column segment_offsets, - list column_order=None, - list null_precedence=None, - *, - bool stable, -): - """ - Sort segments of a table by given keys - - Parameters - ---------- - values : list[Column] - Columns of the table which will be sorted - keys : list[Column] - Columns making up the sort key - offsets : Column - Segment offsets - column_order : list[bool], optional - Sequence of boolean values which correspond to each column in - keys providing the sort order (default all True). - With True <=> ascending; False <=> descending. - null_precedence : list[str], optional - Sequence of "first" or "last" values (default "first") - indicating the position of null values when sorting the keys. - stable : bool - Should the sort be stable? (no default) - - Returns - ------- - list[Column] - list of value columns sorted by keys - """ - ncol = len(values) - order = ordering( - column_order or repeat(True, ncol), - null_precedence or repeat("first", ncol), - ) - func = getattr( - pylibcudf.sorting, - f"{'stable_' if stable else ''}segmented_sort_by_key" - ) - return columns_from_pylibcudf_table( - func( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]), - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in keys]), - segment_offsets.to_pylibcudf(mode="read"), - order[0], - order[1], - ) - ) - - -@acquire_spill_lock() -def digitize(list source_columns, list bins, bool right=False): - """ - Return the indices of the bins to which each value in source_table belongs. - - Parameters - ---------- - source_columns : Input columns to be binned. - bins : List containing columns of bins - right : Indicating whether the intervals include the - right or the left bin edge. - """ - return Column.from_pylibcudf( - getattr(pylibcudf.search, "lower_bound" if right else "upper_bound")( - pylibcudf.Table( - [c.to_pylibcudf(mode="read") for c in bins] - ), - pylibcudf.Table( - [c.to_pylibcudf(mode="read") for c in source_columns] - ), - [pylibcudf.types.Order.ASCENDING]*len(bins), - [pylibcudf.types.NullOrder.BEFORE]*len(bins) - ) - ) - - -@acquire_spill_lock() -def rank_columns(list source_columns, rank_method method, str na_option, - bool ascending, bool pct - ): - """ - Compute numerical data ranks (1 through n) of each column in the dataframe - """ - column_order = ( - pylibcudf.types.Order.ASCENDING - if ascending - else pylibcudf.types.Order.DESCENDING - ) - # ascending - # #top = na_is_smallest - # #bottom = na_is_largest - # #keep = na_is_largest - # descending - # #top = na_is_largest - # #bottom = na_is_smallest - # #keep = na_is_smallest - if ascending: - if na_option == 'top': - null_precedence = pylibcudf.types.NullOrder.BEFORE - else: - null_precedence = pylibcudf.types.NullOrder.AFTER - else: - if na_option == 'top': - null_precedence = pylibcudf.types.NullOrder.AFTER - else: - null_precedence = pylibcudf.types.NullOrder.BEFORE - c_null_handling = ( - pylibcudf.types.NullPolicy.EXCLUDE - if na_option == 'keep' - else pylibcudf.types.NullPolicy.INCLUDE - ) - - return [ - Column.from_pylibcudf( - pylibcudf.sorting.rank( - col.to_pylibcudf(mode="read"), - method, - column_order, - c_null_handling, - null_precedence, - pct, - ) - ) - for col in source_columns - ] diff --git a/python/cudf/cudf/core/_internals/sorting.py b/python/cudf/cudf/core/_internals/sorting.py new file mode 100644 index 00000000000..69f9e7664b1 --- /dev/null +++ b/python/cudf/cudf/core/_internals/sorting.py @@ -0,0 +1,205 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations + +import itertools +from typing import TYPE_CHECKING, Literal + +import pylibcudf as plc + +from cudf._lib.column import Column +from cudf.core.buffer import acquire_spill_lock + +if TYPE_CHECKING: + from collections.abc import Iterable + + from cudf.core.column import ColumnBase + + +@acquire_spill_lock() +def is_sorted( + source_columns: list[ColumnBase], + ascending: list[bool] | None = None, + null_position: list[bool] | None = None, +) -> bool: + """ + Checks whether the rows of a `table` are sorted in lexicographical order. + + Parameters + ---------- + source_columns : list of columns + columns to be checked for sort order + ascending : None or list-like of booleans + None or list-like of boolean values indicating expected sort order of + each column. If list-like, size of list-like must be len(columns). If + None, all columns expected sort order is set to ascending. False (0) - + descending, True (1) - ascending. + null_position : None or list-like of booleans + None or list-like of boolean values indicating desired order of nulls + compared to other elements. If list-like, size of list-like must be + len(columns). If None, null order is set to before. False (0) - after, + True (1) - before. + + Returns + ------- + returns : boolean + Returns True, if sorted as expected by ``ascending`` and + ``null_position``, False otherwise. + """ + if ascending is None: + column_order = [plc.types.Order.ASCENDING] * len(source_columns) + else: + if len(ascending) != len(source_columns): + raise ValueError( + f"Expected a list-like of length {len(source_columns)}, " + f"got length {len(ascending)} for `ascending`" + ) + column_order = [ + plc.types.Order.ASCENDING if asc else plc.types.Order.DESCENDING + for asc in ascending + ] + + if null_position is None: + null_precedence = [plc.types.NullOrder.AFTER] * len(source_columns) + else: + if len(null_position) != len(source_columns): + raise ValueError( + f"Expected a list-like of length {len(source_columns)}, " + f"got length {len(null_position)} for `null_position`" + ) + null_precedence = [ + plc.types.NullOrder.BEFORE if null else plc.types.NullOrder.AFTER + for null in null_position + ] + + return plc.sorting.is_sorted( + plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]), + column_order, + null_precedence, + ) + + +def ordering( + column_order: list[bool], + null_precedence: Iterable[Literal["first", "last"]], +) -> tuple[list[plc.types.Order], list[plc.types.NullOrder]]: + """ + Construct order and null order vectors + + Parameters + ---------- + column_order + Iterable of bool (True for ascending order, False for descending) + null_precedence + Iterable string for null positions ("first" for start, "last" for end) + + Both iterables must be the same length (not checked) + + Returns + ------- + pair of vectors (order, and null_order) + """ + c_column_order = [] + c_null_precedence = [] + for asc, null in zip(column_order, null_precedence): + c_column_order.append( + plc.types.Order.ASCENDING if asc else plc.types.Order.DESCENDING + ) + if asc ^ (null == "first"): + c_null_precedence.append(plc.types.NullOrder.AFTER) + elif asc ^ (null == "last"): + c_null_precedence.append(plc.types.NullOrder.BEFORE) + else: + raise ValueError(f"Invalid null precedence {null}") + return c_column_order, c_null_precedence + + +@acquire_spill_lock() +def order_by( + columns_from_table: list[ColumnBase], + ascending: list[bool], + na_position: Literal["first", "last"], + *, + stable: bool, +): + """ + Get index to sort the table in ascending/descending order. + + Parameters + ---------- + columns_from_table : list[Column] + Columns from the table which will be sorted + ascending : sequence[bool] + Sequence of boolean values which correspond to each column + in the table to be sorted signifying the order of each column + True - Ascending and False - Descending + na_position : str + Whether null values should show up at the "first" or "last" + position of **all** sorted column. + stable : bool + Should the sort be stable? (no default) + + Returns + ------- + Column of indices that sorts the table + """ + order = ordering(ascending, itertools.repeat(na_position)) + func = ( + plc.sorting.stable_sorted_order if stable else plc.sorting.sorted_order + ) + return Column.from_pylibcudf( + func( + plc.Table( + [col.to_pylibcudf(mode="read") for col in columns_from_table], + ), + order[0], + order[1], + ) + ) + + +@acquire_spill_lock() +def sort_by_key( + values: list[ColumnBase], + keys: list[ColumnBase], + ascending: list[bool], + na_position: list[Literal["first", "last"]], + *, + stable: bool, +) -> list[ColumnBase]: + """ + Sort a table by given keys + + Parameters + ---------- + values : list[Column] + Columns of the table which will be sorted + keys : list[Column] + Columns making up the sort key + ascending : list[bool] + Sequence of boolean values which correspond to each column + in the table to be sorted signifying the order of each column + True - Ascending and False - Descending + na_position : list[str] + Sequence of "first" or "last" values (default "first") + indicating the position of null values when sorting the keys. + stable : bool + Should the sort be stable? (no default) + + Returns + ------- + list[Column] + list of value columns sorted by keys + """ + order = ordering(ascending, na_position) + func = ( + plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key + ) + return [ + Column.from_pylibcudf(col) + for col in func( + plc.Table([col.to_pylibcudf(mode="read") for col in values]), + plc.Table([col.to_pylibcudf(mode="read") for col in keys]), + order[0], + order[1], + ).columns() + ] diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 624a3ac95ed..cc07af0f669 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -42,7 +42,7 @@ is_string_dtype, ) from cudf.core._compat import PANDAS_GE_210 -from cudf.core._internals import aggregation, unary +from cudf.core._internals import aggregation, sorting, unary from cudf.core._internals.timezones import get_compatible_timezone from cudf.core.abc import Serializable from cudf.core.buffer import ( @@ -996,13 +996,13 @@ def is_unique(self) -> bool: @cached_property def is_monotonic_increasing(self) -> bool: - return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted( + return not self.has_nulls(include_nan=True) and sorting.is_sorted( [self], [True], None ) @cached_property def is_monotonic_decreasing(self) -> bool: - return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted( + return not self.has_nulls(include_nan=True) and sorting.is_sorted( [self], [False], None ) @@ -1026,15 +1026,20 @@ def contains(self, other: ColumnBase) -> ColumnBase: def sort_values( self: Self, ascending: bool = True, - na_position: str = "last", + na_position: Literal["first", "last"] = "last", ) -> Self: if (not ascending and self.is_monotonic_decreasing) or ( ascending and self.is_monotonic_increasing ): return self.copy() - return libcudf.sort.sort( - [self], column_order=[ascending], null_precedence=[na_position] - )[0] + order = sorting.ordering([ascending], [na_position]) + with acquire_spill_lock(): + plc_table = plc.sorting.sort( + plc.Table([self.to_pylibcudf(mode="read")]), + order[0], + order[1], + ) + return type(self).from_pylibcudf(plc_table.columns()[0]) # type: ignore[return-value] def distinct_count(self, dropna: bool = True) -> int: try: @@ -1204,7 +1209,7 @@ def argsort( as_column(range(len(self) - 1, -1, -1)), ) else: - return libcudf.sort.order_by( + return sorting.order_by( [self], [ascending], na_position, stable=True ) @@ -1511,7 +1516,7 @@ def _return_sentinel_column(): del right_rows # reorder `codes` so that its values correspond to the # values of `self`: - (codes,) = libcudf.sort.sort_by_key( + (codes,) = sorting.sort_by_key( codes, [left_gather_map], [True], ["last"], stable=True ) return codes.fillna(na_sentinel.value) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 28a2bd7fa6c..f099cef3331 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -718,6 +718,40 @@ def _reduction_result_dtype(self, reduction_op: str) -> Dtype: return super()._reduction_result_dtype(reduction_op) + @acquire_spill_lock() + def digitize(self, bins: np.ndarray, right: bool = False) -> Self: + """Return the indices of the bins to which each value in column belongs. + + Parameters + ---------- + bins : np.ndarray + 1-D column-like object of bins with same type as `column`, should be + monotonically increasing. + right : bool + Indicates whether interval contains the right or left bin edge. + + Returns + ------- + A column containing the indices + """ + if self.dtype != bins.dtype: + raise ValueError( + "digitize() expects bins and input column have the same dtype." + ) + + bin_col = as_column(bins, dtype=bins.dtype) + if bin_col.nullable: + raise ValueError("`bins` cannot contain null entries.") + + return type(self).from_pylibcudf( # type: ignore[return-value] + getattr(plc.search, "lower_bound" if right else "upper_bound")( + plc.Table([bin_col.to_pylibcudf(mode="read")]), + plc.Table([self.to_pylibcudf(mode="read")]), + [plc.types.Order.ASCENDING], + [plc.types.NullOrder.BEFORE], + ) + ) + def _normalize_find_and_replace_input( input_column_dtype: DtypeObj, col_to_normalize: ColumnBase | list @@ -772,34 +806,3 @@ def _normalize_find_and_replace_input( if not normalized_column.can_cast_safely(input_column_dtype): return normalized_column return normalized_column.astype(input_column_dtype) - - -def digitize( - column: ColumnBase, bins: np.ndarray, right: bool = False -) -> ColumnBase: - """Return the indices of the bins to which each value in column belongs. - - Parameters - ---------- - column : Column - Input column. - bins : Column-like - 1-D column-like object of bins with same type as `column`, should be - monotonically increasing. - right : bool - Indicates whether interval contains the right or left bin edge. - - Returns - ------- - A column containing the indices - """ - if not column.dtype == bins.dtype: - raise ValueError( - "Digitize() expects bins and input column have the same dtype." - ) - - bin_col = as_column(bins, dtype=bins.dtype) - if bin_col.nullable: - raise ValueError("`bins` cannot contain null entries.") - - return as_column(libcudf.sort.digitize([column], [bin_col], right)) diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 7a39355dd50..aaf2239a71e 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -10,7 +10,7 @@ import pylibcudf as plc import cudf -from cudf import _lib as libcudf +from cudf.core._internals import sorting from cudf.core.buffer import Buffer, acquire_spill_lock from cudf.core.column.column import ColumnBase from cudf.core.missing import NA @@ -144,7 +144,7 @@ def quantile( ) else: # get sorted indices and exclude nulls - indices = libcudf.sort.order_by( + indices = sorting.order_by( [self], [True], "first", stable=True ).slice(self.null_count, len(self)) with acquire_spill_lock(): diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 00199cca828..4f40ba0bd92 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -22,6 +22,7 @@ from cudf import _lib as libcudf from cudf.api.types import is_dtype_equal, is_scalar from cudf.core._compat import PANDAS_LT_300 +from cudf.core._internals import sorting from cudf.core._internals.search import search_sorted from cudf.core.abc import Serializable from cudf.core.buffer import acquire_spill_lock @@ -1476,7 +1477,7 @@ def _get_sorted_inds( else: ascending_lst = list(ascending) - return libcudf.sort.order_by( + return sorting.order_by( list(to_sort), ascending_lst, na_position, diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index a8d82f977d5..b772d35846d 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -18,11 +18,11 @@ import cudf from cudf import _lib as libcudf from cudf._lib import groupby as libgroupby -from cudf._lib.sort import segmented_sort_by_key from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import is_list_like, is_numeric_dtype from cudf.core._compat import PANDAS_LT_300 +from cudf.core._internals import sorting from cudf.core.abc import Serializable from cudf.core.buffer import acquire_spill_lock from cudf.core.column.column import ColumnBase, StructDtype, as_column @@ -792,7 +792,7 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): # want, and right order is a matching gather map for # the result table. Get the correct order by sorting # the right gather map. - (right_order,) = libcudf.sort.sort_by_key( + (right_order,) = sorting.sort_by_key( [right_order], [left_order], [True], @@ -1248,15 +1248,20 @@ def sample( for off, size in zip(group_offsets, size_per_group): rs.shuffle(indices[off : off + size]) else: - rng = cp.random.default_rng(seed=random_state) - (indices,) = segmented_sort_by_key( - [as_column(indices)], - [as_column(rng.random(size=nrows))], - as_column(group_offsets), - [], - [], - stable=True, + keys = cp.random.default_rng(seed=random_state).random( + size=nrows ) + with acquire_spill_lock(): + plc_table = plc.sorting.stable_segmented_sort_by_key( + plc.Table( + [as_column(indices).to_pylibcudf(mode="read")] + ), + plc.Table([as_column(keys).to_pylibcudf(mode="read")]), + as_column(group_offsets).to_pylibcudf(mode="read"), + [plc.types.Order.ASCENDING], + [plc.types.NullOrder.AFTER], + ) + indices = ColumnBase.from_pylibcudf(plc_table.columns()[0]) indices = cp.asarray(indices.data_array_view(mode="read")) # Which indices are we going to want? want = np.arange(samples_per_group.sum(), dtype=size_type_dtype) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 81d954960e2..1a667e24bef 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -6367,9 +6367,49 @@ def rank( elif source._num_columns != num_cols: dropped_cols = True - result_columns = libcudf.sort.rank_columns( - [*source._columns], method_enum, na_option, ascending, pct + column_order = ( + plc.types.Order.ASCENDING + if ascending + else plc.types.Order.DESCENDING ) + # ascending + # #top = na_is_smallest + # #bottom = na_is_largest + # #keep = na_is_largest + # descending + # #top = na_is_largest + # #bottom = na_is_smallest + # #keep = na_is_smallest + if ascending: + if na_option == "top": + null_precedence = plc.types.NullOrder.BEFORE + else: + null_precedence = plc.types.NullOrder.AFTER + else: + if na_option == "top": + null_precedence = plc.types.NullOrder.AFTER + else: + null_precedence = plc.types.NullOrder.BEFORE + c_null_handling = ( + plc.types.NullPolicy.EXCLUDE + if na_option == "keep" + else plc.types.NullPolicy.INCLUDE + ) + + with acquire_spill_lock(): + result_columns = [ + libcudf.column.Column.from_pylibcudf( + plc.sorting.rank( + col.to_pylibcudf(mode="read"), + method_enum, + column_order, + c_null_handling, + null_precedence, + pct, + ) + ) + for col in source._columns + ] if dropped_cols: result = type(source)._from_data( diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 5c224176730..e7ea91c1f21 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -9,6 +9,7 @@ import cudf from cudf import _lib as libcudf from cudf._lib.types import size_type_dtype +from cudf.core._internals import sorting from cudf.core.buffer import acquire_spill_lock from cudf.core.copy_types import GatherMap from cudf.core.join._join_helpers import ( @@ -256,7 +257,7 @@ def _gather_maps(self, left_cols, right_cols): for map_, n, null in zip(maps, lengths, nullify) ) ) - return libcudf.sort.sort_by_key( + return sorting.sort_by_key( list(maps), # If how is right, right map is primary sort key. key_order[:: -1 if self.how == "right" else 1], @@ -426,7 +427,7 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame: else: to_sort = [*result._columns] index_names = None - result_columns = libcudf.sort.sort_by_key( + result_columns = sorting.sort_by_key( to_sort, by, [True] * len(by), diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index f5ee36f851c..a99e06e4a8e 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -22,6 +22,7 @@ from cudf.api.types import is_integer, is_list_like, is_object_dtype, is_scalar from cudf.core import column from cudf.core._base_index import _return_get_indexer_result +from cudf.core._internals import sorting from cudf.core.algorithms import factorize from cudf.core.buffer import acquire_spill_lock from cudf.core.column_accessor import ColumnAccessor @@ -1677,7 +1678,7 @@ def _is_sorted(self, ascending=None, null_position=None) -> bool: f"Expected a list-like or None for `null_position`, got " f"{type(null_position)}" ) - return libcudf.sort.is_sorted( + return sorting.is_sorted( [*self._columns], ascending=ascending, null_position=null_position ) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 647e20fc16b..961e5e11bc0 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3410,7 +3410,7 @@ def describe( ) @_performance_tracking - def digitize(self, bins, right=False): + def digitize(self, bins: np.ndarray, right: bool = False) -> Self: """Return the indices of the bins to which each value belongs. Notes @@ -3441,9 +3441,8 @@ def digitize(self, bins, right=False): 3 2 dtype: int32 """ - return Series._from_column( - cudf.core.column.numerical.digitize(self._column, bins, right), - name=self.name, + return type(self)._from_column( + self._column.digitize(bins, right), name=self.name ) @_performance_tracking From 34e20451cf5452ecea74092dae3c6f5078ade0bd Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 13 Dec 2024 15:36:55 -0800 Subject: [PATCH 77/78] Mark more constexpr functions as device-available (#17545) Contributes to #7795. Also contributes to https://github.com/rapidsai/build-planning/issues/76. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Nghia Truong (https://github.com/ttnghia) - Yunsong Wang (https://github.com/PointKernel) - Bradley Dice (https://github.com/bdice) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/17545 --- ci/build_docs.sh | 6 + .../cudf/column/column_device_view.cuh | 18 ++- .../cudf/detail/aggregation/aggregation.cuh | 2 +- cpp/include/cudf/detail/utilities/cuda.cuh | 11 +- .../detail/utilities/device_operators.cuh | 30 ++-- .../cudf/detail/utilities/integer_utils.hpp | 4 +- .../detail/floating_conversion.hpp | 7 +- .../cudf/hashing/detail/hash_functions.cuh | 5 +- cpp/include/cudf/hashing/detail/hashing.hpp | 2 +- cpp/include/cudf/strings/detail/utf8.hpp | 21 +-- cpp/include/cudf/strings/string_view.cuh | 8 +- .../cudf/table/experimental/row_operators.cuh | 74 +++++----- cpp/include/cudf/types.hpp | 9 +- cpp/include/cudf/utilities/span.hpp | 138 ++++++++++++------ cpp/include/cudf/utilities/traits.hpp | 42 +++--- cpp/src/binaryop/compiled/binary_ops.cuh | 6 +- cpp/src/copying/contiguous_split.cu | 3 +- cpp/src/groupby/sort/group_rank_scan.cu | 3 +- cpp/src/hash/murmurhash3_x64_128.cu | 4 +- cpp/src/hash/sha_hash.cuh | 4 +- cpp/src/hash/xxhash_64.cu | 3 +- cpp/src/io/avro/avro_common.hpp | 2 +- cpp/src/io/comp/unsnap.cu | 3 +- cpp/src/io/fst/agent_dfa.cuh | 14 +- cpp/src/io/statistics/byte_array_view.cuh | 33 +++-- .../io/statistics/typed_statistics_chunk.cuh | 5 +- cpp/src/io/utilities/parsing_utils.cuh | 19 ++- cpp/src/io/utilities/trie.cuh | 4 +- cpp/src/quantiles/quantiles_util.hpp | 9 +- cpp/src/strings/search/find.cu | 3 +- cpp/src/strings/slice.cu | 7 +- docs/cudf/source/conf.py | 2 + 32 files changed, 302 insertions(+), 199 deletions(-) diff --git a/ci/build_docs.sh b/ci/build_docs.sh index 4290d013fe4..52d8f659611 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -35,6 +35,10 @@ rapids-mamba-retry install \ export RAPIDS_DOCS_DIR="$(mktemp -d)" +EXITCODE=0 +trap "EXITCODE=1" ERR +set +e + rapids-logger "Build CPP docs" pushd cpp/doxygen aws s3 cp s3://rapidsai-docs/librmm/html/${RAPIDS_VERSION_MAJOR_MINOR}/rmm.tag . || echo "Failed to download rmm Doxygen tag" @@ -58,3 +62,5 @@ mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/dask-cudf/html" popd RAPIDS_VERSION_NUMBER="${RAPIDS_VERSION_MAJOR_MINOR}" rapids-upload-docs + +exit ${EXITCODE} diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index db6d5255616..ea480b133dc 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -33,11 +33,13 @@ #include #include +#include #include #include #include #include +#include /** * @file column_device_view.cuh @@ -56,8 +58,8 @@ namespace CUDF_EXPORT cudf { * */ struct nullate { - struct YES : std::bool_constant {}; - struct NO : std::bool_constant {}; + struct YES : cuda::std::bool_constant {}; + struct NO : cuda::std::bool_constant {}; /** * @brief `nullate::DYNAMIC` defers the determination of nullability to run time rather than * compile time. The calling code is responsible for specifying whether or not nulls are @@ -80,7 +82,7 @@ struct nullate { * @return `true` if nulls are expected in the operation in which this object is applied, * otherwise false */ - constexpr operator bool() const noexcept { return value; } + CUDF_HOST_DEVICE constexpr operator bool() const noexcept { return value; } bool value; ///< True if nulls are expected }; }; @@ -319,14 +321,14 @@ class alignas(16) column_device_view_base { } template - struct has_element_accessor_impl : std::false_type {}; + struct has_element_accessor_impl : cuda::std::false_type {}; template struct has_element_accessor_impl< C, T, - void_t().template element(std::declval()))>> - : std::true_type {}; + void_t().template element(cuda::std::declval()))>> + : cuda::std::true_type {}; }; // @cond // Forward declaration @@ -534,7 +536,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { * @return `true` if `column_device_view::element()` has a valid overload, `false` otherwise */ template - static constexpr bool has_element_accessor() + CUDF_HOST_DEVICE static constexpr bool has_element_accessor() { return has_element_accessor_impl::value; } @@ -1044,7 +1046,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view * @return `true` if `mutable_column_device_view::element()` has a valid overload, `false` */ template - static constexpr bool has_element_accessor() + CUDF_HOST_DEVICE static constexpr bool has_element_accessor() { return has_element_accessor_impl::value; } diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh index de53e7586cd..c30c3d6f4bd 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.cuh +++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh @@ -36,7 +36,7 @@ namespace cudf { namespace detail { template -constexpr bool is_product_supported() +CUDF_HOST_DEVICE constexpr bool is_product_supported() { return is_numeric(); } diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh index 61a8e9f7ec3..72cdc3d8067 100644 --- a/cpp/include/cudf/detail/utilities/cuda.cuh +++ b/cpp/include/cudf/detail/utilities/cuda.cuh @@ -74,9 +74,10 @@ class grid_1d { * @param num_threads_per_block The number of threads per block * @return thread_index_type The global thread index */ - static constexpr thread_index_type global_thread_id(thread_index_type thread_id, - thread_index_type block_id, - thread_index_type num_threads_per_block) + __device__ static constexpr thread_index_type global_thread_id( + thread_index_type thread_id, + thread_index_type block_id, + thread_index_type num_threads_per_block) { return thread_id + block_id * num_threads_per_block; } @@ -114,8 +115,8 @@ class grid_1d { * @param num_threads_per_block The number of threads per block * @return thread_index_type The global thread index */ - static constexpr thread_index_type grid_stride(thread_index_type num_threads_per_block, - thread_index_type num_blocks_per_grid) + __device__ static constexpr thread_index_type grid_stride(thread_index_type num_threads_per_block, + thread_index_type num_blocks_per_grid) { return num_threads_per_block * num_blocks_per_grid; } diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh index d16be5e22dd..923cd04479d 100644 --- a/cpp/include/cudf/detail/utilities/device_operators.cuh +++ b/cpp/include/cudf/detail/utilities/device_operators.cuh @@ -29,6 +29,8 @@ #include #include +#include + #include namespace cudf { @@ -42,7 +44,7 @@ template ()>* = nullptr> CUDF_HOST_DEVICE inline auto min(LHS const& lhs, RHS const& rhs) { - return std::min(lhs, rhs); + return cuda::std::min(lhs, rhs); } /** @@ -53,7 +55,7 @@ template ()>* = nullptr> CUDF_HOST_DEVICE inline auto max(LHS const& lhs, RHS const& rhs) { - return std::max(lhs, rhs); + return cuda::std::max(lhs, rhs); } } // namespace detail @@ -68,20 +70,20 @@ struct DeviceSum { } template ()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { return T{typename T::duration{0}}; } template () && !cudf::is_fixed_point()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { return T{0}; } template ()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { #ifndef __CUDA_ARCH__ CUDF_FAIL("fixed_point does not yet support device operator identity"); @@ -109,7 +111,7 @@ struct DeviceCount { } template - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { return T{}; } @@ -129,7 +131,7 @@ struct DeviceMin { template && !cudf::is_dictionary() && !cudf::is_fixed_point()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { // chrono types do not have std::numeric_limits specializations and should use T::max() // https://eel.is/c++draft/numeric.limits.general#6 @@ -143,7 +145,7 @@ struct DeviceMin { } template ()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { #ifndef __CUDA_ARCH__ CUDF_FAIL("fixed_point does not yet support DeviceMin identity"); @@ -161,7 +163,7 @@ struct DeviceMin { } template ()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { return static_cast(T::max_value()); } @@ -181,7 +183,7 @@ struct DeviceMax { template && !cudf::is_dictionary() && !cudf::is_fixed_point()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { // chrono types do not have std::numeric_limits specializations and should use T::min() // https://eel.is/c++draft/numeric.limits.general#6 @@ -195,7 +197,7 @@ struct DeviceMax { } template ()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { #ifndef __CUDA_ARCH__ CUDF_FAIL("fixed_point does not yet support DeviceMax identity"); @@ -212,7 +214,7 @@ struct DeviceMax { } template ()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { return static_cast(T::lowest_value()); } @@ -229,13 +231,13 @@ struct DeviceProduct { } template ()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { return T{1}; } template ()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { #ifndef __CUDA_ARCH__ CUDF_FAIL("fixed_point does not yet support DeviceProduct identity"); diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp index 957b6b70fe2..2e3d71815c0 100644 --- a/cpp/include/cudf/detail/utilities/integer_utils.hpp +++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp @@ -86,7 +86,7 @@ constexpr S round_down_safe(S number_to_round, S modulus) noexcept * `modulus` is positive and does not check for overflow. */ template -constexpr S round_up_unsafe(S number_to_round, S modulus) noexcept +CUDF_HOST_DEVICE constexpr S round_up_unsafe(S number_to_round, S modulus) noexcept { auto remainder = number_to_round % modulus; if (remainder == 0) { return number_to_round; } @@ -187,7 +187,7 @@ constexpr bool is_a_power_of_two(I val) noexcept * @return Absolute value if value type is signed. */ template -constexpr auto absolute_value(T value) -> T +CUDF_HOST_DEVICE constexpr auto absolute_value(T value) -> T { if constexpr (cuda::std::is_signed()) return numeric::detail::abs(value); return value; diff --git a/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp b/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp index fce08b4a5c4..9e68bafb09a 100644 --- a/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp +++ b/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp @@ -22,6 +22,7 @@ #include #include #include +#include #include @@ -183,7 +184,7 @@ struct floating_converter { * @param integer_rep The bit-casted floating value to extract the exponent from * @return The stored base-2 exponent and significand, shifted for denormals */ - CUDF_HOST_DEVICE inline static std::pair get_significand_and_pow2( + CUDF_HOST_DEVICE inline static cuda::std::pair get_significand_and_pow2( IntegralType integer_rep) { // Extract the significand @@ -1008,7 +1009,7 @@ CUDF_HOST_DEVICE inline auto shift_to_binary_pospow(DecimalRep decimal_rep, int } // Our shifting_rep is now the integer mantissa, return it and the powers of 2 - return std::pair{shifting_rep, pow2}; + return cuda::std::pair{shifting_rep, pow2}; } /** @@ -1075,7 +1076,7 @@ CUDF_HOST_DEVICE inline auto shift_to_binary_negpow(DecimalRep decimal_rep, int } // Our shifting_rep is now the integer mantissa, return it and the powers of 2 - return std::pair{shifting_rep, pow2}; + return cuda::std::pair{shifting_rep, pow2}; } /** diff --git a/cpp/include/cudf/hashing/detail/hash_functions.cuh b/cpp/include/cudf/hashing/detail/hash_functions.cuh index 0ec41a20ef1..fd3455e761d 100644 --- a/cpp/include/cudf/hashing/detail/hash_functions.cuh +++ b/cpp/include/cudf/hashing/detail/hash_functions.cuh @@ -18,7 +18,8 @@ #include -#include +#include +#include namespace cudf::hashing::detail { @@ -29,7 +30,7 @@ template T __device__ inline normalize_nans(T const& key) { if constexpr (cudf::is_floating_point()) { - if (std::isnan(key)) { return std::numeric_limits::quiet_NaN(); } + if (cuda::std::isnan(key)) { return cuda::std::numeric_limits::quiet_NaN(); } } return key; } diff --git a/cpp/include/cudf/hashing/detail/hashing.hpp b/cpp/include/cudf/hashing/detail/hashing.hpp index a978e54a1b9..7cb80081a95 100644 --- a/cpp/include/cudf/hashing/detail/hashing.hpp +++ b/cpp/include/cudf/hashing/detail/hashing.hpp @@ -82,7 +82,7 @@ std::unique_ptr xxhash_64(table_view const& input, * @param rhs The second hash value * @return Combined hash value */ -constexpr uint32_t hash_combine(uint32_t lhs, uint32_t rhs) +CUDF_HOST_DEVICE constexpr uint32_t hash_combine(uint32_t lhs, uint32_t rhs) { return lhs ^ (rhs + 0x9e37'79b9 + (lhs << 6) + (lhs >> 2)); } diff --git a/cpp/include/cudf/strings/detail/utf8.hpp b/cpp/include/cudf/strings/detail/utf8.hpp index 85349a421b1..84957ab9f1d 100644 --- a/cpp/include/cudf/strings/detail/utf8.hpp +++ b/cpp/include/cudf/strings/detail/utf8.hpp @@ -31,7 +31,7 @@ namespace strings::detail { * @param chr Any single byte from a valid UTF-8 character * @return true if this is not the first byte of the character */ -constexpr bool is_utf8_continuation_char(unsigned char chr) +CUDF_HOST_DEVICE constexpr bool is_utf8_continuation_char(unsigned char chr) { // The (0xC0 & 0x80) bit pattern identifies a continuation byte of a character. return (chr & 0xC0) == 0x80; @@ -43,7 +43,10 @@ constexpr bool is_utf8_continuation_char(unsigned char chr) * @param chr Any single byte from a valid UTF-8 character * @return true if this the first byte of the character */ -constexpr bool is_begin_utf8_char(unsigned char chr) { return not is_utf8_continuation_char(chr); } +CUDF_HOST_DEVICE constexpr bool is_begin_utf8_char(unsigned char chr) +{ + return not is_utf8_continuation_char(chr); +} /** * @brief This will return true if the passed in byte could be the start of @@ -55,7 +58,7 @@ constexpr bool is_begin_utf8_char(unsigned char chr) { return not is_utf8_contin * @param byte The byte to be tested * @return true if this can be the first byte of a character */ -constexpr bool is_valid_begin_utf8_char(uint8_t byte) +CUDF_HOST_DEVICE constexpr bool is_valid_begin_utf8_char(uint8_t byte) { // to be the first byte of a valid (up to 4 byte) UTF-8 char, byte must be one of: // 0b0vvvvvvv a 1 byte character @@ -72,7 +75,7 @@ constexpr bool is_valid_begin_utf8_char(uint8_t byte) * @param character Single character * @return Number of bytes */ -constexpr size_type bytes_in_char_utf8(char_utf8 character) +CUDF_HOST_DEVICE constexpr size_type bytes_in_char_utf8(char_utf8 character) { return 1 + static_cast((character & 0x0000'FF00u) > 0) + static_cast((character & 0x00FF'0000u) > 0) + @@ -89,7 +92,7 @@ constexpr size_type bytes_in_char_utf8(char_utf8 character) * @param byte Byte from an encoded character. * @return Number of bytes. */ -constexpr size_type bytes_in_utf8_byte(uint8_t byte) +CUDF_HOST_DEVICE constexpr size_type bytes_in_utf8_byte(uint8_t byte) { return 1 + static_cast((byte & 0xF0) == 0xF0) // 4-byte character prefix + static_cast((byte & 0xE0) == 0xE0) // 3-byte character prefix @@ -104,7 +107,7 @@ constexpr size_type bytes_in_utf8_byte(uint8_t byte) * @param[out] character Single char_utf8 value. * @return The number of bytes in the character */ -constexpr size_type to_char_utf8(char const* str, char_utf8& character) +CUDF_HOST_DEVICE constexpr size_type to_char_utf8(char const* str, char_utf8& character) { size_type const chr_width = bytes_in_utf8_byte(static_cast(*str)); @@ -131,7 +134,7 @@ constexpr size_type to_char_utf8(char const* str, char_utf8& character) * @param[out] str Output array. * @return The number of bytes in the character */ -constexpr inline size_type from_char_utf8(char_utf8 character, char* str) +CUDF_HOST_DEVICE constexpr inline size_type from_char_utf8(char_utf8 character, char* str) { size_type const chr_width = bytes_in_char_utf8(character); for (size_type idx = 0; idx < chr_width; ++idx) { @@ -148,7 +151,7 @@ constexpr inline size_type from_char_utf8(char_utf8 character, char* str) * @param utf8_char Single UTF-8 character to convert. * @return Code-point for the UTF-8 character. */ -constexpr uint32_t utf8_to_codepoint(cudf::char_utf8 utf8_char) +CUDF_HOST_DEVICE constexpr uint32_t utf8_to_codepoint(cudf::char_utf8 utf8_char) { uint32_t unchr = 0; if (utf8_char < 0x0000'0080) // single-byte pass thru @@ -178,7 +181,7 @@ constexpr uint32_t utf8_to_codepoint(cudf::char_utf8 utf8_char) * @param unchr Character code-point to convert. * @return Single UTF-8 character. */ -constexpr cudf::char_utf8 codepoint_to_utf8(uint32_t unchr) +CUDF_HOST_DEVICE constexpr cudf::char_utf8 codepoint_to_utf8(uint32_t unchr) { cudf::char_utf8 utf8 = 0; if (unchr < 0x0000'0080) // single byte utf8 diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index 1ae4c3703b2..f0040e069d8 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -31,6 +31,8 @@ #include #endif +#include + #include // This file should only include device code logic. @@ -75,8 +77,8 @@ __device__ inline size_type characters_in_string(char const* str, size_type byte * @param pos Character position to count to * @return The number of bytes and the left over non-counted position value */ -__device__ inline std::pair bytes_to_character_position(string_view d_str, - size_type pos) +__device__ inline cuda::std::pair bytes_to_character_position( + string_view d_str, size_type pos) { size_type bytes = 0; auto ptr = d_str.data(); @@ -303,7 +305,7 @@ __device__ inline char_utf8 string_view::operator[](size_type pos) const __device__ inline size_type string_view::byte_offset(size_type pos) const { if (length() == size_bytes()) return pos; - return std::get<0>(strings::detail::bytes_to_character_position(*this, pos)); + return cuda::std::get<0>(strings::detail::bytes_to_character_position(*this, pos)); } __device__ inline int string_view::compare(string_view const& in) const diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh index 3f33c70c29a..8214ea6e83b 100644 --- a/cpp/include/cudf/table/experimental/row_operators.cuh +++ b/cpp/include/cudf/table/experimental/row_operators.cuh @@ -33,6 +33,8 @@ #include #include +#include +#include #include #include #include @@ -48,11 +50,8 @@ #include #include -#include #include -#include #include -#include namespace CUDF_EXPORT cudf { @@ -287,15 +286,16 @@ class device_row_comparator { * `null_order::BEFORE` for all columns. * @param comparator Physical element relational comparison functor. */ - device_row_comparator(Nullate check_nulls, - table_device_view lhs, - table_device_view rhs, - device_span l_dremel_device_views, - device_span r_dremel_device_views, - std::optional> depth = std::nullopt, - std::optional> column_order = std::nullopt, - std::optional> null_precedence = std::nullopt, - PhysicalElementComparator comparator = {}) noexcept + device_row_comparator( + Nullate check_nulls, + table_device_view lhs, + table_device_view rhs, + device_span l_dremel_device_views, + device_span r_dremel_device_views, + cuda::std::optional> depth = cuda::std::nullopt, + cuda::std::optional> column_order = cuda::std::nullopt, + cuda::std::optional> null_precedence = cuda::std::nullopt, + PhysicalElementComparator comparator = {}) noexcept : _lhs{lhs}, _rhs{rhs}, _l_dremel(l_dremel_device_views), @@ -331,9 +331,9 @@ class device_row_comparator { Nullate check_nulls, table_device_view lhs, table_device_view rhs, - std::optional> column_order = std::nullopt, - std::optional> null_precedence = std::nullopt, - PhysicalElementComparator comparator = {}) noexcept + cuda::std::optional> column_order = cuda::std::nullopt, + cuda::std::optional> null_precedence = cuda::std::nullopt, + PhysicalElementComparator comparator = {}) noexcept : _lhs{lhs}, _rhs{rhs}, _l_dremel{}, @@ -410,7 +410,7 @@ class device_row_comparator { return cuda::std::pair(_comparator(_lhs.element(lhs_element_index), _rhs.element(rhs_element_index)), - std::numeric_limits::max()); + cuda::std::numeric_limits::max()); } /** @@ -455,7 +455,7 @@ class device_row_comparator { } if (lcol.num_child_columns() == 0) { - return cuda::std::pair(weak_ordering::EQUIVALENT, std::numeric_limits::max()); + return cuda::std::pair(weak_ordering::EQUIVALENT, cuda::std::numeric_limits::max()); } // Non-empty structs have been modified to only have 1 child when using this. @@ -607,7 +607,7 @@ class device_row_comparator { __device__ constexpr weak_ordering operator()(size_type const lhs_index, size_type const rhs_index) const noexcept { - int last_null_depth = std::numeric_limits::max(); + int last_null_depth = cuda::std::numeric_limits::max(); size_type list_column_index{-1}; for (size_type i = 0; i < _lhs.num_columns(); ++i) { if (_lhs.column(i).type().id() == type_id::LIST) { ++list_column_index; } @@ -626,9 +626,9 @@ class device_row_comparator { // here, otherwise the current code would be failing. auto const [l_dremel_i, r_dremel_i] = _lhs.column(i).type().id() == type_id::LIST - ? std::make_tuple(optional_dremel_view(_l_dremel[list_column_index]), - optional_dremel_view(_r_dremel[list_column_index])) - : std::make_tuple(optional_dremel_view{}, optional_dremel_view{}); + ? cuda::std::make_tuple(optional_dremel_view(_l_dremel[list_column_index]), + optional_dremel_view(_r_dremel[list_column_index])) + : cuda::std::make_tuple(optional_dremel_view{}, optional_dremel_view{}); auto element_comp = element_comparator{_check_nulls, _lhs.column(i), @@ -658,9 +658,9 @@ class device_row_comparator { device_span const _l_dremel; device_span const _r_dremel; Nullate const _check_nulls; - std::optional> const _depth; - std::optional> const _column_order; - std::optional> const _null_precedence; + cuda::std::optional> const _depth; + cuda::std::optional> const _column_order; + cuda::std::optional> const _null_precedence; PhysicalElementComparator const _comparator; }; // class device_row_comparator @@ -882,10 +882,10 @@ struct preprocessed_table { * @return Device array containing respective column orders. If no explicit column orders were * specified during the creation of this object then this will be `nullopt`. */ - [[nodiscard]] std::optional> column_order() const + [[nodiscard]] cuda::std::optional> column_order() const { - return _column_order.size() ? std::optional>(_column_order) - : std::nullopt; + return _column_order.size() ? cuda::std::optional>(_column_order) + : cuda::std::nullopt; } /** @@ -895,10 +895,11 @@ struct preprocessed_table { * @return Device array containing respective column null precedence. If no explicit column null * precedences were specified during the creation of this object then this will be `nullopt`. */ - [[nodiscard]] std::optional> null_precedence() const + [[nodiscard]] cuda::std::optional> null_precedence() const { - return _null_precedence.size() ? std::optional>(_null_precedence) - : std::nullopt; + return _null_precedence.size() + ? cuda::std::optional>(_null_precedence) + : cuda::std::nullopt; } /** @@ -909,9 +910,10 @@ struct preprocessed_table { * @return std::optional> Device array containing respective column depths. * If there are no nested columns in the table then this will be `nullopt`. */ - [[nodiscard]] std::optional> depths() const + [[nodiscard]] cuda::std::optional> depths() const { - return _depths.size() ? std::optional>(_depths) : std::nullopt; + return _depths.size() ? cuda::std::optional>(_depths) + : cuda::std::nullopt; } [[nodiscard]] device_span dremel_device_views() const @@ -940,8 +942,8 @@ struct preprocessed_table { rmm::device_uvector const _depths; // Dremel encoding of list columns used for the comparison algorithm - std::optional> _dremel_data; - std::optional> _dremel_device_views; + cuda::std::optional> _dremel_data; + cuda::std::optional> _dremel_device_views; // Intermediate columns generated from transforming nested children columns into // integers columns using `cudf::rank()`, need to be kept alive. @@ -1808,7 +1810,7 @@ class element_hasher { __device__ element_hasher( Nullate nulls, uint32_t seed = DEFAULT_HASH_SEED, - hash_value_type null_hash = std::numeric_limits::max()) noexcept + hash_value_type null_hash = cuda::std::numeric_limits::max()) noexcept : _check_nulls(nulls), _seed(seed), _null_hash(null_hash) { } @@ -1892,7 +1894,7 @@ class device_row_hasher { */ template