From 79a986067688c18b3d431c7a3acc23e2307fb668 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 25 Nov 2024 22:02:06 -0800 Subject: [PATCH] Remove cudf._lib.reshape in favor of inlining pylibcudf (#17368) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17368 --- python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/reshape.pyx | 35 -------------- python/cudf/cudf/core/dataframe.py | 61 +++++++++++++++++------- python/cudf/cudf/core/groupby/groupby.py | 15 +++++- python/cudf/cudf/core/indexed_frame.py | 22 +++++++-- 6 files changed, 75 insertions(+), 60 deletions(-) delete mode 100644 python/cudf/cudf/_lib/reshape.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 8d3af5205fa..61d3bcbe24e 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -30,7 +30,6 @@ set(cython_sources parquet.pyx reduce.pyx replace.pyx - reshape.pyx round.pyx scalar.pyx sort.pyx diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 1e0bf931c97..efa437eebb7 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -18,7 +18,6 @@ parquet, reduce, replace, - reshape, round, sort, stream_compaction, diff --git a/python/cudf/cudf/_lib/reshape.pyx b/python/cudf/cudf/_lib/reshape.pyx deleted file mode 100644 index 6cebeb2bc16..00000000000 --- a/python/cudf/cudf/_lib/reshape.pyx +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column -from cudf._lib.utils cimport columns_from_pylibcudf_table - -import pylibcudf as plc - - -@acquire_spill_lock() -def interleave_columns(list source_columns): - return Column.from_pylibcudf( - plc.reshape.interleave_columns( - plc.Table([ - c.to_pylibcudf(mode="read") for c in source_columns - ]) - ) - ) - - -@acquire_spill_lock() -def tile(list source_columns, size_type count): - cdef size_type c_count = count - - return columns_from_pylibcudf_table( - plc.reshape.tile( - plc.Table([ - c.to_pylibcudf(mode="read") for c in source_columns - ]), - c_count - ) - ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 7d523d2c5ad..73c0af45293 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7251,13 +7251,22 @@ def stack(self, level=-1, dropna=no_default, future_stack=False): repeated_index = self.index.repeat(len(unique_named_levels)) # Each column name should tile itself by len(df) times - tiled_index = libcudf.reshape.tile( - [ - as_column(unique_named_levels.get_level_values(i)) - for i in range(unique_named_levels.nlevels) - ], - self.shape[0], - ) + with acquire_spill_lock(): + plc_table = plc.reshape.tile( + plc.Table( + [ + as_column( + unique_named_levels.get_level_values(i) + ).to_pylibcudf(mode="read") + for i in range(unique_named_levels.nlevels) + ] + ), + self.shape[0], + ) + tiled_index = [ + libcudf.column.Column.from_pylibcudf(plc) + for plc in plc_table.columns() + ] # Assemble the final index new_index_columns = [*repeated_index._columns, *tiled_index] @@ -7271,7 +7280,6 @@ def stack(self, level=-1, dropna=no_default, future_stack=False): data=range(self._num_columns), index=named_levels ) - column_indices: list[list[int]] = [] if has_unnamed_levels: unnamed_level_values = list( map(column_name_idx.get_level_values, unnamed_levels_indices) @@ -7307,13 +7315,11 @@ def unnamed_group_generator(): else: yield column_idx_df.sort_index().values - column_indices = list(unnamed_group_generator()) - # For each of the group constructed from the unnamed levels, # invoke `interleave_columns` to stack the values. stacked = [] - for column_idx in column_indices: + for column_idx in unnamed_group_generator(): # Collect columns based on indices, append None for -1 indices. columns = [ None if i == -1 else self._data.select_by_index(i).columns[0] @@ -7332,12 +7338,23 @@ def unnamed_group_generator(): ) # homogenize the dtypes of the columns - homogenized = [ + homogenized = ( col.astype(common_type) if col is not None else all_nulls() for col in columns - ] + ) - stacked.append(libcudf.reshape.interleave_columns(homogenized)) + with acquire_spill_lock(): + interleaved_col = libcudf.column.Column.from_pylibcudf( + plc.reshape.interleave_columns( + plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in homogenized + ] + ) + ) + ) + stacked.append(interleaved_col) # Construct the resulting dataframe / series if not has_unnamed_levels: @@ -7838,10 +7855,18 @@ def interleave_columns(self): raise ValueError( "interleave_columns does not support 'category' dtype." ) - - return self._constructor_sliced._from_column( - libcudf.reshape.interleave_columns([*self._columns]) - ) + with acquire_spill_lock(): + result_col = libcudf.column.Column.from_pylibcudf( + plc.reshape.interleave_columns( + plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in self._columns + ] + ) + ) + ) + return self._constructor_sliced._from_column(result_col) @_performance_tracking def eval(self, expr: str, inplace: bool = False, **kwargs): diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index e59b948aba9..b274bdea76d 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -14,17 +14,19 @@ import numpy as np import pandas as pd +import pylibcudf as plc + import cudf from cudf import _lib as libcudf from cudf._lib import groupby as libgroupby from cudf._lib.null_mask import bitmask_or -from cudf._lib.reshape import interleave_columns from cudf._lib.sort import segmented_sort_by_key from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import is_list_like, is_numeric_dtype from cudf.core._compat import PANDAS_LT_300 from cudf.core.abc import Serializable +from cudf.core.buffer import acquire_spill_lock from cudf.core.column.column import ColumnBase, StructDtype, as_column from cudf.core.column_accessor import ColumnAccessor from cudf.core.copy_types import GatherMap @@ -2201,6 +2203,17 @@ def _cov_or_corr(self, func, method_name): # interleave: combines the correlation or covariance results for each # column-pair into a single column + + @acquire_spill_lock() + def interleave_columns(source_columns): + return libcudf.column.Column.from_pylibcudf( + plc.reshape.interleave_columns( + plc.Table( + [c.to_pylibcudf(mode="read") for c in source_columns] + ) + ) + ) + res = cudf.DataFrame._from_data( { x: interleave_columns([gb_cov_corr._data[y] for y in ys]) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 4e839aaeb6a..2f8c2587937 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -5349,7 +5349,7 @@ def _explode(self, explode_column: Any, ignore_index: bool): ) @_performance_tracking - def tile(self, count): + def tile(self, count: int): """Repeats the rows `count` times to form a new Frame. Parameters @@ -5373,10 +5373,24 @@ def tile(self, count): ------- The indexed frame containing the tiled "rows". """ + with acquire_spill_lock(): + plc_table = plc.reshape.tile( + plc.Table( + [ + col.to_pylibcudf(mode="read") + for col in itertools.chain( + self.index._columns, self._columns + ) + ] + ), + count, + ) + tiled = [ + libcudf.column.Column.from_pylibcudf(plc) + for plc in plc_table.columns() + ] return self._from_columns_like_self( - libcudf.reshape.tile( - [*self.index._columns, *self._columns], count - ), + tiled, column_names=self._column_names, index_names=self._index_names, )