Skip to content

Commit

Permalink
Remove cudf._lib.reshape in favor of inlining pylibcudf (#17368)
Browse files Browse the repository at this point in the history
Contributes to #17317

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Matthew Murray (https://github.com/Matt711)

URL: #17368
  • Loading branch information
mroeschke authored Nov 26, 2024
1 parent ab36fc6 commit 79a9860
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 60 deletions.
1 change: 0 additions & 1 deletion python/cudf/cudf/_lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ set(cython_sources
parquet.pyx
reduce.pyx
replace.pyx
reshape.pyx
round.pyx
scalar.pyx
sort.pyx
Expand Down
1 change: 0 additions & 1 deletion python/cudf/cudf/_lib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
parquet,
reduce,
replace,
reshape,
round,
sort,
stream_compaction,
Expand Down
35 changes: 0 additions & 35 deletions python/cudf/cudf/_lib/reshape.pyx

This file was deleted.

61 changes: 43 additions & 18 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -7251,13 +7251,22 @@ def stack(self, level=-1, dropna=no_default, future_stack=False):
repeated_index = self.index.repeat(len(unique_named_levels))

# Each column name should tile itself by len(df) times
tiled_index = libcudf.reshape.tile(
[
as_column(unique_named_levels.get_level_values(i))
for i in range(unique_named_levels.nlevels)
],
self.shape[0],
)
with acquire_spill_lock():
plc_table = plc.reshape.tile(
plc.Table(
[
as_column(
unique_named_levels.get_level_values(i)
).to_pylibcudf(mode="read")
for i in range(unique_named_levels.nlevels)
]
),
self.shape[0],
)
tiled_index = [
libcudf.column.Column.from_pylibcudf(plc)
for plc in plc_table.columns()
]

# Assemble the final index
new_index_columns = [*repeated_index._columns, *tiled_index]
Expand All @@ -7271,7 +7280,6 @@ def stack(self, level=-1, dropna=no_default, future_stack=False):
data=range(self._num_columns), index=named_levels
)

column_indices: list[list[int]] = []
if has_unnamed_levels:
unnamed_level_values = list(
map(column_name_idx.get_level_values, unnamed_levels_indices)
Expand Down Expand Up @@ -7307,13 +7315,11 @@ def unnamed_group_generator():
else:
yield column_idx_df.sort_index().values

column_indices = list(unnamed_group_generator())

# For each of the group constructed from the unnamed levels,
# invoke `interleave_columns` to stack the values.
stacked = []

for column_idx in column_indices:
for column_idx in unnamed_group_generator():
# Collect columns based on indices, append None for -1 indices.
columns = [
None if i == -1 else self._data.select_by_index(i).columns[0]
Expand All @@ -7332,12 +7338,23 @@ def unnamed_group_generator():
)

# homogenize the dtypes of the columns
homogenized = [
homogenized = (
col.astype(common_type) if col is not None else all_nulls()
for col in columns
]
)

stacked.append(libcudf.reshape.interleave_columns(homogenized))
with acquire_spill_lock():
interleaved_col = libcudf.column.Column.from_pylibcudf(
plc.reshape.interleave_columns(
plc.Table(
[
col.to_pylibcudf(mode="read")
for col in homogenized
]
)
)
)
stacked.append(interleaved_col)

# Construct the resulting dataframe / series
if not has_unnamed_levels:
Expand Down Expand Up @@ -7838,10 +7855,18 @@ def interleave_columns(self):
raise ValueError(
"interleave_columns does not support 'category' dtype."
)

return self._constructor_sliced._from_column(
libcudf.reshape.interleave_columns([*self._columns])
)
with acquire_spill_lock():
result_col = libcudf.column.Column.from_pylibcudf(
plc.reshape.interleave_columns(
plc.Table(
[
col.to_pylibcudf(mode="read")
for col in self._columns
]
)
)
)
return self._constructor_sliced._from_column(result_col)

@_performance_tracking
def eval(self, expr: str, inplace: bool = False, **kwargs):
Expand Down
15 changes: 14 additions & 1 deletion python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,19 @@
import numpy as np
import pandas as pd

import pylibcudf as plc

import cudf
from cudf import _lib as libcudf
from cudf._lib import groupby as libgroupby
from cudf._lib.null_mask import bitmask_or
from cudf._lib.reshape import interleave_columns
from cudf._lib.sort import segmented_sort_by_key
from cudf._lib.types import size_type_dtype
from cudf.api.extensions import no_default
from cudf.api.types import is_list_like, is_numeric_dtype
from cudf.core._compat import PANDAS_LT_300
from cudf.core.abc import Serializable
from cudf.core.buffer import acquire_spill_lock
from cudf.core.column.column import ColumnBase, StructDtype, as_column
from cudf.core.column_accessor import ColumnAccessor
from cudf.core.copy_types import GatherMap
Expand Down Expand Up @@ -2201,6 +2203,17 @@ def _cov_or_corr(self, func, method_name):

# interleave: combines the correlation or covariance results for each
# column-pair into a single column

@acquire_spill_lock()
def interleave_columns(source_columns):
return libcudf.column.Column.from_pylibcudf(
plc.reshape.interleave_columns(
plc.Table(
[c.to_pylibcudf(mode="read") for c in source_columns]
)
)
)

res = cudf.DataFrame._from_data(
{
x: interleave_columns([gb_cov_corr._data[y] for y in ys])
Expand Down
22 changes: 18 additions & 4 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5349,7 +5349,7 @@ def _explode(self, explode_column: Any, ignore_index: bool):
)

@_performance_tracking
def tile(self, count):
def tile(self, count: int):
"""Repeats the rows `count` times to form a new Frame.
Parameters
Expand All @@ -5373,10 +5373,24 @@ def tile(self, count):
-------
The indexed frame containing the tiled "rows".
"""
with acquire_spill_lock():
plc_table = plc.reshape.tile(
plc.Table(
[
col.to_pylibcudf(mode="read")
for col in itertools.chain(
self.index._columns, self._columns
)
]
),
count,
)
tiled = [
libcudf.column.Column.from_pylibcudf(plc)
for plc in plc_table.columns()
]
return self._from_columns_like_self(
libcudf.reshape.tile(
[*self.index._columns, *self._columns], count
),
tiled,
column_names=self._column_names,
index_names=self._index_names,
)
Expand Down

0 comments on commit 79a9860

Please sign in to comment.