Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ability to modify and propagate names of columns object #17597

Merged
merged 9 commits into from
Dec 19, 2024
Merged
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1446,7 +1446,7 @@ def _union(self, other, sort=None):
other_df["order"] = other_df.index
res = self_df.merge(other_df, on=[0], how="outer")
res = res.sort_values(
by=res._data.to_pandas_index()[1:], ignore_index=True
by=res._data.to_pandas_index[1:], ignore_index=True
)
union_result = cudf.core.index._index_from_data({0: res._data[0]})

Expand Down
17 changes: 14 additions & 3 deletions python/cudf/cudf/core/column_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,11 +207,16 @@ def _from_columns_like_self(

@property
def level_names(self) -> tuple[abc.Hashable, ...]:
if self.is_cached("to_pandas_index"):
return self.to_pandas_index.names
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any particular reason we want/need to use the cached to_pandas_index here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, when updates to the pandas object created by to_pandas_index are performed, we need the updated names, that's what this piece of code does.

if self._level_names is None or len(self._level_names) == 0:
return tuple((None,) * max(1, self.nlevels))
else:
return self._level_names

def is_cached(self, attr_name: str) -> bool:
return attr_name in self.__dict__

@property
def nlevels(self) -> int:
if len(self) == 0:
Expand Down Expand Up @@ -262,7 +267,12 @@ def _clear_cache(self, old_ncols: int, new_ncols: int) -> None:
new_ncols: int
len(self) after self._data was modified
"""
cached_properties = ("columns", "names", "_grouped_data")
cached_properties = (
"columns",
"names",
"_grouped_data",
"to_pandas_index",
)
for attr in cached_properties:
try:
self.__delattr__(attr)
Expand All @@ -276,6 +286,7 @@ def _clear_cache(self, old_ncols: int, new_ncols: int) -> None:
except AttributeError:
pass

@cached_property
def to_pandas_index(self) -> pd.Index:
"""Convert the keys of the ColumnAccessor to a Pandas Index object."""
if self.multiindex and len(self.level_names) > 0:
Expand Down Expand Up @@ -726,10 +737,10 @@ def droplevel(self, level: int) -> None:
}
new_ncols = len(self)
self._level_names = (
self._level_names[:level] + self._level_names[level + 1 :]
self.level_names[:level] + self.level_names[level + 1 :]
)

if len(self._level_names) == 1:
if len(self.level_names) == 1:
# can't use nlevels, as it depends on multiindex
self.multiindex = False
self._clear_cache(old_ncols, new_ncols)
Expand Down
46 changes: 23 additions & 23 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -961,7 +961,7 @@ def _init_from_series_list(self, data, columns, index):
warnings.simplefilter("ignore", FutureWarning)
concat_df = cudf.concat(data, axis=1)

cols = concat_df._data.to_pandas_index()
cols = concat_df._data.to_pandas_index
if cols.dtype == "object":
concat_df.columns = cols.astype("str")

Expand Down Expand Up @@ -2068,7 +2068,7 @@ def _make_operands_and_index_for_binop(
equal_columns = True
elif isinstance(other, Series):
if (
not (self_pd_columns := self._data.to_pandas_index()).equals(
not (self_pd_columns := self._data.to_pandas_index).equals(
other_pd_index := other.index.to_pandas()
)
and not can_reindex
Expand All @@ -2093,8 +2093,8 @@ def _make_operands_and_index_for_binop(
and fn in cudf.utils.utils._EQUALITY_OPS
and (
not self.index.equals(other.index)
or not self._data.to_pandas_index().equals(
other._data.to_pandas_index()
or not self._data.to_pandas_index.equals(
other._data.to_pandas_index
)
)
):
Expand Down Expand Up @@ -2138,11 +2138,11 @@ def _make_operands_and_index_for_binop(

if not equal_columns:
if isinstance(other, DataFrame):
column_names_list = self._data.to_pandas_index().join(
other._data.to_pandas_index(), how="outer"
column_names_list = self._data.to_pandas_index.join(
other._data.to_pandas_index, how="outer"
)
elif isinstance(other, Series):
column_names_list = self._data.to_pandas_index().join(
column_names_list = self._data.to_pandas_index.join(
other.index.to_pandas(), how="outer"
)
else:
Expand Down Expand Up @@ -2602,8 +2602,8 @@ def update(
if not isinstance(other, DataFrame):
other = DataFrame(other)

self_cols = self._data.to_pandas_index()
if not self_cols.equals(other._data.to_pandas_index()):
self_cols = self._data.to_pandas_index
if not self_cols.equals(other._data.to_pandas_index):
other = other.reindex(self_cols, axis=1)
if not self.index.equals(other.index):
other = other.reindex(self.index, axis=0)
Expand Down Expand Up @@ -2639,7 +2639,7 @@ def __iter__(self):
def __contains__(self, item):
# This must check against containment in the pandas Index and not
# self._column_names to handle NA, None, nan, etc. correctly.
return item in self._data.to_pandas_index()
return item in self._data.to_pandas_index

@_performance_tracking
def items(self):
Expand Down Expand Up @@ -2676,14 +2676,14 @@ def at(self):

@property # type: ignore
@_external_only_api(
"Use _column_names instead, or _data.to_pandas_index() if a pandas "
"Use _column_names instead, or _data.to_pandas_index if a pandas "
"index is absolutely necessary. For checking if the columns are a "
"MultiIndex, use _data.multiindex."
)
@_performance_tracking
def columns(self):
"""Returns a tuple of columns"""
return self._data.to_pandas_index()
return self._data.to_pandas_index

@columns.setter # type: ignore
@_performance_tracking
Expand Down Expand Up @@ -2892,7 +2892,7 @@ def reindex(
df = self
else:
columns = cudf.Index(columns)
intersection = self._data.to_pandas_index().intersection(
intersection = self._data.to_pandas_index.intersection(
columns.to_pandas()
)
df = self.loc[:, intersection]
Expand Down Expand Up @@ -3409,7 +3409,7 @@ def axes(self):
Index(['key', 'k2', 'val', 'temp'], dtype='object')]

"""
return [self.index, self._data.to_pandas_index()]
return [self.index, self._data.to_pandas_index]

def diff(self, periods=1, axis=0):
"""
Expand Down Expand Up @@ -4108,7 +4108,7 @@ def transpose(self):
Not supporting *copy* because default and only behavior is
copy=True
"""
index = self._data.to_pandas_index()
index = self._data.to_pandas_index
columns = self.index.copy(deep=False)
if self._num_columns == 0 or self._num_rows == 0:
return DataFrame(index=index, columns=columns)
Expand Down Expand Up @@ -5514,7 +5514,7 @@ def to_pandas(
}

out_df = pd.DataFrame(out_data, index=out_index)
out_df.columns = self._data.to_pandas_index()
out_df.columns = self._data.to_pandas_index

return out_df

Expand Down Expand Up @@ -6466,7 +6466,7 @@ def _reduce(
source = self._get_columns_by_label(numeric_cols)
if source.empty:
return Series(
index=self._data.to_pandas_index()[:0]
index=self._data.to_pandas_index[:0]
if axis == 0
else source.index,
dtype="float64",
Expand Down Expand Up @@ -6519,7 +6519,7 @@ def _reduce(
"Columns must all have the same dtype to "
f"perform {op=} with {axis=}"
)
pd_index = source._data.to_pandas_index()
pd_index = source._data.to_pandas_index
if source._data.multiindex:
idx = MultiIndex.from_pandas(pd_index)
else:
Expand Down Expand Up @@ -7221,7 +7221,7 @@ def stack(
]
has_unnamed_levels = len(unnamed_levels_indices) > 0

column_name_idx = self._data.to_pandas_index()
column_name_idx = self._data.to_pandas_index
# Construct new index from the levels specified by `level`
named_levels = pd.MultiIndex.from_arrays(
[column_name_idx.get_level_values(lv) for lv in level_indices]
Expand Down Expand Up @@ -7411,7 +7411,7 @@ def cov(self, min_periods=None, ddof: int = 1, numeric_only: bool = False):
)

cov = cupy.cov(self.values, ddof=ddof, rowvar=False)
cols = self._data.to_pandas_index()
cols = self._data.to_pandas_index
df = DataFrame(cupy.asfortranarray(cov), index=cols)
df._set_columns_like(self._data)
return df
Expand Down Expand Up @@ -7454,7 +7454,7 @@ def corr(
)

corr = cupy.corrcoef(values, rowvar=False)
cols = self._data.to_pandas_index()
cols = self._data.to_pandas_index
df = DataFrame(cupy.asfortranarray(corr), index=cols)
df._set_columns_like(self._data)
return df
Expand Down Expand Up @@ -7523,7 +7523,7 @@ def keys(self):
>>> df.keys()
Index([0, 1, 2, 3], dtype='int64')
"""
return self._data.to_pandas_index()
return self._data.to_pandas_index

def itertuples(self, index=True, name="Pandas"):
"""
Expand Down Expand Up @@ -7757,7 +7757,7 @@ def nunique(self, axis=0, dropna: bool = True) -> Series:
raise NotImplementedError("axis parameter is not supported yet.")
counts = [col.distinct_count(dropna=dropna) for col in self._columns]
return self._constructor_sliced(
counts, index=self._data.to_pandas_index()
counts, index=self._data.to_pandas_index
)

def _sample_axis_1(
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -3084,7 +3084,7 @@ def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs):

# drop the first level if we have a multiindex
if result._data.nlevels > 1:
result.columns = result._data.to_pandas_index().droplevel(0)
result.columns = result._data.to_pandas_index.droplevel(0)

return result

Expand Down
12 changes: 5 additions & 7 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1104,13 +1104,11 @@ def dot(self, other, reflect=False):
lhs = self.reindex(index=common, copy=False).values
rhs = other.reindex(index=common, copy=False).values
if isinstance(other, cudf.DataFrame):
result_index = other._data.to_pandas_index()
result_index = other._data.to_pandas_index
elif isinstance(self, cudf.DataFrame) and isinstance(
other, (cudf.Series, cudf.DataFrame)
):
common = self._data.to_pandas_index().union(
other.index.to_pandas()
)
common = self._data.to_pandas_index.union(other.index.to_pandas())
if len(common) > self._num_columns or len(common) > len(
other.index
):
Expand All @@ -1122,7 +1120,7 @@ def dot(self, other, reflect=False):
rhs = other.reindex(index=common, copy=False).values
lhs = lhs.values
if isinstance(other, cudf.DataFrame):
result_cols = other._data.to_pandas_index()
result_cols = other._data.to_pandas_index

elif isinstance(
other, (cp.ndarray, np.ndarray)
Expand Down Expand Up @@ -2242,7 +2240,7 @@ def truncate(self, before=None, after=None, axis=0, copy=True):
if not copy:
raise ValueError("Truncating with copy=False is not supported.")
axis = self._get_axis_from_axis_arg(axis)
ax = self.index if axis == 0 else self._data.to_pandas_index()
ax = self.index if axis == 0 else self._data.to_pandas_index

if not ax.is_monotonic_increasing and not ax.is_monotonic_decreasing:
raise ValueError("truncate requires a sorted index")
Expand Down Expand Up @@ -6735,7 +6733,7 @@ def _drop_rows_by_labels(
return obj.__class__._from_data(
join_res.iloc[:, idx_nlv:]._data,
index=midx,
columns=obj._data.to_pandas_index(),
columns=obj._data.to_pandas_index,
)

else:
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -1125,7 +1125,7 @@ def _concat(cls, objs) -> Self:
# TODO: Verify if this is really necessary or if we can rely on
# DataFrame._concat.
if len(source_data) > 1:
colnames = source_data[0]._data.to_pandas_index()
colnames = source_data[0]._data.to_pandas_index
for obj in source_data[1:]:
obj.columns = colnames

Expand Down Expand Up @@ -2070,7 +2070,7 @@ def _union(self, other, sort=None) -> Self:

result_df = self_df.merge(other_df, on=col_names, how="outer")
result_df = result_df.sort_values(
by=result_df._data.to_pandas_index()[self.nlevels :],
by=result_df._data.to_pandas_index[self.nlevels :],
ignore_index=True,
)

Expand Down
10 changes: 6 additions & 4 deletions python/cudf/cudf/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,8 +431,9 @@ def concat(

result_columns = (
objs[0]
._data.to_pandas_index()
.append([obj._data.to_pandas_index() for obj in objs[1:]])
._data.to_pandas_index.append(
[obj._data.to_pandas_index for obj in objs[1:]]
)
.unique()
)

Expand Down Expand Up @@ -689,7 +690,7 @@ def _tile(A, reps):
if not value_vars:
# TODO: Use frame._data.label_dtype when it's more consistently set
var_data = cudf.Series(
value_vars, dtype=frame._data.to_pandas_index().dtype
value_vars, dtype=frame._data.to_pandas_index.dtype
)
else:
var_data = (
Expand Down Expand Up @@ -1096,6 +1097,7 @@ def pivot(
2 <NA> <NA> three

"""
# import pdb;pdb.set_trace()
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
values_is_list = True
if values is no_default:
already_selected = set(
Expand Down Expand Up @@ -1272,7 +1274,7 @@ def unstack(df, level, fill_value=None, sort: bool = True):
res = df.T.stack(future_stack=False)
# Result's index is a multiindex
res.index.names = (
tuple(df._data.to_pandas_index().names) + df.index.names
tuple(df._data.to_pandas_index.names) + df.index.names
)
return res
else:
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/testing/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -692,8 +692,8 @@ def assert_frame_equal(
)

pd.testing.assert_index_equal(
left._data.to_pandas_index(),
right._data.to_pandas_index(),
left._data.to_pandas_index,
right._data.to_pandas_index,
exact=check_column_type,
check_names=check_names,
check_exact=check_exact,
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/tests/test_column_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def test_to_pandas_simple(simple_data):
# Index([], dtype='object'), and `integer` for RangeIndex()
# to ignore this `inferred_type` comparison, we pass exact=False.
assert_eq(
ca.to_pandas_index(),
ca.to_pandas_index,
pd.DataFrame(
{key: value.values_host for key, value in simple_data.items()}
).columns,
Expand All @@ -75,7 +75,7 @@ def test_to_pandas_simple(simple_data):
def test_to_pandas_multiindex(mi_data):
ca = ColumnAccessor(mi_data, multiindex=True)
assert_eq(
ca.to_pandas_index(),
ca.to_pandas_index,
pd.DataFrame(
{key: value.values_host for key, value in mi_data.items()}
).columns,
Expand All @@ -89,7 +89,7 @@ def test_to_pandas_multiindex_names():
level_names=("foo", "bar"),
)
assert_eq(
ca.to_pandas_index(),
ca.to_pandas_index,
pd.MultiIndex.from_tuples(
(("a", "b"), ("c", "d")), names=("foo", "bar")
),
Expand Down
Loading
Loading