Skip to content

Commit

Permalink
Make Frame._dtype an iterator instead of a dict (#15920)
Browse files Browse the repository at this point in the history
A lot of the usages of `Frame._dtype` didn't require the previous `dict` return type since that was just re-iterated over anyways.

Also removed a redundant `tuple` call in `Frame._column_names` and `Frame._columns`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: #15920
  • Loading branch information
mroeschke authored Jun 5, 2024
1 parent db1b365 commit 57aeeb7
Show file tree
Hide file tree
Showing 6 changed files with 21 additions and 35 deletions.
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1231,7 +1231,7 @@ def dtypes(self):
string object
dtype: object
"""
return pd.Series(self._dtypes, dtype="object")
return pd.Series(dict(self._dtypes), dtype="object")

@property
def ndim(self) -> int:
Expand Down Expand Up @@ -2834,7 +2834,7 @@ def reindex(

return df._reindex(
column_names=columns,
dtypes=self._dtypes,
dtypes=dict(self._dtypes),
deep=copy,
index=index,
inplace=False,
Expand Down
16 changes: 7 additions & 9 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,18 +79,16 @@ def _num_rows(self) -> int:
return self._data.nrows

@property
def _column_names(self) -> Tuple[Any, ...]: # TODO: Tuple[str]?
return tuple(self._data.names)
def _column_names(self) -> Tuple[Any, ...]:
return self._data.names

@property
def _columns(self) -> Tuple[Any, ...]: # TODO: Tuple[Column]?
return tuple(self._data.columns)
def _columns(self) -> Tuple[ColumnBase, ...]:
return self._data.columns

@property
def _dtypes(self):
return dict(
zip(self._data.names, (col.dtype for col in self._data.columns))
)
def _dtypes(self) -> abc.Iterator:
return zip(self._data.names, (col.dtype for col in self._data.columns))

@property
def ndim(self) -> int:
Expand Down Expand Up @@ -1969,7 +1967,7 @@ def __dask_tokenize__(self):

return [
type(self),
str(self._dtypes),
str(dict(self._dtypes)),
normalize_token(self.to_pandas()),
]

Expand Down
16 changes: 3 additions & 13 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,7 @@
from cudf._lib.types import size_type_dtype
from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType
from cudf.api.extensions import no_default
from cudf.api.types import (
is_bool_dtype,
is_float_dtype,
is_list_like,
is_numeric_dtype,
)
from cudf.api.types import is_bool_dtype, is_list_like, is_numeric_dtype
from cudf.core._compat import PANDAS_LT_300
from cudf.core.abc import Serializable
from cudf.core.column.column import ColumnBase, StructDtype, as_column
Expand Down Expand Up @@ -335,12 +330,8 @@ def dtypes(self):
FutureWarning,
)
index = self.grouping.keys.unique().sort_values().to_pandas()
obj_dtypes = self.obj._dtypes
return pd.DataFrame(
{
name: [obj_dtypes[name]] * len(index)
for name in self.obj._data.names
},
{name: [dtype] * len(index) for name, dtype in self.obj._dtypes},
index=index,
)

Expand Down Expand Up @@ -499,8 +490,7 @@ def rank(
# treats NaNs the way we treat nulls.
if cudf.get_option("mode.pandas_compatible"):
if any(
is_float_dtype(typ)
for typ in self.grouping.values._dtypes.values()
col.dtype.kind == "f" for col in self.grouping.values._columns
):
raise NotImplementedError(
"NaNs are not supported in groupby.rank."
Expand Down
10 changes: 5 additions & 5 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -891,7 +891,7 @@ def replace(
) = _get_replacement_values_for_columns(
to_replace=to_replace,
value=value,
columns_dtype_map=self._dtypes,
columns_dtype_map=dict(self._dtypes),
)

for name, col in self._data.items():
Expand Down Expand Up @@ -6313,11 +6313,11 @@ def __dask_tokenize__(self):

return [
type(self),
str(self._dtypes),
str(dict(self._dtypes)),
*[
normalize_token(cat.categories)
for cat in self._dtypes.values()
if cat == "category"
normalize_token(col.dtype.categories)
for col in self._columns
if col.dtype == "category"
],
normalize_token(self.index),
normalize_token(self.hash_values().values_host),
Expand Down
5 changes: 2 additions & 3 deletions python/cudf/cudf/io/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,10 +132,9 @@ def read_csv(
# There exists some dtypes in the result columns that is inferred.
# Find them and map them to the default dtypes.
specified_dtypes = {} if dtype is None else dtype
df_dtypes = df._dtypes
unspecified_dtypes = {
name: df_dtypes[name]
for name in df._column_names
name: dtype
for name, dtype in df._dtypes
if name not in specified_dtypes
}
default_dtypes = {}
Expand Down
5 changes: 2 additions & 3 deletions python/cudf/cudf/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,10 +147,9 @@ def read_json(
# There exists some dtypes in the result columns that is inferred.
# Find them and map them to the default dtypes.
specified_dtypes = {} if dtype is True else dtype
df_dtypes = df._dtypes
unspecified_dtypes = {
name: df_dtypes[name]
for name in df._column_names
name: dtype
for name, dtype in df._dtypes
if name not in specified_dtypes
}
default_dtypes = {}
Expand Down

0 comments on commit 57aeeb7

Please sign in to comment.