From 47e49d04281da3f488bc0d954b366b272c08d316 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 4 Dec 2024 12:55:18 -0800 Subject: [PATCH] Fix groupby(as_index=False).size not reseting index (#17499) closes #17478 Also fixes a bug where the `Series.name` attribute wasn't preserved with `size` Authors: - Matthew Roeschke (https://github.com/mroeschke) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Bradley Dice (https://github.com/bdice) - Matthew Murray (https://github.com/Matt711) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/17499 --- python/cudf/cudf/core/groupby/groupby.py | 7 +++++-- python/cudf/cudf/tests/test_groupby.py | 14 ++++++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 29ab3b60d9d..0f12f266a95 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -497,11 +497,14 @@ def size(self): col = cudf.core.column.column_empty( len(self.obj), "int8", masked=False ) - return ( - cudf.Series._from_column(col) + result = ( + cudf.Series._from_column(col, name=getattr(self.obj, "name", None)) .groupby(self.grouping, sort=self._sort, dropna=self._dropna) .agg("size") ) + if not self._as_index: + result = result.rename("size").reset_index() + return result @_performance_tracking def cumcount(self, ascending: bool = True): diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index eae0fd23ef8..d8a2528230e 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -4074,3 +4074,17 @@ def test_get_group_list_like(): with pytest.raises(KeyError): df.groupby(["a"]).get_group([1]) + + +def test_size_as_index_false(): + df = pd.DataFrame({"a": [1, 2, 1], "b": [1, 2, 3]}, columns=["a", "b"]) + expected = df.groupby("a", as_index=False).size() + result = cudf.from_pandas(df).groupby("a", as_index=False).size() + assert_eq(result, expected) + + +def test_size_series_with_name(): + ser = pd.Series(range(3), name="foo") + expected = ser.groupby(ser).size() + result = cudf.from_pandas(ser).groupby(ser).size() + assert_eq(result, expected)