From 79cd473f8ec18d1f0abed3faa6dd8d61f54bf384 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Fri, 26 Apr 2024 20:51:03 +0200 Subject: [PATCH] Minor fixups for future NumPy 2 compatibility (#15590) These are some small fixes to be compatible with NumPy 2 python changes, as pointed out by the `ruff` with the `"NPY201"` rule-set. I am not really happy with the `_NUMPY_SCTYPES` (reaching into what is now private, but figured that others will do so also for a while; feels like we should add a better way to do this in NumPy before removing it). Listing the full set is also a bit ugly/convoluted, but happy to do so instead. (I was hoping to get a bit further with testing against the NumPy 2rc, but unfortunately the `numba` dependency makes that at least difficult.) Authors: - Sebastian Berg (https://github.com/seberg) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15590 --- python/cudf/cudf/core/column/string.py | 2 +- python/cudf/cudf/core/frame.py | 12 ++++---- python/cudf/cudf/core/index.py | 6 +++- python/cudf/cudf/tests/test_api_types.py | 33 ---------------------- python/cudf/cudf/tests/test_categorical.py | 4 +-- python/cudf/cudf/tests/test_dataframe.py | 16 +++++------ python/cudf/cudf/tests/test_parquet.py | 2 +- python/cudf/cudf/tests/test_stats.py | 4 +-- python/cudf/cudf/utils/dtypes.py | 8 ++++-- 9 files changed, 31 insertions(+), 56 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 0862995bc46..8143e7919a7 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -692,7 +692,7 @@ def contains( Returning an Index of booleans using only a literal pattern. - >>> data = ['Mouse', 'dog', 'house and parrot', '23.0', np.NaN] + >>> data = ['Mouse', 'dog', 'house and parrot', '23.0', np.nan] >>> idx = cudf.Index(data) >>> idx Index(['Mouse', 'dog', 'house and parrot', '23.0', None], dtype='object') diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 01842b5f0a9..cd42bf52ea1 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1077,7 +1077,7 @@ def isna(self): >>> import cudf >>> import numpy as np >>> import pandas as pd - >>> df = cudf.DataFrame({'age': [5, 6, np.NaN], + >>> df = cudf.DataFrame({'age': [5, 6, np.nan], ... 'born': [pd.NaT, pd.Timestamp('1939-05-27'), ... pd.Timestamp('1940-04-25')], ... 'name': ['Alfred', 'Batman', ''], @@ -1095,7 +1095,7 @@ def isna(self): Show which entries in a Series are NA. - >>> ser = cudf.Series([5, 6, np.NaN, np.inf, -np.inf]) + >>> ser = cudf.Series([5, 6, np.nan, np.inf, -np.inf]) >>> ser 0 5.0 1 6.0 @@ -1113,7 +1113,7 @@ def isna(self): Show which entries in an Index are NA. - >>> idx = cudf.Index([1, 2, None, np.NaN, 0.32, np.inf]) + >>> idx = cudf.Index([1, 2, None, np.nan, 0.32, np.inf]) >>> idx Index([1.0, 2.0, , , 0.32, Inf], dtype='float64') >>> idx.isna() @@ -1156,7 +1156,7 @@ def notna(self): >>> import cudf >>> import numpy as np >>> import pandas as pd - >>> df = cudf.DataFrame({'age': [5, 6, np.NaN], + >>> df = cudf.DataFrame({'age': [5, 6, np.nan], ... 'born': [pd.NaT, pd.Timestamp('1939-05-27'), ... pd.Timestamp('1940-04-25')], ... 'name': ['Alfred', 'Batman', ''], @@ -1174,7 +1174,7 @@ def notna(self): Show which entries in a Series are NA. - >>> ser = cudf.Series([5, 6, np.NaN, np.inf, -np.inf]) + >>> ser = cudf.Series([5, 6, np.nan, np.inf, -np.inf]) >>> ser 0 5.0 1 6.0 @@ -1192,7 +1192,7 @@ def notna(self): Show which entries in an Index are NA. - >>> idx = cudf.Index([1, 2, None, np.NaN, 0.32, np.inf]) + >>> idx = cudf.Index([1, 2, None, np.nan, 0.32, np.inf]) >>> idx Index([1.0, 2.0, , , 0.32, Inf], dtype='float64') >>> idx.notna() diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 6c0acdc5fb0..f55fa4c05b5 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -60,6 +60,7 @@ from cudf.core.single_column_frame import SingleColumnFrame from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( + _NUMPY_SCTYPES, _maybe_convert_to_default_type, find_common_type, is_mixed_with_object_dtype, @@ -344,7 +345,10 @@ def _data(self): @_cudf_nvtx_annotate def __contains__(self, item): if isinstance(item, bool) or not isinstance( - item, tuple(np.sctypes["int"] + np.sctypes["float"] + [int, float]) + item, + tuple( + _NUMPY_SCTYPES["int"] + _NUMPY_SCTYPES["float"] + [int, float] + ), ): return False try: diff --git a/python/cudf/cudf/tests/test_api_types.py b/python/cudf/cudf/tests/test_api_types.py index 9436d65e0b7..4abe210c6ea 100644 --- a/python/cudf/cudf/tests/test_api_types.py +++ b/python/cudf/cudf/tests/test_api_types.py @@ -33,7 +33,6 @@ (np.float64, False), (np.complex128, False), (np.str_, False), - (np.unicode_, False), (np.datetime64, False), (np.timedelta64, False), # NumPy scalars. @@ -42,7 +41,6 @@ (np.float64(), False), (np.complex128(), False), (np.str_(), False), - (np.unicode_(), False), (np.datetime64(), False), (np.timedelta64(), False), # NumPy dtype objects. @@ -61,7 +59,6 @@ (np.array([], dtype=np.float64), False), (np.array([], dtype=np.complex128), False), (np.array([], dtype=np.str_), False), - (np.array([], dtype=np.unicode_), False), (np.array([], dtype=np.datetime64), False), (np.array([], dtype=np.timedelta64), False), (np.array([], dtype=object), False), @@ -142,7 +139,6 @@ def test_is_categorical_dtype(obj, expect): (np.float64, True), (np.complex128, True), (np.str_, False), - (np.unicode_, False), (np.datetime64, False), (np.timedelta64, False), # NumPy scalars. @@ -151,7 +147,6 @@ def test_is_categorical_dtype(obj, expect): (np.float64(), True), (np.complex128(), True), (np.str_(), False), - (np.unicode_(), False), (np.datetime64(), False), (np.timedelta64(), False), # NumPy dtype objects. @@ -170,7 +165,6 @@ def test_is_categorical_dtype(obj, expect): (np.array([], dtype=np.float64), True), (np.array([], dtype=np.complex128), True), (np.array([], dtype=np.str_), False), - (np.array([], dtype=np.unicode_), False), (np.array([], dtype=np.datetime64), False), (np.array([], dtype=np.timedelta64), False), (np.array([], dtype=object), False), @@ -247,7 +241,6 @@ def test_is_numeric_dtype(obj, expect): (np.float64, False), (np.complex128, False), (np.str_, False), - (np.unicode_, False), (np.datetime64, False), (np.timedelta64, False), # NumPy scalars. @@ -256,7 +249,6 @@ def test_is_numeric_dtype(obj, expect): (np.float64(), False), (np.complex128(), False), (np.str_(), False), - (np.unicode_(), False), (np.datetime64(), False), (np.timedelta64(), False), # NumPy dtype objects. @@ -275,7 +267,6 @@ def test_is_numeric_dtype(obj, expect): (np.array([], dtype=np.float64), False), (np.array([], dtype=np.complex128), False), (np.array([], dtype=np.str_), False), - (np.array([], dtype=np.unicode_), False), (np.array([], dtype=np.datetime64), False), (np.array([], dtype=np.timedelta64), False), (np.array([], dtype=object), False), @@ -352,7 +343,6 @@ def test_is_integer_dtype(obj, expect): (np.float64, False), (np.complex128, False), (np.str_, False), - (np.unicode_, False), (np.datetime64, False), (np.timedelta64, False), # NumPy scalars. @@ -361,7 +351,6 @@ def test_is_integer_dtype(obj, expect): (np.float64(), False), (np.complex128(), False), (np.str_(), False), - (np.unicode_(), False), (np.datetime64(), False), (np.timedelta64(), False), # NumPy dtype objects. @@ -380,7 +369,6 @@ def test_is_integer_dtype(obj, expect): (np.array([], dtype=np.float64), False), (np.array([], dtype=np.complex128), False), (np.array([], dtype=np.str_), False), - (np.array([], dtype=np.unicode_), False), (np.array([], dtype=np.datetime64), False), (np.array([], dtype=np.timedelta64), False), (np.array([], dtype=object), False), @@ -458,7 +446,6 @@ def test_is_integer(obj, expect): (np.float64, False), (np.complex128, False), (np.str_, True), - (np.unicode_, True), (np.datetime64, False), (np.timedelta64, False), # NumPy scalars. @@ -467,7 +454,6 @@ def test_is_integer(obj, expect): (np.float64(), False), (np.complex128(), False), (np.str_(), True), - (np.unicode_(), True), (np.datetime64(), False), (np.timedelta64(), False), # NumPy dtype objects. @@ -486,7 +472,6 @@ def test_is_integer(obj, expect): (np.array([], dtype=np.float64), False), (np.array([], dtype=np.complex128), False), (np.array([], dtype=np.str_), True), - (np.array([], dtype=np.unicode_), True), (np.array([], dtype=np.datetime64), False), (np.array([], dtype=np.timedelta64), False), # (np.array([], dtype=object), False), @@ -577,7 +562,6 @@ def test_is_string_dtype(obj, expect): (np.float64, False), (np.complex128, False), (np.str_, False), - (np.unicode_, False), (np.datetime64, True), (np.timedelta64, False), # NumPy scalars. @@ -586,7 +570,6 @@ def test_is_string_dtype(obj, expect): (np.float64(), False), (np.complex128(), False), (np.str_(), False), - (np.unicode_(), False), (np.datetime64(), True), (np.timedelta64(), False), # NumPy dtype objects. @@ -605,7 +588,6 @@ def test_is_string_dtype(obj, expect): (np.array([], dtype=np.float64), False), (np.array([], dtype=np.complex128), False), (np.array([], dtype=np.str_), False), - (np.array([], dtype=np.unicode_), False), (np.array([], dtype=np.datetime64), True), (np.array([], dtype=np.timedelta64), False), (np.array([], dtype=object), False), @@ -682,7 +664,6 @@ def test_is_datetime_dtype(obj, expect): (np.float64, False), (np.complex128, False), (np.str_, False), - (np.unicode_, False), (np.datetime64, False), (np.timedelta64, False), # NumPy scalars. @@ -691,7 +672,6 @@ def test_is_datetime_dtype(obj, expect): (np.float64(), False), (np.complex128(), False), (np.str_(), False), - (np.unicode_(), False), (np.datetime64(), False), (np.timedelta64(), False), # NumPy dtype objects. @@ -710,7 +690,6 @@ def test_is_datetime_dtype(obj, expect): (np.array([], dtype=np.float64), False), (np.array([], dtype=np.complex128), False), (np.array([], dtype=np.str_), False), - (np.array([], dtype=np.unicode_), False), (np.array([], dtype=np.datetime64), False), (np.array([], dtype=np.timedelta64), False), (np.array([], dtype=object), False), @@ -787,7 +766,6 @@ def test_is_list_dtype(obj, expect): (np.float64, False), (np.complex128, False), (np.str_, False), - (np.unicode_, False), (np.datetime64, False), (np.timedelta64, False), # NumPy scalars. @@ -796,7 +774,6 @@ def test_is_list_dtype(obj, expect): (np.float64(), False), (np.complex128(), False), (np.str_(), False), - (np.unicode_(), False), (np.datetime64(), False), (np.timedelta64(), False), # NumPy dtype objects. @@ -815,7 +792,6 @@ def test_is_list_dtype(obj, expect): (np.array([], dtype=np.float64), False), (np.array([], dtype=np.complex128), False), (np.array([], dtype=np.str_), False), - (np.array([], dtype=np.unicode_), False), (np.array([], dtype=np.datetime64), False), (np.array([], dtype=np.timedelta64), False), (np.array([], dtype=object), False), @@ -895,7 +871,6 @@ def test_is_struct_dtype(obj, expect): (np.float64, False), (np.complex128, False), (np.str_, False), - (np.unicode_, False), (np.datetime64, False), (np.timedelta64, False), # NumPy scalars. @@ -904,7 +879,6 @@ def test_is_struct_dtype(obj, expect): (np.float64(), False), (np.complex128(), False), (np.str_(), False), - (np.unicode_(), False), (np.datetime64(), False), (np.timedelta64(), False), # NumPy dtype objects. @@ -923,7 +897,6 @@ def test_is_struct_dtype(obj, expect): (np.array([], dtype=np.float64), False), (np.array([], dtype=np.complex128), False), (np.array([], dtype=np.str_), False), - (np.array([], dtype=np.unicode_), False), (np.array([], dtype=np.datetime64), False), (np.array([], dtype=np.timedelta64), False), (np.array([], dtype=object), False), @@ -1004,7 +977,6 @@ def test_is_decimal_dtype(obj, expect): np.float64, np.complex128, np.str_, - np.unicode_, np.datetime64, np.timedelta64, # NumPy scalars. @@ -1013,7 +985,6 @@ def test_is_decimal_dtype(obj, expect): np.float64(), np.complex128(), np.str_(), - np.unicode_(), np.datetime64(), np.timedelta64(), # NumPy dtype objects. @@ -1032,7 +1003,6 @@ def test_is_decimal_dtype(obj, expect): np.array([], dtype=np.float64), np.array([], dtype=np.complex128), np.array([], dtype=np.str_), - np.array([], dtype=np.unicode_), np.array([], dtype=np.datetime64), np.array([], dtype=np.timedelta64), np.array([], dtype=object), @@ -1088,7 +1058,6 @@ def test_pandas_agreement(obj): np.float64, np.complex128, np.str_, - np.unicode_, np.datetime64, np.timedelta64, # NumPy scalars. @@ -1097,7 +1066,6 @@ def test_pandas_agreement(obj): np.float64(), np.complex128(), np.str_(), - np.unicode_(), np.datetime64(), np.timedelta64(), # NumPy dtype objects. @@ -1116,7 +1084,6 @@ def test_pandas_agreement(obj): np.array([], dtype=np.float64), np.array([], dtype=np.complex128), np.array([], dtype=np.str_), - np.array([], dtype=np.unicode_), np.array([], dtype=np.datetime64), np.array([], dtype=np.timedelta64), np.array([], dtype=object), diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index e21fd53bee4..7aba2e45532 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -460,7 +460,7 @@ def test_categorical_dataframe_slice_copy(): pd.Series(["1.0", "2.5", "3.001", None, "9"], dtype="category"), pd.Series(["a", "b", "c", "c", "b", "a", "b", "b"]), pd.Series(["aa", "b", "c", "c", "bb", "bb", "a", "b", "b"]), - pd.Series([1, 2, 3, 89, None, np.nan, np.NaN], dtype="float64"), + pd.Series([1, 2, 3, 89, None, np.nan, np.nan], dtype="float64"), pd.Series([1, 2, 3, 89], dtype="float64"), pd.Series([1, 2.5, 3.001, 89], dtype="float64"), pd.Series([None, None, None]), @@ -493,7 +493,7 @@ def test_categorical_typecast(data, cat_type): pd.Series([1, 2, 3, 89]), pd.Series(["a", "b", "c", "c", "b", "a", "b", "b"]), pd.Series(["aa", "b", "c", "c", "bb", "bb", "a", "b", "b"]), - pd.Series([1, 2, 3, 89, None, np.nan, np.NaN], dtype="float64"), + pd.Series([1, 2, 3, 89, None, np.nan, np.nan], dtype="float64"), pd.Series([1, 2, 3, 89], dtype="float64"), pd.Series([1, 2.5, 3.001, 89], dtype="float64"), pd.Series([None, None, None]), diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 59e8b41e51a..e287603de07 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -5199,20 +5199,20 @@ def test_df_constructor_dtype(dtype): cudf.DataFrame( { "a": [1, 2, 3, 4], - "b": [7, np.NaN, 9, 10], + "b": [7, np.nan, 9, 10], "c": cudf.Series( - [np.NaN, np.NaN, np.NaN, np.NaN], nan_as_null=False + [np.nan, np.nan, np.nan, np.nan], nan_as_null=False ), "d": cudf.Series([None, None, None, None], dtype="int64"), "e": [100, None, 200, None], - "f": cudf.Series([10, None, np.NaN, 11], nan_as_null=False), + "f": cudf.Series([10, None, np.nan, 11], nan_as_null=False), } ), cudf.DataFrame( { "a": [10, 11, 12, 13, 14, 15], "b": cudf.Series( - [10, None, np.NaN, 2234, None, np.NaN], nan_as_null=False + [10, None, np.nan, 2234, None, np.nan], nan_as_null=False ), } ), @@ -5264,11 +5264,11 @@ def test_rowwise_ops_nullable_dtypes_all_null(op): gdf = cudf.DataFrame( { "a": [1, 2, 3, 4], - "b": [7, np.NaN, 9, 10], - "c": cudf.Series([np.NaN, np.NaN, np.NaN, np.NaN], dtype=float), + "b": [7, np.nan, 9, 10], + "c": cudf.Series([np.nan, np.nan, np.nan, np.nan], dtype=float), "d": cudf.Series([None, None, None, None], dtype="int64"), "e": [100, None, 200, None], - "f": cudf.Series([10, None, np.NaN, 11], nan_as_null=False), + "f": cudf.Series([10, None, np.nan, 11], nan_as_null=False), } ) @@ -5300,7 +5300,7 @@ def test_rowwise_ops_nullable_dtypes_partial_null(op): { "a": [10, 11, 12, 13, 14, 15], "b": cudf.Series( - [10, None, np.NaN, 2234, None, np.NaN], + [10, None, np.nan, 2234, None, np.nan], nan_as_null=False, ), } diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 56a4281aad9..6fb1d3d8ba5 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -211,7 +211,7 @@ def make_pdf(nrows, ncolumns=1, nvalids=0, dtype=np.int64): # Randomly but reproducibly mark subset of rows as invalid random.seed(1337) mask = random.sample(range(nrows), nvalids) - test_pdf[test_pdf.index.isin(mask)] = np.NaN + test_pdf[test_pdf.index.isin(mask)] = np.nan if dtype: test_pdf = test_pdf.astype(dtype) diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index b9eb42906e8..27811d0fcde 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -507,7 +507,7 @@ def test_df_corr(method): @pytest.mark.parametrize( "data", [ - [0.0, 1, 3, 6, np.NaN, 7, 5.0, np.nan, 5, 2, 3, -100], + [0.0, 1, 3, 6, np.nan, 7, 5.0, np.nan, 5, 2, 3, -100], [np.nan] * 3, [1, 5, 3], [], @@ -555,7 +555,7 @@ def test_nans_stats(data, ops, skipna): @pytest.mark.parametrize( "data", [ - [0.0, 1, 3, 6, np.NaN, 7, 5.0, np.nan, 5, 2, 3, -100], + [0.0, 1, 3, 6, np.nan, 7, 5.0, np.nan, 5, 2, 3, -100], [np.nan] * 3, [1, 5, 3], ], diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index a33b5ca139c..2aa3129ab30 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -91,6 +91,10 @@ BOOL_TYPES = {"bool"} ALL_TYPES = NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | OTHER_TYPES +# The NumPy scalar types are a bit of a mess as they align with the C types +# so for now we use the `sctypes` dict (although it was made private in 2.0) +_NUMPY_SCTYPES = np.sctypes if hasattr(np, "sctypes") else np._core.sctypes + def np_to_pa_dtype(dtype): """Util to convert numpy dtype to PyArrow dtype.""" @@ -335,7 +339,7 @@ def min_signed_type(x, min_size=8): Return the smallest *signed* integer dtype that can represent the integer ``x`` """ - for int_dtype in np.sctypes["int"]: + for int_dtype in _NUMPY_SCTYPES["int"]: if (cudf.dtype(int_dtype).itemsize * 8) >= min_size: if np.iinfo(int_dtype).min <= x <= np.iinfo(int_dtype).max: return int_dtype @@ -348,7 +352,7 @@ def min_unsigned_type(x, min_size=8): Return the smallest *unsigned* integer dtype that can represent the integer ``x`` """ - for int_dtype in np.sctypes["uint"]: + for int_dtype in _NUMPY_SCTYPES["uint"]: if (cudf.dtype(int_dtype).itemsize * 8) >= min_size: if 0 <= x <= np.iinfo(int_dtype).max: return int_dtype