Skip to content

Commit

Permalink
Handle mixed-like homogeneous types in isin (#15771)
Browse files Browse the repository at this point in the history
Fixes: #15768 

There is a possibility that a host array can have `object` type but contain all values of a homogeneous type, this still cannot be supported by column constructors because `cudf` doesn't have a true `object` types, hence this PR introduces a workaround for this problem in `isin` API.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: #15771
  • Loading branch information
galipremsagar authored May 17, 2024
1 parent 6d5f965 commit d10b8e4
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 4 deletions.
29 changes: 25 additions & 4 deletions python/cudf/cudf/core/column/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
)
from cudf.core.dtypes import CategoricalDtype
from cudf.core.mixins import BinaryOperand
from cudf.errors import MixedTypeError
from cudf.utils.dtypes import (
min_column_type,
min_signed_type,
Expand Down Expand Up @@ -404,10 +405,30 @@ def _process_values_for_isin(
self, values: Sequence
) -> Tuple[ColumnBase, ColumnBase]:
lhs = cast("cudf.core.column.ColumnBase", self)
rhs = as_column(values, nan_as_null=False)

if isinstance(rhs, NumericalColumn):
rhs = rhs.astype(dtype=self.dtype)
try:
rhs = as_column(values, nan_as_null=False)
except (MixedTypeError, TypeError) as e:
# There is a corner where `values` can be of `object` dtype
# but have values of homogeneous type.
inferred_dtype = cudf.api.types.infer_dtype(values)
if (
self.dtype.kind in {"i", "u"} and inferred_dtype == "integer"
) or (
self.dtype.kind == "f"
and inferred_dtype in {"floating", "integer"}
):
rhs = as_column(values, nan_as_null=False, dtype=self.dtype)
elif self.dtype.kind == "f" and inferred_dtype == "integer":
rhs = as_column(values, nan_as_null=False, dtype="int")
elif (
self.dtype.kind in {"i", "u"} and inferred_dtype == "floating"
):
rhs = as_column(values, nan_as_null=False, dtype="float")
else:
raise e
else:
if isinstance(rhs, NumericalColumn):
rhs = rhs.astype(dtype=self.dtype)

if lhs.null_count == len(lhs):
lhs = lhs.astype(rhs.dtype)
Expand Down
22 changes: 22 additions & 0 deletions python/dask_cudf/dask_cudf/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -971,3 +971,25 @@ def func(x):
# NOTE: The calculation here doesn't need to make sense.
# We just need to make sure we get the right type back.
assert type(result) == type(expect)


@pytest.mark.parametrize("data", [[1, 2, 3], [1.1, 2.3, 4.5]])
@pytest.mark.parametrize("values", [[1, 5], [1.1, 2.4, 2.3]])
def test_series_isin(data, values):
ser = cudf.Series(data)
pddf = dd.from_pandas(ser.to_pandas(), 1)
ddf = dask_cudf.from_cudf(ser, 1)

actual = ddf.isin(values)
expected = pddf.isin(values)

dd.assert_eq(actual, expected)


def test_series_isin_error():
ser = cudf.Series([1, 2, 3])
ddf = dask_cudf.from_cudf(ser, 1)
with pytest.raises(TypeError):
ser.isin([1, 5, "a"])
with pytest.raises(TypeError):
ddf.isin([1, 5, "a"]).compute()

0 comments on commit d10b8e4

Please sign in to comment.