Skip to content

Commit

Permalink
Remove cudf._lib.utils in favor of python APIs (#17625)
Browse files Browse the repository at this point in the history
Contributes to #17317

Dependent on #17582

Did a search across RAPIDS and Morpheus and didn't find usage of these methods.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #17625
  • Loading branch information
mroeschke authored Dec 20, 2024
1 parent fb62d0e commit 69d62cb
Show file tree
Hide file tree
Showing 10 changed files with 85 additions and 142 deletions.
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# the License.
# =============================================================================

set(cython_sources column.pyx scalar.pyx strings_udf.pyx types.pyx utils.pyx)
set(cython_sources column.pyx scalar.pyx strings_udf.pyx types.pyx)
set(linked_libraries cudf::cudf)

rapids_cython_create_modules(
Expand Down
Empty file removed python/cudf/cudf/_lib/__init__.pxd
Empty file.
6 changes: 0 additions & 6 deletions python/cudf/cudf/_lib/utils.pxd

This file was deleted.

94 changes: 0 additions & 94 deletions python/cudf/cudf/_lib/utils.pyx

This file was deleted.

4 changes: 3 additions & 1 deletion python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -861,7 +861,9 @@ def _quantile_table(
column_order,
null_precedence,
)
columns = libcudf.utils.columns_from_pylibcudf_table(plc_table)
columns = [
ColumnBase.from_pylibcudf(col) for col in plc_table.columns()
]
return self._from_columns_like_self(
columns,
column_names=self._column_names,
Expand Down
13 changes: 10 additions & 3 deletions python/cudf/cudf/io/avro.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pylibcudf as plc

import cudf
from cudf._lib.utils import data_from_pylibcudf_io
from cudf._lib.column import Column
from cudf.utils import ioutils


Expand Down Expand Up @@ -46,5 +46,12 @@ def read_avro(
options.set_columns(columns)

plc_result = plc.io.avro.read_avro(options)

return cudf.DataFrame._from_data(*data_from_pylibcudf_io(plc_result))
data = {
name: Column.from_pylibcudf(col)
for name, col in zip(
plc_result.column_names(include_children=False),
plc_result.columns,
strict=True,
)
}
return cudf.DataFrame._from_data(data)
16 changes: 12 additions & 4 deletions python/cudf/cudf/io/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
import pylibcudf as plc

import cudf
from cudf._lib.column import Column
from cudf._lib.types import dtype_to_pylibcudf_type
from cudf._lib.utils import data_from_pylibcudf_io
from cudf.api.types import is_hashable, is_scalar
from cudf.core.buffer import acquire_spill_lock
from cudf.utils import ioutils
Expand Down Expand Up @@ -251,9 +251,17 @@ def read_csv(
if na_values is not None:
options.set_na_values([str(val) for val in na_values])

df = cudf.DataFrame._from_data(
*data_from_pylibcudf_io(plc.io.csv.read_csv(options))
)
table_w_meta = plc.io.csv.read_csv(options)
data = {
name: Column.from_pylibcudf(col)
for name, col in zip(
table_w_meta.column_names(include_children=False),
table_w_meta.columns,
strict=True,
)
}

df = cudf.DataFrame._from_data(data)

if isinstance(dtype, abc.Mapping):
for k, v in dtype.items():
Expand Down
26 changes: 14 additions & 12 deletions python/cudf/cudf/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
import cudf
from cudf._lib.column import Column
from cudf._lib.types import dtype_to_pylibcudf_type
from cudf._lib.utils import _data_from_columns, data_from_pylibcudf_io
from cudf.core.buffer import acquire_spill_lock
from cudf.utils import ioutils
from cudf.utils.dtypes import _maybe_convert_to_default_type
Expand Down Expand Up @@ -178,13 +177,11 @@ def read_json(
)
)
)
df = cudf.DataFrame._from_data(
*_data_from_columns(
columns=[Column.from_pylibcudf(col) for col in res_cols],
column_names=res_col_names,
index_names=None,
)
)
data = {
name: Column.from_pylibcudf(col)
for name, col in zip(res_col_names, res_cols, strict=True)
}
df = cudf.DataFrame._from_data(data)
ioutils._add_df_col_struct_names(df, res_child_names)
return df
else:
Expand All @@ -207,10 +204,15 @@ def read_json(
extra_parameters=kwargs,
)
)

df = cudf.DataFrame._from_data(
*data_from_pylibcudf_io(table_w_meta)
)
data = {
name: Column.from_pylibcudf(col)
for name, col in zip(
table_w_meta.column_names(include_children=False),
table_w_meta.columns,
strict=True,
)
}
df = cudf.DataFrame._from_data(data)

# Post-processing to add in struct column names
ioutils._add_df_col_struct_names(df, table_w_meta.child_names)
Expand Down
37 changes: 31 additions & 6 deletions python/cudf/cudf/io/orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,11 @@
import pylibcudf as plc

import cudf
from cudf._lib.column import Column
from cudf._lib.types import dtype_to_pylibcudf_type
from cudf._lib.utils import data_from_pylibcudf_io
from cudf.api.types import is_list_like
from cudf.core.buffer import acquire_spill_lock
from cudf.core.index import _index_from_data
from cudf.utils import ioutils

try:
Expand Down Expand Up @@ -323,11 +324,35 @@ def read_orc(
actual_index_names = list(index_col_names.values())
col_names = names[len(actual_index_names) :]

data, index = data_from_pylibcudf_io(
tbl_w_meta,
col_names if columns is None else names,
actual_index_names,
)
result_col_names = col_names if columns is None else names
if actual_index_names is None:
index = None
data = {
name: Column.from_pylibcudf(col)
for name, col in zip(
result_col_names, tbl_w_meta.columns, strict=True
)
}
else:
result_columns = [
Column.from_pylibcudf(col) for col in tbl_w_meta.columns
]
index = _index_from_data(
dict(
zip(
actual_index_names,
result_columns[: len(actual_index_names)],
strict=True,
)
)
)
data = dict(
zip(
result_col_names,
result_columns[len(actual_index_names) :],
strict=True,
)
)

if is_range_index:
index = range_idx
Expand Down
29 changes: 14 additions & 15 deletions python/cudf/cudf/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,6 @@

import cudf
from cudf._lib.column import Column
from cudf._lib.utils import (
_data_from_columns,
data_from_pylibcudf_io,
)
from cudf.api.types import is_list_like
from cudf.core.buffer import acquire_spill_lock
from cudf.core.column import as_column, column_empty
Expand Down Expand Up @@ -1238,16 +1234,11 @@ def _read_parquet(
# Drop residual columns to save memory
tbl._columns[i] = None

df = cudf.DataFrame._from_data(
*_data_from_columns(
columns=[
Column.from_pylibcudf(plc)
for plc in concatenated_columns
],
column_names=column_names,
index_names=None,
)
)
data = {
name: Column.from_pylibcudf(col)
for name, col in zip(column_names, concatenated_columns)
}
df = cudf.DataFrame._from_data(data)
df = _process_metadata(
df,
column_names,
Expand Down Expand Up @@ -1287,8 +1278,16 @@ def _read_parquet(
options.set_filter(filters)

tbl_w_meta = plc.io.parquet.read_parquet(options)
data = {
name: Column.from_pylibcudf(col)
for name, col in zip(
tbl_w_meta.column_names(include_children=False),
tbl_w_meta.columns,
strict=True,
)
}

df = cudf.DataFrame._from_data(*data_from_pylibcudf_io(tbl_w_meta))
df = cudf.DataFrame._from_data(data)

df = _process_metadata(
df,
Expand Down

0 comments on commit 69d62cb

Please sign in to comment.