Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Avro Reader options classes to pylibcudf #17599

Merged
merged 2 commits into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 12 additions & 5 deletions python/cudf/cudf/io/avro.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,18 @@ def read_avro(
if not isinstance(skip_rows, int) or skip_rows < 0:
raise TypeError("skip_rows must be an int >= 0")

plc_result = plc.io.avro.read_avro(
plc.io.types.SourceInfo([filepath_or_buffer]),
columns,
skip_rows,
num_rows,
options = (
plc.io.avro.AvroReaderOptions.builder(
plc.io.types.SourceInfo([filepath_or_buffer])
)
.skip_rows(skip_rows)
.num_rows(num_rows)
.build()
)

if columns is not None and len(columns) > 0:
options.set_columns(columns)

plc_result = plc.io.avro.read_avro(options)

return cudf.DataFrame._from_data(*data_from_pylibcudf_io(plc_result))
25 changes: 18 additions & 7 deletions python/pylibcudf/pylibcudf/io/avro.pxd
Original file line number Diff line number Diff line change
@@ -1,12 +1,23 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from pylibcudf.io.types cimport SourceInfo, TableWithMetadata
from pylibcudf.libcudf.io.avro cimport avro_reader_options
from pylibcudf.libcudf.io.avro cimport avro_reader_options, avro_reader_options_builder
from pylibcudf.libcudf.types cimport size_type


cpdef TableWithMetadata read_avro(
SourceInfo source_info,
list columns = *,
size_type skip_rows = *,
size_type num_rows = *
)
from pylibcudf.libcudf.types cimport size_type

cdef class AvroReaderOptions:
cdef avro_reader_options c_obj
cdef SourceInfo source
cpdef void set_columns(self, list col_names)


cdef class AvroReaderOptionsBuilder:
cdef avro_reader_options_builder c_obj
cdef SourceInfo source
cpdef AvroReaderOptionsBuilder columns(self, list col_names)
cpdef AvroReaderOptionsBuilder skip_rows(self, size_type skip_rows)
cpdef AvroReaderOptionsBuilder num_rows(self, size_type num_rows)
cpdef AvroReaderOptions build(self)

cpdef TableWithMetadata read_avro(AvroReaderOptions options)
21 changes: 13 additions & 8 deletions python/pylibcudf/pylibcudf/io/avro.pyi
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from pylibcudf.io.types import SourceInfo, TableWithMetadata

__all__ = ["read_avro"]

def read_avro(
source_info: SourceInfo,
columns: list[str] | None = None,
skip_rows: int = 0,
num_rows: int = -1,
) -> TableWithMetadata: ...
__all__ = ["AvroReaderOptions", "AvroReaderOptionsBuilder", "read_avro"]
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see __all__'s in other type stub files. Should it be included?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO yes, ideally the stubs should also have __all__

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, will follow up to add __all__


class AvroReaderOptions:
@staticmethod
def builder(source: SourceInfo) -> AvroReaderOptionsBuilder: ...

class AvroReaderOptionsBuilder:
def columns(col_names: list[str]) -> AvroReaderOptionsBuilder: ...
def skip_rows(skip_rows: int) -> AvroReaderOptionsBuilder: ...
def num_rows(num_rows: int) -> AvroReaderOptionsBuilder: ...
def build(self) -> AvroReaderOptions: ...

def read_avro(options: AvroReaderOptions) -> TableWithMetadata: ...
156 changes: 121 additions & 35 deletions python/pylibcudf/pylibcudf/io/avro.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -10,52 +10,138 @@ from pylibcudf.libcudf.io.avro cimport (
)
from pylibcudf.libcudf.types cimport size_type

__all__ = ["read_avro"]
__all__ = ["read_avro", "AvroReaderOptions", "AvroReaderOptionsBuilder"]


cdef class AvroReaderOptions:
"""
The settings to use for ``read_avro``
For details, see :cpp:class:`cudf::io::avro_reader_options`
"""
@staticmethod
def builder(SourceInfo source):
"""
Create a AvroWriterOptionsBuilder object

For details, see :cpp:func:`cudf::io::avro_reader_options::builder`

Parameters
----------
sink : SourceInfo
The source to read the Avro file from.

Returns
-------
AvroReaderOptionsBuilder
Builder to build AvroReaderOptions
"""
cdef AvroReaderOptionsBuilder avro_builder = AvroReaderOptionsBuilder.__new__(
AvroReaderOptionsBuilder
)
avro_builder.c_obj = avro_reader_options.builder(source.c_obj)
avro_builder.source = source
return avro_builder

cpdef void set_columns(self, list col_names):
"""
Set names of the column to be read.

Parameters
----------
col_names : list[str]
List of column names

Returns
-------
None
"""
cdef vector[string] vec
vec.reserve(len(col_names))
for name in col_names:
vec.push_back(str(name).encode())
self.c_obj.set_columns(vec)


cdef class AvroReaderOptionsBuilder:
cpdef AvroReaderOptionsBuilder columns(self, list col_names):
"""
Set names of the column to be read.

Parameters
----------
col_names : list
List of column names

Returns
-------
AvroReaderOptionsBuilder
"""
cdef vector[string] vec
mroeschke marked this conversation as resolved.
Show resolved Hide resolved
vec.reserve(len(col_names))
for name in col_names:
vec.push_back(str(name).encode())
self.c_obj.columns(vec)
return self

cpdef AvroReaderOptionsBuilder skip_rows(self, size_type skip_rows):
"""
Sets number of rows to skip.

Parameters
----------
skip_rows : size_type
Number of rows to skip from start

Returns
-------
AvroReaderOptionsBuilder
"""
self.c_obj.skip_rows(skip_rows)
return self

cpdef AvroReaderOptionsBuilder num_rows(self, size_type num_rows):
"""
Sets number of rows to read.

Parameters
----------
num_rows : size_type
Number of rows to read after skip

Returns
-------
AvroReaderOptionsBuilder
"""
self.c_obj.num_rows(num_rows)
return self

cpdef AvroReaderOptions build(self):
"""Create a AvroReaderOptions object"""
cdef AvroReaderOptions avro_options = AvroReaderOptions.__new__(
AvroReaderOptions
)
avro_options.c_obj = move(self.c_obj.build())
avro_options.source = self.source
return avro_options


cpdef TableWithMetadata read_avro(
SourceInfo source_info,
list columns = None,
size_type skip_rows = 0,
size_type num_rows = -1
AvroReaderOptions options
):
"""
Reads an Avro dataset into a :py:class:`~.types.TableWithMetadata`.
Read from Avro format.

The source to read from and options are encapsulated
by the `options` object.

For details, see :cpp:func:`read_avro`.

Parameters
----------
source_info: SourceInfo
The SourceInfo object to read the avro dataset from.
columns: list, default None
Optional columns to read, if not provided, reads all columns in the file.
skip_rows: size_type, default 0
The number of rows to skip.
num_rows: size_type, default -1
The number of rows to read, after skipping rows.
If -1 is passed, all rows will be read.

Returns
-------
TableWithMetadata
The Table and its corresponding metadata (column names) that were read in.
options: AvroReaderOptions
Settings for controlling reading behavior
"""
cdef vector[string] c_columns
if columns is not None and len(columns) > 0:
c_columns.reserve(len(columns))
for col in columns:
c_columns.push_back(str(col).encode())

cdef avro_reader_options avro_opts = (
avro_reader_options.builder(source_info.c_obj)
.columns(c_columns)
.skip_rows(skip_rows)
.num_rows(num_rows)
.build()
)

with nogil:
c_result = move(cpp_read_avro(avro_opts))
c_result = move(cpp_read_avro(options.c_obj))

return TableWithMetadata.from_libcudf(c_result)
13 changes: 9 additions & 4 deletions python/pylibcudf/pylibcudf/tests/io/test_avro.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,15 @@ def test_read_avro(avro_dtypes, avro_dtype_data, row_opts, columns, nullable):
buffer.seek(0)

res = plc.io.avro.read_avro(
plc.io.types.SourceInfo([buffer]),
columns=columns,
skip_rows=skip_rows,
num_rows=num_rows,
(
plc.io.avro.AvroReaderOptions.builder(
plc.io.types.SourceInfo([buffer])
)
.columns(columns)
.skip_rows(skip_rows)
.num_rows(num_rows)
.build()
)
)

expected = pa.Table.from_arrays(
Expand Down
Loading