Skip to content

Commit

Permalink
Add JSON Writer options classes to pylibcudf (#17606)
Browse files Browse the repository at this point in the history
Apart of #17565

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: #17606
  • Loading branch information
Matt711 authored Dec 18, 2024
1 parent b096182 commit f3caf09
Show file tree
Hide file tree
Showing 5 changed files with 262 additions and 100 deletions.
30 changes: 17 additions & 13 deletions python/cudf/cudf/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,21 +287,25 @@ def _plc_write_json(
rows_per_chunk: int = 1024 * 64, # 64K rows
) -> None:
try:
plc.io.json.write_json(
plc.io.SinkInfo([path_or_buf]),
plc.io.TableWithMetadata(
plc.Table(
[col.to_pylibcudf(mode="read") for col in table._columns]
),
colnames,
tbl_w_meta = plc.io.TableWithMetadata(
plc.Table(
[col.to_pylibcudf(mode="read") for col in table._columns]
),
na_rep,
include_nulls,
lines,
rows_per_chunk,
true_value="true",
false_value="false",
colnames,
)
options = (
plc.io.json.JsonWriterOptions.builder(
plc.io.SinkInfo([path_or_buf]), tbl_w_meta.tbl
)
.metadata(tbl_w_meta)
.na_rep(na_rep)
.include_nulls(include_nulls)
.lines(lines)
.build()
)
if rows_per_chunk != np.iinfo(np.int32).max:
options.set_rows_per_chunk(rows_per_chunk)
plc.io.json.write_json(options)
except OverflowError as err:
raise OverflowError(
f"Writing JSON file with rows_per_chunk={rows_per_chunk} failed. "
Expand Down
35 changes: 24 additions & 11 deletions python/pylibcudf/pylibcudf/io/json.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,13 @@ from pylibcudf.io.types cimport (
TableWithMetadata,
compression_type,
)
from pylibcudf.libcudf.io.json cimport json_recovery_mode_t
from pylibcudf.libcudf.io.json cimport (
json_recovery_mode_t,
json_writer_options,
json_writer_options_builder,
)
from pylibcudf.libcudf.types cimport size_type
from pylibcudf.table cimport Table


cpdef TableWithMetadata read_json(
Expand All @@ -24,17 +29,25 @@ cpdef TableWithMetadata read_json(
dict extra_parameters = *,
)

cdef class JsonWriterOptions:
cdef json_writer_options c_obj
cdef SinkInfo sink
cdef Table table
cpdef void set_rows_per_chunk(self, size_type val)
cpdef void set_true_value(self, str val)
cpdef void set_false_value(self, str val)

cpdef void write_json(
SinkInfo sink_info,
TableWithMetadata tbl,
str na_rep = *,
bool include_nulls = *,
bool lines = *,
size_type rows_per_chunk = *,
str true_value = *,
str false_value = *
)
cdef class JsonWriterOptionsBuilder:
cdef json_writer_options_builder c_obj
cdef SinkInfo sink
cdef Table table
cpdef JsonWriterOptionsBuilder metadata(self, TableWithMetadata tbl_w_meta)
cpdef JsonWriterOptionsBuilder na_rep(self, str val)
cpdef JsonWriterOptionsBuilder include_nulls(self, bool val)
cpdef JsonWriterOptionsBuilder lines(self, bool val)
cpdef JsonWriterOptions build(self)

cpdef void write_json(JsonWriterOptions options)

cpdef tuple chunked_read_json(
SourceInfo source_info,
Expand Down
29 changes: 19 additions & 10 deletions python/pylibcudf/pylibcudf/io/json.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from collections.abc import Mapping
from typing import TypeAlias

from typing_extensions import Self

from pylibcudf.column import Column
from pylibcudf.io.types import (
CompressionType,
Expand All @@ -10,6 +12,7 @@ from pylibcudf.io.types import (
SourceInfo,
TableWithMetadata,
)
from pylibcudf.table import Table
from pylibcudf.types import DataType

ChildNameToTypeMap: TypeAlias = Mapping[str, ChildNameToTypeMap]
Expand All @@ -28,16 +31,22 @@ def read_json(
prune_columns: bool = False,
recovery_mode: JSONRecoveryMode = JSONRecoveryMode.FAIL,
) -> TableWithMetadata: ...
def write_json(
sink_info: SinkInfo,
table_w_meta: TableWithMetadata,
na_rep: str = "",
include_nulls: bool = False,
lines: bool = False,
rows_per_chunk: int = 2**32 - 1,
true_value: str = "true",
false_value: str = "false",
) -> None: ...

class JsonWriterOptions:
@staticmethod
def builder(sink: SinkInfo, table: Table) -> JsonWriterOptionsBuilder: ...
def set_rows_per_chunk(self, val: int) -> None: ...
def set_true_value(self, val: str) -> None: ...
def set_false_value(self, val: str) -> None: ...

class JsonWriterOptionsBuilder:
def metadata(self, tbl_w_meta: TableWithMetadata) -> Self: ...
def na_rep(self, val: str) -> Self: ...
def include_nulls(self, val: bool) -> Self: ...
def lines(self, val: bool) -> Self: ...
def build(self) -> JsonWriterOptions: ...

def write_json(options: JsonWriterOptions) -> None: ...
def chunked_read_json(
source_info: SourceInfo,
dtypes: list[NameAndType] | None = None,
Expand Down
217 changes: 168 additions & 49 deletions python/pylibcudf/pylibcudf/io/json.pyx
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from libcpp cimport bool
from libcpp.limits cimport numeric_limits
from libcpp.map cimport map
from libcpp.string cimport string
from libcpp.utility cimport move
Expand All @@ -17,13 +16,18 @@ from pylibcudf.libcudf.io.json cimport (
)
from pylibcudf.libcudf.io.types cimport (
compression_type,
table_metadata,
table_with_metadata,
)
from pylibcudf.libcudf.types cimport data_type, size_type
from pylibcudf.types cimport DataType

__all__ = ["chunked_read_json", "read_json", "write_json"]
__all__ = [
"chunked_read_json",
"read_json",
"write_json",
"JsonWriterOptions",
"JsonWriterOptionsBuilder"
]

cdef map[string, schema_element] _generate_schema_map(list dtypes):
cdef map[string, schema_element] schema_map
Expand Down Expand Up @@ -294,56 +298,171 @@ cpdef TableWithMetadata read_json(
return TableWithMetadata.from_libcudf(c_result)


cpdef void write_json(
SinkInfo sink_info,
TableWithMetadata table_w_meta,
str na_rep = "",
bool include_nulls = False,
bool lines = False,
size_type rows_per_chunk = numeric_limits[size_type].max(),
str true_value = "true",
str false_value = "false"
):
cdef class JsonWriterOptions:
"""
Writes a :py:class:`~pylibcudf.table.Table` to JSON format.
The settings to use for ``write_json``
Parameters
----------
sink_info: SinkInfo
The SinkInfo object to write the JSON to.
table_w_meta: TableWithMetadata
The TableWithMetadata object containing the Table to write
na_rep: str, default ""
The string representation for null values.
include_nulls: bool, default False
For details, see :cpp:class:`cudf::io::json_writer_options`
"""
@staticmethod
def builder(SinkInfo sink, Table table):
"""
Create a JsonWriterOptionsBuilder object
Parameters
----------
sink : SinkInfo
The sink used for writer output
table : Table
Table to be written to output
Returns
-------
JsonWriterOptionsBuilder
Builder to build JsonWriterOptions
"""
cdef JsonWriterOptionsBuilder json_builder = (
JsonWriterOptionsBuilder.__new__(JsonWriterOptionsBuilder)
)
json_builder.c_obj = json_writer_options.builder(sink.c_obj, table.view())
json_builder.sink = sink
json_builder.table = table
return json_builder

cpdef void set_rows_per_chunk(self, size_type val):
"""
Sets string to used for null entries.
Parameters
----------
val : size_type
String to represent null value
Returns
-------
None
"""
self.c_obj.set_rows_per_chunk(val)

cpdef void set_true_value(self, str val):
"""
Sets string used for values != 0
Parameters
----------
val : str
String to represent values != 0
Returns
-------
None
"""
self.c_obj.set_true_value(val.encode())

cpdef void set_false_value(self, str val):
"""
Sets string used for values == 0
Parameters
----------
val : str
String to represent values == 0
Returns
-------
None
"""
self.c_obj.set_false_value(val.encode())


cdef class JsonWriterOptionsBuilder:
cpdef JsonWriterOptionsBuilder metadata(self, TableWithMetadata tbl_w_meta):
"""
Sets optional metadata (with column names).
Parameters
----------
tbl_w_meta : TableWithMetadata
Associated metadata
Returns
-------
Self
"""
self.c_obj.metadata(tbl_w_meta.metadata)
return self

cpdef JsonWriterOptionsBuilder na_rep(self, str val):
"""
Sets string to used for null entries.
Parameters
----------
val : str
String to represent null value
Returns
-------
Self
"""
self.c_obj.na_rep(val.encode())
return self

cpdef JsonWriterOptionsBuilder include_nulls(self, bool val):
"""
Enables/Disables output of nulls as 'null'.
lines: bool, default False
If `True`, write output in the JSON lines format.
rows_per_chunk: size_type, defaults to length of the input table
The maximum number of rows to write at a time.
true_value: str, default "true"
The string representation for values != 0 in INT8 types.
false_value: str, default "false"
The string representation for values == 0 in INT8 types.
Parameters
----------
val : bool
Boolean value to enable/disable
Returns
-------
Self
"""
self.c_obj.include_nulls(val)
return self

cpdef JsonWriterOptionsBuilder lines(self, bool val):
"""
Enables/Disables JSON lines for records format.
Parameters
----------
val : bool
Boolean value to enable/disable
Returns
-------
Self
"""
self.c_obj.lines(val)
return self

cpdef JsonWriterOptions build(self):
"""Create a JsonWriterOptions object"""
cdef JsonWriterOptions json_options = JsonWriterOptions.__new__(
JsonWriterOptions
)
json_options.c_obj = move(self.c_obj.build())
json_options.sink = self.sink
json_options.table = self.table
return json_options


cpdef void write_json(JsonWriterOptions options):
"""
cdef table_metadata tbl_meta = table_w_meta.metadata
cdef string na_rep_c = na_rep.encode()

cdef json_writer_options options = (
json_writer_options.builder(sink_info.c_obj, table_w_meta.tbl.view())
.metadata(tbl_meta)
.na_rep(na_rep_c)
.include_nulls(include_nulls)
.lines(lines)
.build()
)
Writes a set of columns to JSON format.
if rows_per_chunk != numeric_limits[size_type].max():
options.set_rows_per_chunk(rows_per_chunk)
if true_value != "true":
options.set_true_value(<string>true_value.encode())
if false_value != "false":
options.set_false_value(<string>false_value.encode())
Parameters
----------
options : JsonWriterOptions
Settings for controlling writing behavior
Returns
-------
None
"""
with nogil:
cpp_write_json(options)
cpp_write_json(options.c_obj)
Loading

0 comments on commit f3caf09

Please sign in to comment.