Skip to content

Commit

Permalink
Support compression= in DataFrame.to_json (#17634)
Browse files Browse the repository at this point in the history
closes #17564

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: #17634
  • Loading branch information
mroeschke authored Dec 19, 2024
1 parent 253b0d8 commit 550ea35
Show file tree
Hide file tree
Showing 6 changed files with 70 additions and 11 deletions.
30 changes: 19 additions & 11 deletions python/cudf/cudf/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,22 @@ def _get_cudf_schema_element_from_dtype(
return lib_type, child_types


def _to_plc_compression(
compression: Literal["infer", "gzip", "bz2", "zip", "xz", None],
) -> plc.io.types.CompressionType:
if compression is not None:
if compression == "gzip":
return plc.io.types.CompressionType.GZIP
elif compression == "bz2":
return plc.io.types.CompressionType.BZIP2
elif compression == "zip":
return plc.io.types.CompressionType.ZIP
else:
return plc.io.types.CompressionType.AUTO
else:
return plc.io.types.CompressionType.NONE


@ioutils.doc_read_json()
def read_json(
path_or_buf,
Expand Down Expand Up @@ -115,17 +131,7 @@ def read_json(
if isinstance(source, str) and not os.path.isfile(source):
filepaths_or_buffers[idx] = source.encode()

if compression is not None:
if compression == "gzip":
c_compression = plc.io.types.CompressionType.GZIP
elif compression == "bz2":
c_compression = plc.io.types.CompressionType.BZIP2
elif compression == "zip":
c_compression = plc.io.types.CompressionType.ZIP
else:
c_compression = plc.io.types.CompressionType.AUTO
else:
c_compression = plc.io.types.CompressionType.NONE
c_compression = _to_plc_compression(compression)

if on_bad_lines.lower() == "error":
c_on_bad_lines = plc.io.types.JSONRecoveryMode.FAIL
Expand Down Expand Up @@ -291,6 +297,7 @@ def _plc_write_json(
include_nulls: bool = True,
lines: bool = False,
rows_per_chunk: int = 1024 * 64, # 64K rows
compression: Literal["infer", "gzip", "bz2", "zip", "xz", None] = None,
) -> None:
try:
tbl_w_meta = plc.io.TableWithMetadata(
Expand All @@ -307,6 +314,7 @@ def _plc_write_json(
.na_rep(na_rep)
.include_nulls(include_nulls)
.lines(lines)
.compression(_to_plc_compression(compression))
.build()
)
if rows_per_chunk != np.iinfo(np.int32).max:
Expand Down
9 changes: 9 additions & 0 deletions python/cudf/cudf/tests/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -1453,3 +1453,12 @@ def test_chunked_json_reader():
with cudf.option_context("io.json.low_memory", True):
gdf = cudf.read_json(buf, lines=True)
assert_eq(df, gdf)


@pytest.mark.parametrize("compression", ["gzip", None])
def test_roundtrip_compression(compression, tmp_path):
expected = cudf.DataFrame({"a": 1, "b": "2"})
fle = BytesIO()
expected.to_json(fle, engine="cudf", compression=compression)
result = cudf.read_json(fle, engine="cudf", compression=compression)
assert_eq(result, expected)
2 changes: 2 additions & 0 deletions python/pylibcudf/pylibcudf/io/json.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ cdef class JsonWriterOptions:
cpdef void set_rows_per_chunk(self, size_type val)
cpdef void set_true_value(self, str val)
cpdef void set_false_value(self, str val)
cpdef void set_compression(self, compression_type comptype)

cdef class JsonWriterOptionsBuilder:
cdef json_writer_options_builder c_obj
Expand All @@ -71,6 +72,7 @@ cdef class JsonWriterOptionsBuilder:
cpdef JsonWriterOptionsBuilder na_rep(self, str val)
cpdef JsonWriterOptionsBuilder include_nulls(self, bool val)
cpdef JsonWriterOptionsBuilder lines(self, bool val)
cpdef JsonWriterOptionsBuilder compression(self, compression_type comptype)
cpdef JsonWriterOptions build(self)

cpdef void write_json(JsonWriterOptions options)
Expand Down
2 changes: 2 additions & 0 deletions python/pylibcudf/pylibcudf/io/json.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,14 @@ class JsonWriterOptions:
def set_rows_per_chunk(self, val: int) -> None: ...
def set_true_value(self, val: str) -> None: ...
def set_false_value(self, val: str) -> None: ...
def set_compression(self, comptype: CompressionType) -> None: ...

class JsonWriterOptionsBuilder:
def metadata(self, tbl_w_meta: TableWithMetadata) -> Self: ...
def na_rep(self, val: str) -> Self: ...
def include_nulls(self, val: bool) -> Self: ...
def lines(self, val: bool) -> Self: ...
def compression(self, comptype: CompressionType) -> Self: ...
def build(self) -> JsonWriterOptions: ...

def write_json(options: JsonWriterOptions) -> None: ...
Expand Down
30 changes: 30 additions & 0 deletions python/pylibcudf/pylibcudf/io/json.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -587,6 +587,20 @@ cdef class JsonWriterOptions:
"""
self.c_obj.set_false_value(val.encode())

cpdef void set_compression(self, compression_type comptype):
"""
Sets compression type to be used
Parameters
----------
comptype : CompressionType
Compression type for sink
Returns
-------
None
"""
self.c_obj.set_compression(comptype)

cdef class JsonWriterOptionsBuilder:
cpdef JsonWriterOptionsBuilder metadata(self, TableWithMetadata tbl_w_meta):
Expand Down Expand Up @@ -653,6 +667,22 @@ cdef class JsonWriterOptionsBuilder:
self.c_obj.lines(val)
return self

cpdef JsonWriterOptionsBuilder compression(self, compression_type comptype):
"""
Sets compression type of output sink.
Parameters
----------
comptype : CompressionType
Compression type used
Returns
-------
Self
"""
self.c_obj.compression(comptype)
return self

cpdef JsonWriterOptions build(self):
"""Create a JsonWriterOptions object"""
cdef JsonWriterOptions json_options = JsonWriterOptions.__new__(
Expand Down
8 changes: 8 additions & 0 deletions python/pylibcudf/pylibcudf/libcudf/io/json.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,8 @@ cdef extern from "cudf/io/json.hpp" \
size_type get_rows_per_chunk() except +libcudf_exception_handler
string get_true_value() except +libcudf_exception_handler
string get_false_value() except +libcudf_exception_handler
cudf_io_types.compression_type get_compression()\
except +libcudf_exception_handler

# setter
void set_table(
Expand All @@ -181,6 +183,9 @@ cdef extern from "cudf/io/json.hpp" \
void set_rows_per_chunk(size_type val) except +libcudf_exception_handler
void set_true_value(string val) except +libcudf_exception_handler
void set_false_value(string val) except +libcudf_exception_handler
void set_compression(
cudf_io_types.compression_type comptype
) except +libcudf_exception_handler

@staticmethod
json_writer_options_builder builder(
Expand Down Expand Up @@ -218,6 +223,9 @@ cdef extern from "cudf/io/json.hpp" \
json_writer_options_builder& false_value(
string val
) except +libcudf_exception_handler
json_writer_options_builder& compression(
cudf_io_types.compression_type comptype
) except +libcudf_exception_handler

json_writer_options build() except +libcudf_exception_handler

Expand Down

0 comments on commit 550ea35

Please sign in to comment.