From 550ea35d55390ddbf4f128c6fdf2e877d9aa0306 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 19 Dec 2024 13:52:02 -0800 Subject: [PATCH] Support compression= in DataFrame.to_json (#17634) closes https://github.com/rapidsai/cudf/issues/17564 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17634 --- python/cudf/cudf/io/json.py | 30 ++++++++++++------- python/cudf/cudf/tests/test_json.py | 9 ++++++ python/pylibcudf/pylibcudf/io/json.pxd | 2 ++ python/pylibcudf/pylibcudf/io/json.pyi | 2 ++ python/pylibcudf/pylibcudf/io/json.pyx | 30 +++++++++++++++++++ .../pylibcudf/pylibcudf/libcudf/io/json.pxd | 8 +++++ 6 files changed, 70 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index 39a85465deb..4be556e1d67 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -54,6 +54,22 @@ def _get_cudf_schema_element_from_dtype( return lib_type, child_types +def _to_plc_compression( + compression: Literal["infer", "gzip", "bz2", "zip", "xz", None], +) -> plc.io.types.CompressionType: + if compression is not None: + if compression == "gzip": + return plc.io.types.CompressionType.GZIP + elif compression == "bz2": + return plc.io.types.CompressionType.BZIP2 + elif compression == "zip": + return plc.io.types.CompressionType.ZIP + else: + return plc.io.types.CompressionType.AUTO + else: + return plc.io.types.CompressionType.NONE + + @ioutils.doc_read_json() def read_json( path_or_buf, @@ -115,17 +131,7 @@ def read_json( if isinstance(source, str) and not os.path.isfile(source): filepaths_or_buffers[idx] = source.encode() - if compression is not None: - if compression == "gzip": - c_compression = plc.io.types.CompressionType.GZIP - elif compression == "bz2": - c_compression = plc.io.types.CompressionType.BZIP2 - elif compression == "zip": - c_compression = plc.io.types.CompressionType.ZIP - else: - c_compression = plc.io.types.CompressionType.AUTO - else: - c_compression = plc.io.types.CompressionType.NONE + c_compression = _to_plc_compression(compression) if on_bad_lines.lower() == "error": c_on_bad_lines = plc.io.types.JSONRecoveryMode.FAIL @@ -291,6 +297,7 @@ def _plc_write_json( include_nulls: bool = True, lines: bool = False, rows_per_chunk: int = 1024 * 64, # 64K rows + compression: Literal["infer", "gzip", "bz2", "zip", "xz", None] = None, ) -> None: try: tbl_w_meta = plc.io.TableWithMetadata( @@ -307,6 +314,7 @@ def _plc_write_json( .na_rep(na_rep) .include_nulls(include_nulls) .lines(lines) + .compression(_to_plc_compression(compression)) .build() ) if rows_per_chunk != np.iinfo(np.int32).max: diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index aaa8d7d07ee..db34329261f 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -1453,3 +1453,12 @@ def test_chunked_json_reader(): with cudf.option_context("io.json.low_memory", True): gdf = cudf.read_json(buf, lines=True) assert_eq(df, gdf) + + +@pytest.mark.parametrize("compression", ["gzip", None]) +def test_roundtrip_compression(compression, tmp_path): + expected = cudf.DataFrame({"a": 1, "b": "2"}) + fle = BytesIO() + expected.to_json(fle, engine="cudf", compression=compression) + result = cudf.read_json(fle, engine="cudf", compression=compression) + assert_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/io/json.pxd b/python/pylibcudf/pylibcudf/io/json.pxd index 7e446298ba9..7ce3cb859a5 100644 --- a/python/pylibcudf/pylibcudf/io/json.pxd +++ b/python/pylibcudf/pylibcudf/io/json.pxd @@ -62,6 +62,7 @@ cdef class JsonWriterOptions: cpdef void set_rows_per_chunk(self, size_type val) cpdef void set_true_value(self, str val) cpdef void set_false_value(self, str val) + cpdef void set_compression(self, compression_type comptype) cdef class JsonWriterOptionsBuilder: cdef json_writer_options_builder c_obj @@ -71,6 +72,7 @@ cdef class JsonWriterOptionsBuilder: cpdef JsonWriterOptionsBuilder na_rep(self, str val) cpdef JsonWriterOptionsBuilder include_nulls(self, bool val) cpdef JsonWriterOptionsBuilder lines(self, bool val) + cpdef JsonWriterOptionsBuilder compression(self, compression_type comptype) cpdef JsonWriterOptions build(self) cpdef void write_json(JsonWriterOptions options) diff --git a/python/pylibcudf/pylibcudf/io/json.pyi b/python/pylibcudf/pylibcudf/io/json.pyi index b84b437a3a2..db4546f138d 100644 --- a/python/pylibcudf/pylibcudf/io/json.pyi +++ b/python/pylibcudf/pylibcudf/io/json.pyi @@ -60,12 +60,14 @@ class JsonWriterOptions: def set_rows_per_chunk(self, val: int) -> None: ... def set_true_value(self, val: str) -> None: ... def set_false_value(self, val: str) -> None: ... + def set_compression(self, comptype: CompressionType) -> None: ... class JsonWriterOptionsBuilder: def metadata(self, tbl_w_meta: TableWithMetadata) -> Self: ... def na_rep(self, val: str) -> Self: ... def include_nulls(self, val: bool) -> Self: ... def lines(self, val: bool) -> Self: ... + def compression(self, comptype: CompressionType) -> Self: ... def build(self) -> JsonWriterOptions: ... def write_json(options: JsonWriterOptions) -> None: ... diff --git a/python/pylibcudf/pylibcudf/io/json.pyx b/python/pylibcudf/pylibcudf/io/json.pyx index 1d8a559afad..cf286378902 100644 --- a/python/pylibcudf/pylibcudf/io/json.pyx +++ b/python/pylibcudf/pylibcudf/io/json.pyx @@ -587,6 +587,20 @@ cdef class JsonWriterOptions: """ self.c_obj.set_false_value(val.encode()) + cpdef void set_compression(self, compression_type comptype): + """ + Sets compression type to be used + + Parameters + ---------- + comptype : CompressionType + Compression type for sink + + Returns + ------- + None + """ + self.c_obj.set_compression(comptype) cdef class JsonWriterOptionsBuilder: cpdef JsonWriterOptionsBuilder metadata(self, TableWithMetadata tbl_w_meta): @@ -653,6 +667,22 @@ cdef class JsonWriterOptionsBuilder: self.c_obj.lines(val) return self + cpdef JsonWriterOptionsBuilder compression(self, compression_type comptype): + """ + Sets compression type of output sink. + + Parameters + ---------- + comptype : CompressionType + Compression type used + + Returns + ------- + Self + """ + self.c_obj.compression(comptype) + return self + cpdef JsonWriterOptions build(self): """Create a JsonWriterOptions object""" cdef JsonWriterOptions json_options = JsonWriterOptions.__new__( diff --git a/python/pylibcudf/pylibcudf/libcudf/io/json.pxd b/python/pylibcudf/pylibcudf/libcudf/io/json.pxd index c241c478f25..d23dd0685d1 100644 --- a/python/pylibcudf/pylibcudf/libcudf/io/json.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/io/json.pxd @@ -167,6 +167,8 @@ cdef extern from "cudf/io/json.hpp" \ size_type get_rows_per_chunk() except +libcudf_exception_handler string get_true_value() except +libcudf_exception_handler string get_false_value() except +libcudf_exception_handler + cudf_io_types.compression_type get_compression()\ + except +libcudf_exception_handler # setter void set_table( @@ -181,6 +183,9 @@ cdef extern from "cudf/io/json.hpp" \ void set_rows_per_chunk(size_type val) except +libcudf_exception_handler void set_true_value(string val) except +libcudf_exception_handler void set_false_value(string val) except +libcudf_exception_handler + void set_compression( + cudf_io_types.compression_type comptype + ) except +libcudf_exception_handler @staticmethod json_writer_options_builder builder( @@ -218,6 +223,9 @@ cdef extern from "cudf/io/json.hpp" \ json_writer_options_builder& false_value( string val ) except +libcudf_exception_handler + json_writer_options_builder& compression( + cudf_io_types.compression_type comptype + ) except +libcudf_exception_handler json_writer_options build() except +libcudf_exception_handler