From 78e5c0d6c5a5c876421d1ab2308b14f8c7ecb9f7 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 11 Dec 2024 17:53:36 -0800 Subject: [PATCH] Use batched memcpy when writing ORC statistics (#17572) This PR replaces a set of per-column, per-rowgroup D2D memcopies with a single call to the `batched_memcpy_async` utility. Should improve performance when writing wide tables. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Muhammad Haseeb (https://github.com/mhaseeb123) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/17572 --- cpp/src/io/orc/writer_impl.cu | 36 ++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 0906017ee61..8e532b01788 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -28,6 +28,7 @@ #include #include +#include #include #include #include @@ -1386,29 +1387,34 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer, // we know the size of each array. The number of stripes per column in a chunk array can // be calculated by dividing the number of chunks by the number of columns. // That many chunks need to be copied at a time to the proper destination. - size_t num_entries_seen = 0; + size_t num_entries_seen = 0; + auto const num_buffers_to_copy = per_chunk_stats.stripe_stat_chunks.size() * num_columns * 2; + auto h_srcs = cudf::detail::make_empty_host_vector(num_buffers_to_copy, stream); + auto h_dsts = cudf::detail::make_empty_host_vector(num_buffers_to_copy, stream); + auto h_lens = cudf::detail::make_empty_host_vector(num_buffers_to_copy, stream); + for (size_t i = 0; i < per_chunk_stats.stripe_stat_chunks.size(); ++i) { auto const stripes_per_col = per_chunk_stats.stripe_stat_chunks[i].size() / num_columns; - auto const chunk_bytes = stripes_per_col * sizeof(statistics_chunk); - auto const merge_bytes = stripes_per_col * sizeof(statistics_merge_group); for (size_t col = 0; col < num_columns; ++col) { - CUDF_CUDA_TRY( - cudaMemcpyAsync(stat_chunks.data() + (num_stripes * col) + num_entries_seen, - per_chunk_stats.stripe_stat_chunks[i].data() + col * stripes_per_col, - chunk_bytes, - cudaMemcpyDefault, - stream.value())); - CUDF_CUDA_TRY( - cudaMemcpyAsync(stats_merge.device_ptr() + (num_stripes * col) + num_entries_seen, - per_chunk_stats.stripe_stat_merge[i].device_ptr() + col * stripes_per_col, - merge_bytes, - cudaMemcpyDefault, - stream.value())); + h_srcs.push_back(per_chunk_stats.stripe_stat_chunks[i].data() + col * stripes_per_col); + h_dsts.push_back(stat_chunks.data() + (num_stripes * col) + num_entries_seen); + h_lens.push_back(stripes_per_col * sizeof(statistics_chunk)); + + h_srcs.push_back(per_chunk_stats.stripe_stat_merge[i].device_ptr() + col * stripes_per_col); + h_dsts.push_back(stats_merge.device_ptr() + (num_stripes * col) + num_entries_seen); + h_lens.push_back(stripes_per_col * sizeof(statistics_merge_group)); } num_entries_seen += stripes_per_col; } + auto const& mr = cudf::get_current_device_resource_ref(); + auto const d_srcs = cudf::detail::make_device_uvector_async(h_srcs, stream, mr); + auto const d_dsts = cudf::detail::make_device_uvector_async(h_dsts, stream, mr); + auto const d_lens = cudf::detail::make_device_uvector_async(h_lens, stream, mr); + cudf::detail::batched_memcpy_async( + d_srcs.begin(), d_dsts.begin(), d_lens.begin(), d_srcs.size(), stream); + auto file_stats_merge = cudf::detail::make_host_vector(num_file_blobs, stream); for (auto i = 0u; i < num_file_blobs; ++i) {