From 16f8173d0f13dcd857e82e2900041fe84087fdec Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Tue, 10 Dec 2024 14:18:08 -0500 Subject: [PATCH] Complete the initial fix --- cpp/CMakeLists.txt | 28 +++++++++--------- cpp/examples/orc_io/debug/breakpoints.txt | 2 +- cpp/src/io/orc/stripe_data.cu | 36 +++++++++++++++++++---- 3 files changed, 46 insertions(+), 20 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f2abfa4fd8a..f2988b6f323 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -799,20 +799,20 @@ set_source_files_properties( PROPERTIES COMPILE_DEFINITIONS "_FILE_OFFSET_BITS=64" ) -set_property( - SOURCE - src/io/orc/dict_enc.cu - src/io/orc/reader_impl.cu - src/io/orc/reader_impl_chunking.cu - src/io/orc/reader_impl_decode.cu - src/io/orc/stats_enc.cu - src/io/orc/stripe_data.cu - src/io/orc/stripe_enc.cu - src/io/orc/stripe_init.cu - src/io/orc/writer_impl.cu - APPEND - PROPERTY COMPILE_OPTIONS "-g;-G" -) +# set_property( +# SOURCE +# src/io/orc/dict_enc.cu +# src/io/orc/reader_impl.cu +# src/io/orc/reader_impl_chunking.cu +# src/io/orc/reader_impl_decode.cu +# src/io/orc/stats_enc.cu +# src/io/orc/stripe_data.cu +# src/io/orc/stripe_enc.cu +# src/io/orc/stripe_init.cu +# src/io/orc/writer_impl.cu +# APPEND +# PROPERTY COMPILE_OPTIONS "-g;-G" +# ) set_property( SOURCE src/io/parquet/writer_impl.cu diff --git a/cpp/examples/orc_io/debug/breakpoints.txt b/cpp/examples/orc_io/debug/breakpoints.txt index 77af24e4312..6675e943850 100644 --- a/cpp/examples/orc_io/debug/breakpoints.txt +++ b/cpp/examples/orc_io/debug/breakpoints.txt @@ -1,4 +1,4 @@ -break /home/coder/cudf/cpp/src/io/orc/stripe_data.cu:167 +break /home/coder/cudf/cpp/src/io/orc/stripe_data.cu:210 # break /home/coder/cudf/cpp/src/io/orc/stripe_data.cu:1398 # break /home/coder/cudf/cpp/src/io/orc/stripe_data.cu:646 diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index daff4b1dbf9..d8abc044119 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -171,8 +171,6 @@ class run_cache { __forceinline__ __device__ void write_to_cache(int64_t* src) { - // Block until the src data, generated by the 1st warp, for the thread block are ready. - __syncthreads(); const auto tid = threadIdx.x; // All threads in the block take a uniform code path. @@ -190,10 +188,33 @@ class run_cache { __syncthreads(); if (tid == 0) { _status = status::DISABLED; } } - __syncthreads(); } - __forceinline__ __device__ void read_from_cache([[maybe_unused]] uint64_t* dst) {} + __forceinline__ __device__ void read_from_cache(int64_t* dst, orc_rlev2_state_s* rle) + { + const auto tid = threadIdx.x; + + // All threads in the block take a uniform code path. + // _reusable_length ranges between [0, 512] + if (_status == status::CAN_READ_FROM_CACHE and _reusable_length > 0) { + // First, shift the data up + const auto dst_idx = tid + _reusable_length; + const auto v = (dst_idx < rle->num_vals + _reusable_length) ? dst[tid] : 0; + __syncthreads(); + + if (dst_idx < rle->num_vals + _reusable_length) { dst[dst_idx] = v; } + __syncthreads(); + + // Second, insert the cached data + if (tid < _reusable_length) { dst[tid] = _buf[tid]; } + __syncthreads(); + + if (tid == 0) { + _status = status::DISABLED; + rle->num_vals += _reusable_length; + } + } + } private: status _status; @@ -940,9 +961,14 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, } __syncwarp(); } + __syncthreads(); if constexpr (cuda::std::is_same_v) { - if (run_cache_bs != nullptr) { run_cache_bs->write_to_cache(vals); } + if (run_cache_bs != nullptr) { + run_cache_bs->read_from_cache(vals, rle); + run_cache_bs->write_to_cache(vals); + } } + __syncthreads(); return rle->num_vals; }