diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 7c0bd6d52e2..49ca5ca0fb9 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -52,6 +52,7 @@ jobs: OTEL_SERVICE_NAME: 'pr-cudf' steps: - name: Telemetry setup + if: ${{ vars.TELEMETRY_ENABLED == 'true' }} uses: rapidsai/shared-actions/telemetry-dispatch-stash-base-env-vars@main changed-files: secrets: inherit @@ -329,7 +330,7 @@ jobs: telemetry-summarize: runs-on: ubuntu-latest needs: pr-builder - if: always() + if: ${{ vars.TELEMETRY_ENABLED == 'true' && !cancelled() }} continue-on-error: true steps: - name: Load stashed telemetry env vars diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml index 3b972f31ca4..01dd2436beb 100644 --- a/.github/workflows/trigger-breaking-change-alert.yaml +++ b/.github/workflows/trigger-breaking-change-alert.yaml @@ -12,7 +12,7 @@ jobs: trigger-notifier: if: contains(github.event.pull_request.labels.*.name, 'breaking') secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-24.12 + uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.02 with: sender_login: ${{ github.event.sender.login }} sender_avatar: ${{ github.event.sender.avatar_url }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 37b26949804..39869b67547 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,7 +2,7 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.6.0 + rev: v5.0.0 hooks: - id: trailing-whitespace exclude: | @@ -17,11 +17,11 @@ repos: ^python/cudf/cudf/tests/data/subword_tokenizer_data/.* ) - repo: https://github.com/MarcoGorelli/cython-lint - rev: v0.16.2 + rev: v0.16.6 hooks: - id: cython-lint - repo: https://github.com/pre-commit/mirrors-mypy - rev: 'v1.10.0' + rev: 'v1.13.0' hooks: - id: mypy additional_dependencies: [types-cachetools] @@ -33,7 +33,7 @@ repos: "python/dask_cudf/dask_cudf"] pass_filenames: false - repo: https://github.com/nbQA-dev/nbQA - rev: 1.8.5 + rev: 1.9.1 hooks: - id: nbqa-isort # Use the cudf_kafka isort orderings in notebooks so that dask @@ -52,7 +52,7 @@ repos: ^cpp/include/cudf_test/cxxopts.hpp ) - repo: https://github.com/sirosen/texthooks - rev: 0.6.6 + rev: 0.6.7 hooks: - id: fix-smartquotes exclude: | @@ -133,7 +133,7 @@ repos: pass_filenames: false verbose: true - repo: https://github.com/codespell-project/codespell - rev: v2.2.6 + rev: v2.3.0 hooks: - id: codespell additional_dependencies: [tomli] @@ -144,7 +144,7 @@ repos: ^CHANGELOG.md$ ) - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.4.8 + rev: v0.8.0 hooks: - id: ruff args: ["--fix"] diff --git a/ci/build_docs.sh b/ci/build_docs.sh index 4290d013fe4..52d8f659611 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -35,6 +35,10 @@ rapids-mamba-retry install \ export RAPIDS_DOCS_DIR="$(mktemp -d)" +EXITCODE=0 +trap "EXITCODE=1" ERR +set +e + rapids-logger "Build CPP docs" pushd cpp/doxygen aws s3 cp s3://rapidsai-docs/librmm/html/${RAPIDS_VERSION_MAJOR_MINOR}/rmm.tag . || echo "Failed to download rmm Doxygen tag" @@ -58,3 +62,5 @@ mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/dask-cudf/html" popd RAPIDS_VERSION_NUMBER="${RAPIDS_VERSION_MAJOR_MINOR}" rapids-upload-docs + +exit ${EXITCODE} diff --git a/ci/cpp_linters.sh b/ci/cpp_linters.sh index 4d5b62ba280..9702b055512 100755 --- a/ci/cpp_linters.sh +++ b/ci/cpp_linters.sh @@ -27,7 +27,7 @@ source rapids-configure-sccache # Run the build via CMake, which will run clang-tidy when CUDF_STATIC_LINTERS is enabled. iwyu_flag="" -if [[ "${RAPIDS_BUILD_TYPE}" == "nightly" ]]; then +if [[ "${RAPIDS_BUILD_TYPE:-}" == "nightly" ]]; then iwyu_flag="-DCUDF_IWYU=ON" fi cmake -S cpp -B cpp/build -DCMAKE_BUILD_TYPE=Release -DCUDF_CLANG_TIDY=ON ${iwyu_flag} -DBUILD_TESTS=OFF -GNinja diff --git a/ci/cudf_pandas_scripts/third-party-integration/test.sh b/ci/cudf_pandas_scripts/third-party-integration/test.sh index f8ddbaba0f3..30e3ffc9a43 100755 --- a/ci/cudf_pandas_scripts/third-party-integration/test.sh +++ b/ci/cudf_pandas_scripts/third-party-integration/test.sh @@ -26,6 +26,8 @@ main() { LIBS=${LIBS#[} LIBS=${LIBS%]} + ANY_FAILURES=0 + for lib in ${LIBS//,/ }; do lib=$(echo "$lib" | tr -d '""') echo "Running tests for library $lib" @@ -56,10 +58,6 @@ main() { rapids-logger "Check GPU usage" nvidia-smi - EXITCODE=0 - trap "EXITCODE=1" ERR - set +e - rapids-logger "pytest ${lib}" NUM_PROCESSES=8 @@ -72,12 +70,20 @@ main() { fi done + EXITCODE=0 + trap "EXITCODE=1" ERR + set +e + TEST_DIR=${TEST_DIR} NUM_PROCESSES=${NUM_PROCESSES} ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh ${lib} + set -e rapids-logger "Test script exiting with value: ${EXITCODE}" + if [[ ${EXITCODE} != 0 ]]; then + ANY_FAILURES=1 + fi done - exit ${EXITCODE} + exit ${ANY_FAILURES} } main "$@" diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 97c72ec8042..33fc2f651c6 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -19,7 +19,7 @@ dependencies: - cramjam - cubinlinker - cuda-nvtx=11.8 -- cuda-python>=11.7.1,<12.0a0,<=11.8.3 +- cuda-python>=11.8.5,<12.0a0 - cuda-sanitizer-api=11.8.86 - cuda-version=11.8 - cudatoolkit @@ -80,7 +80,6 @@ dependencies: - python-confluent-kafka>=2.5.0,<2.6.0a0 - python-xxhash - python>=3.10,<3.13 -- pytorch>=2.1.0 - rapids-build-backend>=0.3.0,<0.4.0.dev0 - rapids-dask-dependency==25.2.*,>=0.0.0a0 - rich @@ -88,7 +87,6 @@ dependencies: - s3fs>=2022.3.0 - scikit-build-core>=0.10.0 - scipy -- spdlog>=1.14.1,<1.15 - sphinx - sphinx-autobuild - sphinx-copybutton @@ -97,8 +95,6 @@ dependencies: - sphinxcontrib-websupport - streamz - sysroot_linux-64==2.17 -- tokenizers==0.15.2 -- transformers==4.39.3 - typing_extensions>=4.0.0 - zlib>=1.2.13 name: all_cuda-118_arch-x86_64 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 84b58b6d7a4..c290a83a37f 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -21,7 +21,7 @@ dependencies: - cuda-nvcc - cuda-nvrtc-dev - cuda-nvtx-dev -- cuda-python>=12.0,<13.0a0,<=12.6.0 +- cuda-python>=12.6.2,<13.0a0 - cuda-sanitizer-api - cuda-version=12.5 - cupy>=12.0.0 @@ -78,7 +78,7 @@ dependencies: - python-confluent-kafka>=2.5.0,<2.6.0a0 - python-xxhash - python>=3.10,<3.13 -- pytorch>=2.1.0 +- pytorch>=2.4.0 - rapids-build-backend>=0.3.0,<0.4.0.dev0 - rapids-dask-dependency==25.2.*,>=0.0.0a0 - rich @@ -86,7 +86,6 @@ dependencies: - s3fs>=2022.3.0 - scikit-build-core>=0.10.0 - scipy -- spdlog>=1.14.1,<1.15 - sphinx - sphinx-autobuild - sphinx-copybutton diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 04904e95630..2c16deeed82 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -91,7 +91,7 @@ requirements: - cudatoolkit - ptxcompiler >=0.7.0 - cubinlinker # CUDA enhanced compatibility. - - cuda-python >=11.7.1,<12.0a0,<=11.8.3 + - cuda-python >=11.8.5,<12.0a0 {% else %} - cuda-cudart - libcufile # [linux64] @@ -100,7 +100,7 @@ requirements: # TODO: Add nvjitlink here # xref: https://github.com/rapidsai/cudf/issues/12822 - cuda-nvrtc - - cuda-python >=12.0,<13.0a0,<=12.6.0 + - cuda-python >=12.6.2,<13.0a0 - pynvjitlink {% endif %} - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml index c78ca326005..00020fdf6b8 100644 --- a/conda/recipes/libcudf/conda_build_config.yaml +++ b/conda/recipes/libcudf/conda_build_config.yaml @@ -31,9 +31,6 @@ fmt_version: flatbuffers_version: - "=24.3.25" -spdlog_version: - - ">=1.14.1,<1.15" - nvcomp_version: - "=4.1.0.6" diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 1c2e9e8dd98..b585aafc397 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -68,7 +68,6 @@ requirements: - librdkafka {{ librdkafka_version }} - fmt {{ fmt_version }} - flatbuffers {{ flatbuffers_version }} - - spdlog {{ spdlog_version }} - zlib {{ zlib_version }} outputs: diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml index ec3fcd59c62..08eab363af0 100644 --- a/conda/recipes/pylibcudf/meta.yaml +++ b/conda/recipes/pylibcudf/meta.yaml @@ -83,9 +83,9 @@ requirements: - {{ pin_compatible('rmm', max_pin='x.x') }} - fsspec >=0.6.0 {% if cuda_major == "11" %} - - cuda-python >=11.7.1,<12.0a0,<=11.8.3 + - cuda-python >=11.8.5,<12.0a0 {% else %} - - cuda-python >=12.0,<13.0a0,<=12.6.0 + - cuda-python >=12.6.2,<13.0a0 {% endif %} - nvtx >=0.2.1 - packaging diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f25b46a52cd..78f529a44d3 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -93,6 +93,7 @@ option( mark_as_advanced(CUDF_BUILD_STREAMS_TEST_UTIL) option(CUDF_CLANG_TIDY "Enable clang-tidy during compilation" OFF) option(CUDF_IWYU "Enable IWYU during compilation" OFF) +option(CUDF_CLANG_TIDY_AUTOFIX "Enable clang-tidy autofixes" OFF) option( CUDF_KVIKIO_REMOTE_IO @@ -205,9 +206,16 @@ function(enable_static_checkers target) if(_LINT_CLANG_TIDY) # clang will complain about unused link libraries on the compile line unless we specify # -Qunused-arguments. - set_target_properties( - ${target} PROPERTIES CXX_CLANG_TIDY "${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments" - ) + if(CUDF_CLANG_TIDY_AUTOFIX) + set_target_properties( + ${target} PROPERTIES CXX_CLANG_TIDY + "${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments;--fix" + ) + else() + set_target_properties( + ${target} PROPERTIES CXX_CLANG_TIDY "${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments" + ) + endif() endif() if(_LINT_IWYU) # A few extra warnings pop up when building with IWYU. I'm not sure why, but they are not @@ -265,6 +273,14 @@ endif() # add third party dependencies using CPM rapids_cpm_init() + +# Not using rapids-cmake since we never want to find, always download. +CPMAddPackage( + NAME rapids_logger GITHUB_REPOSITORY rapidsai/rapids-logger GIT_SHALLOW TRUE GIT_TAG + c510947ae9d3a67530cfe3e5eaccb5a3b8ea0e55 VERSION c510947ae9d3a67530cfe3e5eaccb5a3b8ea0e55 +) +rapids_make_logger(cudf EXPORT_SET cudf-exports) + # find jitify include(cmake/thirdparty/get_jitify.cmake) # find NVTX @@ -291,8 +307,6 @@ include(cmake/Modules/JitifyPreprocessKernels.cmake) include(cmake/thirdparty/get_kvikio.cmake) # find fmt include(cmake/thirdparty/get_fmt.cmake) -# find spdlog -include(cmake/thirdparty/get_spdlog.cmake) # find nanoarrow include(cmake/thirdparty/get_nanoarrow.cmake) # find thread_pool @@ -764,7 +778,6 @@ add_library( src/utilities/default_stream.cpp src/utilities/host_memory.cpp src/utilities/linked_column.cpp - src/utilities/logger.cpp src/utilities/prefetch.cpp src/utilities/stacktrace.cpp src/utilities/stream_pool.cpp @@ -902,11 +915,8 @@ if(CUDF_LARGE_STRINGS_DISABLED) target_compile_definitions(cudf PRIVATE CUDF_LARGE_STRINGS_DISABLED) endif() -# Define RMM logging level -target_compile_definitions(cudf PRIVATE "RMM_LOGGING_LEVEL=LIBCUDF_LOGGING_LEVEL") - -# Define spdlog level -target_compile_definitions(cudf PUBLIC "SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${LIBCUDF_LOGGING_LEVEL}") +# Define logging level +target_compile_definitions(cudf PRIVATE "CUDF_LOG_ACTIVE_LEVEL=${LIBCUDF_LOGGING_LEVEL}") # Enable remote IO through KvikIO target_compile_definitions(cudf PRIVATE $<$:CUDF_KVIKIO_REMOTE_IO>) @@ -920,14 +930,17 @@ if(TARGET CUDA::cuFile${_cufile_suffix}) target_compile_definitions(cudf PRIVATE CUDF_CUFILE_FOUND) endif() +# Remove this after upgrading to a CCCL that has a proper CMake option. See +# https://github.com/NVIDIA/cccl/pull/2844 +target_compile_definitions(cudf PRIVATE THRUST_FORCE_32_BIT_OFFSET_TYPE=1) + # Compile stringified JIT sources first add_dependencies(cudf jitify_preprocess_run) # Specify the target module library dependencies target_link_libraries( cudf - PUBLIC CCCL::CCCL rmm::rmm rmm::rmm_logger $ - spdlog::spdlog_header_only + PUBLIC CCCL::CCCL rmm::rmm rmm::rmm_logger $ cudf_logger PRIVATE $ cuco::cuco ZLIB::ZLIB @@ -936,6 +949,7 @@ target_link_libraries( $ nanoarrow rmm::rmm_logger_impl + cudf_logger_impl ) # Add Conda library, and include paths if specified @@ -1091,7 +1105,7 @@ if(CUDF_BUILD_STREAMS_TEST_UTIL) ${_tgt} PRIVATE "$:${CUDF_CXX_FLAGS}>>" ) target_include_directories(${_tgt} PRIVATE "$") - target_link_libraries(${_tgt} PUBLIC CUDA::cudart rmm::rmm) + target_link_libraries(${_tgt} PUBLIC CUDA::cudart rmm::rmm rmm::rmm_logger rmm::rmm_logger_impl) if(CUDF_BUILD_STACKTRACE_DEBUG) target_link_libraries(${_tgt} PRIVATE cudf_backtrace) endif() diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index d3de9b39977..749e1b628ee 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -140,8 +140,9 @@ function(ConfigureNVBench CMAKE_BENCH_NAME) endfunction() # ################################################################################################## -# * column benchmarks ----------------------------------------------------------------------------- -ConfigureBench(COLUMN_CONCAT_BENCH column/concatenate.cpp) +# * copying benchmarks +# ----------------------------------------------------------------------------- +ConfigureNVBench(COPYING_NVBENCH copying/concatenate.cpp) # ################################################################################################## # * gather benchmark ------------------------------------------------------------------------------ @@ -351,17 +352,22 @@ ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binary # ################################################################################################## # * nvtext benchmark ------------------------------------------------------------------- -ConfigureBench(TEXT_BENCH text/subword.cpp) - ConfigureNVBench( - TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp - text/ngrams.cpp text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp + TEXT_NVBENCH + text/edit_distance.cpp + text/hash_ngrams.cpp + text/jaccard.cpp + text/minhash.cpp + text/ngrams.cpp + text/normalize.cpp + text/replace.cpp + text/subword.cpp + text/tokenize.cpp + text/vocab.cpp ) # ################################################################################################## # * strings benchmark ------------------------------------------------------------------- -ConfigureBench(STRINGS_BENCH string/factory.cu) - ConfigureNVBench( STRINGS_NVBENCH string/case.cpp @@ -377,6 +383,7 @@ ConfigureNVBench( string/copy_range.cpp string/count.cpp string/extract.cpp + string/factory.cpp string/filter.cpp string/find.cpp string/find_multiple.cpp diff --git a/cpp/benchmarks/column/concatenate.cpp b/cpp/benchmarks/column/concatenate.cpp deleted file mode 100644 index 51106c72137..00000000000 --- a/cpp/benchmarks/column/concatenate.cpp +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include -#include -#include -#include - -#include - -#include -#include -#include - -#include -#include -#include - -#include -#include - -class Concatenate : public cudf::benchmark {}; - -template -static void BM_concatenate(benchmark::State& state) -{ - cudf::size_type const num_rows = state.range(0); - cudf::size_type const num_cols = state.range(1); - - auto input = create_sequence_table(cycle_dtypes({cudf::type_to_id()}, num_cols), - row_count{num_rows}, - Nullable ? std::optional{2.0 / 3.0} : std::nullopt); - auto input_columns = input->view(); - std::vector column_views(input_columns.begin(), input_columns.end()); - - CUDF_CHECK_CUDA(0); - - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - auto result = cudf::concatenate(column_views); - } - - state.SetBytesProcessed(state.iterations() * num_cols * num_rows * sizeof(T)); -} - -#define CONCAT_BENCHMARK_DEFINE(type, nullable) \ - BENCHMARK_DEFINE_F(Concatenate, BM_concatenate##_##nullable_##nullable) \ - (::benchmark::State & st) { BM_concatenate(st); } \ - BENCHMARK_REGISTER_F(Concatenate, BM_concatenate##_##nullable_##nullable) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 6, 1 << 18}, {2, 1024}}) \ - ->Unit(benchmark::kMillisecond) \ - ->UseManualTime(); - -CONCAT_BENCHMARK_DEFINE(int64_t, false) -CONCAT_BENCHMARK_DEFINE(int64_t, true) - -template -static void BM_concatenate_tables(benchmark::State& state) -{ - cudf::size_type const num_rows = state.range(0); - cudf::size_type const num_cols = state.range(1); - cudf::size_type const num_tables = state.range(2); - - std::vector> tables(num_tables); - std::generate_n(tables.begin(), num_tables, [&]() { - return create_sequence_table(cycle_dtypes({cudf::type_to_id()}, num_cols), - row_count{num_rows}, - Nullable ? std::optional{2.0 / 3.0} : std::nullopt); - }); - - // Generate table views - std::vector table_views(num_tables); - std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) mutable { - return table->view(); - }); - - CUDF_CHECK_CUDA(0); - - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - auto result = cudf::concatenate(table_views); - } - - state.SetBytesProcessed(state.iterations() * num_cols * num_rows * num_tables * sizeof(T)); -} - -#define CONCAT_TABLES_BENCHMARK_DEFINE(type, nullable) \ - BENCHMARK_DEFINE_F(Concatenate, BM_concatenate_tables##_##nullable_##nullable) \ - (::benchmark::State & st) { BM_concatenate_tables(st); } \ - BENCHMARK_REGISTER_F(Concatenate, BM_concatenate_tables##_##nullable_##nullable) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 8, 1 << 12}, {2, 32}, {2, 128}}) \ - ->Unit(benchmark::kMillisecond) \ - ->UseManualTime(); - -CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, false) -CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, true) - -class ConcatenateStrings : public cudf::benchmark {}; - -template -static void BM_concatenate_strings(benchmark::State& state) -{ - using column_wrapper = cudf::test::strings_column_wrapper; - - auto const num_rows = state.range(0); - auto const num_chars = state.range(1); - auto const num_cols = state.range(2); - - std::string str(num_chars, 'a'); - - // Create owning columns - std::vector columns; - columns.reserve(num_cols); - std::generate_n(std::back_inserter(columns), num_cols, [num_rows, c_str = str.c_str()]() { - auto iter = thrust::make_constant_iterator(c_str); - if (Nullable) { - auto count_it = thrust::make_counting_iterator(0); - auto valid_iter = - thrust::make_transform_iterator(count_it, [](auto i) { return i % 3 == 0; }); - return column_wrapper(iter, iter + num_rows, valid_iter); - } else { - return column_wrapper(iter, iter + num_rows); - } - }); - - // Generate column views - std::vector column_views; - column_views.reserve(columns.size()); - std::transform( - columns.begin(), columns.end(), std::back_inserter(column_views), [](auto const& col) { - return static_cast(col); - }); - - CUDF_CHECK_CUDA(0); - - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - auto result = cudf::concatenate(column_views); - } - - state.SetBytesProcessed(state.iterations() * num_cols * num_rows * - (sizeof(int32_t) + num_chars)); // offset + chars -} - -#define CONCAT_STRINGS_BENCHMARK_DEFINE(nullable) \ - BENCHMARK_DEFINE_F(Concatenate, BM_concatenate_strings##_##nullable_##nullable) \ - (::benchmark::State & st) { BM_concatenate_strings(st); } \ - BENCHMARK_REGISTER_F(Concatenate, BM_concatenate_strings##_##nullable_##nullable) \ - ->RangeMultiplier(8) \ - ->Ranges({{1 << 8, 1 << 14}, {8, 128}, {2, 256}}) \ - ->Unit(benchmark::kMillisecond) \ - ->UseManualTime(); - -CONCAT_STRINGS_BENCHMARK_DEFINE(false) -CONCAT_STRINGS_BENCHMARK_DEFINE(true) diff --git a/cpp/benchmarks/copying/concatenate.cpp b/cpp/benchmarks/copying/concatenate.cpp new file mode 100644 index 00000000000..586b479d0ad --- /dev/null +++ b/cpp/benchmarks/copying/concatenate.cpp @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +#include +#include +#include +#include + +#include + +#include + +static void bench_concatenate(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const num_cols = static_cast(state.get_int64("num_cols")); + auto const nulls = static_cast(state.get_float64("nulls")); + + auto input = create_sequence_table( + cycle_dtypes({cudf::type_to_id()}, num_cols), row_count{num_rows}, nulls); + auto input_columns = input->view(); + auto column_views = std::vector(input_columns.begin(), input_columns.end()); + + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + state.add_global_memory_reads(num_rows * num_cols); + state.add_global_memory_writes(num_rows * num_cols); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { auto result = cudf::concatenate(column_views); }); +} + +NVBENCH_BENCH(bench_concatenate) + .set_name("concatenate") + .add_int64_axis("num_rows", {64, 512, 4096, 32768, 262144}) + .add_int64_axis("num_cols", {2, 8, 64, 512, 1024}) + .add_float64_axis("nulls", {0.0, 0.3}); + +static void bench_concatenate_strings(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const num_cols = static_cast(state.get_int64("num_cols")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const nulls = static_cast(state.get_float64("nulls")); + + data_profile const profile = + data_profile_builder() + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width) + .null_probability(nulls); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); + auto const input = column->view(); + + auto column_views = std::vector(num_cols, input); + + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + auto const sv = cudf::strings_column_view(input); + state.add_global_memory_reads(sv.chars_size(stream) * num_cols); + state.add_global_memory_writes(sv.chars_size(stream) * num_cols); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { auto result = cudf::concatenate(column_views); }); +} + +NVBENCH_BENCH(bench_concatenate_strings) + .set_name("concatenate_strings") + .add_int64_axis("num_rows", {256, 512, 4096, 16384}) + .add_int64_axis("num_cols", {2, 8, 64, 256}) + .add_int64_axis("row_width", {32, 128}) + .add_float64_axis("nulls", {0.0, 0.3}); diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp index 45b46005c47..38a21961735 100644 --- a/cpp/benchmarks/io/cuio_common.cpp +++ b/cpp/benchmarks/io/cuio_common.cpp @@ -17,7 +17,7 @@ #include #include -#include +#include #include #include diff --git a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp index fa017ca9e29..267aa3a93f3 100644 --- a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp +++ b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp @@ -63,8 +63,8 @@ void apply_boolean_mask_benchmark(nvbench::state& state, nvbench::type_list) cudf::size_type const num_rows = state.get_int64("NumRows"); auto const keep = get_keep(state.get_string("keep")); cudf::size_type const cardinality = state.get_int64("cardinality"); + auto const null_probability = state.get_float64("null_probability"); if (cardinality > num_rows) { state.skip("cardinality > num_rows"); @@ -42,7 +43,7 @@ void nvbench_distinct(nvbench::state& state, nvbench::type_list) data_profile profile = data_profile_builder() .cardinality(cardinality) - .null_probability(0.01) + .null_probability(null_probability) .distribution(cudf::type_to_id(), distribution_id::UNIFORM, static_cast(0), @@ -65,6 +66,7 @@ using data_type = nvbench::type_list; NVBENCH_BENCH_TYPES(nvbench_distinct, NVBENCH_TYPE_AXES(data_type)) .set_name("distinct") .set_type_axes_names({"Type"}) + .add_float64_axis("null_probability", {0.01}) .add_string_axis("keep", {"any", "first", "last", "none"}) .add_int64_axis("cardinality", {100, 100'000, 10'000'000, 1'000'000'000}) .add_int64_axis("NumRows", {100, 100'000, 10'000'000, 1'000'000'000}); diff --git a/cpp/benchmarks/string/case.cpp b/cpp/benchmarks/string/case.cpp index cd4d3ca964b..9750475a079 100644 --- a/cpp/benchmarks/string/case.cpp +++ b/cpp/benchmarks/string/case.cpp @@ -24,18 +24,14 @@ void bench_case(nvbench::state& state) { - auto const n_rows = static_cast(state.get_int64("num_rows")); - auto const max_width = static_cast(state.get_int64("row_width")); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const encoding = state.get_string("encoding"); - if (static_cast(n_rows) * static_cast(max_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, max_width); - auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); auto col_view = column->view(); @@ -74,6 +70,7 @@ void bench_case(nvbench::state& state) NVBENCH_BENCH(bench_case) .set_name("case") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) - .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("encoding", {"ascii", "utf8"}); diff --git a/cpp/benchmarks/string/char_types.cpp b/cpp/benchmarks/string/char_types.cpp index eec9a5f54d7..abc5254392e 100644 --- a/cpp/benchmarks/string/char_types.cpp +++ b/cpp/benchmarks/string/char_types.cpp @@ -25,16 +25,12 @@ static void bench_char_types(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const api_type = state.get_string("api"); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); cudf::strings_column_view input(table->view().column(0)); @@ -61,6 +57,7 @@ static void bench_char_types(nvbench::state& state) NVBENCH_BENCH(bench_char_types) .set_name("char_types") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("api", {"all", "filter"}); diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp index a73017dda18..e3940cbc0c7 100644 --- a/cpp/benchmarks/string/contains.cpp +++ b/cpp/benchmarks/string/contains.cpp @@ -29,17 +29,12 @@ std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$", "5W43" static void bench_contains(nvbench::state& state) { - auto const n_rows = static_cast(state.get_int64("num_rows")); + auto const num_rows = static_cast(state.get_int64("num_rows")); auto const row_width = static_cast(state.get_int64("row_width")); auto const pattern_index = static_cast(state.get_int64("pattern")); auto const hit_rate = static_cast(state.get_int64("hit_rate")); - if (static_cast(n_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - - auto col = create_string_column(n_rows, row_width, hit_rate); + auto col = create_string_column(num_rows, row_width, hit_rate); auto input = cudf::strings_column_view(col->view()); auto pattern = patterns[pattern_index]; @@ -56,7 +51,7 @@ static void bench_contains(nvbench::state& state) NVBENCH_BENCH(bench_contains) .set_name("contains") - .add_int64_axis("row_width", {32, 64, 128, 256, 512}) - .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216}) + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_int64_axis("hit_rate", {50, 100}) // percentage .add_int64_axis("pattern", {0, 1, 2}); diff --git a/cpp/benchmarks/string/copy_if_else.cpp b/cpp/benchmarks/string/copy_if_else.cpp index e06cca497c2..5a5743dfddf 100644 --- a/cpp/benchmarks/string/copy_if_else.cpp +++ b/cpp/benchmarks/string/copy_if_else.cpp @@ -25,15 +25,11 @@ static void bench_copy(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); data_profile const str_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const source_table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, str_profile); auto const target_table = @@ -58,5 +54,6 @@ static void bench_copy(nvbench::state& state) NVBENCH_BENCH(bench_copy) .set_name("copy_if_else") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/copy_range.cpp b/cpp/benchmarks/string/copy_range.cpp index af217a49195..7e7353a0e78 100644 --- a/cpp/benchmarks/string/copy_range.cpp +++ b/cpp/benchmarks/string/copy_range.cpp @@ -25,16 +25,12 @@ static void bench_copy_range(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); data_profile const table_profile = data_profile_builder() - .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width) + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width) .no_validity(); auto const source_tables = create_random_table( {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, table_profile); @@ -56,5 +52,6 @@ static void bench_copy_range(nvbench::state& state) NVBENCH_BENCH(bench_copy_range) .set_name("copy_range") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/count.cpp b/cpp/benchmarks/string/count.cpp index f964bc5d224..cf90e316f71 100644 --- a/cpp/benchmarks/string/count.cpp +++ b/cpp/benchmarks/string/count.cpp @@ -30,16 +30,12 @@ static std::string patterns[] = {"\\d+", "a"}; static void bench_count(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const pattern_index = static_cast(state.get_int64("pattern")); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); cudf::strings_column_view input(table->view().column(0)); @@ -61,6 +57,7 @@ static void bench_count(nvbench::state& state) NVBENCH_BENCH(bench_count) .set_name("count") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_int64_axis("pattern", {0, 1}); diff --git a/cpp/benchmarks/string/extract.cpp b/cpp/benchmarks/string/extract.cpp index af4fedb5799..d6866598ff4 100644 --- a/cpp/benchmarks/string/extract.cpp +++ b/cpp/benchmarks/string/extract.cpp @@ -32,11 +32,6 @@ static void bench_extract(nvbench::state& state) auto const num_rows = static_cast(state.get_int64("num_rows")); auto const row_width = static_cast(state.get_int64("row_width")); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - auto groups = static_cast(state.get_int64("groups")); std::default_random_engine generator; @@ -79,6 +74,6 @@ static void bench_extract(nvbench::state& state) NVBENCH_BENCH(bench_extract) .set_name("extract") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_int64_axis("groups", {1, 2, 4}); diff --git a/cpp/benchmarks/string/factory.cpp b/cpp/benchmarks/string/factory.cpp new file mode 100644 index 00000000000..03870b0ae23 --- /dev/null +++ b/cpp/benchmarks/string/factory.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include + +#include + +#include + +#include + +static void bench_factory(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); + + data_profile const profile = data_profile_builder().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); + auto const sv = cudf::strings_column_view(column->view()); + + auto stream = cudf::get_default_stream(); + auto mr = cudf::get_current_device_resource_ref(); + auto d_strings = cudf::strings::detail::create_string_vector_from_column(sv, stream, mr); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + auto chars_size = sv.chars_size(stream); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(chars_size); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + cudf::make_strings_column(d_strings, cudf::string_view{nullptr, 0}); + }); +} + +NVBENCH_BENCH(bench_factory) + .set_name("factory") + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/factory.cu b/cpp/benchmarks/string/factory.cu deleted file mode 100644 index c4e74c4d97e..00000000000 --- a/cpp/benchmarks/string/factory.cu +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "string_bench_args.hpp" - -#include -#include -#include - -#include - -#include -#include -#include - -#include - -#include -#include -#include - -#include - -namespace { -using string_pair = thrust::pair; -struct string_view_to_pair { - __device__ string_pair operator()(thrust::pair const& p) - { - return (p.second) ? string_pair{p.first.data(), p.first.size_bytes()} : string_pair{nullptr, 0}; - } -}; -} // namespace - -class StringsFactory : public cudf::benchmark {}; - -static void BM_factory(benchmark::State& state) -{ - cudf::size_type const n_rows{static_cast(state.range(0))}; - cudf::size_type const max_str_length{static_cast(state.range(1))}; - data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); - auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); - auto d_column = cudf::column_device_view::create(column->view()); - rmm::device_uvector pairs(d_column->size(), cudf::get_default_stream()); - thrust::transform(thrust::device, - d_column->pair_begin(), - d_column->pair_end(), - pairs.data(), - string_view_to_pair{}); - - for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::get_default_stream()); - cudf::make_strings_column(pairs, cudf::get_default_stream()); - } - - cudf::strings_column_view input(column->view()); - state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream())); -} - -static void generate_bench_args(benchmark::internal::Benchmark* b) -{ - int const min_rows = 1 << 12; - int const max_rows = 1 << 24; - int const row_mult = 8; - int const min_rowlen = 1 << 5; - int const max_rowlen = 1 << 13; - int const len_mult = 4; - generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); -} - -#define STRINGS_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(StringsFactory, name) \ - (::benchmark::State & st) { BM_factory(st); } \ - BENCHMARK_REGISTER_F(StringsFactory, name) \ - ->Apply(generate_bench_args) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -STRINGS_BENCHMARK_DEFINE(factory) diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp index 3ea3ff13a2f..2ba793e998e 100644 --- a/cpp/benchmarks/string/find.cpp +++ b/cpp/benchmarks/string/find.cpp @@ -28,21 +28,19 @@ static void bench_find_string(nvbench::state& state) { - auto const n_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const hit_rate = static_cast(state.get_int64("hit_rate")); auto const api = state.get_string("api"); - - if (static_cast(n_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const tgt_type = state.get_string("target"); auto const stream = cudf::get_default_stream(); - auto const col = create_string_column(n_rows, row_width, hit_rate); + auto const col = create_string_column(num_rows, max_width, hit_rate); auto const input = cudf::strings_column_view(col->view()); - cudf::string_scalar target("0987 5W43"); + auto target = cudf::string_scalar("0987 5W43"); + auto targets_col = cudf::make_column_from_scalar(target, num_rows); + auto const targets = cudf::strings_column_view(targets_col->view()); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); auto const chars_size = input.chars_size(stream); @@ -55,23 +53,44 @@ static void bench_find_string(nvbench::state& state) } if (api == "find") { - state.exec(nvbench::exec_tag::sync, - [&](nvbench::launch& launch) { cudf::strings::find(input, target); }); + if (tgt_type == "scalar") { + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::find(input, target); }); + } else if (tgt_type == "column") { + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::find(input, targets); }); + } } else if (api == "contains") { - state.exec(nvbench::exec_tag::sync, - [&](nvbench::launch& launch) { cudf::strings::contains(input, target); }); + if (tgt_type == "scalar") { + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::contains(input, target); }); + } else if (tgt_type == "column") { + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::contains(input, targets); }); + } } else if (api == "starts_with") { - state.exec(nvbench::exec_tag::sync, - [&](nvbench::launch& launch) { cudf::strings::starts_with(input, target); }); + if (tgt_type == "scalar") { + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::starts_with(input, target); }); + } else if (tgt_type == "column") { + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::starts_with(input, targets); }); + } } else if (api == "ends_with") { - state.exec(nvbench::exec_tag::sync, - [&](nvbench::launch& launch) { cudf::strings::ends_with(input, target); }); + if (tgt_type == "scalar") { + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::ends_with(input, target); }); + } else if (tgt_type == "column") { + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::ends_with(input, targets); }); + } } } NVBENCH_BENCH(bench_find_string) .set_name("find_string") + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) + .add_int64_axis("hit_rate", {20, 80}) // percentage .add_string_axis("api", {"find", "contains", "starts_with", "ends_with"}) - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) - .add_int64_axis("num_rows", {260'000, 1'953'000, 16'777'216}) - .add_int64_axis("hit_rate", {20, 80}); // percentage + .add_string_axis("target", {"scalar", "column"}); diff --git a/cpp/benchmarks/string/join_strings.cpp b/cpp/benchmarks/string/join_strings.cpp index 6dcf731ad3c..27652193b7b 100644 --- a/cpp/benchmarks/string/join_strings.cpp +++ b/cpp/benchmarks/string/join_strings.cpp @@ -25,15 +25,11 @@ static void bench_join(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); cudf::strings_column_view input(table->view().column(0)); @@ -54,5 +50,6 @@ static void bench_join(nvbench::state& state) NVBENCH_BENCH(bench_join) .set_name("strings_join") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/lengths.cpp b/cpp/benchmarks/string/lengths.cpp index a19060ead3b..8156e19412b 100644 --- a/cpp/benchmarks/string/lengths.cpp +++ b/cpp/benchmarks/string/lengths.cpp @@ -25,15 +25,11 @@ static void bench_lengths(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); cudf::strings_column_view input(table->view().column(0)); @@ -51,5 +47,6 @@ static void bench_lengths(nvbench::state& state) NVBENCH_BENCH(bench_lengths) .set_name("lengths") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/like.cpp b/cpp/benchmarks/string/like.cpp index 105ae65cbe8..f6410aaef30 100644 --- a/cpp/benchmarks/string/like.cpp +++ b/cpp/benchmarks/string/like.cpp @@ -30,11 +30,6 @@ static void bench_like(nvbench::state& state) auto const row_width = static_cast(state.get_int64("row_width")); auto const hit_rate = static_cast(state.get_int64("hit_rate")); - if (static_cast(n_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - auto col = create_string_column(n_rows, row_width, hit_rate); auto input = cudf::strings_column_view(col->view()); @@ -54,6 +49,6 @@ static void bench_like(nvbench::state& state) NVBENCH_BENCH(bench_like) .set_name("strings_like") - .add_int64_axis("row_width", {32, 64, 128, 256, 512}) - .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216}) + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_int64_axis("hit_rate", {10, 25, 70, 100}); diff --git a/cpp/benchmarks/string/replace_re.cpp b/cpp/benchmarks/string/replace_re.cpp index 4dcf1314f83..69426a2d484 100644 --- a/cpp/benchmarks/string/replace_re.cpp +++ b/cpp/benchmarks/string/replace_re.cpp @@ -26,18 +26,14 @@ static void bench_replace(nvbench::state& state) { - auto const n_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const rtype = state.get_string("type"); - if (static_cast(n_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); - auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); auto program = cudf::strings::regex_program::create("(\\d+)"); @@ -62,6 +58,7 @@ static void bench_replace(nvbench::state& state) NVBENCH_BENCH(bench_replace) .set_name("replace_re") - .add_int64_axis("row_width", {32, 64, 128, 256, 512}) - .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("type", {"replace", "backref"}); diff --git a/cpp/benchmarks/string/reverse.cpp b/cpp/benchmarks/string/reverse.cpp index a2676609a40..e2e914cb350 100644 --- a/cpp/benchmarks/string/reverse.cpp +++ b/cpp/benchmarks/string/reverse.cpp @@ -25,15 +25,11 @@ static void bench_reverse(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); data_profile const table_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile); cudf::strings_column_view input(table->view().column(0)); @@ -51,5 +47,6 @@ static void bench_reverse(nvbench::state& state) NVBENCH_BENCH(bench_reverse) .set_name("reverse") - .add_int64_axis("row_width", {8, 16, 32, 64, 128}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/slice.cpp b/cpp/benchmarks/string/slice.cpp index 1898f0340b6..c828a8ed0b0 100644 --- a/cpp/benchmarks/string/slice.cpp +++ b/cpp/benchmarks/string/slice.cpp @@ -36,11 +36,6 @@ static void bench_slice(nvbench::state& state) auto const row_width = static_cast(state.get_int64("row_width")); auto const stype = state.get_string("type"); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const profile = data_profile_builder().distribution( cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); @@ -76,6 +71,6 @@ static void bench_slice(nvbench::state& state) NVBENCH_BENCH(bench_slice) .set_name("slice") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) - .add_int64_axis("num_rows", {262144, 2097152, 16777216}) + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("type", {"position", "multi"}); diff --git a/cpp/benchmarks/string/split.cpp b/cpp/benchmarks/string/split.cpp index 9ef58daf0fc..9c7c27c4f07 100644 --- a/cpp/benchmarks/string/split.cpp +++ b/cpp/benchmarks/string/split.cpp @@ -28,16 +28,12 @@ static void bench_split(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const stype = state.get_string("type"); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); cudf::string_scalar target("+"); @@ -66,6 +62,7 @@ static void bench_split(nvbench::state& state) NVBENCH_BENCH(bench_split) .set_name("split") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("type", {"split", "split_ws", "record", "record_ws"}); diff --git a/cpp/benchmarks/string/split_re.cpp b/cpp/benchmarks/string/split_re.cpp index 1fdb6e67109..34a7aa96e84 100644 --- a/cpp/benchmarks/string/split_re.cpp +++ b/cpp/benchmarks/string/split_re.cpp @@ -28,17 +28,13 @@ static void bench_split(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto prog = cudf::strings::regex_program::create("\\d+"); data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); @@ -56,5 +52,6 @@ static void bench_split(nvbench::state& state) NVBENCH_BENCH(bench_split) .set_name("split_re") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/string/string_bench_args.hpp b/cpp/benchmarks/string/string_bench_args.hpp deleted file mode 100644 index a34026281e8..00000000000 --- a/cpp/benchmarks/string/string_bench_args.hpp +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include - -#include - -/** - * @brief Generate row count and row length argument ranges for a string benchmark. - * - * Generates a series of row count and row length arguments for string benchmarks. - * Combinations of row count and row length that would exceed the maximum string character - * column data length are not generated. - * - * @param b Benchmark to update with row count and row length arguments. - * @param min_rows Minimum row count argument to generate. - * @param max_rows Maximum row count argument to generate. - * @param rows_mult Row count multiplier to generate intermediate row count arguments. - * @param min_rowlen Minimum row length argument to generate. - * @param max_rowlen Maximum row length argument to generate. - * @param rowlen_mult Row length multiplier to generate intermediate row length arguments. - */ -inline void generate_string_bench_args(benchmark::internal::Benchmark* b, - int min_rows, - int max_rows, - int rows_mult, - int min_rowlen, - int max_rowlen, - int rowlen_mult) -{ - for (int row_count = min_rows; row_count <= max_rows; row_count *= rows_mult) { - for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= rowlen_mult) { - // avoid generating combinations that exceed the cudf column limit - size_t total_chars = static_cast(row_count) * rowlen; - if (total_chars < static_cast(std::numeric_limits::max())) { - b->Args({row_count, rowlen}); - } - } - } -} diff --git a/cpp/benchmarks/text/edit_distance.cpp b/cpp/benchmarks/text/edit_distance.cpp index 6ffa90edb8f..0ad1ae30f8c 100644 --- a/cpp/benchmarks/text/edit_distance.cpp +++ b/cpp/benchmarks/text/edit_distance.cpp @@ -27,15 +27,11 @@ static void bench_edit_distance(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); data_profile const strings_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const strings_table = create_random_table( {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, strings_profile); cudf::strings_column_view input1(strings_table->view().column(0)); @@ -55,5 +51,6 @@ static void bench_edit_distance(nvbench::state& state) NVBENCH_BENCH(bench_edit_distance) .set_name("edit_distance") - .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144}) - .add_int64_axis("row_width", {8, 16, 32, 64, 128, 256}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144}); diff --git a/cpp/benchmarks/text/hash_ngrams.cpp b/cpp/benchmarks/text/hash_ngrams.cpp index 4e5daf83a3c..7577cf00c0f 100644 --- a/cpp/benchmarks/text/hash_ngrams.cpp +++ b/cpp/benchmarks/text/hash_ngrams.cpp @@ -27,16 +27,12 @@ static void bench_hash_ngrams(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const ngrams = static_cast(state.get_int64("ngrams")); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const strings_profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const strings_table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile); cudf::strings_column_view input(strings_table->view().column(0)); @@ -55,6 +51,7 @@ static void bench_hash_ngrams(nvbench::state& state) NVBENCH_BENCH(bench_hash_ngrams) .set_name("hash_ngrams") - .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144}) - .add_int64_axis("row_width", {128, 512, 2048}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {128, 512, 2048}) + .add_int64_axis("num_rows", {16384, 32768, 262144}) .add_int64_axis("ngrams", {5, 10}); diff --git a/cpp/benchmarks/text/jaccard.cpp b/cpp/benchmarks/text/jaccard.cpp index d5b74da6773..5506501138b 100644 --- a/cpp/benchmarks/text/jaccard.cpp +++ b/cpp/benchmarks/text/jaccard.cpp @@ -28,17 +28,13 @@ static void bench_jaccard(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const substring_width = static_cast(state.get_int64("substring_width")); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const strings_profile = data_profile_builder() - .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width) + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width) .no_validity(); auto const input_table = create_random_table( {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, strings_profile); @@ -59,6 +55,7 @@ static void bench_jaccard(nvbench::state& state) NVBENCH_BENCH(bench_jaccard) .set_name("jaccard") + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {128, 512, 1024, 2048}) .add_int64_axis("num_rows", {32768, 131072, 262144}) - .add_int64_axis("row_width", {128, 512, 1024, 2048}) .add_int64_axis("substring_width", {5, 10}); diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp index a80d0dcbdb8..8c86e8d4366 100644 --- a/cpp/benchmarks/text/minhash.cpp +++ b/cpp/benchmarks/text/minhash.cpp @@ -54,9 +54,8 @@ static void bench_minhash(nvbench::state& state) state.add_global_memory_writes(num_rows); // output are hashes state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = base64 - ? nvtext::minhash64_permuted(input, 0, parameters_a, parameters_b, hash_width) - : nvtext::minhash_permuted(input, 0, parameters_a, parameters_b, hash_width); + auto result = base64 ? nvtext::minhash64(input, 0, parameters_a, parameters_b, hash_width) + : nvtext::minhash(input, 0, parameters_a, parameters_b, hash_width); }); } diff --git a/cpp/benchmarks/text/normalize.cpp b/cpp/benchmarks/text/normalize.cpp index 71bccd80d39..594dc0de28a 100644 --- a/cpp/benchmarks/text/normalize.cpp +++ b/cpp/benchmarks/text/normalize.cpp @@ -28,16 +28,12 @@ static void bench_normalize(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const normalize_type = state.get_string("type"); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); @@ -60,6 +56,7 @@ static void bench_normalize(nvbench::state& state) NVBENCH_BENCH(bench_normalize) .set_name("normalize") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("type", {"spaces", "characters", "to_lower"}); diff --git a/cpp/benchmarks/text/replace.cpp b/cpp/benchmarks/text/replace.cpp index 767ebab3eee..24ca4e5dfd7 100644 --- a/cpp/benchmarks/text/replace.cpp +++ b/cpp/benchmarks/text/replace.cpp @@ -31,11 +31,6 @@ static void bench_replace(nvbench::state& state) auto const num_rows = static_cast(state.get_int64("num_rows")); auto const row_width = static_cast(state.get_int64("row_width")); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - std::vector words{" ", "one ", "two ", "three ", "four ", "five ", "six ", "sevén ", "eight ", "nine ", "ten ", "eleven ", "twelve ", "thirteen ", "fourteen ", @@ -71,5 +66,5 @@ static void bench_replace(nvbench::state& state) NVBENCH_BENCH(bench_replace) .set_name("replace") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}); + .add_int64_axis("row_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/text/subword.cpp b/cpp/benchmarks/text/subword.cpp index dd8df695d3e..0b4e3bdefa5 100644 --- a/cpp/benchmarks/text/subword.cpp +++ b/cpp/benchmarks/text/subword.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,9 +14,6 @@ * limitations under the License. */ -#include -#include - #include #include @@ -24,6 +21,8 @@ #include +#include + #include #include #include @@ -54,40 +53,33 @@ static std::string create_hash_vocab_file() return hash_file; } -static void BM_subword_tokenizer(benchmark::State& state) +static void bench_subword_tokenizer(nvbench::state& state) { - auto const nrows = static_cast(state.range(0)); - std::vector h_strings(nrows, "This is a test "); + auto const num_rows = static_cast(state.get_int64("num_rows")); + + std::vector h_strings(num_rows, "This is a test "); cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end()); static std::string hash_file = create_hash_vocab_file(); std::vector offsets{14}; - uint32_t max_sequence_length = 64; - uint32_t stride = 48; - uint32_t do_truncate = 0; - uint32_t do_lower = 1; - // - auto vocab = nvtext::load_vocabulary_file(hash_file); - for (auto _ : state) { - cuda_event_timer raii(state, true); - auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings}, - *vocab, - max_sequence_length, - stride, - do_lower, - do_truncate); - } -} + uint32_t max_sequence = 64; + uint32_t stride = 48; + uint32_t do_truncate = 0; + uint32_t do_lower = 1; -class Subword : public cudf::benchmark {}; + auto input = cudf::strings_column_view{strings}; -#define SUBWORD_BM_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(Subword, name)(::benchmark::State & state) { BM_subword_tokenizer(state); } \ - BENCHMARK_REGISTER_F(Subword, name) \ - ->RangeMultiplier(2) \ - ->Range(1 << 10, 1 << 17) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + auto chars_size = input.chars_size(cudf::get_default_stream()); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(num_rows * max_sequence); -SUBWORD_BM_BENCHMARK_DEFINE(BM_subword_tokenizer); + auto vocab = nvtext::load_vocabulary_file(hash_file); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = + nvtext::subword_tokenize(input, *vocab, max_sequence, stride, do_lower, do_truncate); + }); +} -// BENCHMARK_MAIN(); +NVBENCH_BENCH(bench_subword_tokenizer) + .set_name("subword_tokenize") + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/text/tokenize.cpp b/cpp/benchmarks/text/tokenize.cpp index e83310e0343..b9590c5539f 100644 --- a/cpp/benchmarks/text/tokenize.cpp +++ b/cpp/benchmarks/text/tokenize.cpp @@ -31,17 +31,13 @@ static void bench_tokenize(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); auto const tokenize_type = state.get_string("type"); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const profile = data_profile_builder() - .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width) + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width) .no_validity(); auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); @@ -82,6 +78,7 @@ static void bench_tokenize(nvbench::state& state) NVBENCH_BENCH(bench_tokenize) .set_name("tokenize") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) - .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216}) + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("type", {"whitespace", "multi", "count", "count_multi", "ngrams", "characters"}); diff --git a/cpp/benchmarks/text/vocab.cpp b/cpp/benchmarks/text/vocab.cpp index 523d277df18..0502f375d99 100644 --- a/cpp/benchmarks/text/vocab.cpp +++ b/cpp/benchmarks/text/vocab.cpp @@ -33,16 +33,12 @@ static void bench_vocab_tokenize(nvbench::state& state) { auto const stream = cudf::get_default_stream(); auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); + auto const min_width = static_cast(state.get_int64("min_width")); + auto const max_width = static_cast(state.get_int64("max_width")); - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - - auto const column = [num_rows, row_width] { + auto const column = [num_rows, min_width, max_width] { data_profile const profile = data_profile_builder().no_validity().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width); auto const col = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); return cudf::strings::filter_characters_of_type( cudf::strings_column_view(col->view()), @@ -85,5 +81,6 @@ static void bench_vocab_tokenize(nvbench::state& state) NVBENCH_BENCH(bench_vocab_tokenize) .set_name("vocab_tokenize") - .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) - .add_int64_axis("num_rows", {262144, 524288, 1048576, 2097152, 4194304, 16777216}); + .add_int64_axis("min_width", {0}) + .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("num_rows", {32768, 262144, 2097152}); diff --git a/cpp/benchmarks/text/word_minhash.cpp b/cpp/benchmarks/text/word_minhash.cpp deleted file mode 100644 index adc3dddc59c..00000000000 --- a/cpp/benchmarks/text/word_minhash.cpp +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include - -#include - -#include - -#include - -static void bench_word_minhash(nvbench::state& state) -{ - auto const num_rows = static_cast(state.get_int64("num_rows")); - auto const row_width = static_cast(state.get_int64("row_width")); - auto const seed_count = static_cast(state.get_int64("seed_count")); - auto const base64 = state.get_int64("hash_type") == 64; - - data_profile const strings_profile = - data_profile_builder().distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, 5); - auto strings_table = - create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile); - - auto const num_offsets = (num_rows / row_width) + 1; - auto offsets = cudf::sequence(num_offsets, - cudf::numeric_scalar(0), - cudf::numeric_scalar(row_width)); - - auto source = cudf::make_lists_column(num_offsets - 1, - std::move(offsets), - std::move(strings_table->release().front()), - 0, - rmm::device_buffer{}); - - data_profile const seeds_profile = data_profile_builder().no_validity().distribution( - cudf::type_to_id(), distribution_id::NORMAL, 0, 256); - auto const seed_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32; - auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile); - auto seeds = seeds_table->get_column(0); - - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - - cudf::strings_column_view input(cudf::lists_column_view(source->view()).child()); - auto chars_size = input.chars_size(cudf::get_default_stream()); - state.add_global_memory_reads(chars_size); - state.add_global_memory_writes(num_rows); // output are hashes - - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = base64 ? nvtext::word_minhash64(source->view(), seeds.view()) - : nvtext::word_minhash(source->view(), seeds.view()); - }); -} - -NVBENCH_BENCH(bench_word_minhash) - .set_name("word_minhash") - .add_int64_axis("num_rows", {131072, 262144, 524288, 1048576, 2097152}) - .add_int64_axis("row_width", {10, 100, 1000}) - .add_int64_axis("seed_count", {2, 25}) - .add_int64_axis("hash_type", {32, 64}); diff --git a/cpp/cmake/thirdparty/get_spdlog.cmake b/cpp/cmake/thirdparty/get_spdlog.cmake deleted file mode 100644 index 90b0f4d8a8e..00000000000 --- a/cpp/cmake/thirdparty/get_spdlog.cmake +++ /dev/null @@ -1,27 +0,0 @@ -# ============================================================================= -# Copyright (c) 2023-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= - -# Use CPM to find or clone speedlog -function(find_and_configure_spdlog) - - include(${rapids-cmake-dir}/cpm/spdlog.cmake) - rapids_cpm_spdlog( - FMT_OPTION "EXTERNAL_FMT_HO" - INSTALL_EXPORT_SET cudf-exports - BUILD_EXPORT_SET cudf-exports - ) - -endfunction() - -find_and_configure_spdlog() diff --git a/cpp/cmake/thirdparty/patches/cccl_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json index dcf9c1139f9..d5cadce40c2 100644 --- a/cpp/cmake/thirdparty/patches/cccl_override.json +++ b/cpp/cmake/thirdparty/patches/cccl_override.json @@ -3,16 +3,6 @@ "packages" : { "CCCL" : { "patches" : [ - { - "file" : "${current_json_dir}/cccl_symbol_visibility.diff", - "issue" : "Correct symbol visibility issues in libcudacxx [https://github.com/NVIDIA/cccl/pull/1832/]", - "fixed_in" : "2.6" - }, - { - "file" : "${current_json_dir}/thrust_disable_64bit_dispatching.diff", - "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]", - "fixed_in" : "" - }, { "file" : "${current_json_dir}/thrust_faster_sort_compile_times.diff", "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]", diff --git a/cpp/cmake/thirdparty/patches/cccl_symbol_visibility.diff b/cpp/cmake/thirdparty/patches/cccl_symbol_visibility.diff deleted file mode 100644 index f745d5fa314..00000000000 --- a/cpp/cmake/thirdparty/patches/cccl_symbol_visibility.diff +++ /dev/null @@ -1,27 +0,0 @@ -diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__config b/libcudacxx/include/cuda/std/detail/libcxx/include/__config -index e7c62c031b..5db861853a 100644 ---- a/libcudacxx/include/cuda/std/detail/libcxx/include/__config -+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__config -@@ -1049,7 +1049,6 @@ typedef __char32_t char32_t; - # define _LIBCUDACXX_EXPORTED_FROM_ABI __declspec(dllimport) - # endif - --# define _LIBCUDACXX_TYPE_VIS _LIBCUDACXX_DLL_VIS - # define _LIBCUDACXX_FUNC_VIS _LIBCUDACXX_DLL_VIS - # define _LIBCUDACXX_EXCEPTION_ABI _LIBCUDACXX_DLL_VIS - # define _LIBCUDACXX_HIDDEN -@@ -1448,14 +1447,6 @@ __sanitizer_annotate_contiguous_container(const void*, const void*, const void*, - # define _LIBCUDACXX_WEAK __attribute__((__weak__)) - # endif - --// Redefine some macros for internal use --# if defined(__cuda_std__) --# undef _LIBCUDACXX_FUNC_VIS --# define _LIBCUDACXX_FUNC_VIS _LIBCUDACXX_INLINE_VISIBILITY --# undef _LIBCUDACXX_TYPE_VIS --# define _LIBCUDACXX_TYPE_VIS --# endif // __cuda_std__ -- - // Thread API - # ifndef _LIBCUDACXX_HAS_THREAD_API_EXTERNAL - # if defined(_CCCL_COMPILER_NVRTC) || defined(__EMSCRIPTEN__) diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff deleted file mode 100644 index 6ae1e1c917b..00000000000 --- a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff +++ /dev/null @@ -1,25 +0,0 @@ -diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h -index 2a3cc4e33..8fb337b26 100644 ---- a/thrust/thrust/system/cuda/detail/dispatch.h -+++ b/thrust/thrust/system/cuda/detail/dispatch.h -@@ -44,8 +44,7 @@ - } \ - else \ - { \ -- auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ -- status = call arguments; \ -+ throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ - } - - /** -@@ -66,9 +65,7 @@ - } \ - else \ - { \ -- auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ -- auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ -- status = call arguments; \ -+ throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ - } - /** - * Dispatch between 32-bit and 64-bit index based versions of the same algorithm diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff index cb0cc55f4d2..5f1981e9806 100644 --- a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff +++ b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff @@ -1,20 +1,20 @@ diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh -index eb76ebb0b..c6c529a50 100644 +index 29510db5e..cf57e5786 100644 --- a/cub/cub/block/block_merge_sort.cuh +++ b/cub/cub/block/block_merge_sort.cuh @@ -95,7 +95,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge( KeyT key1 = keys_shared[keys1_beg]; KeyT key2 = keys_shared[keys2_beg]; - + -#pragma unroll +#pragma unroll 1 for (int item = 0; item < ITEMS_PER_THREAD; ++item) { - bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1)); -@@ -376,7 +376,7 @@ public: + const bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1)); +@@ -374,7 +374,7 @@ public: // KeyT max_key = oob_default; - + -#pragma unroll +#pragma unroll 1 for (int item = 1; item < ITEMS_PER_THREAD; ++item) @@ -27,7 +27,7 @@ index 7d9e8622f..da5627306 100644 @@ -87,10 +87,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THRE { constexpr bool KEYS_ONLY = ::cuda::std::is_same::value; - + -#pragma unroll +#pragma unroll 1 for (int i = 0; i < ITEMS_PER_THREAD; ++i) diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md index 1c1052487f2..5032a073b58 100644 --- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md +++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md @@ -1082,15 +1082,15 @@ initialization. If this setting is higher than the compile-time CMake variable, in between the two settings will be excluded from the written log. The available levels are the same as for the CMake variable. * Global logger object exposed via `cudf::logger()` - sets the minimum logging level at runtime. -For example, calling `cudf::logger().set_level(spdlog::level::err)`, will exclude any messages that +For example, calling `cudf::default_logger().set_level(level_enum::err)`, will exclude any messages that are not errors or critical errors. This API should not be used within libcudf to manipulate logging, its purpose is to allow upstream users to configure libcudf logging to fit their application. By default, logging messages are output to stderr. Setting the environment variable `LIBCUDF_DEBUG_LOG_FILE` redirects the log to a file with the specified path (can be relative to the current directory). -Upstream users can also manipulate `cudf::logger().sinks()` to add sinks or divert the log to -standard output or even a custom spdlog sink. +Upstream users can also manipulate `cudf::default_logger().sinks()` to add sinks or divert the log to +standard output. # Data Types diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index 35a39ef9758..ea480b133dc 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -33,11 +33,13 @@ #include #include +#include #include #include #include #include +#include /** * @file column_device_view.cuh @@ -56,8 +58,8 @@ namespace CUDF_EXPORT cudf { * */ struct nullate { - struct YES : std::bool_constant {}; - struct NO : std::bool_constant {}; + struct YES : cuda::std::bool_constant {}; + struct NO : cuda::std::bool_constant {}; /** * @brief `nullate::DYNAMIC` defers the determination of nullability to run time rather than * compile time. The calling code is responsible for specifying whether or not nulls are @@ -80,7 +82,7 @@ struct nullate { * @return `true` if nulls are expected in the operation in which this object is applied, * otherwise false */ - constexpr operator bool() const noexcept { return value; } + CUDF_HOST_DEVICE constexpr operator bool() const noexcept { return value; } bool value; ///< True if nulls are expected }; }; @@ -319,14 +321,14 @@ class alignas(16) column_device_view_base { } template - struct has_element_accessor_impl : std::false_type {}; + struct has_element_accessor_impl : cuda::std::false_type {}; template struct has_element_accessor_impl< C, T, - void_t().template element(std::declval()))>> - : std::true_type {}; + void_t().template element(cuda::std::declval()))>> + : cuda::std::true_type {}; }; // @cond // Forward declaration @@ -460,7 +462,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { */ struct index_element_fn { template () and std::is_unsigned_v)> + CUDF_ENABLE_IF(is_index_type() and std::is_signed_v)> __device__ size_type operator()(column_device_view const& indices, size_type index) { return static_cast(indices.element(index)); @@ -468,10 +470,10 @@ class alignas(16) column_device_view : public detail::column_device_view_base { template () and std::is_unsigned_v))> + CUDF_ENABLE_IF(not(is_index_type() and std::is_signed_v))> __device__ size_type operator()(Args&&... args) { - CUDF_UNREACHABLE("dictionary indices must be an unsigned integral type"); + CUDF_UNREACHABLE("dictionary indices must be a signed integral type"); } }; @@ -534,7 +536,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { * @return `true` if `column_device_view::element()` has a valid overload, `false` otherwise */ template - static constexpr bool has_element_accessor() + CUDF_HOST_DEVICE static constexpr bool has_element_accessor() { return has_element_accessor_impl::value; } @@ -1044,7 +1046,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view * @return `true` if `mutable_column_device_view::element()` has a valid overload, `false` */ template - static constexpr bool has_element_accessor() + CUDF_HOST_DEVICE static constexpr bool has_element_accessor() { return has_element_accessor_impl::value; } diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh index de53e7586cd..c30c3d6f4bd 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.cuh +++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh @@ -36,7 +36,7 @@ namespace cudf { namespace detail { template -constexpr bool is_product_supported() +CUDF_HOST_DEVICE constexpr bool is_product_supported() { return is_numeric(); } diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh index 4159e324472..9226697a7f6 100644 --- a/cpp/include/cudf/detail/copy_if.cuh +++ b/cpp/include/cudf/detail/copy_if.cuh @@ -16,300 +16,25 @@ #pragma once -#include -#include #include -#include #include #include -#include -#include -#include -#include #include #include #include #include -#include #include -#include -#include #include -#include #include #include -#include -#include #include #include -#include - namespace cudf { namespace detail { -// Compute the count of elements that pass the mask within each block -template -CUDF_KERNEL void compute_block_counts(cudf::size_type* __restrict__ block_counts, - cudf::size_type size, - cudf::size_type per_thread, - Filter filter) -{ - int tid = threadIdx.x + per_thread * block_size * blockIdx.x; - int count = 0; - - for (int i = 0; i < per_thread; i++) { - bool mask_true = (tid < size) && filter(tid); - count += __syncthreads_count(mask_true); - tid += block_size; - } - - if (threadIdx.x == 0) block_counts[blockIdx.x] = count; -} - -// Compute the exclusive prefix sum of each thread's mask value within each block -template -__device__ cudf::size_type block_scan_mask(bool mask_true, cudf::size_type& block_sum) -{ - int offset = 0; - - using BlockScan = cub::BlockScan; - __shared__ typename BlockScan::TempStorage temp_storage; - BlockScan(temp_storage).ExclusiveSum(mask_true, offset, block_sum); - - return offset; -} - -// This kernel scatters data and validity mask of a column based on the -// scan of the boolean mask. The block offsets for the scan are already computed. -// Just compute the scan of the mask in each block and add it to the block's -// output offset. This is the output index of each element. Scattering -// the valid mask is not as easy, because each thread is only responsible for -// one bit. Warp-level processing (ballot) makes this simpler. -// To make scattering efficient, we "coalesce" the block's scattered data and -// valids in shared memory, and then write from shared memory to global memory -// in a contiguous manner. -// The has_validity template parameter specializes this kernel for the -// non-nullable case for performance without writing another kernel. -// -// Note: `filter` is not run on indices larger than the input column size -template -__launch_bounds__(block_size) CUDF_KERNEL - void scatter_kernel(cudf::mutable_column_device_view output_view, - cudf::size_type* output_null_count, - cudf::column_device_view input_view, - cudf::size_type const* __restrict__ block_offsets, - cudf::size_type size, - cudf::size_type per_thread, - Filter filter) -{ - T* __restrict__ output_data = output_view.data(); - cudf::bitmask_type* __restrict__ output_valid = output_view.null_mask(); - static_assert(block_size <= 1024, "Maximum thread block size exceeded"); - - int tid = threadIdx.x + per_thread * block_size * blockIdx.x; - cudf::size_type block_offset = block_offsets[blockIdx.x]; - - // one extra warp worth in case the block is not aligned - __shared__ bool temp_valids[has_validity ? block_size + cudf::detail::warp_size : 1]; - __shared__ T temp_data[block_size]; - - cudf::size_type warp_valid_counts{0}; // total valid sum over the `per_thread` loop below - cudf::size_type block_sum = 0; // count passing filter over the `per_thread` loop below - - // Note that since the maximum gridDim.x on all supported GPUs is as big as - // cudf::size_type, this loop is sufficient to cover our maximum column size - // regardless of the value of block_size and per_thread. - for (int i = 0; i < per_thread; i++) { - bool mask_true = (tid < size) && filter(tid); - - cudf::size_type tmp_block_sum = 0; - // get output location using a scan of the mask result - cudf::size_type const local_index = block_scan_mask(mask_true, tmp_block_sum); - block_sum += tmp_block_sum; - - if (has_validity) { - temp_valids[threadIdx.x] = false; // init shared memory - if (threadIdx.x < cudf::detail::warp_size) temp_valids[block_size + threadIdx.x] = false; - __syncthreads(); // wait for init - } - - if (mask_true) { - temp_data[local_index] = input_view.data()[tid]; // scatter data to shared - - // scatter validity mask to shared memory - if (has_validity and input_view.is_valid(tid)) { - // determine aligned offset for this warp's output - cudf::size_type const aligned_offset = block_offset % cudf::detail::warp_size; - temp_valids[local_index + aligned_offset] = true; - } - } - - __syncthreads(); // wait for shared data and validity mask to be complete - - // Copy output data coalesced from shared to global - if (threadIdx.x < tmp_block_sum) - output_data[block_offset + threadIdx.x] = temp_data[threadIdx.x]; - - if (has_validity) { - // Since the valid bools are contiguous in shared memory now, we can use - // __popc to combine them into a single mask element. - // Then, most mask elements can be directly copied from shared to global - // memory. Only the first and last 32-bit mask elements of each block must - // use an atomicOr, because these are where other blocks may overlap. - - constexpr int num_warps = block_size / cudf::detail::warp_size; - // account for partial blocks with non-warp-aligned offsets - int const last_index = tmp_block_sum + (block_offset % cudf::detail::warp_size) - 1; - int const last_warp = min(num_warps, last_index / cudf::detail::warp_size); - int const wid = threadIdx.x / cudf::detail::warp_size; - int const lane = threadIdx.x % cudf::detail::warp_size; - - cudf::size_type tmp_warp_valid_counts{0}; - - if (tmp_block_sum > 0 && wid <= last_warp) { - int valid_index = (block_offset / cudf::detail::warp_size) + wid; - - // compute the valid mask for this warp - uint32_t valid_warp = __ballot_sync(0xffff'ffffu, temp_valids[threadIdx.x]); - - // Note the atomicOr's below assume that output_valid has been set to - // all zero before the kernel - if (lane == 0 && valid_warp != 0) { - tmp_warp_valid_counts = __popc(valid_warp); - if (wid > 0 && wid < last_warp) - output_valid[valid_index] = valid_warp; - else { - cuda::atomic_ref ref{ - output_valid[valid_index]}; - ref.fetch_or(valid_warp, cuda::std::memory_order_relaxed); - } - } - - // if the block is full and not aligned then we have one more warp to cover - if ((wid == 0) && (last_warp == num_warps)) { - uint32_t valid_warp = __ballot_sync(0xffff'ffffu, temp_valids[block_size + threadIdx.x]); - if (lane == 0 && valid_warp != 0) { - tmp_warp_valid_counts += __popc(valid_warp); - cuda::atomic_ref ref{ - output_valid[valid_index + num_warps]}; - ref.fetch_or(valid_warp, cuda::std::memory_order_relaxed); - } - } - } - warp_valid_counts += tmp_warp_valid_counts; - } - - block_offset += tmp_block_sum; - tid += block_size; - } - // Compute total null_count for this block and add it to global count - constexpr cudf::size_type leader_lane{0}; - cudf::size_type block_valid_count = - cudf::detail::single_lane_block_sum_reduce(warp_valid_counts); - - if (threadIdx.x == 0) { // one thread computes and adds to null count - cuda::atomic_ref ref{*output_null_count}; - ref.fetch_add(block_sum - block_valid_count, cuda::std::memory_order_relaxed); - } -} - -template -struct DeviceType { - using type = T; -}; - -template -struct DeviceType()>> { - using type = typename T::rep; -}; - -template -struct DeviceType()>> { - using type = typename cudf::device_storage_type_t; -}; - -// Dispatch functor which performs the scatter for fixed column types and gather for other -template -struct scatter_gather_functor { - template ()>* = nullptr> - std::unique_ptr operator()(cudf::column_view const& input, - cudf::size_type const& output_size, - cudf::size_type const* block_offsets, - Filter filter, - cudf::size_type per_thread, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) - { - auto output_column = - cudf::allocate_like(input, output_size, cudf::mask_allocation_policy::RETAIN, stream, mr); - auto output = output_column->mutable_view(); - - bool has_valid = input.nullable(); - - using Type = typename DeviceType::type; - - auto scatter = (has_valid) ? scatter_kernel - : scatter_kernel; - - cudf::detail::grid_1d grid{input.size(), block_size, per_thread}; - - cudf::detail::device_scalar null_count{0, stream}; - if (output.nullable()) { - // Have to initialize the output mask to all zeros because we may update - // it with atomicOr(). - CUDF_CUDA_TRY(cudaMemsetAsync(static_cast(output.null_mask()), - 0, - cudf::bitmask_allocation_size_bytes(output.size()), - stream.value())); - } - - auto output_device_view = cudf::mutable_column_device_view::create(output, stream); - auto input_device_view = cudf::column_device_view::create(input, stream); - scatter<<>>(*output_device_view, - null_count.data(), - *input_device_view, - block_offsets, - input.size(), - per_thread, - filter); - - if (has_valid) { output_column->set_null_count(null_count.value(stream)); } - return output_column; - } - - template () and !cudf::is_fixed_point()>* = nullptr> - std::unique_ptr operator()(cudf::column_view const& input, - cudf::size_type const& output_size, - cudf::size_type const*, - Filter filter, - cudf::size_type, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) - { - rmm::device_uvector indices(output_size, stream); - - thrust::copy_if(rmm::exec_policy(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(input.size()), - indices.begin(), - filter); - - auto output_table = cudf::detail::gather(cudf::table_view{{input}}, - indices, - cudf::out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, - stream, - mr); - - // There will be only one column - return std::make_unique(std::move(output_table->get_column(0))); - } -}; - /** * @brief Filters `input` using a Filter function object * @@ -319,9 +44,11 @@ struct scatter_gather_functor { * false otherwise. * * @tparam Filter the filter functor type - * @param[in] input The table_view to filter - * @param[in] filter A function object that takes an index and returns a bool - * @return unique_ptr The table generated from filtered `input`. + * @param input The table_view to filter + * @param filter A function object that takes an index and returns a bool + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used for allocating the returned memory + * @return The table generated from filtered `input` */ template std::unique_ptr
copy_if(table_view const& input, @@ -333,76 +60,22 @@ std::unique_ptr
copy_if(table_view const& input, if (0 == input.num_rows() || 0 == input.num_columns()) { return empty_like(input); } - constexpr int block_size = 256; - cudf::size_type per_thread = - elements_per_thread(compute_block_counts, input.num_rows(), block_size); - cudf::detail::grid_1d grid{input.num_rows(), block_size, per_thread}; - - // temp storage for block counts and offsets - rmm::device_uvector block_counts(grid.num_blocks, stream); - rmm::device_uvector block_offsets(grid.num_blocks + 1, stream); - - // 1. Find the count of elements in each block that "pass" the mask - compute_block_counts<<>>( - block_counts.begin(), input.num_rows(), per_thread, filter); - - // initialize just the first element of block_offsets to 0 since the InclusiveSum below - // starts at the second element. - CUDF_CUDA_TRY(cudaMemsetAsync(block_offsets.begin(), 0, sizeof(cudf::size_type), stream.value())); - - // 2. Find the offset for each block's output using a scan of block counts - if (grid.num_blocks > 1) { - // Determine and allocate temporary device storage - size_t temp_storage_bytes = 0; - cub::DeviceScan::InclusiveSum(nullptr, - temp_storage_bytes, - block_counts.begin(), - block_offsets.begin() + 1, - grid.num_blocks, - stream.value()); - rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); - - // Run exclusive prefix sum - cub::DeviceScan::InclusiveSum(d_temp_storage.data(), - temp_storage_bytes, - block_counts.begin(), - block_offsets.begin() + 1, - grid.num_blocks, - stream.value()); - } - - // As it is InclusiveSum, last value in block_offsets will be output_size - // unless num_blocks == 1, in which case output_size is just block_counts[0] - cudf::size_type output_size{0}; - CUDF_CUDA_TRY(cudaMemcpyAsync( - &output_size, - grid.num_blocks > 1 ? block_offsets.begin() + grid.num_blocks : block_counts.begin(), - sizeof(cudf::size_type), - cudaMemcpyDefault, - stream.value())); + auto indices = rmm::device_uvector(input.num_rows(), stream); + auto const begin = thrust::counting_iterator(0); + auto const end = begin + input.num_rows(); + auto const indices_end = + thrust::copy_if(rmm::exec_policy(stream), begin, end, indices.begin(), filter); - stream.synchronize(); + auto const output_size = static_cast(thrust::distance(indices.begin(), indices_end)); - if (output_size == input.num_rows()) { - return std::make_unique
(input, stream, mr); - } else if (output_size > 0) { - std::vector> out_columns(input.num_columns()); - std::transform(input.begin(), input.end(), out_columns.begin(), [&](auto col_view) { - return cudf::type_dispatcher(col_view.type(), - scatter_gather_functor{}, - col_view, - output_size, - block_offsets.begin(), - filter, - per_thread, - stream, - mr); - }); + // nothing selected + if (output_size == 0) { return empty_like(input); } + // everything selected + if (output_size == input.num_rows()) { return std::make_unique
(input, stream, mr); } - return std::make_unique
(std::move(out_columns)); - } else { - return empty_like(input); - } + auto const map = device_span(indices.data(), output_size); + return cudf::detail::gather( + input, map, out_of_bounds_policy::DONT_CHECK, negative_index_policy::NOT_ALLOWED, stream, mr); } } // namespace detail diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh index 5dc75b1a3fb..a7efb4e6e93 100644 --- a/cpp/include/cudf/detail/copy_if_else.cuh +++ b/cpp/include/cudf/detail/copy_if_else.cuh @@ -44,10 +44,11 @@ __launch_bounds__(block_size) CUDF_KERNEL mutable_column_device_view out, size_type* __restrict__ const valid_count) { - auto tidx = cudf::detail::grid_1d::global_thread_id(); - auto const stride = cudf::detail::grid_1d::grid_stride(); - int const warp_id = tidx / cudf::detail::warp_size; - size_type const warps_per_grid = gridDim.x * block_size / cudf::detail::warp_size; + auto tidx = cudf::detail::grid_1d::global_thread_id(); + + auto const stride = cudf::detail::grid_1d::grid_stride(); + auto const warp_id = tidx / cudf::detail::warp_size; + auto const warps_per_grid = stride / cudf::detail::warp_size; // begin/end indices for the column data size_type const begin = 0; @@ -60,7 +61,7 @@ __launch_bounds__(block_size) CUDF_KERNEL // lane id within the current warp constexpr size_type leader_lane{0}; - int const lane_id = threadIdx.x % cudf::detail::warp_size; + auto const lane_id = threadIdx.x % cudf::detail::warp_size; size_type warp_valid_count{0}; diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh index fcb80fe45f7..022c5c40ea0 100644 --- a/cpp/include/cudf/detail/copy_range.cuh +++ b/cpp/include/cudf/detail/copy_range.cuh @@ -56,15 +56,15 @@ CUDF_KERNEL void copy_range_kernel(SourceValueIterator source_value_begin, constexpr cudf::size_type leader_lane{0}; int const lane_id = threadIdx.x % warp_size; - cudf::size_type const tid = threadIdx.x + blockIdx.x * blockDim.x; - int const warp_id = tid / warp_size; + auto const tid = cudf::detail::grid_1d::global_thread_id(); + auto const warp_id = tid / warp_size; cudf::size_type const offset = target.offset(); cudf::size_type const begin_mask_idx = cudf::word_index(offset + target_begin); cudf::size_type const end_mask_idx = cudf::word_index(offset + target_end); cudf::size_type mask_idx = begin_mask_idx + warp_id; - cudf::size_type const masks_per_grid = gridDim.x * blockDim.x / warp_size; + cudf::size_type const masks_per_grid = cudf::detail::grid_1d::grid_stride() / warp_size; cudf::size_type target_offset = begin_mask_idx * warp_size - (offset + target_begin); cudf::size_type source_idx = tid + target_offset; @@ -92,7 +92,7 @@ CUDF_KERNEL void copy_range_kernel(SourceValueIterator source_value_begin, } } - source_idx += blockDim.x * gridDim.x; + source_idx += cudf::detail::grid_1d::grid_stride(); mask_idx += masks_per_grid; } diff --git a/cpp/include/cudf/detail/get_value.cuh b/cpp/include/cudf/detail/get_value.cuh index 5ea0d06039f..1bfb40e5916 100644 --- a/cpp/include/cudf/detail/get_value.cuh +++ b/cpp/include/cudf/detail/get_value.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include #include @@ -48,11 +49,9 @@ T get_value(column_view const& col_view, size_type element_index, rmm::cuda_stre CUDF_EXPECTS(data_type(type_to_id()) == col_view.type(), "get_value data type mismatch"); CUDF_EXPECTS(element_index >= 0 && element_index < col_view.size(), "invalid element_index value"); - T result; - CUDF_CUDA_TRY(cudaMemcpyAsync( - &result, col_view.data() + element_index, sizeof(T), cudaMemcpyDefault, stream.value())); - stream.synchronize(); - return result; + return cudf::detail::make_host_vector_sync( + device_span{col_view.data() + element_index, 1}, stream) + .front(); } } // namespace detail diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index 025e2ccc3ec..17ecc0f5539 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -67,7 +67,7 @@ CUDF_KERNEL void offset_bitmask_binop(Binop op, size_type source_size_bits, size_type* count_ptr) { - auto const tid = threadIdx.x + blockIdx.x * blockDim.x; + auto const tid = cudf::detail::grid_1d::global_thread_id(); auto const last_bit_index = source_size_bits - 1; auto const last_word_index = cudf::word_index(last_bit_index); @@ -75,7 +75,7 @@ CUDF_KERNEL void offset_bitmask_binop(Binop op, size_type thread_count = 0; for (size_type destination_word_index = tid; destination_word_index < destination.size(); - destination_word_index += blockDim.x * gridDim.x) { + destination_word_index += cudf::detail::grid_1d::grid_stride()) { bitmask_type destination_word = detail::get_mask_offset_word(source[0], destination_word_index, @@ -214,8 +214,7 @@ CUDF_KERNEL void subtract_set_bits_range_boundaries_kernel(bitmask_type const* b { constexpr size_type const word_size_in_bits{detail::size_in_bits()}; - size_type const tid = threadIdx.x + blockIdx.x * blockDim.x; - size_type range_id = tid; + auto range_id = cudf::detail::grid_1d::global_thread_id(); while (range_id < num_ranges) { size_type const first_bit_index = *(first_bit_indices + range_id); @@ -243,7 +242,7 @@ CUDF_KERNEL void subtract_set_bits_range_boundaries_kernel(bitmask_type const* b // Update the null count with the computed delta. size_type updated_null_count = *(null_counts + range_id) + delta; *(null_counts + range_id) = updated_null_count; - range_id += blockDim.x * gridDim.x; + range_id += cudf::detail::grid_1d::grid_stride(); } } diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh index 61a8e9f7ec3..72cdc3d8067 100644 --- a/cpp/include/cudf/detail/utilities/cuda.cuh +++ b/cpp/include/cudf/detail/utilities/cuda.cuh @@ -74,9 +74,10 @@ class grid_1d { * @param num_threads_per_block The number of threads per block * @return thread_index_type The global thread index */ - static constexpr thread_index_type global_thread_id(thread_index_type thread_id, - thread_index_type block_id, - thread_index_type num_threads_per_block) + __device__ static constexpr thread_index_type global_thread_id( + thread_index_type thread_id, + thread_index_type block_id, + thread_index_type num_threads_per_block) { return thread_id + block_id * num_threads_per_block; } @@ -114,8 +115,8 @@ class grid_1d { * @param num_threads_per_block The number of threads per block * @return thread_index_type The global thread index */ - static constexpr thread_index_type grid_stride(thread_index_type num_threads_per_block, - thread_index_type num_blocks_per_grid) + __device__ static constexpr thread_index_type grid_stride(thread_index_type num_threads_per_block, + thread_index_type num_blocks_per_grid) { return num_threads_per_block * num_blocks_per_grid; } diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh index 46f424e051b..923cd04479d 100644 --- a/cpp/include/cudf/detail/utilities/device_operators.cuh +++ b/cpp/include/cudf/detail/utilities/device_operators.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,6 +29,8 @@ #include #include +#include + #include namespace cudf { @@ -42,7 +44,7 @@ template ()>* = nullptr> CUDF_HOST_DEVICE inline auto min(LHS const& lhs, RHS const& rhs) { - return std::min(lhs, rhs); + return cuda::std::min(lhs, rhs); } /** @@ -53,7 +55,7 @@ template ()>* = nullptr> CUDF_HOST_DEVICE inline auto max(LHS const& lhs, RHS const& rhs) { - return std::max(lhs, rhs); + return cuda::std::max(lhs, rhs); } } // namespace detail @@ -68,22 +70,26 @@ struct DeviceSum { } template ()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { return T{typename T::duration{0}}; } template () && !cudf::is_fixed_point()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { return T{0}; } template ()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { +#ifndef __CUDA_ARCH__ CUDF_FAIL("fixed_point does not yet support device operator identity"); +#else + CUDF_UNREACHABLE("fixed_point does not yet support device operator identity"); +#endif return T{}; } }; @@ -105,7 +111,7 @@ struct DeviceCount { } template - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { return T{}; } @@ -125,7 +131,7 @@ struct DeviceMin { template && !cudf::is_dictionary() && !cudf::is_fixed_point()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { // chrono types do not have std::numeric_limits specializations and should use T::max() // https://eel.is/c++draft/numeric.limits.general#6 @@ -139,9 +145,13 @@ struct DeviceMin { } template ()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { +#ifndef __CUDA_ARCH__ CUDF_FAIL("fixed_point does not yet support DeviceMin identity"); +#else + CUDF_UNREACHABLE("fixed_point does not yet support DeviceMin identity"); +#endif return cuda::std::numeric_limits::max(); } @@ -153,7 +163,7 @@ struct DeviceMin { } template ()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { return static_cast(T::max_value()); } @@ -173,7 +183,7 @@ struct DeviceMax { template && !cudf::is_dictionary() && !cudf::is_fixed_point()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { // chrono types do not have std::numeric_limits specializations and should use T::min() // https://eel.is/c++draft/numeric.limits.general#6 @@ -187,9 +197,13 @@ struct DeviceMax { } template ()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { +#ifndef __CUDA_ARCH__ CUDF_FAIL("fixed_point does not yet support DeviceMax identity"); +#else + CUDF_UNREACHABLE("fixed_point does not yet support DeviceMax identity"); +#endif return cuda::std::numeric_limits::lowest(); } @@ -200,7 +214,7 @@ struct DeviceMax { } template ()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { return static_cast(T::lowest_value()); } @@ -217,15 +231,19 @@ struct DeviceProduct { } template ()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { return T{1}; } template ()>* = nullptr> - static constexpr T identity() + CUDF_HOST_DEVICE static constexpr T identity() { +#ifndef __CUDA_ARCH__ CUDF_FAIL("fixed_point does not yet support DeviceProduct identity"); +#else + CUDF_UNREACHABLE("fixed_point does not yet support DeviceProduct identity"); +#endif return T{1, numeric::scale_type{0}}; } }; diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp index 8b709f2a8f8..2e3d71815c0 100644 --- a/cpp/include/cudf/detail/utilities/integer_utils.hpp +++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp @@ -1,7 +1,7 @@ /* * Copyright 2019 BlazingDB, Inc. * Copyright 2019 Eyal Rozenberg - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -86,7 +86,7 @@ constexpr S round_down_safe(S number_to_round, S modulus) noexcept * `modulus` is positive and does not check for overflow. */ template -constexpr S round_up_unsafe(S number_to_round, S modulus) noexcept +CUDF_HOST_DEVICE constexpr S round_up_unsafe(S number_to_round, S modulus) noexcept { auto remainder = number_to_round % modulus; if (remainder == 0) { return number_to_round; } @@ -134,16 +134,20 @@ constexpr I div_rounding_up_safe(std::integral_constant, I dividend, } // namespace detail /** - * Divides the left-hand-side by the right-hand-side, rounding up + * @brief Divides the left-hand-side by the right-hand-side, rounding up * to an integral multiple of the right-hand-side, e.g. (9,5) -> 2 , (10,5) -> 2, (11,5) -> 3. * - * @param dividend the number to divide - * @param divisor the number of by which to divide - * @return The least integer multiple of {@link divisor} which is greater than or equal to - * the non-integral division dividend/divisor. + * The result is undefined if `divisor == 0` or + * if `divisor == -1` and `dividend == min()`. + * + * Will not overflow, and may _or may not_ be slower than the intuitive + * approach of using `(dividend + divisor - 1) / divisor`. * - * @note will not overflow, and may _or may not_ be slower than the intuitive - * approach of using (dividend + divisor - 1) / divisor + * @tparam I Integer type for `dividend`, `divisor`, and the return type + * @param dividend The number to divide + * @param divisor The number by which to divide + * @return The least integer multiple of `divisor` which is greater than or equal to + * the non-integral division `dividend/divisor` */ template constexpr I div_rounding_up_safe(I dividend, I divisor) noexcept @@ -183,7 +187,7 @@ constexpr bool is_a_power_of_two(I val) noexcept * @return Absolute value if value type is signed. */ template -constexpr auto absolute_value(T value) -> T +CUDF_HOST_DEVICE constexpr auto absolute_value(T value) -> T { if constexpr (cuda::std::is_signed()) return numeric::detail::abs(value); return value; diff --git a/cpp/include/cudf/detail/utilities/logger.hpp b/cpp/include/cudf/detail/utilities/logger.hpp deleted file mode 100644 index e7643eb44bd..00000000000 --- a/cpp/include/cudf/detail/utilities/logger.hpp +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -// Log messages that require computation should only be used at level TRACE and DEBUG -#define CUDF_LOG_TRACE(...) SPDLOG_LOGGER_TRACE(&cudf::detail::logger(), __VA_ARGS__) -#define CUDF_LOG_DEBUG(...) SPDLOG_LOGGER_DEBUG(&cudf::detail::logger(), __VA_ARGS__) -#define CUDF_LOG_INFO(...) SPDLOG_LOGGER_INFO(&cudf::detail::logger(), __VA_ARGS__) -#define CUDF_LOG_WARN(...) SPDLOG_LOGGER_WARN(&cudf::detail::logger(), __VA_ARGS__) -#define CUDF_LOG_ERROR(...) SPDLOG_LOGGER_ERROR(&cudf::detail::logger(), __VA_ARGS__) -#define CUDF_LOG_CRITICAL(...) SPDLOG_LOGGER_CRITICAL(&cudf::detail::logger(), __VA_ARGS__) diff --git a/cpp/include/cudf/dictionary/encode.hpp b/cpp/include/cudf/dictionary/encode.hpp index dc81fd74992..ced6bd2afa4 100644 --- a/cpp/include/cudf/dictionary/encode.hpp +++ b/cpp/include/cudf/dictionary/encode.hpp @@ -41,7 +41,7 @@ namespace dictionary { * * The null mask and null count are copied from the input column to the output column. * - * @throw cudf::logic_error if indices type is not an unsigned integer type + * @throw cudf::logic_error if indices type is not a signed integer type * @throw cudf::logic_error if the column to encode is already a DICTIONARY type * * @code{.pseudo} @@ -58,7 +58,7 @@ namespace dictionary { */ std::unique_ptr encode( column_view const& column, - data_type indices_type = data_type{type_id::UINT32}, + data_type indices_type = data_type{type_id::INT32}, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); diff --git a/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp b/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp index fce08b4a5c4..9e68bafb09a 100644 --- a/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp +++ b/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp @@ -22,6 +22,7 @@ #include #include #include +#include #include @@ -183,7 +184,7 @@ struct floating_converter { * @param integer_rep The bit-casted floating value to extract the exponent from * @return The stored base-2 exponent and significand, shifted for denormals */ - CUDF_HOST_DEVICE inline static std::pair get_significand_and_pow2( + CUDF_HOST_DEVICE inline static cuda::std::pair get_significand_and_pow2( IntegralType integer_rep) { // Extract the significand @@ -1008,7 +1009,7 @@ CUDF_HOST_DEVICE inline auto shift_to_binary_pospow(DecimalRep decimal_rep, int } // Our shifting_rep is now the integer mantissa, return it and the powers of 2 - return std::pair{shifting_rep, pow2}; + return cuda::std::pair{shifting_rep, pow2}; } /** @@ -1075,7 +1076,7 @@ CUDF_HOST_DEVICE inline auto shift_to_binary_negpow(DecimalRep decimal_rep, int } // Our shifting_rep is now the integer mantissa, return it and the powers of 2 - return std::pair{shifting_rep, pow2}; + return cuda::std::pair{shifting_rep, pow2}; } /** diff --git a/cpp/include/cudf/hashing/detail/hash_functions.cuh b/cpp/include/cudf/hashing/detail/hash_functions.cuh index 0ec41a20ef1..fd3455e761d 100644 --- a/cpp/include/cudf/hashing/detail/hash_functions.cuh +++ b/cpp/include/cudf/hashing/detail/hash_functions.cuh @@ -18,7 +18,8 @@ #include -#include +#include +#include namespace cudf::hashing::detail { @@ -29,7 +30,7 @@ template T __device__ inline normalize_nans(T const& key) { if constexpr (cudf::is_floating_point()) { - if (std::isnan(key)) { return std::numeric_limits::quiet_NaN(); } + if (cuda::std::isnan(key)) { return cuda::std::numeric_limits::quiet_NaN(); } } return key; } diff --git a/cpp/include/cudf/hashing/detail/hashing.hpp b/cpp/include/cudf/hashing/detail/hashing.hpp index a978e54a1b9..7cb80081a95 100644 --- a/cpp/include/cudf/hashing/detail/hashing.hpp +++ b/cpp/include/cudf/hashing/detail/hashing.hpp @@ -82,7 +82,7 @@ std::unique_ptr xxhash_64(table_view const& input, * @param rhs The second hash value * @return Combined hash value */ -constexpr uint32_t hash_combine(uint32_t lhs, uint32_t rhs) +CUDF_HOST_DEVICE constexpr uint32_t hash_combine(uint32_t lhs, uint32_t rhs) { return lhs ^ (rhs + 0x9e37'79b9 + (lhs << 6) + (lhs >> 2)); } diff --git a/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh b/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh index 5e88b905023..31390aa3edf 100644 --- a/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh +++ b/cpp/include/cudf/hashing/detail/murmurhash3_x64_128.cuh @@ -15,177 +15,63 @@ */ #pragma once +#include +#include #include #include -#include +#include +#include +#include namespace cudf::hashing::detail { -// MurmurHash3_x64_128 implementation from -// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp -//----------------------------------------------------------------------------- -// MurmurHash3 was written by Austin Appleby, and is placed in the public -// domain. The author hereby disclaims copyright to this source code. -// Note - The x86 and x64 versions do _not_ produce the same results, as the -// algorithms are optimized for their respective platforms. You can still -// compile and run any of them on any platform, but your performance with the -// non-native version will be less than optimal. template struct MurmurHash3_x64_128 { - using result_type = thrust::pair; + using result_type = cuda::std::array; - constexpr MurmurHash3_x64_128() = default; - constexpr MurmurHash3_x64_128(uint64_t seed) : m_seed(seed) {} - - __device__ inline uint32_t getblock32(std::byte const* data, cudf::size_type offset) const + CUDF_HOST_DEVICE constexpr MurmurHash3_x64_128(uint64_t seed = cudf::DEFAULT_HASH_SEED) + : _impl{seed} { - // Read a 4-byte value from the data pointer as individual bytes for safe - // unaligned access (very likely for string types). - auto block = reinterpret_cast(data + offset); - return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24); } - __device__ inline uint64_t getblock64(std::byte const* data, cudf::size_type offset) const - { - uint64_t result = getblock32(data, offset + 4); - result = result << 32; - return result | getblock32(data, offset); - } + __device__ constexpr result_type operator()(Key const& key) const { return this->_impl(key); } - __device__ inline uint64_t fmix64(uint64_t k) const + __device__ constexpr result_type compute_bytes(cuda::std::byte const* bytes, + std::uint64_t size) const { - k ^= k >> 33; - k *= 0xff51afd7ed558ccdUL; - k ^= k >> 33; - k *= 0xc4ceb9fe1a85ec53UL; - k ^= k >> 33; - return k; + return this->_impl.compute_hash(bytes, size); } - result_type __device__ inline operator()(Key const& key) const { return compute(key); } - + private: template - result_type __device__ inline compute(T const& key) const - { - return compute_bytes(reinterpret_cast(&key), sizeof(T)); - } - - result_type __device__ inline compute_remaining_bytes(std::byte const* data, - cudf::size_type len, - cudf::size_type tail_offset, - result_type h) const - { - // Process remaining bytes that do not fill a 8-byte chunk. - uint64_t k1 = 0; - uint64_t k2 = 0; - auto const tail = reinterpret_cast(data) + tail_offset; - switch (len & (BLOCK_SIZE - 1)) { - case 15: k2 ^= static_cast(tail[14]) << 48; - case 14: k2 ^= static_cast(tail[13]) << 40; - case 13: k2 ^= static_cast(tail[12]) << 32; - case 12: k2 ^= static_cast(tail[11]) << 24; - case 11: k2 ^= static_cast(tail[10]) << 16; - case 10: k2 ^= static_cast(tail[9]) << 8; - case 9: - k2 ^= static_cast(tail[8]) << 0; - k2 *= c2; - k2 = rotate_bits_left(k2, 33); - k2 *= c1; - h.second ^= k2; - - case 8: k1 ^= static_cast(tail[7]) << 56; - case 7: k1 ^= static_cast(tail[6]) << 48; - case 6: k1 ^= static_cast(tail[5]) << 40; - case 5: k1 ^= static_cast(tail[4]) << 32; - case 4: k1 ^= static_cast(tail[3]) << 24; - case 3: k1 ^= static_cast(tail[2]) << 16; - case 2: k1 ^= static_cast(tail[1]) << 8; - case 1: - k1 ^= static_cast(tail[0]) << 0; - k1 *= c1; - k1 = rotate_bits_left(k1, 31); - k1 *= c2; - h.first ^= k1; - }; - return h; - } - - result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const + __device__ constexpr result_type compute(T const& key) const { - auto const nblocks = len / BLOCK_SIZE; - uint64_t h1 = m_seed; - uint64_t h2 = m_seed; - - // Process all four-byte chunks. - for (cudf::size_type i = 0; i < nblocks; i++) { - uint64_t k1 = getblock64(data, (i * BLOCK_SIZE)); // 1st 8 bytes - uint64_t k2 = getblock64(data, (i * BLOCK_SIZE) + (BLOCK_SIZE / 2)); // 2nd 8 bytes - - k1 *= c1; - k1 = rotate_bits_left(k1, 31); - k1 *= c2; - - h1 ^= k1; - h1 = rotate_bits_left(h1, 27); - h1 += h2; - h1 = h1 * 5 + 0x52dce729; - - k2 *= c2; - k2 = rotate_bits_left(k2, 33); - k2 *= c1; - - h2 ^= k2; - h2 = rotate_bits_left(h2, 31); - h2 += h1; - h2 = h2 * 5 + 0x38495ab5; - } - - thrust::tie(h1, h2) = compute_remaining_bytes(data, len, nblocks * BLOCK_SIZE, {h1, h2}); - - // Finalize hash. - h1 ^= len; - h2 ^= len; - - h1 += h2; - h2 += h1; - - h1 = fmix64(h1); - h2 = fmix64(h2); - - h1 += h2; - h2 += h1; - - return {h1, h2}; + return this->compute_bytes(reinterpret_cast(&key), sizeof(T)); } - private: - uint64_t m_seed{}; - static constexpr uint32_t BLOCK_SIZE = 16; // 2 x 64-bit = 16 bytes - - static constexpr uint64_t c1 = 0x87c37b91114253d5UL; - static constexpr uint64_t c2 = 0x4cf5ad432745937fUL; + cuco::murmurhash3_x64_128 _impl; }; template <> MurmurHash3_x64_128::result_type __device__ inline MurmurHash3_x64_128::operator()( bool const& key) const { - return compute(key); + return this->compute(key); } template <> MurmurHash3_x64_128::result_type __device__ inline MurmurHash3_x64_128::operator()( float const& key) const { - return compute(normalize_nans(key)); + return this->compute(normalize_nans(key)); } template <> MurmurHash3_x64_128::result_type __device__ inline MurmurHash3_x64_128::operator()( double const& key) const { - return compute(normalize_nans(key)); + return this->compute(normalize_nans(key)); } template <> @@ -193,9 +79,8 @@ MurmurHash3_x64_128::result_type __device__ inline MurmurHash3_x64_128::operator()( cudf::string_view const& key) const { - auto const data = reinterpret_cast(key.data()); - auto const len = key.size_bytes(); - return compute_bytes(data, len); + return this->compute_bytes(reinterpret_cast(key.data()), + key.size_bytes()); } template <> @@ -203,7 +88,7 @@ MurmurHash3_x64_128::result_type __device__ inline MurmurHash3_x64_128::operator()( numeric::decimal32 const& key) const { - return compute(key.value()); + return this->compute(key.value()); } template <> @@ -211,7 +96,7 @@ MurmurHash3_x64_128::result_type __device__ inline MurmurHash3_x64_128::operator()( numeric::decimal64 const& key) const { - return compute(key.value()); + return this->compute(key.value()); } template <> @@ -219,7 +104,7 @@ MurmurHash3_x64_128::result_type __device__ inline MurmurHash3_x64_128::operator()( numeric::decimal128 const& key) const { - return compute(key.value()); + return this->compute(key.value()); } } // namespace cudf::hashing::detail diff --git a/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh b/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh index 38a7d927b9c..e0c7ce840d7 100644 --- a/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh +++ b/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh @@ -33,7 +33,7 @@ template struct MurmurHash3_x86_32 { using result_type = hash_value_type; - __host__ __device__ constexpr MurmurHash3_x86_32(uint32_t seed = cudf::DEFAULT_HASH_SEED) + CUDF_HOST_DEVICE constexpr MurmurHash3_x86_32(uint32_t seed = cudf::DEFAULT_HASH_SEED) : _impl{seed} { } diff --git a/cpp/include/cudf/hashing/detail/xxhash_64.cuh b/cpp/include/cudf/hashing/detail/xxhash_64.cuh index 7d72349e340..d77d040b365 100644 --- a/cpp/include/cudf/hashing/detail/xxhash_64.cuh +++ b/cpp/include/cudf/hashing/detail/xxhash_64.cuh @@ -31,7 +31,7 @@ template struct XXHash_64 { using result_type = std::uint64_t; - __host__ __device__ constexpr XXHash_64(uint64_t seed = cudf::DEFAULT_HASH_SEED) : _impl{seed} {} + CUDF_HOST_DEVICE constexpr XXHash_64(uint64_t seed = cudf::DEFAULT_HASH_SEED) : _impl{seed} {} __device__ constexpr result_type operator()(Key const& key) const { return this->_impl(key); } diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index bfe76d5690c..b561d0989e9 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -410,6 +410,7 @@ class parquet_reader_options_builder { * * @param val Boolean value whether to read matching projected and filter columns from mismatched * Parquet sources. + * * @return this for chaining. */ parquet_reader_options_builder& allow_mismatched_pq_schemas(bool val) diff --git a/cpp/include/cudf/strings/detail/utf8.hpp b/cpp/include/cudf/strings/detail/utf8.hpp index 85349a421b1..84957ab9f1d 100644 --- a/cpp/include/cudf/strings/detail/utf8.hpp +++ b/cpp/include/cudf/strings/detail/utf8.hpp @@ -31,7 +31,7 @@ namespace strings::detail { * @param chr Any single byte from a valid UTF-8 character * @return true if this is not the first byte of the character */ -constexpr bool is_utf8_continuation_char(unsigned char chr) +CUDF_HOST_DEVICE constexpr bool is_utf8_continuation_char(unsigned char chr) { // The (0xC0 & 0x80) bit pattern identifies a continuation byte of a character. return (chr & 0xC0) == 0x80; @@ -43,7 +43,10 @@ constexpr bool is_utf8_continuation_char(unsigned char chr) * @param chr Any single byte from a valid UTF-8 character * @return true if this the first byte of the character */ -constexpr bool is_begin_utf8_char(unsigned char chr) { return not is_utf8_continuation_char(chr); } +CUDF_HOST_DEVICE constexpr bool is_begin_utf8_char(unsigned char chr) +{ + return not is_utf8_continuation_char(chr); +} /** * @brief This will return true if the passed in byte could be the start of @@ -55,7 +58,7 @@ constexpr bool is_begin_utf8_char(unsigned char chr) { return not is_utf8_contin * @param byte The byte to be tested * @return true if this can be the first byte of a character */ -constexpr bool is_valid_begin_utf8_char(uint8_t byte) +CUDF_HOST_DEVICE constexpr bool is_valid_begin_utf8_char(uint8_t byte) { // to be the first byte of a valid (up to 4 byte) UTF-8 char, byte must be one of: // 0b0vvvvvvv a 1 byte character @@ -72,7 +75,7 @@ constexpr bool is_valid_begin_utf8_char(uint8_t byte) * @param character Single character * @return Number of bytes */ -constexpr size_type bytes_in_char_utf8(char_utf8 character) +CUDF_HOST_DEVICE constexpr size_type bytes_in_char_utf8(char_utf8 character) { return 1 + static_cast((character & 0x0000'FF00u) > 0) + static_cast((character & 0x00FF'0000u) > 0) + @@ -89,7 +92,7 @@ constexpr size_type bytes_in_char_utf8(char_utf8 character) * @param byte Byte from an encoded character. * @return Number of bytes. */ -constexpr size_type bytes_in_utf8_byte(uint8_t byte) +CUDF_HOST_DEVICE constexpr size_type bytes_in_utf8_byte(uint8_t byte) { return 1 + static_cast((byte & 0xF0) == 0xF0) // 4-byte character prefix + static_cast((byte & 0xE0) == 0xE0) // 3-byte character prefix @@ -104,7 +107,7 @@ constexpr size_type bytes_in_utf8_byte(uint8_t byte) * @param[out] character Single char_utf8 value. * @return The number of bytes in the character */ -constexpr size_type to_char_utf8(char const* str, char_utf8& character) +CUDF_HOST_DEVICE constexpr size_type to_char_utf8(char const* str, char_utf8& character) { size_type const chr_width = bytes_in_utf8_byte(static_cast(*str)); @@ -131,7 +134,7 @@ constexpr size_type to_char_utf8(char const* str, char_utf8& character) * @param[out] str Output array. * @return The number of bytes in the character */ -constexpr inline size_type from_char_utf8(char_utf8 character, char* str) +CUDF_HOST_DEVICE constexpr inline size_type from_char_utf8(char_utf8 character, char* str) { size_type const chr_width = bytes_in_char_utf8(character); for (size_type idx = 0; idx < chr_width; ++idx) { @@ -148,7 +151,7 @@ constexpr inline size_type from_char_utf8(char_utf8 character, char* str) * @param utf8_char Single UTF-8 character to convert. * @return Code-point for the UTF-8 character. */ -constexpr uint32_t utf8_to_codepoint(cudf::char_utf8 utf8_char) +CUDF_HOST_DEVICE constexpr uint32_t utf8_to_codepoint(cudf::char_utf8 utf8_char) { uint32_t unchr = 0; if (utf8_char < 0x0000'0080) // single-byte pass thru @@ -178,7 +181,7 @@ constexpr uint32_t utf8_to_codepoint(cudf::char_utf8 utf8_char) * @param unchr Character code-point to convert. * @return Single UTF-8 character. */ -constexpr cudf::char_utf8 codepoint_to_utf8(uint32_t unchr) +CUDF_HOST_DEVICE constexpr cudf::char_utf8 codepoint_to_utf8(uint32_t unchr) { cudf::char_utf8 utf8 = 0; if (unchr < 0x0000'0080) // single byte utf8 diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index 34ed3c5618e..f0040e069d8 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -31,6 +31,8 @@ #include #endif +#include + #include // This file should only include device code logic. @@ -75,8 +77,8 @@ __device__ inline size_type characters_in_string(char const* str, size_type byte * @param pos Character position to count to * @return The number of bytes and the left over non-counted position value */ -__device__ inline std::pair bytes_to_character_position(string_view d_str, - size_type pos) +__device__ inline cuda::std::pair bytes_to_character_position( + string_view d_str, size_type pos) { size_type bytes = 0; auto ptr = d_str.data(); @@ -303,7 +305,7 @@ __device__ inline char_utf8 string_view::operator[](size_type pos) const __device__ inline size_type string_view::byte_offset(size_type pos) const { if (length() == size_bytes()) return pos; - return std::get<0>(strings::detail::bytes_to_character_position(*this, pos)); + return cuda::std::get<0>(strings::detail::bytes_to_character_position(*this, pos)); } __device__ inline int string_view::compare(string_view const& in) const @@ -373,24 +375,23 @@ __device__ inline size_type string_view::find_impl(char const* str, size_type pos, size_type count) const { - auto const nchars = length(); - if (!str || pos < 0 || pos > nchars) return npos; - if (count < 0) count = nchars; + if (!str || pos < 0) { return npos; } + if (pos > 0 && pos > length()) { return npos; } // use iterator to help reduce character/byte counting - auto itr = begin() + pos; + auto const itr = begin() + pos; auto const spos = itr.byte_offset(); - auto const epos = ((pos + count) < nchars) ? (itr + count).byte_offset() : size_bytes(); + auto const epos = + (count >= 0) && ((pos + count) < length()) ? (itr + count).byte_offset() : size_bytes(); auto const find_length = (epos - spos) - bytes + 1; + auto const d_target = string_view{str, bytes}; auto ptr = data() + (forward ? spos : (epos - bytes)); for (size_type idx = 0; idx < find_length; ++idx) { - bool match = true; - for (size_type jdx = 0; match && (jdx < bytes); ++jdx) { - match = (ptr[jdx] == str[jdx]); + if (d_target.compare(ptr, bytes) == 0) { + return forward ? pos : character_offset(epos - bytes - idx); } - if (match) { return forward ? pos : character_offset(epos - bytes - idx); } // use pos to record the current find position pos += strings::detail::is_begin_utf8_char(*ptr); forward ? ++ptr : --ptr; diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh index 3f33c70c29a..8214ea6e83b 100644 --- a/cpp/include/cudf/table/experimental/row_operators.cuh +++ b/cpp/include/cudf/table/experimental/row_operators.cuh @@ -33,6 +33,8 @@ #include #include +#include +#include #include #include #include @@ -48,11 +50,8 @@ #include #include -#include #include -#include #include -#include namespace CUDF_EXPORT cudf { @@ -287,15 +286,16 @@ class device_row_comparator { * `null_order::BEFORE` for all columns. * @param comparator Physical element relational comparison functor. */ - device_row_comparator(Nullate check_nulls, - table_device_view lhs, - table_device_view rhs, - device_span l_dremel_device_views, - device_span r_dremel_device_views, - std::optional> depth = std::nullopt, - std::optional> column_order = std::nullopt, - std::optional> null_precedence = std::nullopt, - PhysicalElementComparator comparator = {}) noexcept + device_row_comparator( + Nullate check_nulls, + table_device_view lhs, + table_device_view rhs, + device_span l_dremel_device_views, + device_span r_dremel_device_views, + cuda::std::optional> depth = cuda::std::nullopt, + cuda::std::optional> column_order = cuda::std::nullopt, + cuda::std::optional> null_precedence = cuda::std::nullopt, + PhysicalElementComparator comparator = {}) noexcept : _lhs{lhs}, _rhs{rhs}, _l_dremel(l_dremel_device_views), @@ -331,9 +331,9 @@ class device_row_comparator { Nullate check_nulls, table_device_view lhs, table_device_view rhs, - std::optional> column_order = std::nullopt, - std::optional> null_precedence = std::nullopt, - PhysicalElementComparator comparator = {}) noexcept + cuda::std::optional> column_order = cuda::std::nullopt, + cuda::std::optional> null_precedence = cuda::std::nullopt, + PhysicalElementComparator comparator = {}) noexcept : _lhs{lhs}, _rhs{rhs}, _l_dremel{}, @@ -410,7 +410,7 @@ class device_row_comparator { return cuda::std::pair(_comparator(_lhs.element(lhs_element_index), _rhs.element(rhs_element_index)), - std::numeric_limits::max()); + cuda::std::numeric_limits::max()); } /** @@ -455,7 +455,7 @@ class device_row_comparator { } if (lcol.num_child_columns() == 0) { - return cuda::std::pair(weak_ordering::EQUIVALENT, std::numeric_limits::max()); + return cuda::std::pair(weak_ordering::EQUIVALENT, cuda::std::numeric_limits::max()); } // Non-empty structs have been modified to only have 1 child when using this. @@ -607,7 +607,7 @@ class device_row_comparator { __device__ constexpr weak_ordering operator()(size_type const lhs_index, size_type const rhs_index) const noexcept { - int last_null_depth = std::numeric_limits::max(); + int last_null_depth = cuda::std::numeric_limits::max(); size_type list_column_index{-1}; for (size_type i = 0; i < _lhs.num_columns(); ++i) { if (_lhs.column(i).type().id() == type_id::LIST) { ++list_column_index; } @@ -626,9 +626,9 @@ class device_row_comparator { // here, otherwise the current code would be failing. auto const [l_dremel_i, r_dremel_i] = _lhs.column(i).type().id() == type_id::LIST - ? std::make_tuple(optional_dremel_view(_l_dremel[list_column_index]), - optional_dremel_view(_r_dremel[list_column_index])) - : std::make_tuple(optional_dremel_view{}, optional_dremel_view{}); + ? cuda::std::make_tuple(optional_dremel_view(_l_dremel[list_column_index]), + optional_dremel_view(_r_dremel[list_column_index])) + : cuda::std::make_tuple(optional_dremel_view{}, optional_dremel_view{}); auto element_comp = element_comparator{_check_nulls, _lhs.column(i), @@ -658,9 +658,9 @@ class device_row_comparator { device_span const _l_dremel; device_span const _r_dremel; Nullate const _check_nulls; - std::optional> const _depth; - std::optional> const _column_order; - std::optional> const _null_precedence; + cuda::std::optional> const _depth; + cuda::std::optional> const _column_order; + cuda::std::optional> const _null_precedence; PhysicalElementComparator const _comparator; }; // class device_row_comparator @@ -882,10 +882,10 @@ struct preprocessed_table { * @return Device array containing respective column orders. If no explicit column orders were * specified during the creation of this object then this will be `nullopt`. */ - [[nodiscard]] std::optional> column_order() const + [[nodiscard]] cuda::std::optional> column_order() const { - return _column_order.size() ? std::optional>(_column_order) - : std::nullopt; + return _column_order.size() ? cuda::std::optional>(_column_order) + : cuda::std::nullopt; } /** @@ -895,10 +895,11 @@ struct preprocessed_table { * @return Device array containing respective column null precedence. If no explicit column null * precedences were specified during the creation of this object then this will be `nullopt`. */ - [[nodiscard]] std::optional> null_precedence() const + [[nodiscard]] cuda::std::optional> null_precedence() const { - return _null_precedence.size() ? std::optional>(_null_precedence) - : std::nullopt; + return _null_precedence.size() + ? cuda::std::optional>(_null_precedence) + : cuda::std::nullopt; } /** @@ -909,9 +910,10 @@ struct preprocessed_table { * @return std::optional> Device array containing respective column depths. * If there are no nested columns in the table then this will be `nullopt`. */ - [[nodiscard]] std::optional> depths() const + [[nodiscard]] cuda::std::optional> depths() const { - return _depths.size() ? std::optional>(_depths) : std::nullopt; + return _depths.size() ? cuda::std::optional>(_depths) + : cuda::std::nullopt; } [[nodiscard]] device_span dremel_device_views() const @@ -940,8 +942,8 @@ struct preprocessed_table { rmm::device_uvector const _depths; // Dremel encoding of list columns used for the comparison algorithm - std::optional> _dremel_data; - std::optional> _dremel_device_views; + cuda::std::optional> _dremel_data; + cuda::std::optional> _dremel_device_views; // Intermediate columns generated from transforming nested children columns into // integers columns using `cudf::rank()`, need to be kept alive. @@ -1808,7 +1810,7 @@ class element_hasher { __device__ element_hasher( Nullate nulls, uint32_t seed = DEFAULT_HASH_SEED, - hash_value_type null_hash = std::numeric_limits::max()) noexcept + hash_value_type null_hash = cuda::std::numeric_limits::max()) noexcept : _check_nulls(nulls), _seed(seed), _null_hash(null_hash) { } @@ -1892,7 +1894,7 @@ class device_row_hasher { */ template