Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add XXHash_32 hasher #17533

Open
wants to merge 27 commits into
base: branch-25.02
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
bff159d
Add xxhash_32
PointKernel Dec 5, 2024
8d5c2dd
Change default hash to xxhash_32
PointKernel Dec 5, 2024
24c5a76
Add missing headers
PointKernel Dec 5, 2024
1597e1f
Merge branch 'branch-25.02' into add-xxhash32
PointKernel Dec 6, 2024
d2635a9
Apply suggestions from code review
PointKernel Dec 6, 2024
e25e0e1
Merge branch 'branch-25.02' into add-xxhash32
PointKernel Dec 6, 2024
e6f204c
Merge remote-tracking branch 'upstream/branch-25.02' into add-xxhash32
PointKernel Dec 9, 2024
200d557
Add xxhash_32 column hash API
PointKernel Dec 9, 2024
50ac47f
Merge remote-tracking branch 'upstream/branch-25.02' into add-xxhash32
PointKernel Dec 10, 2024
4d69dc9
Add xxhash32 tests
PointKernel Dec 10, 2024
03c3c81
Add xxhash in cmake
PointKernel Dec 10, 2024
c660740
Update reference values
PointKernel Dec 10, 2024
4277feb
Minor
PointKernel Dec 10, 2024
8070750
Update expected hash results
PointKernel Dec 11, 2024
7ca8b2b
Merge remote-tracking branch 'upstream/branch-25.02' into add-xxhash32
PointKernel Dec 11, 2024
fe88247
Merge remote-tracking branch 'origin/add-xxhash32' into add-xxhash32
PointKernel Dec 11, 2024
1b486ba
Merge branch 'branch-25.02' into add-xxhash32
bdice Dec 11, 2024
f93bde9
Merge remote-tracking branch 'upstream/branch-25.02' into add-xxhash32
PointKernel Dec 16, 2024
71df59e
Merge remote-tracking branch 'origin/add-xxhash32' into add-xxhash32
PointKernel Dec 16, 2024
93e6af0
Revert default hash changes
PointKernel Dec 16, 2024
b26f0c7
Add Python bindings for xxhash32.
bdice Dec 16, 2024
67d9157
Remove unused helper function.
bdice Dec 16, 2024
71bf77b
Update tests.
bdice Dec 17, 2024
1508aad
Make tests pass.
bdice Dec 17, 2024
1010775
Fix xxhash32 implementation to avoid hash_combine steps.
bdice Dec 17, 2024
6fbd870
Fix typo in result value.
bdice Dec 17, 2024
fc35cc5
Merge branch 'branch-25.02' into add-xxhash32
bdice Dec 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,7 @@ add_library(
src/hash/sha256_hash.cu
src/hash/sha384_hash.cu
src/hash/sha512_hash.cu
src/hash/xxhash_32.cu
src/hash/xxhash_64.cu
src/interop/dlpack.cpp
src/interop/arrow_utilities.cpp
Expand Down
20 changes: 20 additions & 0 deletions cpp/include/cudf/hashing.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,26 @@ std::unique_ptr<column> sha512(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Computes the XXHash_32 hash value of each row in the given table
*
* This function computes the hash of each column using the `seed` for the first column
* and the resulting hash as a seed for the next column and so on.
* The result is a uint32 value for each row.
*
* @param input The table of columns to hash
* @param seed Optional seed value to use for the hash function
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
*
* @returns A column where each row is the hash of a row from the input
*/
std::unique_ptr<column> xxhash_32(
table_view const& input,
uint32_t seed = DEFAULT_HASH_SEED,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Computes the XXHash_64 hash value of each row in the given table
*
Expand Down
5 changes: 5 additions & 0 deletions cpp/include/cudf/hashing/detail/hashing.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@ std::unique_ptr<column> sha512(table_view const& input,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

std::unique_ptr<column> xxhash_32(table_view const& input,
uint64_t seed,
rmm::cuda_stream_view,
rmm::device_async_resource_ref mr);

std::unique_ptr<column> xxhash_64(table_view const& input,
uint64_t seed,
rmm::cuda_stream_view,
Expand Down
118 changes: 118 additions & 0 deletions cpp/include/cudf/hashing/detail/xxhash_32.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cudf/fixed_point/fixed_point.hpp>
#include <cudf/hashing.hpp>
#include <cudf/hashing/detail/hash_functions.cuh>
#include <cudf/lists/list_view.hpp>
#include <cudf/strings/string_view.cuh>
#include <cudf/structs/struct_view.hpp>
#include <cudf/types.hpp>

#include <cuco/hash_functions.cuh>
#include <cuda/std/cstddef>

namespace cudf::hashing::detail {

template <typename Key>
struct XXHash_32 {
using result_type = std::uint32_t;

CUDF_HOST_DEVICE constexpr XXHash_32(uint32_t seed = cudf::DEFAULT_HASH_SEED) : _impl{seed} {}

__device__ constexpr result_type operator()(Key const& key) const { return this->_impl(key); }

__device__ constexpr result_type compute_bytes(cuda::std::byte const* bytes,
std::uint64_t size) const
{
return this->_impl.compute_hash(bytes, size);
}

private:
template <typename T>
__device__ constexpr result_type compute(T const& key) const
{
return this->compute_bytes(reinterpret_cast<cuda::std::byte const*>(&key), sizeof(T));
}

cuco::xxhash_32<Key> _impl;
};

template <>
XXHash_32<bool>::result_type __device__ inline XXHash_32<bool>::operator()(bool const& key) const
{
return this->compute(static_cast<uint8_t>(key));
}

template <>
XXHash_32<float>::result_type __device__ inline XXHash_32<float>::operator()(float const& key) const
{
return this->compute(normalize_nans_and_zeros(key));
}

template <>
XXHash_32<double>::result_type __device__ inline XXHash_32<double>::operator()(
double const& key) const
{
return this->compute(normalize_nans_and_zeros(key));
}

template <>
XXHash_32<cudf::string_view>::result_type
__device__ inline XXHash_32<cudf::string_view>::operator()(cudf::string_view const& key) const
{
return this->compute_bytes(reinterpret_cast<cuda::std::byte const*>(key.data()),
key.size_bytes());
}

template <>
XXHash_32<numeric::decimal32>::result_type
__device__ inline XXHash_32<numeric::decimal32>::operator()(numeric::decimal32 const& key) const
{
return this->compute(key.value());
}

template <>
XXHash_32<numeric::decimal64>::result_type
__device__ inline XXHash_32<numeric::decimal64>::operator()(numeric::decimal64 const& key) const
{
return this->compute(key.value());
}

template <>
XXHash_32<numeric::decimal128>::result_type
__device__ inline XXHash_32<numeric::decimal128>::operator()(numeric::decimal128 const& key) const
{
return this->compute(key.value());
}

template <>
hash_value_type __device__ inline XXHash_32<cudf::list_view>::operator()(
cudf::list_view const& key) const
{
CUDF_UNREACHABLE("List column hashing is not supported");
}

template <>
hash_value_type __device__ inline XXHash_32<cudf::struct_view>::operator()(
cudf::struct_view const& key) const
{
CUDF_UNREACHABLE("Direct hashing of struct_view is not supported");
}

} // namespace cudf::hashing::detail
71 changes: 71 additions & 0 deletions cpp/src/hash/xxhash_32.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/column/column_factories.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/hashing/detail/hashing.hpp>
#include <cudf/hashing/detail/xxhash_32.cuh>
#include <cudf/table/experimental/row_operators.cuh>
#include <cudf/table/table_device_view.cuh>
#include <cudf/utilities/memory_resource.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/tabulate.h>

namespace cudf::hashing {
namespace detail {

std::unique_ptr<column> xxhash_32(table_view const& input,
uint32_t seed,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
auto output = make_numeric_column(data_type(type_to_id<hash_value_type>()),
input.num_rows(),
mask_state::UNALLOCATED,
stream,
mr);

// Return early if there's nothing to hash
if (input.num_columns() == 0 || input.num_rows() == 0) { return output; }

bool const nullable = has_nulls(input);
auto const row_hasher = cudf::experimental::row::hash::row_hasher(input, stream);
auto output_view = output->mutable_view();

// Compute the hash value for each row
thrust::tabulate(rmm::exec_policy(stream),
output_view.begin<hash_value_type>(),
output_view.end<hash_value_type>(),
row_hasher.device_hasher<XXHash_32>(nullable, seed));

return output;
}

} // namespace detail

std::unique_ptr<column> xxhash_32(table_view const& input,
uint32_t seed,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
return detail::xxhash_32(input, seed, stream, mr);
}

} // namespace cudf::hashing
1 change: 1 addition & 0 deletions cpp/src/io/orc/dict_enc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include <cudf/detail/offsets_iterator.cuh>
#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
#include <cudf/io/orc_types.hpp>
#include <cudf/table/experimental/row_operators.cuh>

Expand Down
1 change: 1 addition & 0 deletions cpp/src/io/parquet/chunk_dict.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include <cudf/detail/iterator.cuh>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
#include <cudf/table/experimental/row_operators.cuh>

#include <rmm/exec_policy.hpp>
Expand Down
1 change: 1 addition & 0 deletions cpp/src/join/join_common_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
#include <cudf/table/experimental/row_operators.cuh>
#include <cudf/utilities/memory_resource.hpp>

Expand Down
1 change: 1 addition & 0 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ ConfigureTest(
hashing/sha256_test.cpp
hashing/sha384_test.cpp
hashing/sha512_test.cpp
hashing/xxhash_32_test.cpp
hashing/xxhash_64_test.cpp
)

Expand Down
67 changes: 67 additions & 0 deletions cpp/tests/hashing/xxhash_32_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_utilities.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/type_lists.hpp>

#include <cudf/hashing.hpp>

class XXHash_32_Test : public cudf::test::BaseFixture {};

TEST_F(XXHash_32_Test, TestInteger)
{
auto col1 = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 42, 825}};
auto constexpr seed = 0u;
auto const output = cudf::hashing::xxhash_32(cudf::table_view({col1}), seed);

// Expected results were generated with the reference implementation:
// https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h
auto expected =
cudf::test::fixed_width_column_wrapper<uint32_t>({2802733858u, 3816402826u, 3721130582u});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected);
}

TEST_F(XXHash_32_Test, TestDouble)
{
auto col1 = cudf::test::fixed_width_column_wrapper<double>{{-8., 25., 90.}};
auto constexpr seed = 42u;

auto const output = cudf::hashing::xxhash_32(cudf::table_view({col1}), seed);

// Expected results were generated with the reference implementation:
// https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h
auto expected =
cudf::test::fixed_width_column_wrapper<uint32_t>({635906976u, 1479683640u, 1813668619u});

CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected);
}

TEST_F(XXHash_32_Test, StringType)
{
auto col1 = cudf::test::strings_column_wrapper({"I", "am", "AI"});
auto constexpr seed = 825u;

auto output = cudf::hashing::xxhash_32(cudf::table_view({col1}), seed);

// Expected results were generated with the reference implementation:
// https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h
auto expected =
cudf::test::fixed_width_column_wrapper<uint32_t>({2975112264u, 4267142293u, 4063988593u});

CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected);
}
16 changes: 12 additions & 4 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2837,16 +2837,22 @@ def hash_values(

Parameters
----------
method : {'murmur3', 'md5', 'xxhash64'}, default 'murmur3'
method : {'murmur3', 'xxhash32', 'xxhash64', 'md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512'}, default 'murmur3'
Hash function to use:

* murmur3: MurmurHash3 hash function
* md5: MD5 hash function
* xxhash32: xxHash32 hash function
* xxhash64: xxHash64 hash function
* md5: MD5 hash function
* sha1: SHA-1 hash function
* sha224: SHA-224 hash function
* sha256: SHA-256 hash function
* sha384: SHA-384 hash function
* sha512: SHA-512 hash function

seed : int, optional
Seed value to use for the hash function. This parameter is only
supported for 'murmur3' and 'xxhash64'.
supported for 'murmur3', 'xxhash32', and 'xxhash64'.


Returns
Expand Down Expand Up @@ -2901,7 +2907,7 @@ def hash_values(
2 fe061786ea286a515b772d91b0dfcd70
dtype: object
"""
seed_hash_methods = {"murmur3", "xxhash64"}
seed_hash_methods = {"murmur3", "xxhash32", "xxhash64"}
if seed is None:
seed = 0
elif method not in seed_hash_methods:
Expand All @@ -2915,6 +2921,8 @@ def hash_values(
)
if method == "murmur3":
plc_column = plc.hashing.murmurhash3_x86_32(plc_table, seed)
elif method == "xxhash32":
plc_column = plc.hashing.xxhash_32(plc_table, seed)
elif method == "xxhash64":
plc_column = plc.hashing.xxhash_64(plc_table, seed)
elif method == "md5":
Expand Down
Loading
Loading