Skip to content

Commit

Permalink
Large strings support in cudf::concatenate (#15195)
Browse files Browse the repository at this point in the history
Enables `cudf::concatenate` to create and return a large strings column (offsets are INT64).

This also introduces the `LIBCUDF_LARGE_STRINGS_ENABLED` environment variable and utilities around it.
One internal utility checks the value so appropriate logic can either throw an overflow exception or build INT64 offsets as appropriate. 

The `cudf::test::large_strings_enabler` is introduced to set/unset the env var for individual tests are needed.
A follow on PR will attempt to consolidate these kinds of tests with a specialized test fixture using this utility class.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: #15195
  • Loading branch information
davidwendt authored Apr 4, 2024
1 parent d7b8fc4 commit 4e44d5d
Show file tree
Hide file tree
Showing 6 changed files with 142 additions and 38 deletions.
27 changes: 27 additions & 0 deletions cpp/include/cudf/strings/detail/utilities.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,24 @@ namespace cudf {
namespace strings {
namespace detail {

/**
* @brief Create an offsets column to be a child of a strings column
*
* This will return the properly typed column to be filled in by the caller
* given the number of bytes to address.
*
* @param chars_bytes Number of bytes for the chars in the strings column
* @param count Number of elements for the offsets column.
* This is the number of rows in the parent strings column +1.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return The offsets child column for a strings column
*/
std::unique_ptr<column> create_offsets_child_column(int64_t chars_bytes,
size_type count,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

/**
* @brief Creates a string_view vector from a strings column.
*
Expand All @@ -52,6 +70,15 @@ rmm::device_uvector<string_view> create_string_vector_from_column(
*/
int64_t get_offset64_threshold();

/**
* @brief Checks if large strings is enabled
*
* This checks the setting in the environment variable LIBCUDF_LARGE_STRINGS_ENABLED.
*
* @return true if large strings are supported
*/
bool is_large_strings_enabled();

/**
* @brief Return a normalized offset value from a strings offsets column
*
Expand Down
25 changes: 25 additions & 0 deletions cpp/include/cudf_test/column_utilities.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,29 @@ template <>
std::pair<thrust::host_vector<std::string>, std::vector<bitmask_type>> to_host(column_view c);
//! @endcond

/**
* @brief For enabling large strings testing in specific tests
*/
struct large_strings_enabler {
/**
* @brief Create large strings enable object
*
* @param default_enable Default enables large strings support
*/
large_strings_enabler(bool default_enable = true);
~large_strings_enabler();

/**
* @brief Enable large strings support
*/
void enable();

/**
* @brief Disable large strings support
*/
void disable();
};

} // namespace cudf::test

// Macros for showing line of failure.
Expand Down Expand Up @@ -242,3 +265,5 @@ std::pair<thrust::host_vector<std::string>, std::vector<bitmask_type>> to_host(c
SCOPED_TRACE(" <-- line of failure\n"); \
cudf::test::detail::expect_equal_buffers(lhs, rhs, size_bytes); \
} while (0)

#define CUDF_TEST_ENABLE_LARGE_STRINGS() cudf::test::large_strings_enabler ls___
6 changes: 1 addition & 5 deletions cpp/src/strings/copying/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -220,9 +220,6 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
CUDF_EXPECTS(offsets_count <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
"total number of strings exceeds the column size limit",
std::overflow_error);
CUDF_EXPECTS(total_bytes <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
"total size of strings exceeds the column size limit",
std::overflow_error);

bool const has_nulls =
std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); });
Expand All @@ -232,8 +229,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
auto d_new_chars = output_chars.data();

// create output offsets column
auto offsets_column = make_numeric_column(
data_type{type_id::INT32}, offsets_count, mask_state::UNALLOCATED, stream, mr);
auto offsets_column = create_offsets_child_column(total_bytes, offsets_count, stream, mr);
auto itr_new_offsets =
cudf::detail::offsetalator_factory::make_output_iterator(offsets_column->mutable_view());

Expand Down
35 changes: 33 additions & 2 deletions cpp/src/strings/utilities.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <cudf/detail/get_value.cuh>
#include <cudf/strings/detail/char_tables.hpp>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/utilities/error.hpp>

#include <rmm/cuda_stream_view.hpp>
Expand All @@ -31,6 +32,9 @@
#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform.h>

#include <cstdlib>
#include <string>

namespace cudf {
namespace strings {
namespace detail {
Expand Down Expand Up @@ -65,6 +69,27 @@ rmm::device_uvector<string_view> create_string_vector_from_column(
return strings_vector;
}

/**
* @copydoc cudf::strings::detail::create_offsets_child_column
*/
std::unique_ptr<column> create_offsets_child_column(int64_t chars_bytes,
size_type count,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto const threshold = get_offset64_threshold();
if (!is_large_strings_enabled()) {
CUDF_EXPECTS(
chars_bytes < threshold, "Size of output exceeds the column size limit", std::overflow_error);
}
return make_numeric_column(
chars_bytes < threshold ? data_type{type_id::INT32} : data_type{type_id::INT64},
count,
mask_state::UNALLOCATED,
stream,
mr);
}

namespace {
// The device variables are created here to avoid using a singleton that may cause issues
// with RMM initialize/finalize. See PR #3159 for details on this approach.
Expand Down Expand Up @@ -123,13 +148,19 @@ special_case_mapping const* get_special_case_mapping_table()

int64_t get_offset64_threshold()
{
auto const threshold = std::getenv("LIBCUDF_LARGE_STRINGS_THRESHOLD");
std::size_t const rtn = threshold != nullptr ? std::atol(threshold) : 0;
auto const threshold = std::getenv("LIBCUDF_LARGE_STRINGS_THRESHOLD");
int64_t const rtn = threshold != nullptr ? std::atol(threshold) : 0L;
return (rtn > 0 && rtn < std::numeric_limits<int32_t>::max())
? rtn
: std::numeric_limits<int32_t>::max();
}

bool is_large_strings_enabled()
{
auto const env = std::getenv("LIBCUDF_LARGE_STRINGS_ENABLED");
return env != nullptr && std::string(env) == "1";
}

int64_t get_offset_value(cudf::column_view const& offsets,
size_type index,
rmm::cuda_stream_view stream)
Expand Down
76 changes: 45 additions & 31 deletions cpp/tests/copying/concatenate_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
#include <cudf/table/table.hpp>
#include <cudf/utilities/default_stream.hpp>

#include <thrust/iterator/constant_iterator.h>

#include <numeric>
#include <stdexcept>
#include <string>
Expand Down Expand Up @@ -164,37 +166,6 @@ TEST_F(StringColumnTest, ConcatenateColumnView)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

TEST_F(StringColumnTest, ConcatenateColumnViewLarge)
{
// Test large concatenate, causes out of bound device memory errors if kernel
// indexing is not int64_t.
// 1.5GB bytes, 5k columns
constexpr size_t num_strings = 10000;
constexpr size_t string_length = 150000;
constexpr size_t strings_per_column = 2;
constexpr size_t num_columns = num_strings / strings_per_column;

std::vector<std::string> strings;
std::vector<char const*> h_strings;
std::vector<cudf::test::strings_column_wrapper> strings_column_wrappers;
std::vector<cudf::column_view> strings_columns;

std::string s(string_length, 'a');
for (size_t i = 0; i < num_strings; ++i)
h_strings.push_back(s.data());

for (size_t i = 0; i < num_columns; ++i)
strings_column_wrappers.push_back(cudf::test::strings_column_wrapper(
h_strings.data() + i * strings_per_column, h_strings.data() + (i + 1) * strings_per_column));
for (auto& wrapper : strings_column_wrappers)
strings_columns.push_back(wrapper);

auto results = cudf::concatenate(strings_columns);

cudf::test::strings_column_wrapper expected(h_strings.begin(), h_strings.end());
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

TEST_F(StringColumnTest, ConcatenateManyColumns)
{
std::vector<char const*> h_strings{
Expand Down Expand Up @@ -226,6 +197,49 @@ TEST_F(StringColumnTest, ConcatenateTooLarge)
EXPECT_THROW(cudf::concatenate(input_cols), std::overflow_error);
}

TEST_F(StringColumnTest, ConcatenateLargeStrings)
{
CUDF_TEST_ENABLE_LARGE_STRINGS();
auto itr = thrust::constant_iterator<std::string_view>(
"abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY"); // 50 bytes
auto input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000); // 250MB
auto view = cudf::column_view(input);
std::vector<cudf::column_view> input_cols;
std::vector<cudf::size_type> splits;
int const multiplier = 10;
for (int i = 0; i < multiplier; ++i) { // 2500MB > 2GB
input_cols.push_back(view);
splits.push_back(view.size() * (i + 1));
}
splits.pop_back(); // remove last entry
auto result = cudf::concatenate(input_cols);
auto sv = cudf::strings_column_view(result->view());
EXPECT_EQ(sv.size(), view.size() * multiplier);
EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});

// verify results in sections
auto sliced = cudf::split(result->view(), splits);
for (auto c : sliced) {
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
}

// also test with large strings column as input
{
input_cols.clear();
input_cols.push_back(input); // regular column
input_cols.push_back(result->view()); // large column
result = cudf::concatenate(input_cols);
sv = cudf::strings_column_view(result->view());
EXPECT_EQ(sv.size(), view.size() * (multiplier + 1));
EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
splits.push_back(view.size() * multiplier);
sliced = cudf::split(result->view(), splits);
for (auto c : sliced) {
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
}
}
}

struct TableTest : public cudf::test::BaseFixture {};

TEST_F(TableTest, ConcatenateTables)
Expand Down
11 changes: 11 additions & 0 deletions cpp/tests/utilities/column_utilities.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1011,5 +1011,16 @@ std::pair<thrust::host_vector<std::string>, std::vector<bitmask_type>> to_host(c
return {std::move(host_data), bitmask_to_host(c)};
}

large_strings_enabler::large_strings_enabler(bool default_enable)
{
default_enable ? enable() : disable();
}

large_strings_enabler::~large_strings_enabler() { disable(); }

void large_strings_enabler::enable() { setenv("LIBCUDF_LARGE_STRINGS_ENABLED", "1", 1); }

void large_strings_enabler::disable() { setenv("LIBCUDF_LARGE_STRINGS_ENABLED", "0", 1); }

} // namespace test
} // namespace cudf

0 comments on commit 4e44d5d

Please sign in to comment.