Skip to content

Commit

Permalink
Merge branch 'branch-24.06' into exact_stats
Browse files Browse the repository at this point in the history
  • Loading branch information
ttnghia authored Apr 26, 2024
2 parents 299a29d + c62c5f6 commit 4e980bb
Show file tree
Hide file tree
Showing 19 changed files with 712 additions and 345 deletions.
4 changes: 2 additions & 2 deletions cpp/benchmarks/string/contains.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows,
}

// longer pattern lengths demand more working memory per string
std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$"};
std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$", "5W43"};

static void bench_contains(nvbench::state& state)
{
Expand Down Expand Up @@ -114,4 +114,4 @@ NVBENCH_BENCH(bench_contains)
.add_int64_axis("row_width", {32, 64, 128, 256, 512})
.add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
.add_int64_axis("hit_rate", {50, 100}) // percentage
.add_int64_axis("pattern", {0, 1});
.add_int64_axis("pattern", {0, 1, 2});
12 changes: 8 additions & 4 deletions cpp/benchmarks/string/count.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,13 @@

#include <nvbench/nvbench.cuh>

static std::string patterns[] = {"\\d+", "a"};

static void bench_count(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
auto const pattern_index = static_cast<cudf::size_type>(state.get_int64("pattern"));

if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
Expand All @@ -41,7 +44,7 @@ static void bench_count(nvbench::state& state)
create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
cudf::strings_column_view input(table->view().column(0));

std::string pattern = "\\d+";
auto const pattern = patterns[pattern_index];

auto prog = cudf::strings::regex_program::create(pattern);

Expand All @@ -59,4 +62,5 @@ static void bench_count(nvbench::state& state)
NVBENCH_BENCH(bench_count)
.set_name("count")
.add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
.add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
.add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
.add_int64_axis("pattern", {0, 1});
37 changes: 27 additions & 10 deletions cpp/include/cudf_test/testing_main.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,25 @@ inline auto parse_cudf_test_opts(int argc, char** argv)
}
}

/**
* @brief Sets up stream mode memory resource adaptor
*
* The resource adaptor is only set as the current device resource if the
* stream mode is enabled.
*
* The caller must keep the return object alive for the life of the test runs.
*
* @param cmd_opts Command line options returned by parse_cudf_test_opts
* @return Memory resource adaptor
*/
inline auto make_memory_resource_adaptor(cxxopts::ParseResult const& cmd_opts)
{
auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>();
auto resource = cudf::test::create_memory_resource(rmm_mode);
rmm::mr::set_current_device_resource(resource.get());
return resource;
}

/**
* @brief Sets up stream mode memory resource adaptor
*
Expand Down Expand Up @@ -181,14 +200,12 @@ inline auto make_stream_mode_adaptor(cxxopts::ParseResult const& cmd_opts)
* function parses the command line to customize test behavior, like the
* allocation mode used for creating the default memory resource.
*/
#define CUDF_TEST_PROGRAM_MAIN() \
int main(int argc, char** argv) \
{ \
::testing::InitGoogleTest(&argc, argv); \
auto const cmd_opts = parse_cudf_test_opts(argc, argv); \
auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>(); \
auto resource = cudf::test::create_memory_resource(rmm_mode); \
rmm::mr::set_current_device_resource(resource.get()); \
auto adaptor = make_stream_mode_adaptor(cmd_opts); \
return RUN_ALL_TESTS(); \
#define CUDF_TEST_PROGRAM_MAIN() \
int main(int argc, char** argv) \
{ \
::testing::InitGoogleTest(&argc, argv); \
auto const cmd_opts = parse_cudf_test_opts(argc, argv); \
[[maybe_unused]] auto mr = make_memory_resource_adaptor(cmd_opts); \
[[maybe_unused]] auto adaptor = make_stream_mode_adaptor(cmd_opts); \
return RUN_ALL_TESTS(); \
}
19 changes: 14 additions & 5 deletions cpp/src/strings/regex/regex.inl
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,15 @@ __device__ __forceinline__ reprog_device reprog_device::load(reprog_device const
: reinterpret_cast<reprog_device*>(buffer)[0];
}

__device__ __forceinline__ static string_view::const_iterator find_char(
cudf::char_utf8 chr, string_view const d_str, string_view::const_iterator itr)
{
while (itr.byte_offset() < d_str.size_bytes() && *itr != chr) {
++itr;
}
return itr;
}

/**
* @brief Evaluate a specific string against regex pattern compiled to this instance.
*
Expand Down Expand Up @@ -253,16 +262,16 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const
case BOL:
if (pos == 0) break;
if (jnk.startchar != '^') { return thrust::nullopt; }
--pos;
--itr;
startchar = static_cast<char_utf8>('\n');
case CHAR: {
auto const fidx = dstr.find(startchar, pos);
if (fidx == string_view::npos) { return thrust::nullopt; }
pos = fidx + (jnk.starttype == BOL);
auto const find_itr = find_char(startchar, dstr, itr);
if (find_itr.byte_offset() >= dstr.size_bytes()) { return thrust::nullopt; }
itr = find_itr + (jnk.starttype == BOL);
pos = itr.position();
break;
}
}
itr += (pos - itr.position()); // faster to increment position
}

if (((eos < 0) || (pos < eos)) && match == 0) {
Expand Down
9 changes: 9 additions & 0 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -568,6 +568,15 @@ ConfigureTest(
strings/urls_tests.cpp
)

# ##################################################################################################
# * large strings test ----------------------------------------------------------------------------
ConfigureTest(
LARGE_STRINGS_TEST large_strings/large_strings_fixture.cpp large_strings/merge_tests.cpp
large_strings/concatenate_tests.cpp
GPUS 1
PERCENT 100
)

# ##################################################################################################
# * json path test --------------------------------------------------------------------------------
ConfigureTest(JSON_PATH_TEST json/json_tests.cpp)
Expand Down
43 changes: 0 additions & 43 deletions cpp/tests/copying/concatenate_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -197,49 +197,6 @@ TEST_F(StringColumnTest, ConcatenateTooLarge)
EXPECT_THROW(cudf::concatenate(input_cols), std::overflow_error);
}

TEST_F(StringColumnTest, ConcatenateLargeStrings)
{
CUDF_TEST_ENABLE_LARGE_STRINGS();
auto itr = thrust::constant_iterator<std::string_view>(
"abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY"); // 50 bytes
auto input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000); // 250MB
auto view = cudf::column_view(input);
std::vector<cudf::column_view> input_cols;
std::vector<cudf::size_type> splits;
int const multiplier = 10;
for (int i = 0; i < multiplier; ++i) { // 2500MB > 2GB
input_cols.push_back(view);
splits.push_back(view.size() * (i + 1));
}
splits.pop_back(); // remove last entry
auto result = cudf::concatenate(input_cols);
auto sv = cudf::strings_column_view(result->view());
EXPECT_EQ(sv.size(), view.size() * multiplier);
EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});

// verify results in sections
auto sliced = cudf::split(result->view(), splits);
for (auto c : sliced) {
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
}

// also test with large strings column as input
{
input_cols.clear();
input_cols.push_back(input); // regular column
input_cols.push_back(result->view()); // large column
result = cudf::concatenate(input_cols);
sv = cudf::strings_column_view(result->view());
EXPECT_EQ(sv.size(), view.size() * (multiplier + 1));
EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
splits.push_back(view.size() * multiplier);
sliced = cudf::split(result->view(), splits);
for (auto c : sliced) {
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
}
}
}

struct TableTest : public cudf::test::BaseFixture {};

TEST_F(TableTest, ConcatenateTables)
Expand Down
Loading

0 comments on commit 4e980bb

Please sign in to comment.