Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL][Reduction] Fix identityless reductions with unwritten reducers #8709

872 changes: 549 additions & 323 deletions sycl/include/sycl/reduction.hpp

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions sycl/test-e2e/Reduction/reduction_big_data.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,9 @@ int test(queue &Q, T Identity) {

// Initialize.
BinaryOperation BOp;
T CorrectOut;
initInputData(InBuf, CorrectOut, BOp, NWorkItems);
std::optional<T> CorrectOutOpt;
initInputData(InBuf, CorrectOutOpt, BOp, NWorkItems);
T CorrectOut = *CorrectOutOpt;

// Compute.
Q.submit([&](handler &CGH) {
Expand Down
4 changes: 2 additions & 2 deletions sycl/test-e2e/Reduction/reduction_ctor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ void test_reducer(Reduction &Redu, T A, T B) {

typename Reduction::binary_operation BOp;
T ExpectedValue = BOp(A, B);
assert(ExpectedValue == detail::ReducerAccess{Reducer}.getElement(0) &&
assert(ExpectedValue == *detail::ReducerAccess{Reducer}.getElement(0) &&
"Wrong result of binary operation.");
assert(
toBool(Reducer.identity() == Redu.getIdentityContainer().getIdentity()) &&
Expand All @@ -40,7 +40,7 @@ void test_reducer(Reduction &Redu, T Identity, BinaryOperation BOp, T A, T B) {

T ExpectedValue = BOp(A, B);
assert(
toBool(ExpectedValue == detail::ReducerAccess{Reducer}.getElement(0)) &&
toBool(ExpectedValue == *detail::ReducerAccess{Reducer}.getElement(0)) &&
"Wrong result of binary operation.");
assert(
toBool(Reducer.identity() == Redu.getIdentityContainer().getIdentity()) &&
Expand Down
4 changes: 3 additions & 1 deletion sycl/test-e2e/Reduction/reduction_nd_N_vars.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@ struct Red {
}

void init() {
initInputData(InBuf, CorrectOut, BOp, NWorkItems);
std::optional<T> CorrectOutOpt;
initInputData(InBuf, CorrectOutOpt, BOp, NWorkItems);
CorrectOut = *CorrectOutOpt;
if (!PropList.template has_property<
property::reduction::initialize_to_identity>())
CorrectOut = BOp(CorrectOut, InitVal);
Expand Down
66 changes: 66 additions & 0 deletions sycl/test-e2e/Reduction/reduction_nd_reducer_skip.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// RUN: %ACC_RUN_PLACEHOLDER %t.out
//
// Group algorithms are not supported on Nvidia.
// XFAIL: hip_nvidia

// This test performs basic checks of parallel_for(nd_range, reduction, func)
// with reductions initialized with a one element buffer. Additionally, some
// reducers will not be written to.

#include "reduction_utils.hpp"

using namespace sycl;

int NumErrors = 0;

template <typename T> class SkipEvenName;
template <typename T> class SkipOddName;
template <typename T> class SkipAllName;

template <typename Name, typename T, class BinaryOperation>
void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t WGSize,
size_t NWItems) {
nd_range<1> NDRange(range<1>{NWItems}, range<1>{WGSize});
NumErrors += test<SkipEvenName<Name>, T>(Q, Identity, Init, BOp, NDRange,
property_list{}, SkipEvenOp{});
NumErrors += test<SkipOddName<Name>, T>(Q, Identity, Init, BOp, NDRange,
property_list{}, SkipOddOp{});
NumErrors += test<SkipAllName<Name>, T>(Q, Identity, Init, BOp, NDRange,
property_list{}, SkipAllOp{});
}

int main() {
queue Q;
printDeviceInfo(Q);

// Check some non power-of-two work-group sizes.
tests<class A1, int>(Q, 0, 99, std::plus<int>{}, 1, 7);
tests<class A2, int>(Q, 0, 99, std::plus<int>{}, 49, 49 * 5);

// Try some power-of-two work-group sizes.
tests<class B1, int>(Q, 0, 99, std::plus<>{}, 1, 32);
tests<class B2, int>(Q, 1, 99, std::multiplies<>{}, 4, 32);
tests<class B3, int>(Q, 0, 99, std::bit_or<>{}, 8, 128);
tests<class B4, int>(Q, 0, 99, std::bit_xor<>{}, 16, 256);
tests<class B5, int>(Q, ~0, 99, std::bit_and<>{}, 32, 256);
tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
ext::oneapi::minimum<>{}, 64, 256);
tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
ext::oneapi::maximum<>{}, 128, 256);
tests<class B8, int>(Q, 0, 99, std::plus<>{}, 256, 256);

// Check with various types.
tests<class C1, float>(Q, 1, 99, std::multiplies<>{}, 8, 24);
tests<class C2, short>(Q, 0x7fff, -99, ext::oneapi::minimum<>{}, 8, 256);
tests<class C3, unsigned char>(Q, 0, 99, ext::oneapi::maximum<>{}, 8, 256);

// Check with CUSTOM type.
using CV = CustomVec<long long>;
tests<class D1, CV>(Q, CV(0), CV(99), CustomVecPlus<long long>{}, 8, 256);

printFinalStatus(NumErrors);
return NumErrors;
}
65 changes: 65 additions & 0 deletions sycl/test-e2e/Reduction/reduction_range_1d_reducer_skip.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// RUN: %ACC_RUN_PLACEHOLDER %t.out

// This test performs basic checks of parallel_for(range<1>, reduction, func)
// with reductions initialized with a one element buffer. Additionally, some
// reducers will not be written to.

#include "reduction_utils.hpp"

using namespace sycl;

int NumErrors = 0;

template <typename T> class SkipEvenName;
template <typename T> class SkipOddName;
template <typename T> class SkipAllName;

template <typename Name, typename T, typename... ArgTys>
void tests(ArgTys &&...Args) {
NumErrors += test<SkipEvenName<Name>, T>(std::forward<ArgTys>(Args)...,
property_list{}, SkipEvenOp{});
NumErrors += test<SkipOddName<Name>, T>(std::forward<ArgTys>(Args)...,
property_list{}, SkipOddOp{});
NumErrors += test<SkipAllName<Name>, T>(std::forward<ArgTys>(Args)...,
property_list{}, SkipAllOp{});
}

int main() {
queue Q;
printDeviceInfo(Q);
size_t MaxWGSize =
Q.get_device().get_info<info::device::max_work_group_size>();

constexpr access::mode RW = access::mode::read_write;
// Fast-reduce and Fast-atomics. Try various range types/sizes.
tests<class A1, int>(Q, 0, 99, std::plus<int>{}, range<1>(1));
tests<class A2, int>(Q, 0, 99, std::plus<>{}, range<1>(2));
tests<class A3, int>(Q, 0, 99, std::plus<>{}, range<1>(7));
tests<class A4, int>(Q, 0, 99, std::plus<>{}, range<1>(64));
tests<class A5, int>(Q, 0, 99, std::plus<>{}, range<1>(MaxWGSize * 2));
tests<class A6, int>(Q, 0, 99, std::plus<>{}, range<1>(MaxWGSize * 2 + 5));

// Check with CUSTOM type.
tests<class B1, CustomVec<long long>>(Q, 0, 99, CustomVecPlus<long long>{},
range<1>(256));
tests<class B2, CustomVec<long long>>(Q, 0, 99, CustomVecPlus<long long>{},
range<1>(MaxWGSize * 3));
tests<class B3, CustomVec<long long>>(Q, 99, CustomVecPlus<long long>{},
range<1>(72));

// Check with identityless operations.
tests<class C1, int>(Q, 99, PlusWithoutIdentity<int>{}, range<1>(1));
tests<class C2, int>(Q, 99, PlusWithoutIdentity<int>{}, range<1>(2));
tests<class C3, int>(Q, 99, PlusWithoutIdentity<int>{}, range<1>(7));
tests<class C4, int>(Q, 99, PlusWithoutIdentity<int>{}, range<1>(64));
tests<class C5, int>(Q, 99, PlusWithoutIdentity<int>{},
range<1>(MaxWGSize * 2));
tests<class C6, int>(Q, 99, PlusWithoutIdentity<int>{},
range<1>(MaxWGSize * 2 + 5));

printFinalStatus(NumErrors);
return NumErrors;
}
69 changes: 69 additions & 0 deletions sycl/test-e2e/Reduction/reduction_range_2d_dw_reducer_skip.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out

// TODO: accelerator may not suport atomics required by the current
// implementation. Enable testing when implementation is fixed.
// RUNx: %ACC_RUN_PLACEHOLDER %t.out

// This test performs basic checks of parallel_for(range<2>, reduction, func)
// with reductions initialized with a one element buffer. Additionally, some
// reducers will not be written to.

#include "reduction_utils.hpp"

using namespace sycl;

int NumErrors = 0;

template <typename T> class SkipEvenName;
template <typename T> class SkipOddName;
template <typename T> class SkipAllName;

template <typename Name, typename T, typename... ArgTys>
void tests(ArgTys &&...Args) {
NumErrors += test<SkipEvenName<Name>, T>(std::forward<ArgTys>(Args)...,
property_list{}, SkipEvenOp{});
NumErrors += test<SkipOddName<Name>, T>(std::forward<ArgTys>(Args)...,
property_list{}, SkipOddOp{});
NumErrors += test<SkipAllName<Name>, T>(std::forward<ArgTys>(Args)...,
property_list{}, SkipAllOp{});
}

int main() {
queue Q;
printDeviceInfo(Q);
size_t MaxWGSize =
Q.get_device().get_info<info::device::max_work_group_size>();

tests<class A1, int>(Q, 0, 99, std::plus<>{}, range<2>{1, 1});
tests<class A2, int>(Q, 0, 99, std::plus<>{}, range<2>{2, 2});
tests<class A3, int>(Q, 0, 99, std::plus<>{}, range<2>{2, 3});
tests<class A4, int>(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize, 1});
tests<class A5, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{1, MaxWGSize});
tests<class A6, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{2, MaxWGSize * 2});
tests<class A7, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize * 3, 7});
tests<class A8, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{3, MaxWGSize * 3});

tests<class B1, CustomVec<long long>>(Q, 0, 99, CustomVecPlus<long long>{},
range<2>{33, MaxWGSize});
tests<class B2, CustomVec<long long>>(Q, 99, CustomVecPlus<long long>{},
range<2>{33, MaxWGSize});

tests<class C1, int>(Q, 99, PlusWithoutIdentity<int>{}, range<2>{1, 1});
tests<class C2, int>(Q, 99, PlusWithoutIdentity<int>{}, range<2>{2, 2});
tests<class C3, int>(Q, 99, PlusWithoutIdentity<int>{}, range<2>{2, 3});
tests<class C4, int>(Q, 99, PlusWithoutIdentity<int>{},
range<2>{MaxWGSize, 1});
tests<class C5, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<2>{1, MaxWGSize});
tests<class C6, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<2>{2, MaxWGSize * 2});
tests<class C7, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<2>{MaxWGSize * 3, 7});
tests<class C8, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<2>{3, MaxWGSize * 3});

printFinalStatus(NumErrors);
return NumErrors;
}
23 changes: 21 additions & 2 deletions sycl/test-e2e/Reduction/reduction_range_3d_rw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,12 @@ int main() {
tests<class D2, int>(Q, 99, PlusWithoutIdentity<int>{}, range<3>{2, 2, 2});
tests<class D3, int>(Q, 99, PlusWithoutIdentity<int>{}, range<3>{2, 3, 4});

/* Temporarily disabled
tests<class D4, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<3>{1, 1, MaxWGSize + 1});
tests<class D5, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<3>{1, MaxWGSize + 1, 1});
tests<class D6, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<3>{MaxWGSize + 1, 1, 1});
*/

tests<class D7, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<3>{2, 5, MaxWGSize * 2});
Expand All @@ -83,6 +81,27 @@ int main() {
tests<class D9, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<3>{MaxWGSize * 3, 8, 4});

tests<class E1, int>(Q, 99, MultipliesWithoutIdentity<int>{},
range<3>{1, 1, 1});
tests<class E2, int>(Q, 99, MultipliesWithoutIdentity<int>{},
range<3>{2, 2, 2});
tests<class E3, int>(Q, 99, MultipliesWithoutIdentity<int>{},
range<3>{2, 3, 4});

tests<class E4, int64_t>(Q, 99, MultipliesWithoutIdentity<int64_t>{},
range<3>{1, 1, MaxWGSize + 1});
tests<class E5, int64_t>(Q, 99, MultipliesWithoutIdentity<int64_t>{},
range<3>{1, MaxWGSize + 1, 1});
tests<class E6, int64_t>(Q, 99, MultipliesWithoutIdentity<int64_t>{},
range<3>{MaxWGSize + 1, 1, 1});

tests<class E7, int64_t>(Q, 99, MultipliesWithoutIdentity<int64_t>{},
range<3>{2, 5, MaxWGSize * 2});
tests<class E8, int64_t>(Q, 99, MultipliesWithoutIdentity<int64_t>{},
range<3>{3, MaxWGSize * 3, 2});
tests<class E9, int64_t>(Q, 99, MultipliesWithoutIdentity<int64_t>{},
range<3>{MaxWGSize * 3, 8, 4});

printFinalStatus(NumErrors);
return NumErrors;
}
82 changes: 82 additions & 0 deletions sycl/test-e2e/Reduction/reduction_range_3d_rw_reducer_skip.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out

// TODO: accelerator may not suport atomics required by the current
// implementation. Enable testing when implementation is fixed.
// RUNx: %ACC_RUN_PLACEHOLDER %t.out

// This test performs basic checks of parallel_for(range<3>, reduction, func)
// with reductions initialized with a one element buffer. Additionally, some
// reducers will not be written to.

#include "reduction_utils.hpp"

using namespace sycl;

int NumErrors = 0;

template <typename T> class SkipEvenName;
template <typename T> class SkipOddName;
template <typename T> class SkipAllName;

template <typename Name, typename T, typename... ArgTys>
void tests(ArgTys &&...Args) {
NumErrors += test<SkipEvenName<Name>, T>(std::forward<ArgTys>(Args)...,
property_list{}, SkipEvenOp{});
NumErrors += test<SkipOddName<Name>, T>(std::forward<ArgTys>(Args)...,
property_list{}, SkipOddOp{});
NumErrors += test<SkipAllName<Name>, T>(std::forward<ArgTys>(Args)...,
property_list{}, SkipAllOp{});
}

int main() {
queue Q;
printDeviceInfo(Q);
size_t MaxWGSize =
Q.get_device().get_info<info::device::max_work_group_size>();

tests<class A1, int>(Q, 0, 99, std::plus<>{}, range<3>{1, 1, 1});
tests<class A2, int>(Q, 0, 99, std::plus<>{}, range<3>{2, 2, 2});
tests<class A3, int>(Q, 0, 99, std::plus<>{}, range<3>{2, 3, 4});

tests<class A4, int64_t>(Q, 0, 99, std::plus<>{},
range<3>{1, 1, MaxWGSize + 1});
tests<class A5, int64_t>(Q, 0, 99, std::plus<>{},
range<3>{1, MaxWGSize + 1, 1});
tests<class A6, int64_t>(Q, 0, 99, std::plus<>{},
range<3>{MaxWGSize + 1, 1, 1});

tests<class A7, int64_t>(Q, 0, 99, std::plus<>{},
range<3>{2, 5, MaxWGSize * 2});
tests<class A8, int64_t>(Q, 0, 99, std::plus<>{},
range<3>{3, MaxWGSize * 3, 2});
tests<class A9, int64_t>(Q, 0, 99, std::plus<>{},
range<3>{MaxWGSize * 3, 8, 4});

tests<class B1, CustomVec<long long>>(Q, 0, 99, CustomVecPlus<long long>{},
range<3>{2, 33, MaxWGSize});
tests<class B2, CustomVec<long long>>(Q, 99, CustomVecPlus<long long>{},
range<3>{2, 33, MaxWGSize});

tests<class C1, int>(Q, 99, PlusWithoutIdentity<int>{}, range<3>{1, 1, 1});
tests<class C2, int>(Q, 99, PlusWithoutIdentity<int>{}, range<3>{2, 2, 2});
tests<class C3, int>(Q, 99, PlusWithoutIdentity<int>{}, range<3>{2, 3, 4});

tests<class C4, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<3>{1, 1, MaxWGSize + 1});
tests<class C5, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<3>{1, MaxWGSize + 1, 1});
tests<class C6, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<3>{MaxWGSize + 1, 1, 1});

tests<class C7, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<3>{2, 5, MaxWGSize * 2});
tests<class C8, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<3>{3, MaxWGSize * 3, 2});
tests<class C9, int64_t>(Q, 99, PlusWithoutIdentity<int64_t>{},
range<3>{MaxWGSize * 3, 8, 4});

printFinalStatus(NumErrors);
return NumErrors;
}
4 changes: 3 additions & 1 deletion sycl/test-e2e/Reduction/reduction_range_N_vars.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@ struct Red {
}

void init() {
initInputData(InBuf, CorrectOut, BOp, NWorkItems);
std::optional<T> CorrectOutOpt;
initInputData(InBuf, CorrectOutOpt, BOp, NWorkItems);
CorrectOut = *CorrectOutOpt;
if (!PropList.template has_property<
property::reduction::initialize_to_identity>())
CorrectOut = BOp(CorrectOut, InitVal);
Expand Down
6 changes: 3 additions & 3 deletions sycl/test-e2e/Reduction/reduction_usm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,12 @@ int test(queue &Q, OptionalIdentity<T, HasIdentity> Identity, T Init,
}

// Initialize.
T CorrectOut;
std::optional<T> CorrectOutOpt;
BinaryOperation BOp;

buffer<T, 1> InBuf(NWItems);
initInputData(InBuf, CorrectOut, BOp, NWItems);
CorrectOut = BOp(CorrectOut, Init);
initInputData(InBuf, CorrectOutOpt, BOp, NWItems);
T CorrectOut = BOp(*CorrectOutOpt, Init);

// Compute.
Q.submit([&](handler &CGH) {
Expand Down
Loading