Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

[SYCL] Add test cases for SYCL-2020 reductions used in read_write mode #170

Merged
merged 5 commits into from
Mar 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
155 changes: 101 additions & 54 deletions SYCL/Reduction/reduction_nd_N_vars.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,14 @@
#include <numeric>
#include <string>

template <typename... Ts> class KernelNameGroup;

using namespace cl::sycl;

template <typename... Ts> class KNameGroup;
template <typename T, bool B> class KName;

constexpr access::mode RW = access::mode::read_write;
constexpr access::mode DW = access::mode::discard_write;

template <typename T>
bool cherkResultIsExpected(int TestCaseNum, T Expected, T Computed) {
bool Success;
Expand All @@ -46,11 +50,12 @@ bool cherkResultIsExpected(int TestCaseNum, T Expected, T Computed) {
return Success;
}

template <class ReductionExample, typename T1, access::mode Mode1, typename T2,
access::mode Mode2, typename T3, access::mode Mode3, typename T4,
class BinaryOperation1, class BinaryOperation2,
// Returns 0 if the test case passed. Otherwise, some non-zero value.
template <class Name, bool IsSYCL2020Mode, typename T1, access::mode Mode1,
typename T2, access::mode Mode2, typename T3, access::mode Mode3,
typename T4, class BinaryOperation1, class BinaryOperation2,
class BinaryOperation3, class BinaryOperation4>
int runTest(T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,
int testOne(T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,
T2 IdentityVal2, T2 InitVal2, BinaryOperation2 BOp2,
T3 IdentityVal3, T3 InitVal3, BinaryOperation3 BOp3,
T4 IdentityVal4, T3 InitVal4, BinaryOperation4 BOp4,
Expand All @@ -67,16 +72,16 @@ int runTest(T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,
auto Dev = Q.get_device();
if (AllocType4 == usm::alloc::shared &&
!Dev.get_info<info::device::usm_shared_allocations>())
return 4;
return 0;
if (AllocType4 == usm::alloc::host &&
!Dev.get_info<info::device::usm_host_allocations>())
return 4;
return 0;
if (AllocType4 == usm::alloc::device &&
!Dev.get_info<info::device::usm_device_allocations>())
return 4;
return 0;
T4 *Out4 = (T4 *)malloc(sizeof(T4), Dev, Q.get_context(), AllocType4);
if (Out4 == nullptr)
return 4;
return 1;

// Initialize the arrays with sentinel values
// and pre-compute the expected result 'CorrectOut'.
Expand Down Expand Up @@ -109,48 +114,65 @@ int runTest(T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,

if (AllocType4 == usm::alloc::device) {
Q.submit([&](handler &CGH) {
CGH.single_task<
KernelNameGroup<ReductionExample, class KernelNameUSM4>>(
CGH.single_task<KNameGroup<Name, class KernelNameUSM4>>(
[=]() { *Out4 = InitVal4; });
}).wait();
} else {
*Out4 = InitVal4;
}
}

// The main code to be tested.
Q.submit([&](handler &CGH) {
auto In1 = InBuf1.template get_access<access::mode::read>(CGH);
auto In2 = InBuf2.template get_access<access::mode::read>(CGH);
auto In3 = InBuf3.template get_access<access::mode::read>(CGH);
auto In4 = InBuf4.template get_access<access::mode::read>(CGH);

auto Out1 = OutBuf1.template get_access<Mode1>(CGH);
auto Out2 = OutBuf2.template get_access<Mode2>(CGH);
accessor<T3, 0, Mode3, access::target::global_buffer> Out3(OutBuf3, CGH);

auto Lambda = [=](nd_item<1> NDIt, auto &Sum1, auto &Sum2, auto &Sum3,
auto &Sum4) {
size_t I = NDIt.get_global_id(0);
Sum1.combine(In1[I]);
Sum2.combine(In2[I]);
Sum3.combine(In3[I]);
Sum4.combine(In4[I]);
};

auto Redu1 =
ONEAPI::reduction<T1, BinaryOperation1>(Out1, IdentityVal1, BOp1);
auto Redu2 =
ONEAPI::reduction<T2, BinaryOperation2>(Out2, IdentityVal2, BOp2);
auto Redu3 =
ONEAPI::reduction<T3, BinaryOperation3>(Out3, IdentityVal3, BOp3);
auto Redu4 =
ONEAPI::reduction<T4, BinaryOperation4>(Out4, IdentityVal4, BOp4);

auto NDR = nd_range<1>{range<1>(NWorkItems), range<1>{WGSize}};
CGH.parallel_for<ReductionExample>(NDR, Redu1, Redu2, Redu3, Redu4,
Lambda);
}).wait();
auto NDR = nd_range<1>{range<1>(NWorkItems), range<1>{WGSize}};
if constexpr (IsSYCL2020Mode) {
Q.submit([&](handler &CGH) {
auto In1 = InBuf1.template get_access<access::mode::read>(CGH);
auto In2 = InBuf2.template get_access<access::mode::read>(CGH);
auto In3 = InBuf3.template get_access<access::mode::read>(CGH);
auto In4 = InBuf4.template get_access<access::mode::read>(CGH);

auto Redu1 = sycl::reduction(OutBuf1, CGH, IdentityVal1, BOp1);
auto Redu2 = sycl::reduction(OutBuf2, CGH, IdentityVal2, BOp2);
auto Redu3 = sycl::reduction(OutBuf3, CGH, IdentityVal3, BOp3);
auto Redu4 = sycl::reduction(Out4, IdentityVal4, BOp4);

auto Lambda = [=](nd_item<1> NDIt, auto &Sum1, auto &Sum2, auto &Sum3,
auto &Sum4) {
size_t I = NDIt.get_global_id(0);
Sum1.combine(In1[I]);
Sum2.combine(In2[I]);
Sum3.combine(In3[I]);
Sum4.combine(In4[I]);
};
CGH.parallel_for<Name>(NDR, Redu1, Redu2, Redu3, Redu4, Lambda);
}).wait();
} else {
// Test ONEAPI reductions
Q.submit([&](handler &CGH) {
auto In1 = InBuf1.template get_access<access::mode::read>(CGH);
auto In2 = InBuf2.template get_access<access::mode::read>(CGH);
auto In3 = InBuf3.template get_access<access::mode::read>(CGH);
auto In4 = InBuf4.template get_access<access::mode::read>(CGH);

auto Out1 = OutBuf1.template get_access<Mode1>(CGH);
auto Out2 = OutBuf2.template get_access<Mode2>(CGH);
accessor<T3, 0, Mode3, access::target::global_buffer> Out3(OutBuf3, CGH);

auto Redu1 = ONEAPI::reduction(Out1, IdentityVal1, BOp1);
auto Redu2 = ONEAPI::reduction(Out2, IdentityVal2, BOp2);
auto Redu3 = ONEAPI::reduction(Out3, IdentityVal3, BOp3);
auto Redu4 = ONEAPI::reduction(Out4, IdentityVal4, BOp4);

auto Lambda = [=](nd_item<1> NDIt, auto &Sum1, auto &Sum2, auto &Sum3,
auto &Sum4) {
size_t I = NDIt.get_global_id(0);
Sum1.combine(In1[I]);
Sum2.combine(In2[I]);
Sum3.combine(In3[I]);
Sum4.combine(In4[I]);
};
CGH.parallel_for<Name>(NDR, Redu1, Redu2, Redu3, Redu4, Lambda);
}).wait();
}

// Check the results and free memory.
int Error = 0;
Expand Down Expand Up @@ -185,18 +207,43 @@ int runTest(T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,
return Error;
}

int main() {
// Tests both implementations of reduction:
// sycl::reduction and sycl::ONEAPI::reduction
template <class Name, typename T1, access::mode Mode1, typename T2,
access::mode Mode2, typename T3, access::mode Mode3, typename T4,
class BinaryOperation1, class BinaryOperation2,
class BinaryOperation3, class BinaryOperation4>
int testBoth(T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,
T2 IdentityVal2, T2 InitVal2, BinaryOperation2 BOp2,
T3 IdentityVal3, T3 InitVal3, BinaryOperation3 BOp3,
T4 IdentityVal4, T3 InitVal4, BinaryOperation4 BOp4,
usm::alloc AllocType4, size_t NWorkItems, size_t WGSize) {
int Error =
runTest<class ReduFloatPlus16x1, float, access::mode::discard_write, int,
access::mode::read_write, short, access::mode::read_write, int>(
0, 1000, std::plus<float>{}, 0, 2000, std::plus<>{}, 0, 4000,
std::bit_or<>{}, 0, 8000, std::bit_xor<>{}, usm::alloc::shared, 16,
16);
testOne<KName<Name, false>, false, T1, Mode1, T2, Mode2, T3, Mode3, T4>(
IdentityVal1, InitVal1, BOp1, IdentityVal2, InitVal2, BOp2,
IdentityVal3, InitVal3, BOp3, IdentityVal4, InitVal4, BOp4,
AllocType4, NWorkItems, WGSize);

// TODO: property::reduction::initialize_to_identity is not supported yet.
// Thus only read_write mode is tested now.
constexpr access::mode _Mode1 = (Mode1 == DW) ? RW : Mode1;
constexpr access::mode _Mode2 = (Mode2 == DW) ? RW : Mode2;
constexpr access::mode _Mode3 = (Mode3 == DW) ? RW : Mode3;
Error +=
testOne<KName<Name, true>, true, T1, _Mode1, T2, _Mode2, T3, _Mode3, T4>(
IdentityVal1, InitVal1, BOp1, IdentityVal2, InitVal2, BOp2,
IdentityVal3, InitVal3, BOp3, IdentityVal4, InitVal4, BOp4,
AllocType4, NWorkItems, WGSize);
return Error;
}

int main() {
int Error = testBoth<class FP32Plus16x16, float, DW, int, RW, short, RW, int>(
0, 1000, std::plus<float>{}, 0, 2000, std::plus<>{}, 0, 4000,
std::bit_or<>{}, 0, 8000, std::bit_xor<>{}, usm::alloc::shared, 16, 16);

auto Add = [](auto x, auto y) { return (x + y); };
Error += runTest<class ReduFloatPlus5x257, float, access::mode::read_write,
int, access::mode::read_write, short,
access::mode::discard_write, int>(
Error += testBoth<class FP32Plus5x257, float, RW, int, RW, short, DW, int>(
0, 1000, std::plus<float>{}, 0, 2000, std::plus<>{}, 0, 4000, Add, 0,
8000, std::bit_xor<int>{}, usm::alloc::device, 5 * (256 + 1), 5);

Expand Down
81 changes: 49 additions & 32 deletions SYCL/Reduction/reduction_nd_ext_type.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@

using namespace cl::sycl;

template <typename SpecializationKernelName, typename T, int Dim,
template <typename T, bool B> class KName;
constexpr access::mode RW = access::mode::read_write;
constexpr access::mode DW = access::mode::discard_write;

template <typename Name, bool IsSYCL2020Mode, typename T, int Dim,
access::mode Mode, class BinaryOperation>
void test(T Identity, size_t WGSize, size_t NWItems) {
buffer<T, 1> InBuf(NWItems);
Expand All @@ -24,19 +28,27 @@ void test(T Identity, size_t WGSize, size_t NWItems) {

// Compute.
queue Q;
Q.submit([&](handler &CGH) {
auto In = InBuf.template get_access<access::mode::read>(CGH);
accessor<T, Dim, Mode, access::target::global_buffer> Out(OutBuf, CGH);
auto Redu = ONEAPI::reduction(Out, Identity, BOp);

range<1> GlobalRange(NWItems);
range<1> LocalRange(WGSize);
nd_range<1> NDRange(GlobalRange, LocalRange);
CGH.parallel_for<SpecializationKernelName>(
NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) {
Sum.combine(In[NDIt.get_global_linear_id()]);
});
});
nd_range<1> NDRange(range<1>{NWItems}, range<1>{WGSize});
if constexpr (IsSYCL2020Mode) {
Q.submit([&](handler &CGH) {
auto In = InBuf.template get_access<access::mode::read>(CGH);
auto Redu = sycl::reduction(OutBuf, CGH, Identity, BOp);

CGH.parallel_for<Name>(NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) {
Sum.combine(In[NDIt.get_global_linear_id()]);
});
});
} else {
Q.submit([&](handler &CGH) {
auto In = InBuf.template get_access<access::mode::read>(CGH);
accessor<T, Dim, Mode, access::target::global_buffer> Out(OutBuf, CGH);
auto Redu = ONEAPI::reduction(Out, Identity, BOp);

CGH.parallel_for<Name>(NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) {
Sum.combine(In[NDIt.get_global_linear_id()]);
});
});
}

// Check correctness.
auto Out = OutBuf.template get_access<access::mode::read>();
Expand All @@ -52,6 +64,19 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
}
}

template <typename Name, typename T, int Dim, access::mode Mode,
class BinaryOperation>
void testBoth(T Identity, size_t WGSize, size_t NWItems) {
test<KName<Name, false>, false, T, Dim, Mode, BinaryOperation>(
Identity, WGSize, NWItems);

// TODO: property::reduction::initialize_to_identity is not supported yet.
// Thus only read_write mode is tested now.
constexpr access::mode _Mode = (Mode == DW) ? RW : Mode;
test<KName<Name, true>, true, T, Dim, _Mode, BinaryOperation>(
Identity, WGSize, NWItems);
}

template <typename T> int runTests(const string_class &ExtensionName) {
device D = default_selector().select_device();
if (!D.is_host() && !D.has_extension(ExtensionName)) {
Expand All @@ -60,24 +85,16 @@ template <typename T> int runTests(const string_class &ExtensionName) {
}

// Check some less standards WG sizes and corner cases first.
test<class KernelName_oTh, T, 1, access::mode::read_write,
std::multiplies<T>>(0, 4, 4);
test<class KernelName_QUQnMARQT, T, 0, access::mode::discard_write,
ONEAPI::plus<T>>(0, 4, 64);

test<class KernelName_xGixNo, T, 0, access::mode::read_write,
ONEAPI::minimum<T>>(getMaximumFPValue<T>(), 7, 7);
test<class KernelName_qXNFw, T, 1, access::mode::discard_write,
ONEAPI::maximum<T>>(getMinimumFPValue<T>(), 7, 7 * 5);

#if __cplusplus >= 201402L
test<class KernelName_lXdWtzANdDcvm, T, 1, access::mode::read_write,
ONEAPI::plus<>>(1, 3, 3 * 5);
test<class KernelName_FDQalsDxmbi, T, 1, access::mode::discard_write,
ONEAPI::minimum<>>(getMaximumFPValue<T>(), 3, 3);
test<class KernelName_TaNRRxDRXbzYrFImPYC, T, 0, access::mode::discard_write,
ONEAPI::maximum<>>(getMinimumFPValue<T>(), 3, 3);
#endif // __cplusplus >= 201402L
testBoth<class A, T, 1, RW, std::multiplies<T>>(0, 4, 4);
testBoth<class B, T, 0, DW, ONEAPI::plus<T>>(0, 4, 64);

testBoth<class C, T, 0, RW, ONEAPI::minimum<T>>(getMaximumFPValue<T>(), 7, 7);
testBoth<class D, T, 1, access::mode::discard_write, ONEAPI::maximum<T>>(
getMinimumFPValue<T>(), 7, 7 * 5);

testBoth<class E, T, 1, RW, ONEAPI::plus<>>(1, 3, 3 * 5);
testBoth<class F, T, 1, DW, ONEAPI::minimum<>>(getMaximumFPValue<T>(), 3, 3);
testBoth<class G, T, 0, DW, ONEAPI::maximum<>>(getMinimumFPValue<T>(), 3, 3);

std::cout << "Test passed\n";
return 0;
Expand Down
Loading