Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit bcc75c7

Browse files
authored
[SYCL] Add test cases for SYCL-2020 reductions used in read_write mode (#170)
Signed-off-by: Vyacheslav N Klochkov <[email protected]>
1 parent f7036ed commit bcc75c7

8 files changed

+449
-278
lines changed

SYCL/Reduction/reduction_nd_N_vars.cpp

Lines changed: 101 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,14 @@
2727
#include <numeric>
2828
#include <string>
2929

30-
template <typename... Ts> class KernelNameGroup;
31-
3230
using namespace cl::sycl;
3331

32+
template <typename... Ts> class KNameGroup;
33+
template <typename T, bool B> class KName;
34+
35+
constexpr access::mode RW = access::mode::read_write;
36+
constexpr access::mode DW = access::mode::discard_write;
37+
3438
template <typename T>
3539
bool cherkResultIsExpected(int TestCaseNum, T Expected, T Computed) {
3640
bool Success;
@@ -46,11 +50,12 @@ bool cherkResultIsExpected(int TestCaseNum, T Expected, T Computed) {
4650
return Success;
4751
}
4852

49-
template <class ReductionExample, typename T1, access::mode Mode1, typename T2,
50-
access::mode Mode2, typename T3, access::mode Mode3, typename T4,
51-
class BinaryOperation1, class BinaryOperation2,
53+
// Returns 0 if the test case passed. Otherwise, some non-zero value.
54+
template <class Name, bool IsSYCL2020Mode, typename T1, access::mode Mode1,
55+
typename T2, access::mode Mode2, typename T3, access::mode Mode3,
56+
typename T4, class BinaryOperation1, class BinaryOperation2,
5257
class BinaryOperation3, class BinaryOperation4>
53-
int runTest(T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,
58+
int testOne(T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,
5459
T2 IdentityVal2, T2 InitVal2, BinaryOperation2 BOp2,
5560
T3 IdentityVal3, T3 InitVal3, BinaryOperation3 BOp3,
5661
T4 IdentityVal4, T3 InitVal4, BinaryOperation4 BOp4,
@@ -67,16 +72,16 @@ int runTest(T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,
6772
auto Dev = Q.get_device();
6873
if (AllocType4 == usm::alloc::shared &&
6974
!Dev.get_info<info::device::usm_shared_allocations>())
70-
return 4;
75+
return 0;
7176
if (AllocType4 == usm::alloc::host &&
7277
!Dev.get_info<info::device::usm_host_allocations>())
73-
return 4;
78+
return 0;
7479
if (AllocType4 == usm::alloc::device &&
7580
!Dev.get_info<info::device::usm_device_allocations>())
76-
return 4;
81+
return 0;
7782
T4 *Out4 = (T4 *)malloc(sizeof(T4), Dev, Q.get_context(), AllocType4);
7883
if (Out4 == nullptr)
79-
return 4;
84+
return 1;
8085

8186
// Initialize the arrays with sentinel values
8287
// and pre-compute the expected result 'CorrectOut'.
@@ -109,48 +114,65 @@ int runTest(T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,
109114

110115
if (AllocType4 == usm::alloc::device) {
111116
Q.submit([&](handler &CGH) {
112-
CGH.single_task<
113-
KernelNameGroup<ReductionExample, class KernelNameUSM4>>(
117+
CGH.single_task<KNameGroup<Name, class KernelNameUSM4>>(
114118
[=]() { *Out4 = InitVal4; });
115119
}).wait();
116120
} else {
117121
*Out4 = InitVal4;
118122
}
119123
}
120124

121-
// The main code to be tested.
122-
Q.submit([&](handler &CGH) {
123-
auto In1 = InBuf1.template get_access<access::mode::read>(CGH);
124-
auto In2 = InBuf2.template get_access<access::mode::read>(CGH);
125-
auto In3 = InBuf3.template get_access<access::mode::read>(CGH);
126-
auto In4 = InBuf4.template get_access<access::mode::read>(CGH);
127-
128-
auto Out1 = OutBuf1.template get_access<Mode1>(CGH);
129-
auto Out2 = OutBuf2.template get_access<Mode2>(CGH);
130-
accessor<T3, 0, Mode3, access::target::global_buffer> Out3(OutBuf3, CGH);
131-
132-
auto Lambda = [=](nd_item<1> NDIt, auto &Sum1, auto &Sum2, auto &Sum3,
133-
auto &Sum4) {
134-
size_t I = NDIt.get_global_id(0);
135-
Sum1.combine(In1[I]);
136-
Sum2.combine(In2[I]);
137-
Sum3.combine(In3[I]);
138-
Sum4.combine(In4[I]);
139-
};
140-
141-
auto Redu1 =
142-
ONEAPI::reduction<T1, BinaryOperation1>(Out1, IdentityVal1, BOp1);
143-
auto Redu2 =
144-
ONEAPI::reduction<T2, BinaryOperation2>(Out2, IdentityVal2, BOp2);
145-
auto Redu3 =
146-
ONEAPI::reduction<T3, BinaryOperation3>(Out3, IdentityVal3, BOp3);
147-
auto Redu4 =
148-
ONEAPI::reduction<T4, BinaryOperation4>(Out4, IdentityVal4, BOp4);
149-
150-
auto NDR = nd_range<1>{range<1>(NWorkItems), range<1>{WGSize}};
151-
CGH.parallel_for<ReductionExample>(NDR, Redu1, Redu2, Redu3, Redu4,
152-
Lambda);
153-
}).wait();
125+
auto NDR = nd_range<1>{range<1>(NWorkItems), range<1>{WGSize}};
126+
if constexpr (IsSYCL2020Mode) {
127+
Q.submit([&](handler &CGH) {
128+
auto In1 = InBuf1.template get_access<access::mode::read>(CGH);
129+
auto In2 = InBuf2.template get_access<access::mode::read>(CGH);
130+
auto In3 = InBuf3.template get_access<access::mode::read>(CGH);
131+
auto In4 = InBuf4.template get_access<access::mode::read>(CGH);
132+
133+
auto Redu1 = sycl::reduction(OutBuf1, CGH, IdentityVal1, BOp1);
134+
auto Redu2 = sycl::reduction(OutBuf2, CGH, IdentityVal2, BOp2);
135+
auto Redu3 = sycl::reduction(OutBuf3, CGH, IdentityVal3, BOp3);
136+
auto Redu4 = sycl::reduction(Out4, IdentityVal4, BOp4);
137+
138+
auto Lambda = [=](nd_item<1> NDIt, auto &Sum1, auto &Sum2, auto &Sum3,
139+
auto &Sum4) {
140+
size_t I = NDIt.get_global_id(0);
141+
Sum1.combine(In1[I]);
142+
Sum2.combine(In2[I]);
143+
Sum3.combine(In3[I]);
144+
Sum4.combine(In4[I]);
145+
};
146+
CGH.parallel_for<Name>(NDR, Redu1, Redu2, Redu3, Redu4, Lambda);
147+
}).wait();
148+
} else {
149+
// Test ONEAPI reductions
150+
Q.submit([&](handler &CGH) {
151+
auto In1 = InBuf1.template get_access<access::mode::read>(CGH);
152+
auto In2 = InBuf2.template get_access<access::mode::read>(CGH);
153+
auto In3 = InBuf3.template get_access<access::mode::read>(CGH);
154+
auto In4 = InBuf4.template get_access<access::mode::read>(CGH);
155+
156+
auto Out1 = OutBuf1.template get_access<Mode1>(CGH);
157+
auto Out2 = OutBuf2.template get_access<Mode2>(CGH);
158+
accessor<T3, 0, Mode3, access::target::global_buffer> Out3(OutBuf3, CGH);
159+
160+
auto Redu1 = ONEAPI::reduction(Out1, IdentityVal1, BOp1);
161+
auto Redu2 = ONEAPI::reduction(Out2, IdentityVal2, BOp2);
162+
auto Redu3 = ONEAPI::reduction(Out3, IdentityVal3, BOp3);
163+
auto Redu4 = ONEAPI::reduction(Out4, IdentityVal4, BOp4);
164+
165+
auto Lambda = [=](nd_item<1> NDIt, auto &Sum1, auto &Sum2, auto &Sum3,
166+
auto &Sum4) {
167+
size_t I = NDIt.get_global_id(0);
168+
Sum1.combine(In1[I]);
169+
Sum2.combine(In2[I]);
170+
Sum3.combine(In3[I]);
171+
Sum4.combine(In4[I]);
172+
};
173+
CGH.parallel_for<Name>(NDR, Redu1, Redu2, Redu3, Redu4, Lambda);
174+
}).wait();
175+
}
154176

155177
// Check the results and free memory.
156178
int Error = 0;
@@ -185,18 +207,43 @@ int runTest(T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,
185207
return Error;
186208
}
187209

188-
int main() {
210+
// Tests both implementations of reduction:
211+
// sycl::reduction and sycl::ONEAPI::reduction
212+
template <class Name, typename T1, access::mode Mode1, typename T2,
213+
access::mode Mode2, typename T3, access::mode Mode3, typename T4,
214+
class BinaryOperation1, class BinaryOperation2,
215+
class BinaryOperation3, class BinaryOperation4>
216+
int testBoth(T1 IdentityVal1, T1 InitVal1, BinaryOperation1 BOp1,
217+
T2 IdentityVal2, T2 InitVal2, BinaryOperation2 BOp2,
218+
T3 IdentityVal3, T3 InitVal3, BinaryOperation3 BOp3,
219+
T4 IdentityVal4, T3 InitVal4, BinaryOperation4 BOp4,
220+
usm::alloc AllocType4, size_t NWorkItems, size_t WGSize) {
189221
int Error =
190-
runTest<class ReduFloatPlus16x1, float, access::mode::discard_write, int,
191-
access::mode::read_write, short, access::mode::read_write, int>(
192-
0, 1000, std::plus<float>{}, 0, 2000, std::plus<>{}, 0, 4000,
193-
std::bit_or<>{}, 0, 8000, std::bit_xor<>{}, usm::alloc::shared, 16,
194-
16);
222+
testOne<KName<Name, false>, false, T1, Mode1, T2, Mode2, T3, Mode3, T4>(
223+
IdentityVal1, InitVal1, BOp1, IdentityVal2, InitVal2, BOp2,
224+
IdentityVal3, InitVal3, BOp3, IdentityVal4, InitVal4, BOp4,
225+
AllocType4, NWorkItems, WGSize);
226+
227+
// TODO: property::reduction::initialize_to_identity is not supported yet.
228+
// Thus only read_write mode is tested now.
229+
constexpr access::mode _Mode1 = (Mode1 == DW) ? RW : Mode1;
230+
constexpr access::mode _Mode2 = (Mode2 == DW) ? RW : Mode2;
231+
constexpr access::mode _Mode3 = (Mode3 == DW) ? RW : Mode3;
232+
Error +=
233+
testOne<KName<Name, true>, true, T1, _Mode1, T2, _Mode2, T3, _Mode3, T4>(
234+
IdentityVal1, InitVal1, BOp1, IdentityVal2, InitVal2, BOp2,
235+
IdentityVal3, InitVal3, BOp3, IdentityVal4, InitVal4, BOp4,
236+
AllocType4, NWorkItems, WGSize);
237+
return Error;
238+
}
239+
240+
int main() {
241+
int Error = testBoth<class FP32Plus16x16, float, DW, int, RW, short, RW, int>(
242+
0, 1000, std::plus<float>{}, 0, 2000, std::plus<>{}, 0, 4000,
243+
std::bit_or<>{}, 0, 8000, std::bit_xor<>{}, usm::alloc::shared, 16, 16);
195244

196245
auto Add = [](auto x, auto y) { return (x + y); };
197-
Error += runTest<class ReduFloatPlus5x257, float, access::mode::read_write,
198-
int, access::mode::read_write, short,
199-
access::mode::discard_write, int>(
246+
Error += testBoth<class FP32Plus5x257, float, RW, int, RW, short, DW, int>(
200247
0, 1000, std::plus<float>{}, 0, 2000, std::plus<>{}, 0, 4000, Add, 0,
201248
8000, std::bit_xor<int>{}, usm::alloc::device, 5 * (256 + 1), 5);
202249

SYCL/Reduction/reduction_nd_ext_type.hpp

Lines changed: 49 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,11 @@
88

99
using namespace cl::sycl;
1010

11-
template <typename SpecializationKernelName, typename T, int Dim,
11+
template <typename T, bool B> class KName;
12+
constexpr access::mode RW = access::mode::read_write;
13+
constexpr access::mode DW = access::mode::discard_write;
14+
15+
template <typename Name, bool IsSYCL2020Mode, typename T, int Dim,
1216
access::mode Mode, class BinaryOperation>
1317
void test(T Identity, size_t WGSize, size_t NWItems) {
1418
buffer<T, 1> InBuf(NWItems);
@@ -24,19 +28,27 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
2428

2529
// Compute.
2630
queue Q;
27-
Q.submit([&](handler &CGH) {
28-
auto In = InBuf.template get_access<access::mode::read>(CGH);
29-
accessor<T, Dim, Mode, access::target::global_buffer> Out(OutBuf, CGH);
30-
auto Redu = ONEAPI::reduction(Out, Identity, BOp);
31-
32-
range<1> GlobalRange(NWItems);
33-
range<1> LocalRange(WGSize);
34-
nd_range<1> NDRange(GlobalRange, LocalRange);
35-
CGH.parallel_for<SpecializationKernelName>(
36-
NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) {
37-
Sum.combine(In[NDIt.get_global_linear_id()]);
38-
});
39-
});
31+
nd_range<1> NDRange(range<1>{NWItems}, range<1>{WGSize});
32+
if constexpr (IsSYCL2020Mode) {
33+
Q.submit([&](handler &CGH) {
34+
auto In = InBuf.template get_access<access::mode::read>(CGH);
35+
auto Redu = sycl::reduction(OutBuf, CGH, Identity, BOp);
36+
37+
CGH.parallel_for<Name>(NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) {
38+
Sum.combine(In[NDIt.get_global_linear_id()]);
39+
});
40+
});
41+
} else {
42+
Q.submit([&](handler &CGH) {
43+
auto In = InBuf.template get_access<access::mode::read>(CGH);
44+
accessor<T, Dim, Mode, access::target::global_buffer> Out(OutBuf, CGH);
45+
auto Redu = ONEAPI::reduction(Out, Identity, BOp);
46+
47+
CGH.parallel_for<Name>(NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) {
48+
Sum.combine(In[NDIt.get_global_linear_id()]);
49+
});
50+
});
51+
}
4052

4153
// Check correctness.
4254
auto Out = OutBuf.template get_access<access::mode::read>();
@@ -52,6 +64,19 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
5264
}
5365
}
5466

67+
template <typename Name, typename T, int Dim, access::mode Mode,
68+
class BinaryOperation>
69+
void testBoth(T Identity, size_t WGSize, size_t NWItems) {
70+
test<KName<Name, false>, false, T, Dim, Mode, BinaryOperation>(
71+
Identity, WGSize, NWItems);
72+
73+
// TODO: property::reduction::initialize_to_identity is not supported yet.
74+
// Thus only read_write mode is tested now.
75+
constexpr access::mode _Mode = (Mode == DW) ? RW : Mode;
76+
test<KName<Name, true>, true, T, Dim, _Mode, BinaryOperation>(
77+
Identity, WGSize, NWItems);
78+
}
79+
5580
template <typename T> int runTests(const string_class &ExtensionName) {
5681
device D = default_selector().select_device();
5782
if (!D.is_host() && !D.has_extension(ExtensionName)) {
@@ -60,24 +85,16 @@ template <typename T> int runTests(const string_class &ExtensionName) {
6085
}
6186

6287
// Check some less standards WG sizes and corner cases first.
63-
test<class KernelName_oTh, T, 1, access::mode::read_write,
64-
std::multiplies<T>>(0, 4, 4);
65-
test<class KernelName_QUQnMARQT, T, 0, access::mode::discard_write,
66-
ONEAPI::plus<T>>(0, 4, 64);
67-
68-
test<class KernelName_xGixNo, T, 0, access::mode::read_write,
69-
ONEAPI::minimum<T>>(getMaximumFPValue<T>(), 7, 7);
70-
test<class KernelName_qXNFw, T, 1, access::mode::discard_write,
71-
ONEAPI::maximum<T>>(getMinimumFPValue<T>(), 7, 7 * 5);
72-
73-
#if __cplusplus >= 201402L
74-
test<class KernelName_lXdWtzANdDcvm, T, 1, access::mode::read_write,
75-
ONEAPI::plus<>>(1, 3, 3 * 5);
76-
test<class KernelName_FDQalsDxmbi, T, 1, access::mode::discard_write,
77-
ONEAPI::minimum<>>(getMaximumFPValue<T>(), 3, 3);
78-
test<class KernelName_TaNRRxDRXbzYrFImPYC, T, 0, access::mode::discard_write,
79-
ONEAPI::maximum<>>(getMinimumFPValue<T>(), 3, 3);
80-
#endif // __cplusplus >= 201402L
88+
testBoth<class A, T, 1, RW, std::multiplies<T>>(0, 4, 4);
89+
testBoth<class B, T, 0, DW, ONEAPI::plus<T>>(0, 4, 64);
90+
91+
testBoth<class C, T, 0, RW, ONEAPI::minimum<T>>(getMaximumFPValue<T>(), 7, 7);
92+
testBoth<class D, T, 1, access::mode::discard_write, ONEAPI::maximum<T>>(
93+
getMinimumFPValue<T>(), 7, 7 * 5);
94+
95+
testBoth<class E, T, 1, RW, ONEAPI::plus<>>(1, 3, 3 * 5);
96+
testBoth<class F, T, 1, DW, ONEAPI::minimum<>>(getMaximumFPValue<T>(), 3, 3);
97+
testBoth<class G, T, 0, DW, ONEAPI::maximum<>>(getMinimumFPValue<T>(), 3, 3);
8198

8299
std::cout << "Test passed\n";
83100
return 0;

0 commit comments

Comments
 (0)